# Profiling Python code

### Example: Find duplicate movie titles

- Read 5000 movie titles
- Return a list of movie titles that occur twice
- Search is case insensitive

In [1]:
def read_movies():
    """ Read movies name from movies_name.txt file"""
    
    path = "movies_name.txt"
    with open(path) as fd:
        return fd.read().splitlines()

In [4]:
def is_duplicate(name , movie_list):
    
    for movie_name in movie_list:
        if movie_name.lower() == name.lower():
            return True
        
    return False

def find_duplicate_movies():
    
    movies_data = read_movies()
    duplicates = []
    while movies_data:
        movie = movies_data.pop()
        if is_duplicate(movie , movies_data):
            duplicates.append(movie)
    
    return duplicates

In [5]:
%time find_duplicate_movies()

CPU times: user 2.06 s, sys: 5.24 ms, total: 2.06 s
Wall time: 2.07 s


['Sabotage',
 'The Last House on the Left',
 'Night of the Living Dead',
 "A Dog's Breakfast",
 'Cat People',
 'The Texas Chain Saw Massacre',
 'The Calling',
 'Side Effects',
 '20,000 Leagues Under the Sea',
 'The Love Letter',
 'Across the Universe',
 'Halloween',
 'Oz the Great and Powerful',
 'Crossroads',
 'Home',
 'House of Wax',
 'King Kong',
 'The Unborn',
 'Snitch',
 'Trance',
 'The Lovely Bones',
 'History of the World: Part I',
 'Goosebumps',
 'The Full Monty',
 'Juno',
 'The Fog',
 'Lucky Number Slevin',
 'The Day the Earth Stood Still',
 'Mercury Rising',
 'Jack Reacher',
 'A Woman, a Gun and a Noodle Shop',
 'Dawn of the Dead',
 'A Nightmare on Elm Street',
 'Carrie',
 'Stealing Harvard',
 'Hamlet',
 'Lolita',
 'The French Connection',
 'Dodgeball: A True Underdog Story',
 'Halloween II',
 "The Astronaut's Wife",
 'The Omen',
 'Cinderella',
 'Alice in Wonderland',
 'The Gambler',
 'The Watch',
 'Day of the Dead',
 'Unknown',
 'Ben-Hur',
 'My Soul to Take',
 'Planet of the

## Using cProfile to improve the code

- Profiles the function using cProfile, and prints out a report
- Adapted from the Python 3.6 docs:
- https://docs.python.org/3/library/profile.html#profile.Profile

In [6]:
import cProfile, pstats, io

def profile():
    """uses cProfile to profile a function"""
    pr = cProfile.Profile()
    pr.enable()
    
    # Function to call
    find_duplicate_movies()
    
    pr.disable()
    s = io.StringIO()
    sortby = 'cumulative'
    ps = pstats.Stats(pr, stream=s).strip_dirs().sort_stats(sortby)
    ps.print_stats()
    print(s.getvalue())
    

In [7]:
profile()

         24980393 function calls in 4.710 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.003    0.003    4.710    4.710 <ipython-input-4-c10cde2d10e0>:9(find_duplicate_movies)
     5043    2.707    0.001    4.704    0.001 <ipython-input-4-c10cde2d10e0>:1(is_duplicate)
 24970168    1.998    0.000    1.998    0.000 {method 'lower' of 'str' objects}
        1    0.000    0.000    0.002    0.002 <ipython-input-1-c7689e016564>:1(read_movies)
     5043    0.001    0.000    0.001    0.000 {method 'pop' of 'list' objects}
        1    0.001    0.001    0.001    0.001 {method 'splitlines' of 'str' objects}
        1    0.001    0.001    0.001    0.001 {built-in method io.open}
        1    0.000    0.000    0.000    0.000 {method 'read' of '_io.TextIOWrapper' objects}
        1    0.000    0.000    0.000    0.000 codecs.py:318(decode)
        1    0.000    0.000    0.000    0.000 {built-in method _codecs.utf_8_deco

In [8]:
def is_duplicate(name , movie_list):
    
    for movie_name in movie_list:
        if movie_name == name:
            return True
        
    return False

def find_duplicate_movies():
    
    movies_data = read_movies()
    movies_data = [movie.lower() for movie in movies_data]
    duplicates = []
    while movies_data:
        movie = movies_data.pop()
        if is_duplicate(movie , movies_data):
            duplicates.append(movie)
    
    return duplicates

In [9]:
profile()

         15269 function calls in 0.302 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.002    0.002    0.302    0.302 <ipython-input-8-c88e92635196>:9(find_duplicate_movies)
     5043    0.295    0.000    0.295    0.000 <ipython-input-8-c88e92635196>:1(is_duplicate)
        1    0.001    0.001    0.002    0.002 <ipython-input-8-c88e92635196>:12(<listcomp>)
        1    0.000    0.000    0.002    0.002 <ipython-input-1-c7689e016564>:1(read_movies)
     5043    0.001    0.000    0.001    0.000 {method 'lower' of 'str' objects}
        1    0.001    0.001    0.001    0.001 {method 'splitlines' of 'str' objects}
     5043    0.001    0.000    0.001    0.000 {method 'pop' of 'list' objects}
        1    0.000    0.000    0.000    0.000 {built-in method io.open}
        1    0.000    0.000    0.000    0.000 {method 'read' of '_io.TextIOWrapper' objects}
        1    0.000    0.000    0.000    0.000 codecs.py:318(

In [10]:
def find_duplicate_movies():
    
    movies_data = read_movies()
    movies_data = [movie.lower() for movie in movies_data]
    duplicates = []
    while movies_data:
        movie = movies_data.pop()
        if movie in movies_data:
            duplicates.append(movie)
    
    return duplicates

In [11]:
profile()

         10226 function calls in 0.150 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.146    0.146    0.150    0.150 <ipython-input-10-dc6c61275643>:1(find_duplicate_movies)
        1    0.001    0.001    0.002    0.002 <ipython-input-10-dc6c61275643>:4(<listcomp>)
        1    0.000    0.000    0.002    0.002 <ipython-input-1-c7689e016564>:1(read_movies)
     5043    0.001    0.000    0.001    0.000 {method 'lower' of 'str' objects}
        1    0.001    0.001    0.001    0.001 {method 'splitlines' of 'str' objects}
     5043    0.001    0.000    0.001    0.000 {method 'pop' of 'list' objects}
        1    0.000    0.000    0.000    0.000 {built-in method io.open}
        1    0.000    0.000    0.000    0.000 {method 'read' of '_io.TextIOWrapper' objects}
        1    0.000    0.000    0.000    0.000 codecs.py:318(decode)
        1    0.000    0.000    0.000    0.000 {built-in method _codecs.utf_8_decode}