In [41]:
%matplotlib inline
import pandas as pd
import datetime as dt
import warnings
warnings.filterwarnings('ignore')

# Adjust display settings
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_colwidth', 120)

**Merging the datasets and cleaning the data**

In [42]:
# Data: MovieLens _ml
# Read in Movies and Ratings from MovieLens Data
data_dir = '/home/justin/Documents/Data Science/Data'
mov_ml = pd.read_csv(data_dir + '/MovieLens/ml-latest/movies.csv')
rat_ml = pd.read_csv(data_dir + '/MovieLens/ml-latest/ratings.csv')

# Convert out of 5 ratings, to out of 10 ratings
rat_ml['rating'] = rat_ml['rating'] * 2

In [43]:
# Data: MovieLens _ml
# Find the average movie rating and store in 'mr', remove those without a rating
mov_rat_ml = pd.DataFrame(rat_ml[['movieId','rating']].groupby('movieId').mean())
mov_rat_ml.head()

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,7.789603
2,6.442171
3,6.360189
4,5.759455
5,6.161622


In [44]:
# Data: MovieLens _ml
# Count the number of ratings and store in 'mc'. Sort by highest number of ratings first
mov_num_rat_ml = pd.DataFrame(rat_ml[['movieId','rating']].groupby('movieId').size(), columns=['num_ratings']).sort_values(by='num_ratings', ascending=0)
mov_num_rat_ml.head()

Unnamed: 0_level_0,num_ratings
movieId,Unnamed: 1_level_1
356,81296
296,79091
318,77887
593,76271
480,69545


In [45]:
# Data: MovieLens
# Show all movies, number of ratings 'num_ratings' and average rating 'rating'
mov_rat_agg_ml = mov_num_rat_ml.merge(mov_rat_ml, right_index=1, left_index=1)
mov_rat_agg_ml.head()

Unnamed: 0_level_0,num_ratings,rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,60424,7.789603
2,23950,6.442171
3,15267,6.360189
4,2935,5.759455
5,14769,6.161622


In [46]:
# Data: MovieLens _ml
# Aggregate all movies, and ratings stats into one Dataframe (Note: this removes unrated films)
# Sort by top movies if sorted by average rating, then number of ratings
agg_ml = (mov_ml.merge(mov_rat_agg_ml,how='inner', left_on='movieId',right_index=1).groupby(['title','genres']).mean().
    sort_values(['rating','num_ratings'], ascending=[0,0]))
agg_ml.reset_index(inplace=True)
agg_ml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33664 entries, 0 to 33663
Data columns (total 5 columns):
title          33664 non-null object
genres         33664 non-null object
movieId        33664 non-null float64
num_ratings    33664 non-null float64
rating         33664 non-null float64
dtypes: float64(3), object(2)
memory usage: 1.3+ MB


In [47]:
# Data: MovieLens
# Separate the year from the title into separate columns for MovieLens titles
df = agg_ml.copy()
df['year'] = df.title.str.extract('[(](\d{4})[)]') 
df['title'] = df.title.str.replace(r' [(]\d{4}[)]', '')
df['year'].fillna('0', inplace=True)
df['year'] = df['year'].astype(int)
agg_sepYr_ml = df
df.head()

Unnamed: 0,title,genres,movieId,num_ratings,rating,year
0,De la servitude moderne,Documentary,106517.0,2.0,10.0,2009
1,Dilwale,Action|Children|Comedy|Crime|Drama|Romance,150268.0,2.0,10.0,2015
2,Naked Among Wolves,Drama|War,148030.0,2.0,10.0,2015
3,The Fruit Hunters,Documentary,133323.0,2.0,10.0,2012
4,The Girl in the Book,Drama,148701.0,2.0,10.0,2015


In [48]:
# Data: MovieLens
# Move the 'The' to the start for MovieLens titles to help with matching
# (e.g. "Usual Suspects, The" in MovieLens is "The Usual Suspects" in IMDB & OMDB)
df = agg_sepYr_ml.copy()
df['has_the'] = df.title.str.contains(', The')
df.title = df.title.str.replace(r', The', '')
df.ix[df.has_the, 'title'] = 'The ' + df.ix[df.has_the, 'title']
agg_sepYr_moveThe_ml = df
df[df.has_the].head()

Unnamed: 0,title,genres,movieId,num_ratings,rating,year,has_the
42,The Barchester Chronicles,Drama,95517.0,1.0,10.0,1982,True
43,The Best of Ernie and Bert,Children,94972.0,1.0,10.0,1988,True
115,The Keeping the Promise (Sign of the Beaver),Children|Drama,93967.0,1.0,10.0,1997,True
142,The New Rulers of the World,Documentary,114011.0,1.0,10.0,2001,True
143,The On Any Sunday Next Chapter,Documentary,133964.0,1.0,10.0,2014,True


In [49]:
agg_sepYr_moveThe_ml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33664 entries, 0 to 33663
Data columns (total 7 columns):
title          33664 non-null object
genres         33664 non-null object
movieId        33664 non-null float64
num_ratings    33664 non-null float64
rating         33664 non-null float64
year           33664 non-null int64
has_the        33664 non-null bool
dtypes: bool(1), float64(3), int64(1), object(2)
memory usage: 1.6+ MB


In [50]:
# Data: MovieLens
# Move the 'A' to the start for MovieLens titles to help with matching
# (e.g. )
df = agg_sepYr_moveThe_ml.copy()
df['has_a'] = df.title.str.contains(', A')
df.title = df.title.str.replace(r', A', '')
df.ix[df.has_a, 'title'] = 'A ' + df.ix[df.has_a, 'title']
agg_sepYr_moveThe_moveA_ml = df
df[df.has_a].head()

Unnamed: 0,title,genres,movieId,num_ratings,rating,year,has_the,has_a
57,A Christmasgain,(no genres listed),148857.0,1.0,10.0,2015,False,True
305,A Child's Christmas in Wales,Children|Drama,95837.0,4.0,9.25,1987,False,True
323,A Season for Miracles,Children|Romance,98699.0,3.0,9.0,1999,False,True
343,A Fine Madness,Comedy|Drama|Romance,31638.0,2.0,9.0,1966,False,True
454,A Englishugust,Comedy|Drama,139895.0,1.0,9.0,1994,False,True


In [51]:
# Function to normalise title names, by removing : and - and making titles in "Title Case Which Is Like This"
# Will replace 'title' with normalised title
def normlise_title(df):
    df.title = df.title.str.replace(r'&', 'and')
    df.title = df.title.str.replace(r'([^(\w|\s)]+)', '') # regular expression to remove all matches of non alpha-numeric characters
    
    #df.title = df.title.str.replace(r'-', '')
    #df.title = df.title.str.replace(r':', '')
    df.title = df.title.map(lambda x: x.title())
    return df

In [52]:
# Normalise the film titles (to help later with matching)
agg_sepYr_moveThe_moveA_normed_ml = normlise_title(agg_sepYr_moveThe_moveA_ml.copy())

In [80]:
# Separate the multiple alternative film titles in () from movie lens into separate columns
df = agg_sepYr_moveThe_moveA_normed_ml.copy()
df['title'] = df['title'].str.replace('(A.K.A. )', '') # get rid of extraneous 'A.K.A' text
df['titles'] = df['title'].str.findall(r'\(([^()]+)\)') # regular expression to find all matches of strings inside parentheses ()
df['titles_len'] = df['titles'].map(lambda x: len(x))

#Get the first title (not in parentheses)
df['title1'] = df['title'].str.extract(r'(\A[^()]+) \(') # regular expression to match from the start of the string to first (
df.loc[df['title1'].isnull(), 'title1'] = df.loc[df['title1'].isnull(), 'title']

# Add separate columns for each alternative title (maximum of 3 alternative titles allowed)
# Starting from title2 to title4
df['title2'] = pd.np.nan
df['title3'] = pd.np.nan
df['title4'] = pd.np.nan
for i in range(2,5):
    df.loc[df['titles_len'] >= i - 1, 'title%s'%i] = df['titles'].map(lambda x: x[i - 2] if len(x) >= i - 1 else pd.np.nan)
    
#Clean up dataframe by removing now extraneous columns
df = df.drop(['title', 'titles', 'has_the', 'has_a'], axis = 1)
    
#Save and show results
agg_sepYr_moveThe_moveA_normed_sepTitles_ml = df
print('There are %s films with multiple titles in movielens'%len(df[df['title2'].notnull()]))
df.sort_values(by = 'titles_len', ascending = False).head()

There are 5198 films with multiple titles in movielens


Unnamed: 0,genres,movieId,num_ratings,rating,year,titles_len,title1,title2,title3,title4
28916,Horror,78084.0,9.0,4.888889,1980,4,The Anthropophagus The Grim Reaper,Antropophagus,Man Beast,Savage Island
7363,Drama|Romance,53835.0,54.0,7.203704,1954,3,Journey To Italy,Viaggio In Italia,Voyage To Italy,Voyage In Italy
1335,Drama|War,45899.0,9.0,8.0,1971,3,Trial Of The Road,Checkup On The Roads,Checkpoint,Proverka Na Dorogakh
18878,Comedy|Crime|Drama,68411.0,6.0,6.166667,1944,3,Black Magic,Meeting At Midnight,Charlie Chan In Meeting At Midnight,Charlie Chan In Black Magic
30166,Drama|Romance,105794.0,9.0,4.444444,1997,3,Another Nine And A Half Weeks,Love In Paris,9 12 Weeks Ii,Another 9 12 Weeks


In [54]:
# Data: IMDB
# Read in IMDB Data
cst_im = pd.read_csv(data_dir +'/IMDB/pycon-pandas-tutorial-master/data/cast.csv')
mov_im = pd.read_csv(data_dir +'/IMDB/pycon-pandas-tutorial-master/data/titles.csv')

# Create data frame with titles and films, it and with id 'index'
mov_im.reset_index(inplace=True)
mov_cst_im = cst_im.merge(mov_im)

mov_cst_im.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3580847 entries, 0 to 3580846
Data columns (total 7 columns):
title        object
year         int64
name         object
type         object
character    object
n            float64
index        int64
dtypes: float64(1), int64(2), object(4)
memory usage: 218.6+ MB


In [55]:
# Normalise the film titles (to help later with matching)
mov_normed_im = normlise_title(mov_im.copy())

In [56]:
# Data: IMDB
# Fix inserted roman numbers, and separate multiple titles to im_title1 and im_title2
df = mov_normed_im.copy()

roman_numerals_brackets = r' \([ixvIXV]+\)' # re for extracting bracket roman numerals from titles

# print some stats for what will be removed.
print('There are %s imdb title with roman numerals removed' % len(df[df['title'].str.contains(roman_numerals_brackets)]))
print('There are %s imdb titles with two or more (' %len(df[df['title'].str.contains(r'\w\(\w\(')]))

# remove roman numberals from titles
df['title'] = df['title'].str.replace(roman_numerals_brackets, '') 

# Seaprate title into im_title1 and im_title2
df['im_title1'] = df['title'].str.extract(r'(\A[^()]+) \(') # match from the start of the string to first (
df['im_title2'] = df['title'].str.extract(r'\w \(([^()]+)\)$') # match item within bracksets, at the end of string
df.loc[df['im_title1'].isnull(), 'im_title1'] = df[df['im_title1'].isnull()]['title'] # for those that dont match the 2 above (ie only one title)

# Rename title to avoid confusion down the line when merging with im
df = df.rename(columns = {'title':'im_title'}) 

# Print how much alternative titles have been found
print('There are %s alternative titles added' % len(df[df['im_title2'].notnull()]))

 # Display results of changed multiple name titles
mov_im_titles_fixed = df
df[df['im_title2'].notnull()].head()
df.head()

There are 3331 imdb title with roman numerals removed
There are 0 imdb titles with two or more (
There are 522 alternative titles added


Unnamed: 0,index,im_title,year,im_title1,im_title2
0,0,The Passing,1985,The Passing,
1,1,Kothewali,2000,Kothewali,
2,2,Nemuri Kyoshiro Manji Giri,1969,Nemuri Kyoshiro Manji Giri,
3,3,Goose On The Loose,2006,Goose On The Loose,
4,4,Parizhskaya Drama,1983,Parizhskaya Drama,


In [57]:
# Will match x, with the best 'close_match' in list 'lst',
# extending difflib's get_close_matches to get the type of matches desired
# - will return nan instead of [] for no matches
import difflib
def get_best_match(x, lst, cutoff):
    if type(x) != str:
        return pd.np.nan
    #print(x)
    #print(lst)
    ans = difflib.get_close_matches(x, lst, cutoff = cutoff)
    if len(ans) == 0:
        return pd.np.nan
    else:
        return ans[0]

In [58]:
# MAY REMOVE THIS AS IT PRODUCES ALOT OF FALSE POSITIVES, OR MAKE IT SUCH THAT IT CAN ONLY MATCH ON FULL WORDS AT THE START

# Will match x with first value in list 'lst', where x is starts with the same start of the string of any list element
# or vice-versa, 
# To avoid too many false matches, make sure each word is > 4 chars in length
# e.g. x = 'Star Wars' and lst = ['Star Wars episode 4'] will return 'Star Wars: episode 4'
# Will return nan for no match
def get_starts_with_match(x, lst):
    if type(x) != str:
        return pd.np.nan
    for s in list(lst):
        if type(s) != str:
            return pd.np.nan
        if (s.startswith(x) | x.startswith(s)) & (len(x) >= 4) & (len(s) >= 4):
            return s
    return pd.np.nan

In [139]:
# Will match x with first value in list 'lst', where words of x is contained within the string of any list element
# or vice-cersa, the string of the list element has words all contained within x
# e.g. x = 'Il Buono Il Brutto Il Cattivo' and lst = ['Buono Il Brutto Il Cattivo Il'] will return 'Buono Il Brutto Il Cattivo Il'
# Will return nan for no match
def get_word_subset_match(x, lst):
    if type(x) != str:
        return pd.np.nan
    x = x.split(' ')
    for s in list(lst):
        y = s.split(' ')
        if (set(x) <= set(y)) | (set(y) <= set(x)):
            return s
    return pd.np.nan

In [144]:
# e.g. x = 'Il Buono Il Brutto Il Cattivo' and lst = ['Buono Il Brutto Il Cattivo Il'] will return 'Buono Il Brutto Il Cattivo Il'
# Will return nan for no match
x = 'Buono Il Brutto Il Cattivo'
lst = ['Buono Il Brutto Il Cattivo Il']
get_word_subset_match(x, lst)


'Buono Il Brutto Il Cattivo Il'

In [60]:
# Create variables for imdb and movielens databases herafter (for brevity)
im = mov_im_titles_fixed # Movies from IMDB database (clean)
ml = agg_sepYr_moveThe_moveA_normed_sepTitles_ml # Movies from MovieLens database (clean)
con_flds = ['title', 'year'] # the most common connection fields between IMDB and Movielens

In [61]:
# Add prev and next years to dataset to help for better matching.
im['prev_year'] = im['year'] - 1
im['next_year'] = im['year'] + 1
im.head()

Unnamed: 0,index,im_title,year,im_title1,im_title2,prev_year,next_year
0,0,The Passing,1985,The Passing,,1984,1986
1,1,Kothewali,2000,Kothewali,,1999,2001
2,2,Nemuri Kyoshiro Manji Giri,1969,Nemuri Kyoshiro Manji Giri,,1968,1970
3,3,Goose On The Loose,2006,Goose On The Loose,,2005,2007
4,4,Parizhskaya Drama,1983,Parizhskaya Drama,,1982,1984


In [62]:
# Data: MovieLens & IMDB

# Parameters for filtering out films
min_num_ratings = 10
min_rating = 4
min_year = 1965

# Filter out films with few ratings
ml = ml[ml['num_ratings'] >= min_num_ratings]

# Filter out films with poor rating
ml = ml[ml['rating'] >= min_rating]

#Filter out old films
ml = ml[ml['year'] >= min_year]
im = im[im['year'] >= min_year]

# Merge MovieLens and IMDB dataframes together on title and year for exact match
# Starting from title1 and working up to title4 in movielens
# And similarly, starting from im_title1 then going to im_title2
df = pd.DataFrame()
films_all = pd.DataFrame()
for im_title in ['im_title1', 'im_title2']:
    for i in range (1, 5):

        # Iterate through year, prev_year and next_year of imdb to find matches 
        # prev_year and next_year are in case year is off by 1 between imdb and movielens
        for yr in ['year', 'prev_year', 'next_year']:
            df = ml.merge(im[im[im_title].notnull()], left_on = ['title%s' %i, 'year'], right_on = [im_title, yr], how = 'inner')
            df['title'] = df['title%s' %i]
            df['match_on'] = 'title%s' %i # indicate which title (title1 - title4) a match was found
            df['title_match_type'] = 'exact' # indicate that the exact title was matched
            df['year_match_on'] = yr # indicate if the year was matched correctly (year), or match on prev/next year
            print('%s exact title matches on title%s with %s %s' %(len(df), i, im_title, yr))
            films_all = films_all.append(df)
            
# Remove duplicate matches - ie films that match on a title two of more of: year, prev_year, next_year.
# Will keep match by (exact year) 'year' by default
cnt = len(films_all)
films_all = films_all.sort_values(by = ['movieId', 'year_match_on'], ascending = [True, False]).drop_duplicates(subset = 'movieId')
print('%s duplicate matches have been removed (ie match on a title and two or more of: year, prev_year, next_year)' % (cnt - len(films_all)))
    
# Add these matched films, to set of all movielens films. (Films not yet match from movielens to imdb, but be where 'index' = NaN)
films_all = ml.merge(films_all[['movieId', 'index', 'title', 'im_title1', 'im_title2', 'match_on', 'title_match_type', 'year_match_on']], how = 'left')
films_all.head()

10018 exact title matches on title1 with im_title1 year
214 exact title matches on title1 with im_title1 prev_year
334 exact title matches on title1 with im_title1 next_year
1099 exact title matches on title2 with im_title1 year
18 exact title matches on title2 with im_title1 prev_year
29 exact title matches on title2 with im_title1 next_year
46 exact title matches on title3 with im_title1 year
1 exact title matches on title3 with im_title1 prev_year
1 exact title matches on title3 with im_title1 next_year
0 exact title matches on title4 with im_title1 year
0 exact title matches on title4 with im_title1 prev_year
0 exact title matches on title4 with im_title1 next_year
4 exact title matches on title1 with im_title2 year
1 exact title matches on title1 with im_title2 prev_year
0 exact title matches on title1 with im_title2 next_year
9 exact title matches on title2 with im_title2 year
0 exact title matches on title2 with im_title2 prev_year
0 exact title matches on title2 with im_title2 

Unnamed: 0,genres,movieId,num_ratings,rating,year,titles_len,title1,title2,title3,title4,index,title,im_title1,im_title2,match_on,title_match_type,year_match_on
0,Action|Crime,146327.0,20.0,8.9,1979,0,Cant Change The Meeting Place,,,,,,,,,,
1,Crime|Drama,318.0,77887.0,8.883421,1994,0,The Shawshank Redemption,,,,64328.0,The Shawshank Redemption,The Shawshank Redemption,,title1,exact,year
2,Crime|Drama,858.0,49846.0,8.707278,1972,0,The Godfather,,,,180450.0,The Godfather,The Godfather,,title1,exact,year
3,(no genres listed),147330.0,10.0,8.7,1979,0,Sherlock Holmes And Dr Watson Acquaintance,,,,,,,,,,
4,Crime|Mystery|Thriller,50.0,53195.0,8.637973,1995,0,The Usual Suspects,,,,30856.0,The Usual Suspects,The Usual Suspects,,title1,exact,year


In [108]:
# Data: IMDB

# Find films in IMDB whose titles are not yet matched with movielens data, store im_mm
df = films_all
df = df[df['index'].notnull()]
im_mm = im.merge(df[['index', 'movieId']], how = 'left')
im_mm[im_mm['movieId'].isnull()]
im_mm.drop(['movieId'], axis = 1, inplace = True)
im_mm.head()

Unnamed: 0,index,im_title,year,im_title1,im_title2,prev_year,next_year
0,0,The Passing,1985,The Passing,,1984,1986
1,1,Kothewali,2000,Kothewali,,1999,2001
2,2,Nemuri Kyoshiro Manji Giri,1969,Nemuri Kyoshiro Manji Giri,,1968,1970
3,3,Goose On The Loose,2006,Goose On The Loose,,2005,2007
4,4,Parizhskaya Drama,1983,Parizhskaya Drama,,1982,1984


In [67]:
# For this row, return the first non-null value when starting from title1_close_match
# and going up to title4_close_match
def get_close_title_match(row):
    for i in range(1, 5):
        if pd.isnull(row['title%s_close_match' %i]) == False:
            return row['title%s_close_match' %i]
    return pd.np.nan

In [68]:
# For this row, return the title name (e.g. title1) of the first non-null value when starting from
# title1_close_match and going up to title4_close_match
def get_match_on(row):
    for i in range(1, 5):
        if pd.isnull(row['title%s_close_match' %i]) == False:
            return 'title%s'%i
    return pd.np.nan

In [69]:
# Update title with close matches, starting from title1_close_match and working right
# Indicate details from the combination c:
# - column match_on which title column it matched
# - match_type that it was 'close' match (as opposed to 'exact')
# - year field matched on
# close_match_type can be a string of of the following:
# - difflib - using difflib's get_close_matches and taking first value
# - contains_match - (as explained in method get_contains_match above)
# - word_subset_match - (as explained in method word_subset_match)
def update_close_match_title_and_year_fld(df, close_match_type, combo):
    
    # Filter for all of the new matches
    ftr = no_title_for_year(df, combo['year']) & has_title_close_match(df)
    
    # Update the fields for the new matches
    df.loc[ftr, 'title'] = df.loc[ftr].apply(lambda row: get_close_title_match(row), axis = 1)
    df.loc[ftr, 'title_match_type'] = 'close'
    df.loc[ftr, 'match_on'] = df.loc[ftr].apply(lambda row: get_match_on(row), axis = 1)
    df.loc[ftr, 'year_match_on'] = combo['year_fld']
    df.loc[ftr, 'close_match_type'] = close_match_type
    
    # Print the new matches that are found and their details
    movielens_titles = len( df[ftr] )
    imdb_titles = len(im_mm[im_mm['year'] == combo['year']])
    close_matches = len(df[ ftr & df[combo['title_cm_fld']].notnull() ])
    if close_matches:
        print('From %s movielens & %s imdb titles- %s %s %ses for movielens year %s & imdbs %s' 
        % (movielens_titles, imdb_titles, close_matches, close_match_type,
           combo['title_cm_fld'], combo['year'], combo['year_fld']))

In [70]:
def has_title_close_match(df):
    return df['title1_close_match'].notnull()|df['title2_close_match'].notnull()|df['title3_close_match'].notnull()|df['title4_close_match'].notnull()

In [71]:
# Return filter for all rows where there is no title, for year 'year'
def no_title_for_year(df, year):
    return df['title'].isnull() & (df['year'] == year)

In [72]:
# Return filter for all rows where there is no title, for year 'year'
def has_title_for_year(df, year):
    return df['title'].notnull() & (df['year'] == year)

In [73]:
# Generator for iterating over title field names: zipped titlei & titlei_close_match for i in [1, 4]
# nested by all film years in movielens
# nested by ['year'] if exact_year parameters is True, else ['prev_year', 'next_year']
# Yields a dictionary containing:
# title_fld : 'titlei' for i in [1, 4]
# title_cm_fld : 'titlei' for i in [1, 4]
# year : film year in movielens
# year_fld : either 'year', 'prev_year' or 'next_year'
from itertools import product
def titlefields_filmyears_yearfields(exact_year = True):
    stream = product(zip(['title%s'%i for i in range(1, 5) ], ['title%s_close_match'%i for i in range(1, 5) ]), 
                     range(min(ml['year']), max(ml['year'])+1),
                     ['year'] if exact_year else ['prev_year', 'next_year'])
    for x in stream:
        result = {}
        result['title_fld'] = x[0][0]
        result['title_cm_fld'] = x[0][1]
        result['year'] = x[1]
        result['year_fld'] = x[2]
        yield result

In [147]:
# For iterating through movielens years and finding close matches on titles1 - title4
# df - the dataframe to be modified
# exact_year - if True will need year's to line up, if False will only use prev_year or next_year
# (for speed its best to run this once first on True, then send a smaller list to df and use False)
def find_close_matches_on_movielens_titles(df, exact_year = True):
                
    # Iterate over all combinations, combo of: [title[1-4], title_close_match[1-4]]
    # with all years of movielens films AND
    # with all year field names if applicable (['year'] for exact_year, otherwise ['prev_year', 'next_year'])
    for combo in titlefields_filmyears_yearfields(exact_year):
        
        # Try both im_title1 and im_title2
        for im_title in ['im_title1', 'im_title2']:
            
            # List of im films to use as potential matches for movielens films
            im_films = im_mm[(im_mm[combo['year_fld']] == combo['year'])&im_mm[im_title].notnull()][im_title]
        
            # Using difflib's get_close_matches...
            # Find closest match for film from the i-th title field for this year
            # I match found, update fields 'title', 'title_match_type', 'year_match_on', 'match_type','close_match_type'
            ftr = no_title_for_year(df, combo['year'])
            df.loc[ftr, combo['title_cm_fld']] = df.loc[ftr, combo['title_fld']].map(
                lambda x: get_best_match(x, im_films, cutoff = 0.9))
            update_close_match_title_and_year_fld(df, 'difflib', combo)

            # Using my get_contains_match...
            # Find closest match for film from the i-th title field for this year
            # I match found, update fields 'title', 'title_match_type', 'year_match_on', 'match_type','close_match_type'
            ftr = no_title_for_year(df, combo['year'])
            df.loc[ftr, combo['title_cm_fld']] = df.loc[ftr, combo['title_fld']].map(
                lambda x: get_starts_with_match(x, im_films))
            update_close_match_title_and_year_fld(df, 'startswith', combo)
            
            # Using my get_subset_match...
            # Find closest match for film from the i-th title field for this year
            # I match found, update fields 'title', 'title_match_type', 'year_match_on', 'match_type','close_match_type'
            ftr = no_title_for_year(df, combo['year'])
            df.loc[ftr, combo['title_cm_fld']] = df.loc[ftr, combo['title_fld']].map(
                lambda x: get_word_subset_match(x, im_films))
            update_close_match_title_and_year_fld(df, 'wordsubset', combo)

In [75]:
# Run same iteration as above titlefields_filmyears_yearfields, but for get_contains_match
# Or maybe modify the above code, so that get_best_match can be chagned dynamically - this could be slower however than doing in stages...

In [76]:
# Run same iteration as above titlefields_filmyears_yearfields, but for get_subset_match

In [110]:
# for combo in titlefields_filmyears_yearfields():
#     # Try both im_title1 and im_title2
#     for im_title in ['im_title1', 'im_title2']:
#         print('%s %s' %(combo, im_title))

In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15087 entries, 0 to 15086
Data columns (total 17 columns):
genres              15087 non-null object
movieId             15087 non-null float64
num_ratings         15087 non-null float64
rating              15087 non-null float64
year                15087 non-null int64
titles_len          15087 non-null int64
title1              15087 non-null object
title2              2511 non-null object
title3              128 non-null object
title4              6 non-null object
index               11394 non-null float64
title               11394 non-null object
im_title1           11394 non-null object
im_title2           15 non-null object
match_on            11394 non-null object
title_match_type    11394 non-null object
year_match_on       11394 non-null object
dtypes: float64(4), int64(2), object(11)
memory usage: 2.1+ MB


In [91]:
im_mm.head()

Unnamed: 0,genres,movieId,num_ratings,rating,year,titles_len,title1,title2,title3,title4,index,title,im_title1,im_title2,match_on,title_match_type,year_match_on
0,Action|Crime,146327.0,20.0,8.9,1979,0,Cant Change The Meeting Place,,,,,,,,,,
3,(no genres listed),147330.0,10.0,8.7,1979,0,Sherlock Holmes And Dr Watson Acquaintance,,,,,,,,,,
7,(no genres listed),142115.0,30.0,8.566667,2001,0,The Blue Planet,,,,,,,,,,
8,Adventure|Children|Comedy|Drama,139620.0,23.0,8.565217,1998,0,Everythings Gonna Be Great,,,,,,,,,,
9,Action|Fantasy|Mystery,140737.0,73.0,8.561644,2006,0,The Lost Room,,,,,,,,,,


In [148]:
# Data: MovieLens & IMDB

# Continue modifying the same dataframe
df = films_all.copy()

# Create new field for each title1 to title4 for close match
for i in range(1, 5):
    df['title%s_close_match' %i] = pd.np.nan
    
# Create new field to display which close match technique was used
# 'difflib', 'contains_match', 'word_subset'
df['close_match_type'] = pd.np.nan

find_close_matches_on_movielens_titles(df)
#find_close_matches_on_movielens_titles(df, exact_year = False)

# Save results
films_all_w_cmtitles = df

#Show close matches that have been found
close_match_ftr = df['title1_close_match'].notnull()|df['title2_close_match'].notnull()|df['title3_close_match'].notnull()|df['title4_close_match'].notnull()
df[close_match_ftr].head()

From 2 movielens & 1741 imdb titles- 2 difflib title1_close_matches for movielens year 1965 & imdbs year
From 1 movielens & 1741 imdb titles- 1 startswith title1_close_matches for movielens year 1965 & imdbs year
From 1 movielens & 1741 imdb titles- 1 wordsubset title1_close_matches for movielens year 1965 & imdbs year
From 3 movielens & 1837 imdb titles- 3 startswith title1_close_matches for movielens year 1966 & imdbs year
From 2 movielens & 1837 imdb titles- 2 wordsubset title1_close_matches for movielens year 1966 & imdbs year
From 1 movielens & 1840 imdb titles- 1 wordsubset title1_close_matches for movielens year 1967 & imdbs year
From 2 movielens & 1981 imdb titles- 2 difflib title1_close_matches for movielens year 1968 & imdbs year
From 1 movielens & 1981 imdb titles- 1 wordsubset title1_close_matches for movielens year 1968 & imdbs year
From 2 movielens & 2149 imdb titles- 2 difflib title1_close_matches for movielens year 1969 & imdbs year
From 1 movielens & 2109 imdb titles- 

Unnamed: 0,genres,movieId,num_ratings,rating,year,titles_len,title1,title2,title3,title4,...,im_title1,im_title2,match_on,title_match_type,year_match_on,title1_close_match,title2_close_match,title3_close_match,title4_close_match,close_match_type
9,Action|Fantasy|Mystery,140737.0,73.0,8.561644,2006,0,The Lost Room,,,,...,,,title1,close,year,The Lost,,,,startswith
23,Documentary,139090.0,10.0,8.4,2009,0,The U,,,,...,,,title1,close,year,The Uhoh Show,,,,startswith
38,Action|Adventure|Sci-Fi,260.0,67092.0,8.316103,1977,0,Star Wars Episode Iv A New Hope,,,,...,,,title1,close,year,Star Wars,,,,startswith
45,Documentary,110366.0,10.0,8.3,2010,0,Jeanmichel Basquiat The Radiant Child,,,,...,,,title1,close,year,The The The,,,,wordsubset
52,Documentary,86504.0,1345.0,8.26171,2004,0,Voices From The List,,,,...,,,title1,close,year,The List,,,,wordsubset


In [152]:
df = films_all_w_cmtitles
df[df['close_match_type'] == 'wordsubset'][['title', 'im_title1', 'im_title2', 'title1', 'title2', 'title3', 'title4', 'title1_close_match', 'title2_close_match', 'title3_close_match', 'title4_close_match', 'close_match_type']]

Unnamed: 0,title,im_title1,im_title2,title1,title2,title3,title4,title1_close_match,title2_close_match,title3_close_match,title4_close_match,close_match_type
45,The The The,,,Jeanmichel Basquiat The Radiant Child,,,,The The The,,,,wordsubset
52,The List,,,Voices From The List,,,,The List,,,,wordsubset
61,Tomorrow,,,World Of Tomorrow,,,,Tomorrow,,,,wordsubset
67,Il Buono Il Brutto Il Cattivo,,,The Good The Bad And The Ugly,Buono Il Brutto Il Cattivo Il,,,,Il Buono Il Brutto Il Cattivo,,,wordsubset
71,Das Boot,,,The Boot Das,Boat,,,Das Boot,,,,wordsubset
81,The Hunt,,,The Trials Of Darryl Hunt,,,,The Hunt,,,,wordsubset
99,Boy,,,Old Boy,,,,Boy,,,,wordsubset
104,Lost,,,Paradise Lost The Child Murders At Robin Hood Hills,,,,Lost,,,,wordsubset
114,Freedom,,,The Trap What Happened To Our Dream Of Freedom,,,,Freedom,,,,wordsubset
118,Se7En,,,Seven,Aka Se7En,,,,Se7En,,,wordsubset


In [125]:
# Show the results of all matches on all titles, exact or close on title and year
df = films_all_w_cmtitles
df.groupby(['title_match_type', 'year_match_on', 'match_on']).size()


title_match_type  year_match_on  match_on
close             year           title1       361
                                 title2        67
                                 title3         4
exact             next_year      title1       221
                                 title2        26
                                 title3         1
                  prev_year      title1        94
                                 title2        13
                                 title3         1
                  year           title1      9913
                                 title2      1079
                                 title3        46
dtype: int64

In [150]:
df = films_all_w_cmtitles
df = df[['genres', 'movieId', 'index', 'num_ratings', 'rating', 'year', 'title', 'close_match_type']][df['close_match_type'] == 'wordsubset']
df

Unnamed: 0,genres,movieId,index,num_ratings,rating,year,title,close_match_type
45,Documentary,110366.0,,10.0,8.300000,2010,The The The,wordsubset
52,Documentary,86504.0,,1345.0,8.261710,2004,The List,wordsubset
61,Animation|Comedy,148881.0,,12.0,8.250000,2015,Tomorrow,wordsubset
67,Action|Adventure|Western,1201.0,,16356.0,8.237100,1966,Il Buono Il Brutto Il Cattivo,wordsubset
71,Action|Drama|War,1233.0,,15108.0,8.212669,1981,Das Boot,wordsubset
81,Crime|Documentary,53885.0,,10.0,8.200000,2006,The Hunt,wordsubset
99,Mystery|Thriller,27773.0,,8357.0,8.153285,2003,Boy,wordsubset
104,Documentary,1361.0,,1683.0,8.147950,1996,Lost,wordsubset
114,Documentary,89985.0,,23.0,8.130435,2007,Freedom,wordsubset
118,Mystery|Thriller,47.0,,47630.0,8.125446,1995,Se7En,wordsubset


In [None]:
df[(df['title_match_type'] == 'close') & (df['year_match_on'] == 'next_year')]

In [None]:
# Show films still not matched by movielens to IMDB
df = films_all_w_cmtitles
df = df[df['title'].isnull()]

df_mm = df#[~df['genres'].str.contains('Documentary')&~df['genres'].str.contains('no genres listed')]

df = films_all_w_cmtitles
print('%s matches of movielens titles with imdb, from a total of %s movielens films.'
      %(len(df[df['title'].notnull()]),len(df)))
print('There are %s ''film'' films from movielens, still not matched from IMDb'%len(df_mm))
df_mm.head(30)

In [None]:
# Quick IMDB title searcher
def imdb(title):
    return im[im['title'].str.contains(title)]

In [None]:
#imdb('Star Wars') # IMDB 'Star Wars' 1977, MovieLens 'Star Wars Episode Iv A New Hope' 1977 - FIXABLE (Year diff)
#imdb('Cinema Paradiso') # Nuovo Cinema Paradiso in IMDB but 1988, instead of 1989 - FIXABLE (enclosed title)
#imdb('Sherlock') #'Sherlock Holmes And Dr Watson Acquaintance' only in imdb online
#imdb('The Blue Planet') #'The Blue Planet' only in imdb online (mini-series)
#imdb('Gonna be great') #Everythin's Gonna Be Great only in imdb online
#imdb('Lost Room') #The Lost Room online in imdb online (mini-series)
#imdb('Once Brothers') # Not even on imdb online (Documentary)
#imdb('Black Mirror') # Black Mirror 2011, only in imdb online (short)
#imdb('Sherlock') #The Adventures Of Sherlock Holmes And Dr Watson The Hound Of The Baskervilles not even on imdb online
#imdb('The Chaos Class') #The Dunce Class On Vacation only in imdb online
#imdb('Band Of Brothers') #Band Of Brothers, only in imdb online (mini-series)
#imdb('Life Is Beautiful') #Life is Beautiful 1997 only in imdb online - International -  NEED TO FIGURE OUt WHY
#imdb('Le fabuleux') #Amelie 2001 only in imdb online - International - NEED TO FIGURE OUT WHY
#imdb('Wallace And Gromit The Wrong Trousers') #Wallace And Gromit The Wrong Trousers not even in imdb
#imdb('Heart Of A Dog') #Heart Of A Dog only in imdb (TV movie)
#imdb('Kavkazskaya Plennitsa Ili Novye Priklyucheniya Shurika') - Title2 is Kavkazskaya Plennitsa - FIXABLE (enclosed title)
#imdb('Decalogue') #The Decalogue - only in imdb online (tv miniseries)
#imdb('World of tomorrow') #World of Tomorrow - only in imdb online (short)
#imdb('Il Buono Il Brutto Il Cattivo') Title2 Buono Il Brutto Il Cattivo Il - FIXABLE (use algorithm for same words)
#imdb('Wild Tales') # Only in imdb oneline - International - NEED TO FIGURE OUT WHY
#imdb('Das Boot') # The Boot Das - FIXABLE (use algorithm for same words)
#imdb('Creature Comforts') # Creature Comforts only in imdb online (short)
#imdb('Brainstorm') #Brainstorm, not even in imdb online
#imdb('Love And Honor') # Lovand And Honour 2007 in imdb, 2006 in movielens - FIXABLE (Year diff)
#imdb('Cowboy Bebop') # Only in imdb online (tv series)
#imdb('Formula of love') # Formula of love Only in imdb online (tv movie)
#imdb('To Live') # Only in imdb oneline - International - NEED TO FIGURE OUT WHY
#imdb('Some Folks Call It A Sling Blade') # only in imdb online (short)
#imdb('Oldboy') #Old Boy 2003 FIXABLE (need word joiner search algorithm - could apply to all)
#imdb('Que Horas') # The Second Mother - ALTERNATIVE IMDB TITLE WILL NEED MANUAL MAPPING
#imdb('Léon')  # Only in imdb oneline - International - NEED TO FIGURE OUT WHY (There's another AKA too!)

In [None]:
#Experiments for FIXABLE (use algorithm for same words)
#difflib.get_close_matches('Il Buono Il Brutto Il Cattivo', 'Buono Il Brutto Il Cattivo Il', cutoff = 0.1)
x = 'Il Buono Il Brutto Il Cattivo'.split(' ')
y = 'Buono Il Brutto Il Cattivo Il'.split(' ')
set(x) == set(y)
set(x) <= set(y)

In [None]:
#Experiments for FIXABLE (Year diff)
df = df_mm.merge(im, left_on = 'title1', right_on = 'title')
df[abs(df.year_x - df.year_y) <= 1]

**Experimentation with my fav films spreadsheet**

In [None]:
# Find out which of my top films are not matched in movielens or imdb
# To be uses as a basis to fine-tune matching algorithm
# (Need to make sure spreadhseet is not saved with - or ' autocorrect changes)
df = pd.read_excel(data_dir + '/Me/top_films.xls', encoding = 'utf-8')
my_films = df.merge(films_all, left_on = ['title'], right_on = ['title'], how = 'left')[['title', 'year', 'ml', 'im']]

In [None]:
# Break up my top films into various splices
df = my_films
my_films_no_ml = df[df['ml'].isnull() & df['im'].notnull()] # my top films that are NOT in movielens on title match alone
my_films_no_im = df[df['im'].isnull() & df['ml'].notnull()] # my top films that are NOT in imdb on title match alone
my_films_no_ml_im = df[df['ml'].isnull() & df['im'].isnull()] # my top films that are NOT in both of: movielens AND imdb on title match alone

In [None]:
# For my top films that mismatch between movielens and imdb, find closes matches to each set
import difflib
df = films_all
df1_mm = df[df['im'].isnull()] # mis-matches - movielens films that don't have an imdb record
df2_mm = df[df['ml'].isnull()] # mis-matches - imdb films that don't have a movielens record

cut_off = 0.4
my_films['ml_close_matches'] = my_films['title'].map(lambda x: difflib.get_close_matches(x, df1_mm['title'], cutoff = 0.2))
my_films['im_close_matches'] = my_films['title'].map(lambda x: difflib.get_close_matches(x, df2_mm['title'], cutoff = 0.2))
my_films#[my_films['ml'].isnull()|my_films['im'].isnull()]

In [None]:
df1[df1['title'].str.contains("Up in the Air")]['title'].iloc[0]

In [None]:
df2[df2['title'].str.contains("Love")&df2['title'].str.contains("Drugs")]['title']#.iloc[0]

In [None]:
df2_mm[df2_mm['title'].str.contains("Love")&df2_mm['title'].str.contains("Drugs")]['title']

In [None]:
difflib.get_close_matches('Love and Other Drugs', df2_mm['title'], cutoff = 0.2)

In [None]:
df2_mm.info()

In [None]:
my_films[my_films['title'].str.contains("When Harry")]['title'].iloc[0]

In [None]:
difflib.get_close_matches('Love and Other Drugs', ['Love & Other Drugs'])

**Creating surveys for new users**

In [None]:
# Create dataframe of films that users can review...

# Remove films without an english language
df = films_all
df = df.merge(lan_om, left_on='id', right_on='movie_id')
df = df[df.language_iso_639_1 == 'en']

# Remove films made before 1965
df = df[df.year >= 1965]

# Remove films with < 10 number of ratings
df = df[df.num_ratings >= 10]

# Get rid of duplicates (since some films have mulitple lead-actor (n=1) and/or genre)
df = df.drop_duplicates(subset=['index'])

# Sort by best films at the start
df = df.sort_values(by=['rating', 'num_ratings'], ascending=[False, False])

films_reviewable = df
df.head()


In [None]:
# Randomise DataFrame, by placing the top 500 films in random order, then the remaining films in random order
# This is in an attempt to reduce a bias, that people will typically rate more prestigious films higher 
# as they would appear that way in the list otherwise. For example Schindler's List, The Godfather etc are well known to rate highly
# Print out 30 random copies of this to be imported into google sheets
df = films_reviewable.head(1000).copy()
num_surveys = 30
for i in range(num_surveys):
    top500 = df.head(500)
    top500 = top500.reindex(pd.np.random.permutation(top500.index))
    remainder = df.tail(500)
    remainder = remainder.reindex(pd.np.random.permutation(remainder.index))
    films_reviewable_randomised = top500.append(remainder)
    films_reviewable_randomised.head()

    # Output to Excel to then be imported manually into Google Sheets for new user to review
    #(is quicker than writing directly to google sheets from pandas)
    films_reviewable_randomised[['title','year','lead actor','genres','index']].to_excel('output%d.xlsx' % i)

In [None]:
# Output the remaining fims to review into excel (not including top 1000)
# (Only for power users)
df = films_reviewable.tail(len(films_reviewable)-1000).copy()
df = df.reindex(pd.np.random.permutation(df.index))

# Output to Excel to then be imported manually into Google Sheets for new user to review
#(is quicker than writing directly to google sheets from pandas)
df[['title','year','lead actor','genres','index']].to_excel('after_top_1000.xlsx')

**Generating profiles for new users (see also below)**

In [None]:
# Function to read in and clean-up new user ratings data from Google Sheets
def read_new_user_ratings(workbook_name, sheet_name):
    # Read in data
    workbook = gc.open(workbook_name)
    df = pd.DataFrame(workbook.worksheet(sheet_name).get_all_records())
    
    # Convert 'out of 10' column to correct data
    df.loc[(df['out of 10']=='')|(df['out of 10']=='-'),'out of 10'] = -1
    df['out of 10'] = df['out of 10'].astype(int)
    
    # (For the time being) remove all films new user has not reviewed (ie they haven't yet seen)
    df = df[df['out of 10'] >= 0]
    
    # Add to dictionary
    users[workbook_name][sheet_name] = df

In [None]:
%%time
# Dynamically read new user ratings from various google sheets

# Adapted from merging code between: 
# http://gspread.readthedocs.io/en/latest/oauth2.html - Setting up the authentication
# http://pbpython.com/pandas-google-forms-part1.html - Structure of code (with different 'credentials' var to above)
# https://github.com/burnash/gspread - Reading and manipulating the google sheet

from __future__ import print_function
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd
import json

# Authenication and Google Sheet Parameters & Variables
SCOPE = ["https://spreadsheets.google.com/feeds"]
SECRETS_FILE = "/Users/justinbarton/Documents/DevSetup/Justin-ee2a176f3e01.json"
credentials = ServiceAccountCredentials.from_json_keyfile_name(SECRETS_FILE, SCOPE)

# Authorise the Google Sheet to open it.
gc = gspread.authorize(credentials)

# Define variables to open spreadsheets and hold new user ratings
users = {}

# Read my own user ratings
users['Films_Justin'] = {}
read_new_user_ratings('Films_Justin', 'Reviewed')

# Read new user ratings
users['Films_Barton'] = {}
read_new_user_ratings('Films_Barton', 'Damian')
read_new_user_ratings('Films_Barton', 'Jess')

users['Films_Barker'] = {}
read_new_user_ratings('Films_Barker', 'Dave')
read_new_user_ratings('Films_Barker', 'Andreas')
read_new_user_ratings('Films_Barker', 'Below')
read_new_user_ratings('Films_Barker', 'LindaBelow')

**Creating feature tables**

In [None]:
# Data: MovieLens
# Create dataframe 'films_all_jobs' containing: all movies: 'movie_id' (OMDB) 'index' (IMDB), jobs 'job_name' and people 'person'
# (and various other info associated with those people and roles.)
job_full_om = job_om.merge(cst_om, left_on='job_id', right_on='job_id')
mov_jobs_om = job_full_om.merge(ppl_om, left_on='person_id', right_on='id')
mov_jobs_om.rename(columns={'name_x':'job_name', 'name_y':'person', 'id':'person_id'}, inplace=True) # Rename fields, to avoid future confusion upon merging
films_all_jobs = mov_jobs_om.merge(films_all[['index', 'id']], left_on = 'movie_id', right_on = 'id')
films_all_jobs.drop(['id'], axis=1, inplace = True)
films_all_jobs.head()

In [None]:
# Data: MovieLens & IMDB & OMDB
# Create dataframe 'films_all_actors' containing: all movies 'movie_id' (MovieLens) or 'index' (IMDB), actors 'actor', 
# 'actor_rank_for_movie' (the smaller the number the higher ranked in that film)

# Set parameter for the maximum number of actors to have for each film, retaining only the highest ranked.
max_actors = 10

# Get actor and rank info from imdb
df2 = mov_cst_im[['index','name','type','n']]
df2 = df2[df2.n.notnull()]
df2 = df2[df2.n <= max_actors]

# Add actor and rank info
df = films_all
df = df.merge(df2, left_on='index', right_on='index')
df.rename(columns={'name':'Actor', 'type':'is_male_actor', 'n':'actor_rank_for_movie'}, inplace=True)
df['is_male_actor'] = df['is_male_actor'] == 'actor'

df.drop_duplicates(inplace=True)
films_all_actors = df
df.info()

In [None]:
# Data: MovieLens & IMDB & OMDB
# Create dataframe 'films_all_det' containing all movies 'id' (OMDB) or 'index' (IMDB), 'revenue', 'budget', 'runtime'
df = det_om[['movie_id', 'runtime', 'budget', 'revenue']]
df = df[(df.runtime != 0)|(df.budget != 0)|df.revenue != 0] # Make sure at least one of 'runtime', 'budget' and 'revenue' is non-zero

df = df.merge(films_all, left_on = 'movie_id', right_on = 'id')
films_all_det = df
df.head()

In [None]:
# Define features, using feature_name as their key. And provide their:
# 1) df = DataFrame (must have films indexed with column name 'index')
# 2) (column_name_containing_feature, column_name_feature_data)  (use False if the feature_name (the key) is the column_name)
# 3) has_zero_records - is True if this feature has some records that are indicated by 0
#   Otherwise the value to indicate null record will be given (e.g. False, 0, NaN etc..)
#    (this only applies to those whose column_name = feature_name)
features = {}

feats = ['num_ratings', 'rating', 'year']
for f in feats:
    features[f] = (films_all, False, False)

features['Actor'] = (films_all_actors, False, False)

feats = ['Director', 'Screenplay', 'Producer', 'Director of Photography', 
         'Editor', 'Original Music Composer', 'Music', 'Executive Producer']
for f in feats:
    features[f] = (films_all_jobs, ('job_name', 'person'), False)

feats = ['runtime', 'budget', 'revenue']
for f in feats:
    features[f] = (films_all_det, False, True)
    
    
# Define feature selection tuning parameters
good_films_threshold = 7 # Films equal to or greater than this will be labelled 'GOOD'
bad_films_threshold = 5 # Films equal to or less than this will be labelled 'BAD'
min_feature_count_threshold = 1 # Need to have at least this many 'good' films of discrete features, for feature to be considered

In [None]:
# Function to add features to a dataframe df (has to have imdb linker 'index')
# Features are:
# --BASIC INFO--
# 'num_ratings', 'ratings', 'year'
# --JOB TITLES--
# 'Actor', 'Director', 'Screenplay', 'Producer', 'Director of Photography', 'Editor', 'Original Music Composer'
# 'Editor','Executive Producer', 'Music'
# --MOVIE DETAILS--
# runtime (mins), budget, revenue

def add_features(df, feature_names):

    # Go through and add all features to df
    for feature_name in feature_names:
        
        # Gather important variables for this feature
        feature = features[feature_name]
        feat_df = feature[0]
        column_name_is_feature_name = feature[1] == False
        if column_name_is_feature_name == False:
            column_name_containing_feature = feature[1][0]
            column_name_feature_data = feature[1][1]
        has_zero_records = feature[2]

        # Add features, when feature_name is column_name
        if column_name_is_feature_name:
            # Remove null records of feature (if applicable)
            df2 = feat_df[['index', feature_name]]
            if has_zero_records:
                df2 = df2[df2[feature_name] > 0]
            
            # Add feature
            df = df.merge(df2, how = 'left')

        # Add features, when needing to use 2 columns to get feature
        else:
            df2 = feat_df[feat_df[column_name_containing_feature] == feature_name][['index', column_name_feature_data]]
            df = df.merge(df2, how = 'left')
            df.rename(columns = {column_name_feature_data:feature_name}, inplace = True)

        # Delete duplicates that have crept in
        df.drop_duplicates(inplace=True)

    return df

In [None]:
# Add a column to show the values counts of a feature (per film in df)
# Then sort by that column count, and return dataframe
# df = dataframe to modify and output (must have imdb movie indentifer 'index')
# feature = the name of said feature
def sort_by_feature_count(df, feature):
    df2 = pd.DataFrame(df[['index', feature]].drop_duplicates()[feature].value_counts())
    df2 = df2.rename(columns = {feature : '%s_count' % feature})
    df = df.merge(df2, left_on = feature, right_index = True)
    df.sort_values('%s_count' % feature, ascending = False, inplace=True)
    return df

In [None]:
# Output a dataframe for this user, showing the maximum number of occurances within this feature
# This only works for discrete features, (NOT continuous features like revenue, budget and runtime etc)
# If bad_films is True, will show the results for users bad films, instead of the default, good films
def top_discrete_features(sheet_name, user_name, feature, bad_films = False):
    df = users[sheet_name][user_name]
    if bad_films:
        df = add_features(df[df['out of 10'] <= bad_films_threshold], [feature])
    else:
        df = add_features(df[df['out of 10'] >= good_films_threshold], [feature])
    df = sort_by_feature_count(df, feature)
    df[df['%s_count' % feature] >= min_feature_count_threshold]
    return df


**Generating profiles for new users (continued)**

In [None]:
# Graphing capability for function: top_discrete_features
# max_disp_results = maximum number of values to display on the graph 
# (e.g. if feature_name = 'Director' and max_disp_results = 5, it would display 5 top actors)
def graph_top_discrete_features(sheet_name, user_name, feature, max_disp_results, bad_films = False):
    df = top_discrete_features(sheet_name, user_name, feature, bad_films)
    df = df[[feature, '%s_count' % feature]].drop_duplicates()
    df = df.head(max_disp_results)
    if len(df != 0):
        df.plot(x = feature, y = '%s_count' % feature, kind = 'barh', 
                title = "%s's %spreferred %ss" % (user_name, ('','non-')[bad_films] ,feature), 
                legend = False)
    else:
        print('%s has no %spreferred %ss' % (user_name, ('','non-')[bad_films] ,feature))

In [None]:
# Display Top Preferred and Non-Preferred features for new users
test_features = ['Actor','Director', 'Producer', 'Director of Photography', 'Screenplay', 'Original Music Composer']

for sheet, usrs in users.iteritems():
    for usr in usrs:
        for feature in test_features:
            graph_top_discrete_features(sheet, usr, feature, 10, bad_films = False)
            graph_top_discrete_features(sheet, usr, feature, 10, bad_films = True)

**Linear regression on film attributes - per new user**

In [None]:
# Create Linear Regression Models for each user, using only continuous variables
# that are film attributes
# For each user:
# Try a model with all feaures, and a model with each feature individually
import sklearn
from sklearn.linear_model import LinearRegression
test_features = ['year','num_ratings','rating','runtime', 'budget', 'revenue']
results = {}
for sheet, usrs in users.iteritems():
    for usr in usrs:
        
        # Results table
        sheet_user = '%s %s' % (sheet, usr)
        results[sheet_user] = {}
        
        # Create dataframe, and show number of films rated
        df = users[sheet][usr]
        results[sheet_user]['Films_rated'] = len(df)

        df = add_features(df, test_features)
        # Make sure each feature has a value in every row
        for feat in test_features:
            df = df[df[feat].notnull()]
            
        # Linear regression on all features
        X_train, X_test, Y_train, Y_test = sklearn.cross_validation.train_test_split(
            df[test_features], df['out of 10'])
        lm = LinearRegression()
        lm.fit(X_train, Y_train)

        results[sheet_user]['All_feats_R^2_Train'] = lm.score(X_train, Y_train)
        results[sheet_user]['All_feats_R^2_Test'] = lm.score(X_test, Y_test)
            
        # Linear regression on features individually
        for feature in test_features:
            X_train, X_test, Y_train, Y_test = sklearn.cross_validation.train_test_split(
            df[[feature]], df['out of 10'])
            lm = LinearRegression()
            lm.fit(X_train, Y_train)
            
            results[sheet_user]['%s_R^2_Train' % feature] = lm.score(X_train, Y_train)
            results[sheet_user]['%s_R^2_Test' % feature] = lm.score(X_test, Y_test)
            
df = pd.DataFrame(results).transpose().drop(['Films_rated'], axis = 1).plot(kind='bar', legend = False)
pd.DataFrame(results)

**Linear regression on old user ratings - per new user**

In [None]:
# For new user defined by [sheet_name][user_name], find all historial users, who have rated the same films
# Add these users and their ratings as new columns to Dataframe df returned (indentified as UserId)  
# Count the number of films, historical users and new user have both rated.
# Optional parameters:
# only_match_good: only find historial users that match on good films
# max_num_users: the maximum number of users to return
# min_common_films: the minimum number of shared rated films, 
#    from new user to old user to include in df
def find_hist_users(sheet_name, user_name, only_match_good = False, max_num_users = pd.np.Inf, min_common_films = -pd.np.Inf):
    global old_rats
    usr = users[sheet_name][user_name] # new user ratings
    
    # Retain only good films from new and old users (if applicable)
    if only_match_good:
        usr = usr[ usr['out of 10'] >= good_films_threshold ]
        old_rats = old_rats[old_rats['user_rating'] >= good_films_threshold]
    
    # Merge new user ratings, with old
    df = old_rats.merge(usr, left_on = 'index', right_on = 'index')
    df.drop_duplicates(inplace = True)
    
    # Count the number common films reviwed between new user, and old users
    gb = df[['userId', 'index']].groupby('userId').count()
    gb.rename(columns={'index':'common_films_count'}, inplace = True)
    
    # Remove old users that did not hit the min_common_films threshold
    gb = gb[gb['common_films_count'] >= min_common_films]
    gb.reset_index(inplace = True)
    gb.sort_values(by = 'common_films_count', ascending = False, inplace = True)
    
    # Create final dataframe, and cut off at the maximum number of users to find
    gb = gb.head(min(len(gb), max_num_users))
    df = gb.merge(df)

    return df

In [None]:
# Convert hist_users_with_new into a user features dataframe. (used after find_hist_users)
# hist_users_with_new must have columns:
# 'index' - film indentifer
# 'userId' - historial user indentifier
# 'user_rating' - historal user_rating
# 'out of 10' - new user rating
# 'common_films_count' - the  number of shared rated films, from new user to old user
# New columns names added will be 'userId', for all userIds
# The number of new columns added will be capped by:
# max_features - the maximum number of userId_d columns to add
def create_hist_user_features(hist_users_with_new, max_features = pd.np.Inf):
    df = hist_users_with_new
    df = df[['userId', 'common_films_count']]
    df.drop_duplicates(inplace = True)
    df.sort_values(by = 'common_films_count', ascending = False, inplace = True)
    feats = df.head(min(len(df),max_features))[['userId']]
    
    df = hist_users_with_new.merge(feats, left_on = 'userId', right_on = 'userId')
    
    df = df[['index', 'userId','user_rating','out of 10']]
    return df
    

In [None]:
# Perform pivot table, with normal pandas fields: 'values', 'index', 'column'
# Remove na values in pivot table, then..
# Add columns 'keep_cols' in the final table, whose values correspond with 'index'
#   Note: the 'keep_cols' are NOT pivoted, and can't be part of 'columns'
# If dropna == True, will make sure all features have a value 
#   Note: to be used when running directly in a regression model
def pivot_and_keep_cols(df, values, index, columns, keep_cols = [], dropna = True):
    piv = df.pivot_table(values = 'user_rating', index = 'index', columns = 'userId')
    if dropna:
        piv.dropna(inplace = True)
    df = piv.merge(df[keep_cols+[index]], left_index = True, right_on = index)
    return df.drop_duplicates()

In [None]:
# Run multiple linear regression models for each new user, on old users ratings as features
# Same field descriptions as above, store results in 'results'
def run_linreg_hist_users(sheet_name, user_name, max_num_users, only_match_good, results):
    # Transform the data
    df = find_hist_users(sheet_name = sheet_name, user_name = user_name, 
                         max_num_users = max_num_users, only_match_good = only_match_good)
    df = create_hist_user_features(df)
    df = pivot_and_keep_cols(df, values = 'user_rating', index = 'index', columns = 'userId', keep_cols = ['out of 10'])
    
    # Run the linear regression
    feats = df.drop(['out of 10', 'index'], axis = 1)
    test_score = pd.np.nan
    train_score = pd.np.nan
    if len(feats) >= 2: # Only run linear regression if there is enough data to do so
        X_train, X_test, Y_train, Y_test = sklearn.cross_validation.train_test_split(
                feats, df['out of 10'])
        lm = LinearRegression()
        lm.fit(X_train, Y_train)
        test_score = lm.score(X_test, Y_test)
        train_score = lm.score(X_train, Y_train)
    
    # Store the results
    results.append({'User':'%s %s'%(sheet_name, user_name), 
                   'Amount rated by user':len(users[sheet_name][user_name]),
                   'Amount of old users':feats.shape[1],
                   'Amount of films in common':feats.shape[0],
                   'Good films only':only_match_good, 
                   'R^2_Train':train_score,
                   'R^2_Test':test_score})
    
    return results


In [None]:
# Run linear regression models for all users, against previous users, tweaking both:
#   number of users to use as features (dimensions, d), 
#   and only finding users that match on good films or not
%time
results = []
result_fields = ('User', 'Amount rated by user', 'Amount of old users', 'Amount of films in common',
                 'Good films only', 'R^2_Train', 'R^2_Test')
dimensions = [1, 2, 5, 10, 20, 50, 100, 200, 500]
for sheet, usrs in users.iteritems():
    for usr in usrs:
        for d in dimensions:
            run_linreg_hist_users(sheet, usr, max_num_users = d, only_match_good = True, results = results)
            run_linreg_hist_users(sheet, usr, max_num_users = d, only_match_good = False, results = results)
df = pd.DataFrame(results, columns = result_fields)
mult_lin_reg_old_users = df
df

In [None]:
# Show the R^2 scores distribution for each users
mult_lin_reg_old_users[['User', 'Amount rated by user', 'R^2_Train', 'R^2_Test']].groupby(['User', 'Amount rated by user']).describe()

In [None]:
# Plot the heaviest and the lightest film raters, to see how many old users have also rated these films
heaviest = df[df['User'] == 'Films_Justin Reviewed']
heaviest.plot(kind = 'scatter', x = 'Amount of old users', 
    y = 'Amount of films in common', title = 'Heaviest film rater: %s films' % len(users['Films_Justin']['Reviewed']),
    xlim = (0,max(heaviest['Amount of old users'])), ylim = (0,max(heaviest['Amount of films in common'])))

lightest = df[df['User'] == 'Films_Barker Andreas']
lightest.plot(kind = 'scatter', x = 'Amount of old users', 
    y = 'Amount of films in common', title = 'Lighest film rater: %s films' % len(users['Films_Barker']['Andreas']),
    xlim = (0,max(lightest['Amount of old users'])), ylim = (0,max(lightest['Amount of films in common'])))

In [None]:
# Plot the relationship between number of 'Amount of old users' vs 'Amount of films in common' across the board
df.plot(kind = 'scatter', x = 'Amount of old users', 
    y = 'Amount of films in common', title = "'Amount of old users' vs 'Amount of films in common'",
    xlim = (0,max(df['Amount of old users'])), ylim = (0,max(df['Amount of films in common'])))
df.plot(kind = 'scatter', x = 'Amount of old users', 
    y = 'Amount of films in common', title = "'Amount of old users' vs 'Amount of films in common' (zoomed in on corner)",
    xlim = (0,100), ylim = (0,250))

**Finding old users with similar ratings to new users**

In [None]:
# For each new user, find similar users from on old users ratings
# Same field descriptions as above
# Tolerance - how close as ratio difference, must both the mean and std of new user ratings be to be included
# results dictionary 'similiar_users' of dataframes indexed by 'sheet_name user_name'
def find_similar_hist_users(sheet_name, user_name, max_num_users, only_match_good, tolerance, similiar_users):
    
    df = find_hist_users(sheet_name, user_name, max_num_users = max_num_users, only_match_good = only_match_good)
    df = create_hist_user_features(df)
    
    df = pivot_and_keep_cols(df, values = 'user_rating', index = 'index', columns = 'userId', 
                          keep_cols = ['out of 10'], dropna = False)

    # List to store the results
    results = pd.DataFrame()

    # Loop through all old users, and record how similar their scores are based on: mean, median, std etc
    old_users = list(df.columns)[:-2]
    for u in old_users:
        df1 = df[df['out of 10'].notnull()&df[u].notnull()][['out of 10', u]].describe().transpose()
        df1.rename(index = {u:'userId'}, inplace = True)
        df1 = pd.DataFrame(df1.unstack()).transpose()
        df1.rename(index = {0:u}, inplace = True)
        results = results.append(df1)
    
    # Add 'diff' values to compare the results - (linking back to the tolerance above)
    results['mean', 'diff'] = abs((results['mean', 'out of 10'] - results['mean', 'userId'])/results['mean', 'out of 10'])
    results['std', 'diff'] = abs((results['std', 'out of 10'] - results['std', 'userId'])/results['std', 'out of 10'])
    
    # Remove those outside of the tolerance
    results = results[ results['mean', 'diff'] <= tolerance ]
    results = results[ results['std', 'diff'] <= tolerance ]
    
    # Sort results and save
    results.sort_values(by = [('std', 'diff'), ('mean', 'diff')])
    similiar_users['%s %s' % (sheet_name, user_name)] = results

In [None]:
# Find the most similar raters for each user
# This takes ages and/or locks up the computer, so it won't be run all the time.
have_good_computer = False
similiar_users = {}
if have_good_computer:
    for sheet, usrs in users.iteritems():
        for usr in usrs:
            find_similar_hist_users(sheet, usr, max_num_users = pd.np.Inf, only_match_good = True, 
                tolerance = 0.05, similiar_users = similiar_users)
            
# Instead do this for a single user:
find_similar_hist_users('Films_Justin', 'Reviewed', max_num_users = 200, only_match_good = True, 
                tolerance = 0.10, similiar_users = similiar_users)
similiar_users['Films_Justin Reviewed']

In [None]:
# From Justin's similar users above, find films these users have rating as good films
# These will form the basis for the first recommendations

LAF_threshold = 8 # Life altering film threshold (score out of 10)

# Get just the 'userId's of all of the old users who are similar
df = similiar_users['Films_Justin Reviewed']
df.drop(list(df.columns), axis = 1, inplace = True)

# Find LAFs from these similar users
df = old_rats.merge(df, left_on = 'userId', right_index = True)
df = df[df['user_rating'] >= LAF_threshold]
df = df[['index']]
df.drop_duplicates(inplace = True)
df = films_all.merge(df, left_on = 'index', right_on = 'index')[['title', 'year', 'genres', 'lead actor','index']]

# Remove films I've already seen, and display results
tmp = df[['index']].merge(users['Films_Justin']['Reviewed'][['title','index']], left_on = 'index', right_on = 'index', how='left')
unwatched_films = tmp[tmp['title'].isnull()][['index']]
recommendations = unwatched_films.merge(df, left_on = 'index', right_on = 'index')
recommendations.drop_duplicates(subset = ['title', 'year'])