# Content Based Movie Recommendation

In [1]:
import numpy as np
import pandas as pd
import re
pd.set_option('display.max_columns', None)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import metrics
from scipy.sparse import csr_matrix

### Import Dataset

In [2]:
# Import movie dataset (combined metadata), subset the data to 10k rows for computational conviencies
whole_df = pd.read_csv('combined_metadata_table.csv')

### Pick A Movie (Fake Search Engine)

In [3]:
def identify_movie(your_pick, whole_df):
    return whole_df[whole_df['title'].str.contains(your_pick, flags=re.IGNORECASE, regex=True)]

In [4]:
your_pick = 'Skyfall'
identify_movie(your_pick, whole_df)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,metascore,reviews_from_users,reviews_from_critics,id,overview,popularity,revenue,tagline
28682,tt1074638,Skyfall,Skyfall,2012,10/26/12,"Action, Adventure, Thriller",143,"UK, USA","English, Turkish, Shanghainese, Portuguese, Ja...",Sam Mendes,"Neal Purvis, Robert Wade",Eon Productions,"Daniel Craig, Judi Dench, Javier Bardem, Ralph...",Bond's loyalty to M is tested when her past co...,7.7,600350,81.0,1549.0,726.0,37724,When Bond's latest assignment goes gravely wro...,20.309585,1108561000.0,Think on your sins.


### Subset Dataset

In [5]:
def subset_by_genre(your_pick, whole_df):
    genres = whole_df[whole_df['title'] == your_pick]['genre']
    genres = list(genres)[0].split(', ')
    masks = []
    for genre in genres:
        mask = whole_df['genre'].str.contains(genre)
        masks.append(mask)
    mask = masks[0]
    for i in range(1,len(masks)):
        mask = mask | masks[i]
    return whole_df[mask]

In [6]:
def subset_by_year(your_pick, whole_df, year_range=30):
    year = whole_df[whole_df['title'] == your_pick]['year']
    year = list(year)[0]
    window = [year-year_range, year+year_range]
    mask = (whole_df['year'] >= window[0]) & (whole_df['year'] <= window[1])
    return whole_df[mask]

In [7]:
def subset_by_language(your_pick, whole_df):
    languages = whole_df[whole_df['title'] == your_pick]['language']
    languages = list(languages)[0].split(', ')
    masks = []
    for language in languages:
        mask = whole_df['language'].str.contains(language)
        masks.append(mask)
    mask = masks[0]
    for i in range(1,len(masks)):
        mask = mask | masks[i]
    return whole_df[mask]

In [8]:
subset_df = subset_by_genre(your_pick, whole_df)
source = subset_by_year(your_pick, subset_df)
#source = subset_by_language(your_pick, source)

In [9]:
print('Subset Size:', source.shape[0])
source.head(3)

Subset Size: 8359


Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,metascore,reviews_from_users,reviews_from_critics,id,overview,popularity,revenue,tagline
10542,tt0083693,Brimstone & Treacle,Brimstone & Treacle,1982,2/2/83,"Drama, Thriller",87,UK,English,Richard Loncraine,"Dennis Potter, Dennis Potter",Namara Films,"Sting, Denholm Elliott, Joan Plowright, Suzann...",A strange young man has a sinister effect on t...,6.4,1291,,18.0,8.0,73116,A strange young man has a sinister effect on t...,0.620534,0.0,
10549,tt0084786,They Call Me Bruce?,They Call Me Bruce?,1982,11/12/82,"Comedy, Action",87,USA,English,Elliott Hong,"David Randolph, Johnny Yune",Goldpine Productions,"Johnny Yune, Margaux Hemingway, Raf Mauro, Pam...",A goofy Korean finds his life hopelessly compl...,5.7,1308,,26.0,7.0,24830,Get set for all the zany fun in this hilarious...,1.223424,16894678.0,A chopstick comedy!
10553,tt0083511,48 Hrs.,48 Hrs.,1982,6/24/83,"Action, Comedy, Crime",96,USA,English,Walter Hill,"Roger Spottiswoode, Walter Hill",Paramount Pictures,"Nick Nolte, Eddie Murphy, Annette O'Toole, Fra...",A hard-nosed cop reluctantly teams up with a w...,6.9,64318,71.0,131.0,88.0,150,A hard-nosed cop reluctantly teams up with a w...,15.297121,78868508.0,One Cop. One Con. No Mercy.


### Data Processing

In [10]:
# Delete spaces in names to make names more unique
# eg. 'firstname lastname' -> 'firstnamelastname'
column_with_names = ['director', 'writer', 'production_company', 'actors']
source = source.copy()
for col in column_with_names:
    source[str(col)] = source[str(col)].str.replace(' ', '', regex=True)
    source[str(col)] = source[str(col)].str.replace(',', ' ', regex=True)

### Compute TFIDF

In [11]:
# Suggestion: remove the spaces for the names, to solving clustering ambiguity
# Issue: how to deal with numbers ???
columns = ['country', 'director','writer', 
           'production_company', 'actors',
           'description','overview', 'tagline']

In [12]:
# Converts the source dataframe into single string for tfidf computation
df = {}
movies = []
titles = []
imdbid = []
for i in range(source.shape[0]):
    row = source.iloc[i]
    row_str = ''
    titles.append(row['title'])
    imdbid.append(row['imdb_title_id'])
    for column in columns:
        row_str += str(row[column])
        row_str += ' '
    movies.append(row_str)

df['IMDBid'] = imdbid
df['Title'] = titles
df['Content'] = movies
df = pd.DataFrame(df)

In [13]:
df.head()

Unnamed: 0,IMDBid,Title,Content
0,tt0083693,Brimstone & Treacle,UK RichardLoncraine DennisPotter DennisPotter ...
1,tt0084786,They Call Me Bruce?,USA ElliottHong DavidRandolph JohnnyYune Goldp...
2,tt0083511,48 Hrs.,USA WalterHill RogerSpottiswoode WalterHill Pa...
3,tt0083741,Cold River,USA FredG.Sullivan WilliamJudson FredG.Sulliva...
4,tt0084704,The Soldier,USA JamesGlickenhaus JamesGlickenhaus JamesGli...


In [14]:
# Tranform to tfidf space
V = TfidfVectorizer()
X = V.fit_transform(df['Content'])
print('X shape:', X.shape)

X shape: (8359, 113241)


### Compute Similarity

In [15]:
'''
# Dimensionality reduction
svd = TruncatedSVD(n_components=10)
X = svd.fit_transform(X)
print('X shape after SVD:', X.shape)


# Compute similarity of movie for SVD
index = df[df['Title'] == your_pick].index[0]
d1 = X[index]
mag_d1 = np.linalg.norm(d1)
dist = []
for i in range(X.shape[0]):
    row = X[i]
    dot_product_xy = np.dot(d1, row)
    mag_row = np.linalg.norm(row)
    x_time_y = mag_d1 * mag_row
    dist.append(dot_product_xy/x_time_y) 
dist_series = pd.Series(dist)
dist_series = dist_series.sort_values(ascending=False)
dist_series.iloc[1:6]
dist_series = pd.DataFrame(dist_series)
'''

"\n# Dimensionality reduction\nsvd = TruncatedSVD(n_components=10)\nX = svd.fit_transform(X)\nprint('X shape after SVD:', X.shape)\n\n\n# Compute similarity of movie for SVD\nindex = df[df['Title'] == your_pick].index[0]\nd1 = X[index]\nmag_d1 = np.linalg.norm(d1)\ndist = []\nfor i in range(X.shape[0]):\n    row = X[i]\n    dot_product_xy = np.dot(d1, row)\n    mag_row = np.linalg.norm(row)\n    x_time_y = mag_d1 * mag_row\n    dist.append(dot_product_xy/x_time_y) \ndist_series = pd.Series(dist)\ndist_series = dist_series.sort_values(ascending=False)\ndist_series.iloc[1:6]\ndist_series = pd.DataFrame(dist_series)\n"

In [16]:
# Compute similarity of movie: Melvin and Howard
index = df[df['Title'] == your_pick].index[0]
d1 = list(csr_matrix.toarray(X[index]))
mag_d1 = np.linalg.norm(d1)
dist = []
for i in range(X.shape[0]):
    row = list(csr_matrix.toarray(X[i]))
    dot_product_xy = np.multiply(d1, row).sum(1)
    mag_row = np.linalg.norm(row)
    x_time_y = mag_d1 * mag_row
    dist.append(dot_product_xy/x_time_y) 
dist_series = pd.Series(dist)
dist_series = dist_series.sort_values(ascending=False)
dist_series.iloc[1:6]
dist_series = pd.DataFrame(dist_series)

### Recommend Top 5 Movies

In [17]:
# Merge similarity scores with original dataframe to visualize
result = pd.merge(dist_series, df, how='inner', left_index=True, right_index=True)
result = result.rename({0: 'Cosine Similarity Score'}, axis='columns')

In [18]:
# Show closest results
result.head(6)

Unnamed: 0,Cosine Similarity Score,IMDBid,Title,Content
6069,[1.0],tt1074638,Skyfall,"UK, USA SamMendes NealPurvis RobertWade EonPro..."
7589,[0.23768025689893485],tt2379713,Spectre,"UK, USA SamMendes JohnLogan NealPurvis B24 Dan..."
190,[0.20872375889872066],tt0086006,Never Say Never Again,"UK, USA, West Germany IrvinKershner KevinMcClo..."
4519,[0.1858382296061858],tt0830515,Quantum of Solace,"UK, USA MarcForster PaulHaggis NealPurvis Metr..."
4024,[0.16730066255531495],tt0381061,Casino Royale,"UK, Czech Republic, USA, Germany, Bahamas Mart..."
2924,[0.14985118325489172],tt0246460,Die Another Day,"UK, USA LeeTamahori IanFleming NealPurvis EonP..."


### Some Ideas for Future

In [19]:
# make a window of year range
# take the difference between year to see the closest
# make it fast: compute the tfidf once per day, put in cache
# Use PCA to speed up cosine similarity computation