# Recommender system tutorial

In [1]:
import pandas as pd

In [2]:
metadata = pd.read_csv("/Users/zacharyargentin/Programming/datasets/movies_dataset/movies_metadata.csv", low_memory=False)
metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


# Simple Recommender

wr = (vR / (v + m)) + mC / (v + m).  

v = vote_count     
R = vote_average   
C = mean rating across all movies   
m = minimum vote count to be considered   


In [3]:
C = metadata["vote_average"].mean()
C

5.618207215133889

In [4]:
m = metadata["vote_count"].quantile(0.90)
m

160.0

In [5]:
q_movies = metadata.copy().loc[metadata["vote_count"].ge(m)]
q_movies.shape

(4555, 24)

In [6]:
# Function that computes the weighted rating of each movie
# wr = (vR / (v + m)) + mC / (v + m).
def weighted_rating(x, m=m, C=C):
    v = x["vote_count"]
    R = x["vote_average"]
    return (v/(v+m) * R) + (m/(m+v) * C)

In [7]:
q_movies["score"] = q_movies.apply(weighted_rating, axis=1)

In [8]:
#Sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

#Print the top 15 movies
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(20)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171


# Content-Based Recommender

In [9]:
metadata["overview"].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
tfidf = TfidfVectorizer(stop_words="english")
metadata["overview"] = metadata["overview"].fillna("")
tfidf_matrix = tfidf.fit_transform(metadata["overview"])

In [12]:
tfidf_matrix.shape

(45466, 75827)

In [13]:
tfidf.get_feature_names()[5000:5010]

['avails',
 'avaks',
 'avalanche',
 'avalanches',
 'avallone',
 'avalon',
 'avant',
 'avanthika',
 'avanti',
 'avaracious']

With this matrix in hand, you can now compute a similarity score. There are several similarity metrics that you can use for this, such as the manhattan, euclidean, the Pearson, and the cosine similarity scores.

since you have used the TF-IDF vectorizer, calculating the dot product between each vector will directly give you the cosine similarity score. Therefore, you will use sklearn's linear_kernel() instead of cosine_similarities() since it is faster.

In [14]:
from sklearn.metrics.pairwise import linear_kernel
# from sklearn.metrics.pairwise import cosine_similarity

In [15]:
%%time
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

CPU times: user 9.51 s, sys: 7.59 s, total: 17.1 s
Wall time: 30.7 s


In [16]:
cosine_sim.shape

(45466, 45466)

In [17]:
cosine_sim[351]

array([0.        , 0.        , 0.        , ..., 0.        , 0.02055491,
       0.        ])

In [18]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(metadata.index, index=metadata["title"]).drop_duplicates()
indices.head()

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64

In [19]:
indices[:10]

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64

#### get recommendation function explained

In [20]:
indices["Forrest Gump"]

351

In [21]:
list(enumerate(cosine_sim[351]))

[(0, 0.0),
 (1, 0.0),
 (2, 0.0),
 (3, 0.014145921296366692),
 (4, 0.0),
 (5, 0.0422384868359441),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.008864279982673402),
 (13, 0.015389306851069958),
 (14, 0.0),
 (15, 0.011634020150998178),
 (16, 0.0),
 (17, 0.0),
 (18, 0.0),
 (19, 0.0),
 (20, 0.009362928840514588),
 (21, 0.0),
 (22, 0.015457887641372696),
 (23, 0.07882762348082893),
 (24, 0.0),
 (25, 0.0),
 (26, 0.0),
 (27, 0.011208425312457705),
 (28, 0.0),
 (29, 0.0),
 (30, 0.030570217050889796),
 (31, 0.0),
 (32, 0.0),
 (33, 0.0),
 (34, 0.004358263239701963),
 (35, 0.0365573686370032),
 (36, 0.0),
 (37, 0.0),
 (38, 0.0),
 (39, 0.0),
 (40, 0.0),
 (41, 0.0),
 (42, 0.03004625428175299),
 (43, 0.0),
 (44, 0.0),
 (45, 0.02386073123752926),
 (46, 0.0170794449269769),
 (47, 0.022522186271504627),
 (48, 0.0),
 (49, 0.005971417123408101),
 (50, 0.009180916775185366),
 (51, 0.0),
 (52, 0.0),
 (53, 0.00531155991969874),
 (54, 0.0),
 (55, 0.0),
 (56, 0.0),
 (57, 0.02225

In [22]:
sorted(list(enumerate(cosine_sim[351])), key=lambda x: x[1], reverse=True)

[(351, 1.0),
 (10966, 0.1587729274540996),
 (13513, 0.14806757717874905),
 (22372, 0.14476753358734779),
 (31468, 0.14250391641554108),
 (9214, 0.139174484273161),
 (19554, 0.1361922271789111),
 (29006, 0.13253494066771437),
 (39672, 0.13021824065329346),
 (40018, 0.12920084493151907),
 (30256, 0.12523231833420576),
 (16579, 0.12493384304379111),
 (33082, 0.12239676293677659),
 (18559, 0.11924679245064776),
 (15688, 0.1179546896830849),
 (11633, 0.11702260055250119),
 (30916, 0.11653513511562857),
 (27481, 0.11596154853582294),
 (23239, 0.1149255898105854),
 (2823, 0.10675918229176526),
 (2073, 0.10649665407746314),
 (18153, 0.10610768497865841),
 (45181, 0.10497683761054849),
 (31371, 0.1042966073086643),
 (12479, 0.1036382859624619),
 (31250, 0.1032371963367764),
 (33726, 0.10270912339165636),
 (27068, 0.10270470887419289),
 (386, 0.10179301557024092),
 (5225, 0.10158317734243281),
 (32144, 0.10027796980791764),
 (43707, 0.09915431532806568),
 (22979, 0.0977420914907939),
 (32238, 0.

In [23]:
sorted(list(enumerate(cosine_sim[351])), key=lambda x: x[1], reverse=True)[1:11]

[(10966, 0.1587729274540996),
 (13513, 0.14806757717874905),
 (22372, 0.14476753358734779),
 (31468, 0.14250391641554108),
 (9214, 0.139174484273161),
 (19554, 0.1361922271789111),
 (29006, 0.13253494066771437),
 (39672, 0.13021824065329346),
 (40018, 0.12920084493151907),
 (30256, 0.12523231833420576)]

In [24]:
[i[0] for i in sorted(list(enumerate(cosine_sim[351])), key=lambda x: x[1], reverse=True)[1:11]]

[10966, 13513, 22372, 31468, 9214, 19554, 29006, 39672, 40018, 30256]

In [25]:
metadata['title'].iloc[[i[0] for i in sorted(list(enumerate(cosine_sim[351])), key=lambda x: x[1], reverse=True)[1:11]]]

10966       An American Haunting
13513             Nigdy w życiu!
22372             The Lost Thing
31468                     Carver
9214                 Crane World
19554     The Marshal of Finland
29006    For a Handful of Kisses
39672                  U Be Dead
40018               Down by Love
30256                   Lucky 13
Name: title, dtype: object

In [26]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    titles = metadata["title"].iloc[movie_indices]
    return titles

In [27]:
get_recommendations("Toy Story")

15348                                     Toy Story 3
2997                                      Toy Story 2
10301                          The 40 Year Old Virgin
24523                                       Small Fry
23843                     Andy Hardy's Blonde Trouble
29202                                      Hot Splash
43427                Andy Kaufman Plays Carnegie Hall
38476    Superstar: The Life and Times of Andy Warhol
42721    Andy Peters: Exclamation Mark Question Point
8327                                        The Champ
Name: title, dtype: object

> The recommendations aren't very good using only the plot.   
> let's add more features

In [28]:
credits = pd.read_csv("/Users/zacharyargentin/Programming/datasets/movies_dataset/credits.csv")
keywords = pd.read_csv("/Users/zacharyargentin/Programming/datasets/movies_dataset/keywords.csv")

In [29]:
# Remove rows with bad IDs.
metadata = metadata.drop([19730, 29503, 35587])

In [30]:
metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [31]:
credits.head(3)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602


In [32]:
keywords.head(3)

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."


In [33]:
# Convert IDs to int. Required for merging
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')

In [34]:
# Merge keywords and credits into your main metadata dataframe
metadata = metadata.merge(credits, on="id")
metadata = metadata.merge(keywords, on="id")

In [35]:
metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."


In [36]:
# Parse the stringified features into their corresponding python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(literal_eval)

In [37]:
metadata.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [38]:
metadata["crew"][0][0]

{'credit_id': '52fe4284c3a36847f8024f49',
 'department': 'Directing',
 'gender': 2,
 'id': 7879,
 'job': 'Director',
 'name': 'John Lasseter',
 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}

In [39]:
import numpy as np

In [40]:
def get_director(x):
    for i in x:
        if i["job"] == ("Director"):
            return i["name"]
    return np.nan

In [41]:
def get_list(x):
    if isinstance(x, list):
        names = [i["name"] for i in x]
        if len(names) > 3:
            return names[:3]
        return names
    return[]

In [42]:
metadata["director"] = metadata["crew"].apply(get_director)

In [43]:
features = ['cast', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(get_list)

In [44]:
metadata[["title", "cast", "genres", "keywords", "director"]].head(3)

Unnamed: 0,title,cast,genres,keywords,director
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]","[Animation, Comedy, Family]","[jealousy, toy, boy]",John Lasseter
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]","[Adventure, Fantasy, Family]","[board game, disappearance, based on children'...",Joe Johnston
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]","[Romance, Comedy]","[fishing, best friend, duringcreditsstinger]",Howard Deutch


In [45]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [46]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)

In [47]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [48]:
metadata['soup'] = metadata.apply(create_soup, axis=1)

In [49]:
metadata[['soup']].head(2)

Unnamed: 0,soup
0,jealousy toy boy tomhanks timallen donrickles ...
1,boardgame disappearance basedonchildren'sbook ...


In [50]:
from sklearn.feature_extraction.text import CountVectorizer

In [51]:
count = CountVectorizer(stop_words="english")
count_matrix = count.fit_transform(metadata["soup"])

In [52]:
type(count_matrix)

scipy.sparse.csr.csr_matrix

In [53]:
count_matrix[:4000].shape

(4000, 73881)

In [54]:
from sklearn.metrics.pairwise import cosine_similarity

### kernal dies here for some reason   
I think stack overflow. when I slice a smaller sample it works fine

In [55]:
%%time
cosine_sim2 = cosine_similarity(count_matrix[:200000], count_matrix[:20000])

CPU times: user 3.43 s, sys: 3.82 s, total: 7.25 s
Wall time: 12.6 s


In [56]:
# Reset index of your main DataFrame and construct reverse mapping as before
metadata = metadata.reset_index()
indices = pd.Series(metadata.index, index=metadata['title'])

In [57]:
get_recommendations('The Dark Knight Rises', cosine_sim2)

12589      The Dark Knight
10210        Batman Begins
9311                Shiner
9874       Amongst Friends
7772              Mitchell
516      Romeo Is Bleeding
11463         The Prestige
10853       Helter Skelter
18940            Last Exit
4520       An Innocent Man
Name: title, dtype: object

In [58]:
get_recommendations('The Godfather', cosine_sim2)

1934            The Godfather: Part III
1199             The Godfather: Part II
15609                   The Rain People
18940                         Last Exit
8001     The Night of the Following Day
18261                 The Son of No One
7772                           Mitchell
1186                     Apocalypse Now
1648                   Ill Gotten Gains
3487         Jails, Hospitals & Hip-Hop
Name: title, dtype: object