In [30]:
import pandas as pd

In [31]:
movies = pd.read_csv("movies.csv")

In [32]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [33]:
import re
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [34]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [35]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2)) 
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [37]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5) [-5:]
    results = movies.iloc[indices][::-1]
    return results

In [38]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    disable=False
)

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) >5:
            display(search(title))

movie_input.observe(on_type, names = 'value')

display(movie_input, movie_list)
            

Text(value='Toy Story', description='Movie Title:')

Output()

In [39]:
ratings = pd.read_csv("ratings.csv")
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [40]:
movie_id = 89745

In [41]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [42]:
similar_users

array([    21,    187,    208, ..., 162469, 162485, 162532], shape=(6036,))

In [43]:
similar_user_recs = ratings[(ratings["movieId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [44]:
similar_user_recs

33           5767
56           8154
76            260
149          2194
159          2501
            ...  
24999891    56941
24999893    58293
24999923      260
24999935      541
24999971     1259
Name: movieId, Length: 269882, dtype: int64

In [45]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [46]:
similar_user_recs

movieId
260       5.559145
589       3.371272
2762      2.959576
541       2.877071
924       2.107853
            ...   
906       0.114148
1694      0.113817
111360    0.112326
52435     0.106362
71464     0.103711
Name: count, Length: 63, dtype: float64

In [47]:
all_users =  ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [48]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [49]:
all_users_recs

movieId
260       0.355497
589       0.215587
2762      0.189259
541       0.183983
924       0.134793
            ...   
906       0.007300
1694      0.007278
111360    0.007183
52435     0.006802
71464     0.006632
Name: count, Length: 63, dtype: float64

In [50]:
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [51]:
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
260,5.559145,0.355497
589,3.371272,0.215587
2762,2.959576,0.189259
541,2.877071,0.183983
924,2.107853,0.134793
...,...,...
906,0.114148,0.007300
1694,0.113817,0.007278
111360,0.112326,0.007183
52435,0.106362,0.006802


In [52]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
rec_percentages = rec_percentages.sort_values("score", ascending=True)

In [53]:
rec_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8228,0.196985,0.012597,15.637674
2968,0.279490,0.017873,15.637674
1101,0.561299,0.035894,15.637674
8641,0.363154,0.023223,15.637674
3409,0.119781,0.007660,15.637674
...,...,...,...
61132,0.198144,0.012671,15.637674
21,0.604871,0.038680,15.637674
628,0.412028,0.026348,15.637674
647,0.154738,0.009895,15.637674


In [54]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
7683,0.196985,0.012597,15.637674,8228,"Maltese Falcon, The (a.k.a. Dangerous Female) ...",Mystery,Maltese Falcon The aka Dangerous Female 1931
2876,0.27949,0.017873,15.637674,2968,Time Bandits (1981),Adventure|Comedy|Fantasy|Sci-Fi,Time Bandits 1981
1074,0.561299,0.035894,15.637674,1101,Top Gun (1986),Action|Romance,Top Gun 1986
7928,0.363154,0.023223,15.637674,8641,Anchorman: The Legend of Ron Burgundy (2004),Comedy,Anchorman The Legend of Ron Burgundy 2004
3314,0.119781,0.00766,15.637674,3409,Final Destination (2000),Drama|Thriller,Final Destination 2000
19678,0.299536,0.019155,15.637674,102125,Iron Man 3 (2013),Action|Sci-Fi|Thriller|IMAX,Iron Man 3 2013
1631,0.113817,0.007278,15.637674,1694,"Apostle, The (1997)",Drama,Apostle The 1997
11527,0.106362,0.006802,15.637674,52435,How the Grinch Stole Christmas! (1966),Animation|Comedy|Fantasy|Musical,How the Grinch Stole Christmas 1966
1917,0.245527,0.015701,15.637674,2006,"Mask of Zorro, The (1998)",Action|Comedy|Romance,Mask of Zorro The 1998
1210,0.245527,0.015701,15.637674,1243,Rosencrantz and Guildenstern Are Dead (1990),Comedy|Drama,Rosencrantz and Guildenstern Are Dead 1990


In [58]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [59]:
movie_name_input = widgets.Text(
    value="Toy Story",
    description = "Movie Title:",
    disable=False
)    

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title =data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names = "value")

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()