In [87]:
import pandas as pd
movies = pd.read_csv('movies.csv')

In [88]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [89]:
import re

def cleantitle(title):
    #going through each title and removing characters that aren't letters or numbers
    return re.sub("[^a-zA-Z0-9 ]" , "", title)


In [90]:
movies["clean_title"] = movies["title"].apply(cleantitle)

In [91]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [92]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2)) #passing in a range of 2 consecutive words 
tfidf = vectorizer.fit_transform(movies["clean_title"]) #using vectorizer to turn the titles into the matrix

In [93]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = cleantitle(title)
    query_vec = vectorizer.transform([title]) #turning the entered term into the vector
    similarity = cosine_similarity(query_vec, tfidf).flatten() #comparing the query term to each of the cleaned titles and returning a similarity score
    indices = np.argpartition(similarity, -5)[-5:] #getting the indices of the 5 most similar terms
    results = movies.iloc[indices][::-1] #getting the movies at those indices
    return results 

In [94]:
search("toy story")

Unnamed: 0,movieId,title,genres,clean_title
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019
20497,106022,Toy Story of Terror (2013),Animation|Children|Comedy,Toy Story of Terror 2013


In [95]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(value='Toy Story', description='Enter Title:', disabled=False) #creating text box
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output() #first clearing anything in the output widget
        title = data["new"]
        if len(title) > 5:
            display(search(title)) #displaying what is outputted by calling the search function on the inputted title

movie_input.observe(on_type, names='value') #calling on_type on movie_input
 
display(movie_input, movie_list)

Text(value='Toy Story', description='Enter Title:')

Output()

In [96]:
ratings = pd.read_csv("ratings.csv")
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [97]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [120]:
movie_id = 1

In [121]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
#finding the unique users who both watched a certain movie with movie_id and rated it above 4 stars

In [122]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
#finding all of the movies names that users who are similar to us 
#(in that they enjoyed the same movie with movieId) also enjoyed (in that they rated it above 4 stars)

In [123]:
similar_user_recs

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
24998854    60069
24998861    67997
24998876    78499
24998884    81591
24998888    88129
Name: movieId, Length: 1358326, dtype: int64

In [124]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > .1]
#limiting the movie recs to only those that greater than 10 percent of all similar users also liked

In [125]:
similar_user_recs

movieId
1        1.000000
318      0.445607
260      0.403770
356      0.370215
296      0.367295
           ...   
953      0.103053
551      0.101195
1222     0.100876
745      0.100345
48780    0.100186
Name: count, Length: 113, dtype: float64

In [129]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
#finding all of the users that liked the movies in the set of similar user recs

In [131]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
# getting the percentages of all users who like the movie in the set of similar user recs. 
# We want this percentage to be significantly lower than the percentage of similar users who like the movie
# this is because if 100% all users like a certain movie, then the reccomendation we give to the user is only given bc
# the movie is popular. We'd rather reccomend a movie that 30% of all users like but significantly more simlar users like.

In [132]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis = 1)
rec_percentages.columns = ["similar user recs" , "all user recs"]

In [133]:
rec_percentages

Unnamed: 0_level_0,similar user recs,all user recs
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.124728
318,0.445607,0.342220
260,0.403770,0.222207
356,0.370215,0.235266
296,0.367295,0.284674
...,...,...
953,0.103053,0.045792
551,0.101195,0.040918
1222,0.100876,0.066877
745,0.100345,0.037031


In [137]:
rec_percentages["score"] = rec_percentages["similar user recs"] / rec_percentages["all user recs"]
#dividing the percentages for similar user recs and all user recs to create a score

In [138]:
rec_percentages = rec_percentages.sort_values("score", ascending = False) 

In [139]:
rec_percentages

Unnamed: 0_level_0,similar user recs,all user recs,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.000000,0.124728,8.017414
3114,0.280648,0.053706,5.225654
2355,0.110539,0.025091,4.405452
78499,0.152960,0.035131,4.354038
4886,0.235147,0.070811,3.320783
...,...,...,...
2858,0.216724,0.167634,1.292845
296,0.367295,0.284674,1.290232
79132,0.166817,0.131384,1.269693
4973,0.142501,0.112405,1.267747


In [140]:
rec_percentages.head(10).merge(movies, left_index = True, right_on = "movieId") #getting top ten and merging titles

Unnamed: 0,similar user recs,all user recs,score,movieId,title,genres,clean_title
0,1.0,0.124728,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.280648,0.053706,5.225654,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.110539,0.025091,4.405452,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.15296,0.035131,4.354038,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4780,0.235147,0.070811,3.320783,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
580,0.216618,0.067513,3.208539,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
6258,0.228139,0.072268,3.156862,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
587,0.1794,0.059977,2.99115,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8246,0.203504,0.068453,2.972889,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
359,0.253411,0.085764,2.954762,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


In [147]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    
    #based on the movieId given, this is finding users who also rated that movie highly (similar users) and 
    #also finding all of the other movies that they gave high ratings to 

    
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    
    #here, we are only taking the movies that make up more than 10 percent of similar movies recs
    
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    #here we are finding the percentages of all users (not just ones who liked the same movie) that like the reccomendations
    
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    #here we create a percentages data frame with the percentages of similar 
    #(liked the same movie) users who also like reccomended movies and the 
    #percentages of all users who like the reccomended movies
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    
    #here, we create a score by dividing the similar users percentages by the all users percentages
    #Because We want user percentage to be significantly lower than the percentage of similar users who like the movie,
    #the higher the score, the more unique the rec is towards someone who likes a particular movie. These are good recs.

    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

    #here, we're taking the top ten highest scores and merging them with the table that has the movie information.

In [156]:
movie_name_input = widgets.Text(
    value='Jurassic Park',
    description='Enter Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Jurassic Park', description='Enter Title:')

Output()