In [1]:
import pandas as pd
import numpy as np
import regex as re
import ipywidgets as widgets
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


movies = pd.read_csv("movies.csv")

In [2]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [3]:
def clean_title(title):
    cleaned_title = re.sub("[^a-zA-Z0-9]", " ", title)
    return cleaned_title

In [4]:
movies["clean_title"] = movies["title"].apply(clean_title) #creating a title column wihtout special chars

In [5]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [6]:
vectorizer = TfidfVectorizer(ngram_range=(1,2)) #vectorize titles by looking at each word and each group of 2 words

tfidf = vectorizer.fit_transform(movies["clean_title"]) #implement tfidf matrice for clean titles

In [7]:
def search(title):
    title = clean_title(title) #clean the title
    query_vec = vectorizer.transform([title]) #vectorize the title
    similarity_scores = cosine_similarity(query_vec, tfidf).flatten() #find the similarity of the vecotrized title to all other titles
    indices = np.argsort(similarity_scores)[-5:][::-1] # find the 5 most similar titles
    results = movies.iloc[indices] # most similar result is the last of the 5
    return results

In [8]:
example = search("Fight Club")
example

Unnamed: 0,movieId,title,genres,clean_title
2867,2959,Fight Club (1999),Action|Crime|Drama|Thriller,Fight Club 1999
45149,169736,Zombie Fight Club (2014),Action|Horror,Zombie Fight Club 2014
45572,170661,Female Fight Club (2017),Action,Female Fight Club 2017
45023,169468,Lure: Teen Fight Club (2010),Action|Crime|Drama,Lure Teen Fight Club 2010
37860,153548,Fight Club: Members Only (2006),(no genres listed),Fight Club Members Only 2006


In [9]:
#Creating search widgets
#Input widget
title_input = widgets.Text(
    value = "Fight Club",
    description = "Movie Title",
    disabled = False
)
#Output widget
movie_list = widgets.Output()

def on_type(data: dict):
    with movie_list:
        movie_list.clear_output() #clear anything in there before
        title = data["new"] #data is a dictionnary
        if len(title) >= 4:
            display(search(title))
        
title_input.observe(on_type, names='value') #call on on_type whenever something is typed in widget

display(title_input, movie_list)

Text(value='Fight Club', description='Movie Title')

Output()

In [10]:
ratings = pd.read_csv("ratings.csv")

In [11]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [12]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [13]:
#movie_id = 1 is for dedmonstration purposes and to chek if code works
#It could have been any value an will later be a variable
movie_id = 1
#Creating Recommendation system
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >4)]["userId"].unique() 
#users who watched liked (rating > 4 out of 5) the same movie as the one searched

similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
#finding movies who similar_users also liked

similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > .1] #Get recommendations that at least 10% of similar users liked

#Finding the 'niche' movies liked by similar users
#Finding how much all of the users in the dataset liked the similar_user_recs so filter out niche
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [14]:
all_users_recs

movieId
318      0.342220
296      0.284674
2571     0.244033
356      0.235266
593      0.225909
           ...   
551      0.040918
50872    0.039111
745      0.037031
78499    0.035131
2355     0.025091
Name: count, Length: 113, dtype: float64

In [15]:
#Creating a Recommendation Score
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percentages.columns = ["similar", "all"]
#Finding the percentages of how many similar & all users liked the same movies

In [16]:
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.124728
318,0.445607,0.342220
260,0.403770,0.222207
356,0.370215,0.235266
296,0.367295,0.284674
...,...,...
953,0.103053,0.045792
551,0.101195,0.040918
1222,0.100876,0.066877
745,0.100345,0.037031


In [17]:
# Score is tha ratio of how many similar people liked the same movies as us
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"] 

In [18]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [19]:
rec_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.000000,0.124728,8.017414
3114,0.280648,0.053706,5.225654
2355,0.110539,0.025091,4.405452
78499,0.152960,0.035131,4.354038
4886,0.235147,0.070811,3.320783
...,...,...,...
2858,0.216724,0.167634,1.292845
296,0.367295,0.284674,1.290232
79132,0.166817,0.131384,1.269693
4973,0.142501,0.112405,1.267747


In [20]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.124728,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.280648,0.053706,5.225654,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.110539,0.025091,4.405452,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bug s Life A 1998
14813,0.15296,0.035131,4.354038,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4780,0.235147,0.070811,3.320783,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
580,0.216618,0.067513,3.208539,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
6258,0.228139,0.072268,3.156862,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
587,0.1794,0.059977,2.99115,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8246,0.203504,0.068453,2.972889,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
359,0.253411,0.085764,2.954762,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


In [30]:
#Building a recommendation system with everything we've done so far
def find_similar_movies(movie_id):
    #Finding "similar users"
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

    #Adjusting so only movies with over 10% of recommendations are left
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .1]

    #Finding how common this recommendation is with all users
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    #Creating a "score" to recommend the movie
    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"] 
    
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    recs = rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]
    return recs
    

In [31]:
#Creating interactive recommendation widget
movie_name_input = widgets.Text(
    value = "Fight Club",
    description = "Movie Title:",
    disabled = False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 3:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))
            
movie_name_input.observe(on_type, names = "value")

display(movie_name_input, recommendation_list)
            

Text(value='Fight Club', description='Movie Title:')

Output()