In [10]:
import pandas as pd

movies = pd.read_csv("movies.csv")

In [11]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [12]:
import re

def clean_title(title):
    return re.sub("[^a-zA-Z0-9]"," ",title) #replaces all the characters which aren't(^) a-z/A-Z/0-9 with nothing("") in each title

In [13]:
#creating a new column in our dataset
movies["clean_title"] = movies["title"].apply(clean_title)

In [14]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"]) #converts into a matrix


In [16]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title]) #convert into a set of nos(vector)
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity,-5)[-5:] #finds the 5 most similar entries to our search term
    results = movies.iloc[indices][::-1] #index the movies data by these indices
    #[::-1] => reverses the results, i.e. most similar results are actually @ the bottom of the list, so bring it up
    return results

In [17]:
ratings = pd.read_csv("ratings.csv")

In [18]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [19]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [20]:
#the recommendation function
def find_similar_movies(movie_id):
    #finding the recommendations similar to us
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique() 
    # user has watched the movie with movie_id && rating > 4
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    #the user(s) similar to us and finding the movies that they liked i.e. the other movies liked by users similar to us
    l
    
    #finding only >10% similar user's recommendations
    similar_user_recs = similar_user_recs.value_counts()/len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .1] 
    # we'll find the movies that 10% or more similar users liked
    #113 movies => no. of movies liked by users similar to us
    
    
    #finding recommendations of all users
    #Now, let's find out how much all of the users in our dataset who like these movies
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"]>4)]
    #these are all the users that watched the movies recommended to us(the 113 movies)
    
    #finding the % of all users who recommended these movies in similar_user_recs
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    #% of all users that like the movie, but we want only those movies that
    # are liked by the similar users and not the general public
    
    
    
    #combine the similar users and all users recommendations
    rec_percentages = pd.concat([similar_user_recs, all_user_recs],axis=1)
    rec_percentages.columns = ["similar","all"]
    #finding the score by dividing the above 2 columns
    rec_percentages["score"]  = rec_percentages["similar"] / rec_percentages["all"]
    #sort in descending order
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    
    #take the top 10 recommendations and merge them with the movies data to get the titles
    #left_index=True => use the left index(1st column) i.e. the movieId of the rec_percentages
    #and then merge that on the right with the movieId
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score","title","genres"]]


In [23]:
import ipywidgets as widgets
from IPython.display import display

movie_input_name = widgets.Text(
    value = "Toy Story",
    description="Movie Title:",
    disabled=False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title)>5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"] #extract the movieId of the 1st row
            display(find_similar_movies(movie_id))

movie_input_name.observe(on_type, names="value") #observe the movie input

display(movie_input_name,recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()