In [4]:
import pandas as pd
import numpy as np

movies = pd.read_csv("ml-25m/movies.csv")

In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
import re

def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)


In [7]:
movies["cleaned_title"] = movies["title"].apply(clean_title)

In [8]:
movies

Unnamed: 0,movieId,title,genres,cleaned_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

# creating into ngrams - it will look two consecutive words together
vectorizer = TfidfVectorizer(ngram_range = (1,2))

# fit transform just like fit and then transform the word
tfidf = vectorizer.fit_transform(movies["cleaned_title"])

<62423x170073 sparse matrix of type '<class 'numpy.float64'>'
	with 446566 stored elements in Compressed Sparse Row format>

### Creating a search function 

In [10]:
# cosine similarity

# Finding the similarity
from sklearn.metrics.pairwise import cosine_similarity

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()

    # we need to find the items which have the greatest similarity to the search term
    indices = np.argpartition(similarity, -5)[-5:] # finds the five most similar items to our search term
    results = movies.iloc[indices][::-1]
    
    return results

In [11]:
myinput = input('Enter the name of the movie : ')

search(myinput)

Enter the name of the movie : 


Unnamed: 0,movieId,title,genres,cleaned_title
62422,209171,Women of Devil's Island (1962),Action|Adventure|Drama,Women of Devils Island 1962
20808,107573,Apnea (Apnoia) (2010),Drama,Apnea Apnoia 2010
20806,107563,"Princess for Christmas, A (2011)",Children|Comedy,Princess for Christmas A 2011
20804,107557,Fun Size (2012),Comedy,Fun Size 2012
20807,107565,"Fuck You, Goethe (Fack Ju Göhte) (2013)",Comedy,Fuck You Goethe Fack Ju Ghte 2013


### Display the results

In [12]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 3:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)



Text(value='', description='Movie Title:')

Output()

## Ratings Data 

In [13]:
ratings = pd.read_csv("ml-25m/ratings.csv")

In [14]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


##### Finding users who like the same movie

In [15]:
# toy story 1995 movie id
movie_id = 1

#users who watched and liked toy story 1995 
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
similar_users

array([    36,     75,     86, ..., 162527, 162530, 162533])

In [16]:
# other movies that "similar to us" users like 
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [17]:
similar_user_recs

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
24998854    60069
24998861    67997
24998876    78499
24998884    81591
24998888    88129
Name: movieId, Length: 1358326, dtype: int64

In [18]:
# movie with id = 1 appears 18835 times 
similar_user_recs.value_counts()


1         18835
318        8393
260        7605
356        6973
296        6918
          ...  
128478        1
125125        1
119701        1
107563        1
7625          1
Name: movieId, Length: 19282, dtype: int64

In [19]:
len(similar_users)

18835

In [20]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > .10]
similar_user_recs

1        1.000000
318      0.445607
260      0.403770
356      0.370215
296      0.367295
           ...   
953      0.103053
551      0.101195
1222     0.100876
745      0.100345
48780    0.100186
Name: movieId, Length: 113, dtype: float64

##### Finding How Much all users like movies

In [21]:
#checking a match of the movie in our recommended movies
# check if anyone who has rated the movie is in our recommended movies (similar_user_recs) dataframe
# also the ratings more than 4 
all_users = ratings[ratings["movieId"].isin(similar_user_recs.index) & (ratings['rating'] > 4)]
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
...,...,...,...,...
25000062,162541,5618,4.5,1240953299
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613
25000081,162541,7361,4.5,1240953484


In [22]:
# precentage of all_users who are in similar user recs
all_users_recs = all_users['movieId'].value_counts() / len(all_users["userId"].unique())

In [23]:
# this shows 318 movieId have 34% of likes
all_users_recs # percentage of all users who liked these moveies

318      0.342220
296      0.284674
2571     0.244033
356      0.235266
593      0.225909
           ...   
551      0.040918
50872    0.039111
745      0.037031
78499    0.035131
2355     0.025091
Name: movieId, Length: 113, dtype: float64

##### Creating a recommendation score 

In [24]:
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percentages.columns = ["similar", "all"]
rec_percentages

Unnamed: 0,similar,all
1,1.000000,0.124728
318,0.445607,0.342220
260,0.403770,0.222207
356,0.370215,0.235266
296,0.367295,0.284674
...,...,...
953,0.103053,0.045792
551,0.101195,0.040918
1222,0.100876,0.066877
745,0.100345,0.037031


In [25]:
# we find the ratio between the similar and 
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
rec_percentages

Unnamed: 0,similar,all,score
1,1.000000,0.124728,8.017414
318,0.445607,0.342220,1.302105
260,0.403770,0.222207,1.817089
356,0.370215,0.235266,1.573604
296,0.367295,0.284674,1.290232
...,...,...,...
953,0.103053,0.045792,2.250441
551,0.101195,0.040918,2.473085
1222,0.100876,0.066877,1.508376
745,0.100345,0.037031,2.709748


In [26]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)
rec_percentages

Unnamed: 0,similar,all,score
1,1.000000,0.124728,8.017414
3114,0.280648,0.053706,5.225654
2355,0.110539,0.025091,4.405452
78499,0.152960,0.035131,4.354038
4886,0.235147,0.070811,3.320783
...,...,...,...
2858,0.216724,0.167634,1.292845
296,0.367295,0.284674,1.290232
79132,0.166817,0.131384,1.269693
4973,0.142501,0.112405,1.267747


In [27]:
# This show the top ten recommendations
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")["cleaned_title"]

0                   Toy Story 1995
3021              Toy Story 2 1999
2264              Bugs Life A 1998
14813             Toy Story 3 2010
4780             Monsters Inc 2001
580                   Aladdin 1992
6258             Finding Nemo 2003
587      Beauty and the Beast 1991
8246          Incredibles The 2004
359             Lion King The 1994
Name: cleaned_title, dtype: object

In [28]:
def recommendation(movie_id):
    
    # find users similar to us 
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    
    #percentatge of the user who are similar to us and like the movie
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    
    # all of the users and there recommendations
    all_users = ratings[ratings["movieId"].isin(similar_user_recs.index) & (ratings['rating'] > 4)]
    all_users_recs = all_users['movieId'].value_counts() / len(all_users["userId"].unique())
    
    # find the percentage
    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")["cleaned_title"]
    

In [29]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 3:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(recommendation(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='', description='Movie Title:')

Output()

In [31]:
import pickle

In [32]:
pickle.dump(movies, open('movies.pkl', 'wb'))

In [33]:
movies['title'].values

array(['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)',
       ..., 'Bad Poems (2018)', 'A Girl Thing (2001)',
       "Women of Devil's Island (1962)"], dtype=object)

In [34]:
pickle.dump(ratings, open('ratings.pkl', 'wb'))