In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [2]:
#movies.csv--> movieId,title,genres
movie_df=pd.read_csv('movies.csv')
movie_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [3]:
#ratings.csv--> userId,movieId,rating and timestamp
ratings_df=pd.read_csv('ratings.csv')
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


#### Merging both the dataframes on movieId

In [4]:
df=pd.merge(movie_df,ratings_df,on='movieId')
df

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483
...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,1537109082
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5,1537109545
100833,193585,Flint (2017),Drama,184,3.5,1537109805
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5,1537110021


In [5]:
df=df.drop('timestamp',axis=1)
df

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5
...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5
100833,193585,Flint (2017),Drama,184,3.5
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5


In [6]:
#Dropping the records which has missing values from 'title' column
movie_rating_merge=df.dropna(axis=0,subset=['title'])
movie_rating_merge


Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5
...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5
100833,193585,Flint (2017),Drama,184,3.5
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5


In [15]:
#Total number of ratings for a specific movie

movie_rating_count=movie_rating_merge.groupby(by=['title'])['rating'].count().reset_index().rename(columns={'rating':'total_rating_movie'})
movie_rating_count.sort_values(by='total_rating_movie',ascending=False)


Unnamed: 0,title,total_rating_movie
3158,Forrest Gump (1994),329
7593,"Shawshank Redemption, The (1994)",317
6865,Pulp Fiction (1994),307
7680,"Silence of the Lambs, The (1991)",279
5512,"Matrix, The (1999)",278
...,...,...
4773,King Solomon's Mines (1950),1
4772,King Solomon's Mines (1937),1
4771,King Ralph (1991),1
4769,King Kong Lives (1986),1


In [16]:
# Total Rating count of a specific rating

rating_total_count=movie_rating_merge.merge(movie_rating_count,left_on='title',right_on='title',how='left')
rating_total_count

Unnamed: 0,movieId,title,genres,userId,rating,total_rating_movie
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,215
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,215
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,215
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,215
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,215
...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,1
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5,1
100833,193585,Flint (2017),Drama,184,3.5,1
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5,1


In [17]:
movie_rating_count['total_rating_movie'].describe()

count    9719.000000
mean       10.375141
std        22.406220
min         1.000000
25%         1.000000
50%         3.000000
75%         9.000000
max       329.000000
Name: total_rating_movie, dtype: float64

In [18]:

popularity_threshold=60
popular_movie_as_per_rating=rating_total_count.query('total_rating_movie>= @popularity_threshold')
popular_movie_as_per_rating.sort_values(by='total_rating_movie',ascending=False)

Unnamed: 0,movieId,title,genres,userId,rating,total_rating_movie
10024,356,Forrest Gump (1994),Comedy|Drama|Romance|War,11,5.0,329
10106,356,Forrest Gump (1994),Comedy|Drama|Romance|War,160,3.0,329
10128,356,Forrest Gump (1994),Comedy|Drama|Romance|War,200,4.0,329
10129,356,Forrest Gump (1994),Comedy|Drama|Romance|War,201,5.0,329
10130,356,Forrest Gump (1994),Comedy|Drama|Romance|War,202,5.0,329
...,...,...,...,...,...,...
12826,466,Hot Shots! Part Deux (1993),Action|Comedy|War,7,5.0,60
12825,466,Hot Shots! Part Deux (1993),Action|Comedy|War,6,2.0,60
30848,1377,Batman Returns (1992),Action|Crime,244,4.0,60
30849,1377,Batman Returns (1992),Action|Crime,268,3.0,60


In [20]:
# Creating a Pivot Table for user_id and title as per ratings
# The table gives info about for a particular movie whoever(as per userid) have given rating 

movie_features=popular_movie_as_per_rating.pivot_table(index='title',columns='userId',values='rating').fillna(0)
movie_features

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0
"40-Year-Old Virgin, The (2005)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5
"Abyss, The (1989)",4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0
Ace Ventura: Pet Detective (1994),0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,2.0,0.0,0.0,0.0,3.5,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Willy Wonka & the Chocolate Factory (1971),5.0,0.0,0.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,4.0,0.0,0.0,3.5,0.0,0.0
"Wizard of Oz, The (1939)",5.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,2.0,0.0,5.0,2.5,0.0,3.5
X-Men (2000),5.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,0.0,3.5
X2: X-Men United (2003),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0


In [21]:
# Sparse matrix of type CSR(compressed dataframe to fit into RAM)

movie_features_matrix=csr_matrix(movie_features.values)
#movie_features_matrix
knn_model=NearestNeighbors(metric='cosine')
knn_model.fit(movie_features_matrix)

NearestNeighbors(metric='cosine')

In [36]:
#After fitting into the model, getting random index and finding the movie suggestions for that index

index_=np.random.choice(movie_features.shape[0])
distance,movies=knn_model.kneighbors(movie_features.iloc[index_,:].values.reshape(1,-1),n_neighbors=6)
#First getting the whole record and getting 5 neighbors for it
print(index_)

235


In [37]:
# with Flatten
c=0
for i in range(len(distance.flatten())):
    if i==0: #For this movie the recommendations are as below
        print('Recommendations for {0}:\n'.format(movie_features.index[index_]))
    else:
        c+=1
        print('{0}:{1}, with distance of {2}'.format(i,movie_features.index[movies.flatten()[i]],distance.flatten()[i]))
#print('Count:',c)
# The number of recommendations for a specific movie changes accordingly

Recommendations for Prestige, The (2006):

1:Departed, The (2006), with distance of 0.3903629416827662
2:Batman Begins (2005), with distance of 0.39379522842943715
3:V for Vendetta (2006), with distance of 0.42925628643389735
4:Memento (2000), with distance of 0.44936564847932414
5:Dark Knight, The (2008), with distance of 0.45372708576971377
