In [1]:
# Imported Lib
import numpy as np
import pandas as pd
import matplotlib as plt

In [2]:
# Imported movies.csv file
movies_df = pd.read_csv('movies.csv',usecols=['movieId','title'],dtype={'movieId':'int32','title':'str'})
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [3]:
# Imported Ratings.csv
ratings_df = pd.read_csv('ratings.csv',usecols=['userId','movieId','rating'],dtype={'userId':'int32','movieId':'int32','rating':'float'})
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [4]:
# merging 'rating_df' and 'movies_df' and naming it as 'df'
df = pd.merge(ratings_df,movies_df,on='movieId')
df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   100836 non-null  int32  
 1   movieId  100836 non-null  int32  
 2   rating   100836 non-null  float64
 3   title    100836 non-null  object 
dtypes: float64(1), int32(2), object(1)
memory usage: 3.1+ MB


In [6]:
# Removing all NaN values from 'df' titles column
df.dropna(axis=0,subset=['title']) # to drop NaN vslues if present in title col.

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)
...,...,...,...,...
100831,610,160341,2.5,Bloodmoon (1997)
100832,610,160527,4.5,Sympathy for the Underdog (1971)
100833,610,160836,3.0,Hazard (2005)
100834,610,163937,3.5,Blair Witch (2016)


In [7]:
# arranging the datafarme by number of rating given to each movie (stored in 'df_ratingCount')
df_ratingCount = df.groupby(by=['title'])['rating'].count().reset_index().rename(columns = {'rating': 'totalRatingCount'})
df_ratingCount.head()

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [8]:
# merging 'df_ratingCount' with 'df' and storing in 'df_totRatingCount' 
df_totRatingCount = df.merge(df_ratingCount, left_on = 'title', right_on = 'title', how = 'left')
df_totRatingCount.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [9]:
df_totRatingCount.shape

(100836, 5)

In [10]:
# Setting the Threshold for the number of rating so that the movies above the threshold will only be considered.(Fell free to change the thershold can be changed)
threshold = 30
# selecting only thoes movies that are above threshold in 'df_popularMovies'
df_popularMovies = df_totRatingCount.query('totalRatingCount >= @threshold')
df_popularMovies.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [11]:
df_popularMovies.shape

(58021, 5)

In [12]:
# Converting 'df_popularMovies' in PIVOT table and storing in 'df_popMoviePivot'
df_popMoviePivot = df_popularMovies.pivot_table(index='title',columns='userId',values='rating').fillna(0)
df_popMoviePivot

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians (1996),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians (One Hundred and One Dalmatians) (1961),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Young Frankenstein (1974),5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0
Zodiac (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
Zombieland (2009),0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
Zoolander (2001),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0


In [13]:
# Converting 'df_popMoviePivot' in array(this array is sparse matrix) and transforming CSR fromat for fast computaion
from scipy.sparse import csr_matrix
df_popMoviePivot_matrix = csr_matrix(df_popMoviePivot.values)
print(df_popMoviePivot_matrix)
df_popMoviePivot_matrix.shape

  (0, 14)	4.0
  (0, 17)	4.0
  (0, 21)	0.5
  (0, 40)	3.5
  (0, 60)	4.5
  (0, 66)	4.0
  (0, 68)	4.0
  (0, 71)	4.0
  (0, 87)	2.5
  (0, 90)	4.0
  (0, 103)	5.0
  (0, 123)	1.0
  (0, 139)	3.0
  (0, 141)	5.0
  (0, 146)	3.5
  (0, 151)	3.5
  (0, 157)	5.0
  (0, 175)	3.5
  (0, 211)	4.5
  (0, 230)	5.0
  (0, 247)	4.0
  (0, 278)	4.0
  (0, 296)	2.0
  (0, 315)	3.0
  (0, 316)	4.0
  :	:
  (881, 103)	4.5
  (881, 109)	2.5
  (881, 112)	3.5
  (881, 120)	4.0
  (881, 123)	4.0
  (881, 146)	4.0
  (881, 152)	4.5
  (881, 210)	3.0
  (881, 245)	4.0
  (881, 247)	4.5
  (881, 250)	4.0
  (881, 304)	4.0
  (881, 317)	4.5
  (881, 326)	2.0
  (881, 378)	5.0
  (881, 399)	4.0
  (881, 412)	4.0
  (881, 473)	4.5
  (881, 521)	4.5
  (881, 523)	3.0
  (881, 565)	2.5
  (881, 584)	5.0
  (881, 594)	4.0
  (881, 599)	4.5
  (881, 608)	4.0


(882, 609)

In [14]:
# Using KNN Algo to find the Nearest (or similar) Movie . Here cosine_similarity is used.
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(df_popMoviePivot_matrix)



NearestNeighbors(algorithm='brute', metric='cosine')

In [15]:
# Randomly selecting any Movies form 'df_popMoviePivot' Talble
query_index = np.random.choice(df_popMoviePivot.shape[0])
print(query_index)

509


In [16]:
# Storing the Recommended movies index in 'indices' and cosine_distance in 'distances'
distances, indices = model_knn.kneighbors(df_popMoviePivot.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 7)

In [17]:
# Printing the Recommendation ( Frist value in 'index' is the target movie for which movies are recommended)
for i in range (0,len(distances.flatten())):
    if i==0:
        print('Recommended movies after watching: {0} \n'.format(df_popMoviePivot.index[query_index]))
    else:
        print('{0}. {1}   with coisne distance {2}\n'.format(i,df_popMoviePivot.index[indices.flatten()[i]],distances.flatten()[i]))
    

Recommended movies after watching: Mask, The (1994) 

1. Lion King, The (1994)   with coisne distance 0.3610848282154827

2. Jurassic Park (1993)   with coisne distance 0.38863028374700537

3. Batman (1989)   with coisne distance 0.402173598828077

4. Mrs. Doubtfire (1993)   with coisne distance 0.40785938737270566

5. True Lies (1994)   with coisne distance 0.41102441291849745

6. Ace Ventura: Pet Detective (1994)   with coisne distance 0.4271724861834877

