In [1]:
import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [4]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


In [5]:
data = movies.merge(ratings, on = 'movieId')
data.head(2)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962


In [6]:
rating_count = data.groupby(['title'])['rating'].count().sort_values(ascending = False).reset_index()
rating_count = rating_count.rename(columns = {'rating': 'Total_Rating_Count'})
rating_count.head()

Unnamed: 0,title,Total_Rating_Count
0,Forrest Gump (1994),329
1,"Shawshank Redemption, The (1994)",317
2,Pulp Fiction (1994),307
3,"Silence of the Lambs, The (1991)",279
4,"Matrix, The (1999)",278


In [7]:
final_data = data.merge(rating_count,on = 'title')
final_data.head(2)


Unnamed: 0,movieId,title,genres,userId,rating,timestamp,Total_Rating_Count
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703,215
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962,215


In [8]:
final_data['Total_Rating_Count'].describe()

count    100836.000000
mean         58.758777
std          61.965384
min           1.000000
25%          13.000000
50%          39.000000
75%          84.000000
max         329.000000
Name: Total_Rating_Count, dtype: float64

In [9]:
threshold = 50
final_data_filter = final_data[final_data['Total_Rating_Count'] >= threshold]
final_data_filter.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,Total_Rating_Count
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703,215
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962,215
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946,215
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970,215
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483,215


In [10]:
final_data_filter_pivot = final_data_filter.pivot_table(columns = 'userId',index = 'title', values = 'rating').fillna(0)
final_data_filter_pivot.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0


In [11]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
final_data_matrix = csr_matrix(final_data_filter_pivot.values)
final_data_matrix

<450x606 sparse matrix of type '<class 'numpy.float64'>'
	with 41360 stored elements in Compressed Sparse Row format>

In [12]:
model_knn = NearestNeighbors(metric = 'cosine',algorithm = 'brute')
model_knn.fit(final_data_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [13]:
final_data_filter_pivot.shape

(450, 606)

In [14]:
query_index = np.random.choice(final_data_filter_pivot.shape[0])
print(query_index)

distances, indices = model_knn.kneighbors(final_data_filter_pivot.iloc[query_index,:].values.reshape(1,-1),n_neighbors = 6)

35


In [15]:
distances

array([[0.        , 0.24814306, 0.36623888, 0.43402268, 0.43838415,
        0.44469217]])

In [16]:
indices[0]

array([ 35,  34,  21, 368, 216, 268], dtype=int64)

In [17]:
ls = []
for i in indices[0][1:]:
    ls.append(i)
rec = final_data_filter_pivot.reset_index()
rec.iloc[ls,0]

34     Austin Powers: International Man of Mystery (1...
21                                   American Pie (1999)
368          South Park: Bigger, Longer and Uncut (1999)
216          Indiana Jones and the Temple of Doom (1984)
268                     Men in Black (a.k.a. MIB) (1997)
Name: title, dtype: object