In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
movie=pd.read_csv('movie.csv')
rating=pd.read_csv('rating.csv')

In [3]:
movie.drop('genres',axis=1,inplace=True)
rating.drop('timestamp',axis=1,inplace=True)

In [4]:
df=pd.merge(movie,rating)
df=df[df['movieId']<10000]
df=df[df['userId']<10000]
df.head(1)

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),3,4.0


In [5]:
combine_rating = df.dropna(axis=0,subset=['title'])
rating_count= combine_rating.groupby(by=['title'])['rating'].count().sort_values(ascending=False).reset_index()
rating_count= rating_count.rename(columns = {'rating':'totalrating'})
rating_count.head()

Unnamed: 0,title,totalrating
0,Pulp Fiction (1994),3388
1,Forrest Gump (1994),3375
2,"Silence of the Lambs, The (1991)",3144
3,"Shawshank Redemption, The (1994)",3125
4,Jurassic Park (1993),3040


In [6]:
join_df=pd.merge(df,rating_count)
join_df.head()

Unnamed: 0,movieId,title,userId,rating,totalrating
0,1,Toy Story (1995),3,4.0,2499
1,1,Toy Story (1995),6,5.0,2499
2,1,Toy Story (1995),8,4.0,2499
3,1,Toy Story (1995),10,4.0,2499
4,1,Toy Story (1995),11,4.5,2499


In [7]:
join_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 904716 entries, 0 to 904715
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   movieId      904716 non-null  int64  
 1   title        904716 non-null  object 
 2   userId       904716 non-null  int64  
 3   rating       904716 non-null  float64
 4   totalrating  904716 non-null  int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 41.4+ MB


In [8]:
hit_movies= join_df.query('totalrating >=50')
hit_movies.head()

Unnamed: 0,movieId,title,userId,rating,totalrating
0,1,Toy Story (1995),3,4.0,2499
1,1,Toy Story (1995),6,5.0,2499
2,1,Toy Story (1995),8,4.0,2499
3,1,Toy Story (1995),10,4.0,2499
4,1,Toy Story (1995),11,4.5,2499


In [9]:
hit_movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 838691 entries, 0 to 904528
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   movieId      838691 non-null  int64  
 1   title        838691 non-null  object 
 2   userId       838691 non-null  int64  
 3   rating       838691 non-null  float64
 4   totalrating  838691 non-null  int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 38.4+ MB


In [10]:
ds=hit_movies.pivot_table(index=['title'],columns=['userId'],values='rating').fillna(0)
ds.head(2)

userId,1,2,3,4,5,6,7,8,9,10,...,6888,6889,6890,6891,6892,6893,6894,6895,6896,6897
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
*batteries not included (1987),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
from scipy.sparse import csr_matrix
df_matrix= csr_matrix(ds.values)

In [12]:
df_matrix

<2713x6896 sparse matrix of type '<class 'numpy.float64'>'
	with 838691 stored elements in Compressed Sparse Row format>

In [13]:
from sklearn.neighbors import NearestNeighbors
model_knn= NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(df_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [14]:
query_index= np.random.choice(ds.shape[0])
distances, indices= model_knn.kneighbors(ds.iloc[query_index,:].values.reshape(1,-1), n_neighbors=6)

In [15]:
distances

array([[7.77156117e-16, 5.38396623e-01, 5.52203994e-01, 5.65822206e-01,
        5.79725032e-01, 5.89845360e-01]])

In [16]:
for i in range(0,len(distances[0])):
    if i==0:
        print('Recommendation for {0}:\n'.format(ds.index[query_index]))
    else:
        print('{0}:{1}, with distance of {2}:'.format(i,ds.index[indices.flatten()[i]], distances[0][i]))

Recommendation for Last Action Hero (1993):

1:Demolition Man (1993), with distance of 0.5383966232073861:
2:Judge Dredd (1995), with distance of 0.5522039943518783:
3:Hot Shots! Part Deux (1993), with distance of 0.56582220558167:
4:Naked Gun 33 1/3: The Final Insult (1994), with distance of 0.5797250324032786:
5:Maverick (1994), with distance of 0.5898453596050698:


In [17]:
for i in range(0,len(distances[0])):
    if i==0:
        print('Recommendation for {0}:\n'.format(ds.index[query_index]))
    else:
        print('{0}:{1}, with distance of {2}:'.format(i,ds.index[indices.flatten()[i]], distances[0][i]))

Recommendation for Last Action Hero (1993):

1:Demolition Man (1993), with distance of 0.5383966232073861:
2:Judge Dredd (1995), with distance of 0.5522039943518783:
3:Hot Shots! Part Deux (1993), with distance of 0.56582220558167:
4:Naked Gun 33 1/3: The Final Insult (1994), with distance of 0.5797250324032786:
5:Maverick (1994), with distance of 0.5898453596050698:
