In [None]:
#collaborative filtering movie recommendation system on the basis of k nearest neighbours

In [1]:
import pandas as pd
import numpy as np

In [6]:
dataframe_movies = pd.read_csv('movies.csv', usecols=['movieId', 'title'], dtype = {'movieId': 'int32', 'title': 'str'})
dataframe_ratings = pd.read_csv('ratings.csv', usecols=['userId', 'movieId', 'rating'], dtype = {'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})


In [7]:
dataframe_movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [8]:
dataframe_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [10]:
#merging two dataframes on the basis of one common column 'moviesId'

dataframe = pd.merge(dataframe_movies, dataframe_ratings, on = 'movieId')
dataframe.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [11]:
#remove records having Null or NaN, axis = 0 or index removes Null valued records, axis 1 removes Null columns
#subset: array-like or optional, Labels along other axis to consider, e.g. if you are dropping rows these would be a list of columns to include.
combine_movie_rating = dataframe.dropna(axis=0, subset = ['title']) 


In [46]:
movie_ratingCount = (combine_movie_rating.
                     groupby(by = ['title'])['rating'].
                     count().
                     reset_index().
                     rename(columns={'rating': 'totalRatingCount'})
                     [['title', 'totalRatingCount']])
#movie_ratingCount.head()
movie_ratingCount.head()


Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [50]:
rating_with_totalRatingCount = combine_movie_rating.merge(movie_ratingCount, left_on = 'title',  right_on = 'title', how = 'left')
#merge towards left i.e. combine_movie_rating
rating_with_totalRatingCount.head()

Unnamed: 0,movieId,title,userId,rating,totalRatingCount
0,1,Toy Story (1995),1,4.0,215
1,1,Toy Story (1995),5,4.0,215
2,1,Toy Story (1995),7,4.5,215
3,1,Toy Story (1995),15,2.5,215
4,1,Toy Story (1995),17,4.5,215


In [55]:
pd.set_option('display.float_format', lambda x: '%.4f'%x)
print(movie_ratingCount['totalRatingCount'].describe())

count   9719.0000
mean      10.3751
std       22.4062
min        1.0000
25%        1.0000
50%        3.0000
75%        9.0000
max      329.0000
Name: totalRatingCount, dtype: float64


In [57]:
#only take movies which have rating count >=50
threshold = 50
rating_popular_movie = rating_with_totalRatingCount.query('totalRatingCount >= @threshold')
rating_popular_movie.head()

Unnamed: 0,movieId,title,userId,rating,totalRatingCount
0,1,Toy Story (1995),1,4.0,215
1,1,Toy Story (1995),5,4.0,215
2,1,Toy Story (1995),7,4.5,215
3,1,Toy Story (1995),15,2.5,215
4,1,Toy Story (1995),17,4.5,215


In [58]:
rating_popular_movie.shape

(41362, 5)

In [64]:
#creating pivot table (matrix) for Nearest Neighbour

movie_features_dataframe = rating_popular_movie.pivot_table(index='title', columns='userId', values='rating').fillna(0)
movie_features_dataframe.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0


In [67]:
from scipy.sparse import csr_matrix

movie_features_dataframe_matrix = csr_matrix(movie_features_dataframe.values)
movie_features_dataframe_matrix

<450x606 sparse matrix of type '<class 'numpy.float32'>'
	with 41360 stored elements in Compressed Sparse Row format>

In [74]:
from sklearn.neighbors import NearestNeighbors

knn_model = NearestNeighbors(metric = 'cosine', algorithm = 'brute')

knn_model.fit(movie_features_dataframe_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [75]:
movie_features_dataframe.shape

(450, 606)

In [91]:
query_index = np.random.choice(movie_features_dataframe.shape[0])
print(query_index)

336


In [99]:
query_index=400

In [100]:
distances, indices = knn_model.kneighbors(movie_features_dataframe.iloc[query_index, :].values.reshape(1,-1), n_neighbors =10)

#Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.


In [101]:
movie_features_dataframe.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0


In [107]:
for i in range(0, len(distances.flatten())):
    if i==0:
        print('Movie Recommendations for {0}:\n'.format(movie_features_dataframe.index[query_index]))
    else:
        print('{0}:{1}, with distances of {2}:'.format(i, movie_features_dataframe.index[indices.flatten()[i]], distances.flatten()[i]))

Movie Recommendations for This Is Spinal Tap (1984):

1:Monty Python and the Holy Grail (1975), with distances of 0.4848027229309082:
2:Fish Called Wanda, A (1988), with distances of 0.4880514144897461:
3:Raising Arizona (1987), with distances of 0.49566829204559326:
4:Ferris Bueller's Day Off (1986), with distances of 0.514601469039917:
5:Beetlejuice (1988), with distances of 0.5236431360244751:
6:Blues Brothers, The (1980), with distances of 0.5366696119308472:
7:High Fidelity (2000), with distances of 0.5432546138763428:
8:Breakfast Club, The (1985), with distances of 0.5523796081542969:
9:Young Frankenstein (1974), with distances of 0.5531401038169861:
