In [1]:
import pandas as pd

In [4]:
movies_df = pd.read_csv('../Data/movies.txt', usecols=['movieId', 'title'])
ratings_df = pd.read_csv('../Data/ratings.txt', usecols=['userId','movieId', 'rating'])

In [5]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [6]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [7]:
merged_df = pd.merge(ratings_df,movies_df, on='movieId')
merged_df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [8]:
sub_df = merged_df.dropna(axis=0, subset=['title'])
sub_df.shape

(100836, 4)

In [9]:
merged_df.shape

(100836, 4)

In [12]:
movieRatingCount = sub_df.groupby(by=['title'])['rating'].count().reset_index().rename(columns = {'rating': 'totalRatingCount'})
movieRatingCount

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2
...,...,...
9714,eXistenZ (1999),22
9715,xXx (2002),24
9716,xXx: State of the Union (2005),5
9717,¡Three Amigos! (1986),26


In [14]:
rating_count_merged_df = movieRatingCount.merge(merged_df, on='title')
rating_count_merged_df.head()

Unnamed: 0,title,totalRatingCount,userId,movieId,rating
0,'71 (2014),1,610,117867,4.0
1,'Hellboy': The Seeds of Creation (2004),1,332,97757,4.0
2,'Round Midnight (1986),2,332,26564,3.5
3,'Round Midnight (1986),2,377,26564,3.5
4,'Salem's Lot (2004),1,345,27751,5.0


In [17]:
count_threshold_value = 100
df_threshold_value = rating_count_merged_df[rating_count_merged_df['totalRatingCount'] > count_threshold_value]
print(df_threshold_value.shape)
df_threshold_value.head()

(19788, 5)


Unnamed: 0,title,totalRatingCount,userId,movieId,rating
610,2001: A Space Odyssey (1968),109,7,924,4.0
611,2001: A Space Odyssey (1968),109,18,924,4.0
612,2001: A Space Odyssey (1968),109,19,924,3.0
613,2001: A Space Odyssey (1968),109,23,924,4.0
614,2001: A Space Odyssey (1968),109,27,924,2.0


In [18]:
movie_features_df = df_threshold_value.pivot_table(index='title', columns='userId', values='rating').fillna(0.0)
movie_features_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
Ace Ventura: Pet Detective (1994),0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,2.0,0.0,0.0,0.0,3.5,0.0,3.0
Aladdin (1992),0.0,0.0,0.0,4.0,4.0,5.0,3.0,0.0,0.0,4.0,...,0.0,0.0,0.0,3.0,3.5,0.0,0.0,3.0,0.0,0.0
Alien (1979),4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,4.0,3.0,4.0,0.0,4.5
Aliens (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,3.5,0.0,4.5,0.0,5.0


In [23]:
# will convert the above to array matrix
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [20]:
movie_features_matrix = csr_matrix(arg1=movie_features_df)

In [22]:
movie_features_matrix

<134x597 sparse matrix of type '<class 'numpy.float64'>'
	with 19788 stored elements in Compressed Sparse Row format>

In [24]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(movie_features_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')