## Recommender: Collaborative Filtering

This notebook shows the construction of a recommender system purely built from collaborative filtering.

In [27]:
###############
### IMPORTS ###
###############

# Calculating SVD matrix is too large so use Dask
import dask
import dask.dataframe as dd
import dask.array as da

import numpy as np
import pandas as pd

In [28]:
df_all = dd.read_csv('data/dataframe_merged.csv')

In [29]:
print('Shape of dataframe: ', df_all.shape)
print('Columns of dataframe: ', df_all.columns)

Shape of dataframe:  (Delayed('int-59a84c75-673f-4425-854a-8a16d2d2ea99'), 28)
Columns of dataframe:  Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'keywords', 'director'],
      dtype='object')


I am interested in looking at the ratings dataset, where users and their ratings are mapped to movieIds. I will also load in the titles, so that I can refer to this to map the index to movie title.

In [30]:
df_titles = pd.read_csv('data/dataframe_merged.csv', usecols=['title', 'id'])

In [31]:
df_titles

Unnamed: 0,id,title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
3,31357,Waiting to Exhale
4,11862,Father of the Bride Part II
...,...,...
46623,439050,Subdue
46624,111109,Century of Birthing
46625,67758,Betrayal
46626,227506,Satan Triumphant


In [32]:
df_ratings = pd.read_csv('data/ratings_small.csv')

In [33]:
df_ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [34]:
# Create an empty matrix that takes the shape of movieId by userId
# Explicitly state that the datatype will be integers
ratings_mat = np.ndarray(
    shape=(max(df_ratings['movieId'].values), max(df_ratings['userId'].values)),
    dtype=np.int64)

In [35]:
pd.DataFrame(ratings_mat)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,661,662,663,664,665,666,667,668,669,670
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163944,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
163945,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
163946,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
163947,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
# This will fill in the matrix with the user ratings
ratings_mat[df_ratings['movieId'].values-1, df_ratings['userId'].values-1] = df_ratings['rating'].values

In [37]:
df_ratings_mat = pd.DataFrame(ratings_mat)
df_ratings_mat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,661,662,663,664,665,666,667,668,669,670
0,0,0,0,0,0,0,3,0,4,0,...,0,4,3,0,0,0,0,0,4,5
1,0,0,0,0,0,0,0,0,0,0,...,5,0,0,3,0,0,0,0,0,0
2,0,0,0,0,4,0,0,0,0,0,...,0,0,0,3,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,3,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163944,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
163945,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
163946,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
163947,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# Normalize the matrix
normalized_mat = ratings_mat - np.asarray([(np.mean(ratings_mat, 1))]).T

In [39]:
pd.DataFrame(normalized_mat)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,661,662,663,664,665,666,667,668,669,670
0,-1.388972,-1.388972,-1.388972,-1.388972,-1.388972,-1.388972,1.611028,-1.388972,2.611028,-1.388972,...,-1.388972,2.611028,1.611028,-1.388972,-1.388972,-1.388972,-1.388972,-1.388972,2.611028,3.611028
1,-0.529061,-0.529061,-0.529061,-0.529061,-0.529061,-0.529061,-0.529061,-0.529061,-0.529061,-0.529061,...,4.470939,-0.529061,-0.529061,2.470939,-0.529061,-0.529061,-0.529061,-0.529061,-0.529061,-0.529061
2,-0.271237,-0.271237,-0.271237,-0.271237,3.728763,-0.271237,-0.271237,-0.271237,-0.271237,-0.271237,...,-0.271237,-0.271237,-0.271237,2.728763,-0.271237,-0.271237,-0.271237,-0.271237,-0.271237,-0.271237
3,-0.044709,-0.044709,-0.044709,-0.044709,-0.044709,-0.044709,-0.044709,-0.044709,-0.044709,-0.044709,...,-0.044709,-0.044709,-0.044709,-0.044709,-0.044709,-0.044709,-0.044709,-0.044709,-0.044709,-0.044709
4,-0.265276,-0.265276,-0.265276,-0.265276,-0.265276,-0.265276,-0.265276,-0.265276,-0.265276,-0.265276,...,-0.265276,-0.265276,-0.265276,2.734724,-0.265276,-0.265276,-0.265276,-0.265276,-0.265276,-0.265276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163944,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
163945,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
163946,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
163947,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [16]:
svd_food = normalized_mat.T / np.sqrt(ratings_mat.shape[0] - 1)

In [17]:
df_svd_food = pd.DataFrame(svd_food)
df_svd_food

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,163939,163940,163941,163942,163943,163944,163945,163946,163947,163948
0,-0.003430,-0.001307,-0.000670,-0.00011,-0.000655,-0.001461,-0.000629,-0.00007,-0.000232,-0.001502,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.000018
1,-0.003430,-0.001307,-0.000670,-0.00011,-0.000655,-0.001461,-0.000629,-0.00007,-0.000232,0.008377,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.000018
2,-0.003430,-0.001307,-0.000670,-0.00011,-0.000655,-0.001461,-0.000629,-0.00007,-0.000232,-0.001502,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.000018
3,-0.003430,-0.001307,-0.000670,-0.00011,-0.000655,-0.001461,-0.000629,-0.00007,-0.000232,0.008377,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.000018
4,-0.003430,-0.001307,0.009209,-0.00011,-0.000655,-0.001461,-0.000629,-0.00007,-0.000232,-0.001502,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.000018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
666,-0.003430,-0.001307,-0.000670,-0.00011,-0.000655,0.008418,-0.000629,-0.00007,-0.000232,-0.001502,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.000018
667,-0.003430,-0.001307,-0.000670,-0.00011,-0.000655,-0.001461,-0.000629,-0.00007,-0.000232,-0.001502,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.000018
668,-0.003430,-0.001307,-0.000670,-0.00011,-0.000655,-0.001461,-0.000629,-0.00007,-0.000232,-0.001502,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.000018
669,0.006448,-0.001307,-0.000670,-0.00011,-0.000655,-0.001461,-0.000629,-0.00007,-0.000232,-0.001502,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.000018


In [18]:
# Store max value of movieId as variable 'max_movieId'
max_movieId = max(df_ratings['movieId'])
print('Max movieId value: ', max_movieId)
print('Number of movies in MOVIE database (df_movies): ', len(df_titles))
print('Note that the movieIds are not strictly from 0-100; empty rows will need to be filtered out.')

Max movieId value:  163949
Number of movies in MOVIE database (df_movies):  46628
Note that the movieIds are not strictly from 0-100; empty rows will need to be filtered out.


In [19]:
# Store max value of userId as variable 'max_userId'
max_userId = max(df_ratings['userId'])
print('Max userId value: ', max_userId)

Max userId value:  671


In [20]:
# Convert the pandas dataframe 'df_ratings_mat' to dask dataframe: 'dd_ratings_mat'
# Create 500 partitions so that dask can manage each partition

dd_svd_food = dd.from_pandas(df_svd_food, npartitions=500)

In [21]:
# Convert the dask dataframe to a dask array: 'da_ratings_mat'
da_svd_food = dd_svd_food.to_dask_array()
print(da_svd_food.compute())
da_svd_food

[[-3.43036498e-03 -1.30663044e-03 -6.69878141e-04 ...  0.00000000e+00
   0.00000000e+00 -1.84032456e-05]
 [-3.43036498e-03 -1.30663044e-03 -6.69878141e-04 ...  0.00000000e+00
   0.00000000e+00 -1.84032456e-05]
 [-3.43036498e-03 -1.30663044e-03 -6.69878141e-04 ...  0.00000000e+00
   0.00000000e+00 -1.84032456e-05]
 ...
 [-3.43036498e-03 -1.30663044e-03 -6.69878141e-04 ...  0.00000000e+00
   0.00000000e+00 -1.84032456e-05]
 [ 6.44849727e-03 -1.30663044e-03 -6.69878141e-04 ...  0.00000000e+00
   0.00000000e+00 -1.84032456e-05]
 [ 8.91821283e-03 -1.30663044e-03 -6.69878141e-04 ...  0.00000000e+00
   0.00000000e+00 -1.84032456e-05]]


Unnamed: 0,Array,Chunk
Bytes,unknown,unknown
Shape,"(nan, 163949)","(nan, 163949)"
Count,670 Tasks,335 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes unknown unknown Shape (nan, 163949) (nan, 163949) Count 670 Tasks 335 Chunks Type float64 numpy.ndarray",,

Unnamed: 0,Array,Chunk
Bytes,unknown,unknown
Shape,"(nan, 163949)","(nan, 163949)"
Count,670 Tasks,335 Chunks
Type,float64,numpy.ndarray


In [22]:
# Check how many partitions were actually made; for some reason it's less than the designated amount
da_svd_food.npartitions

335

In [23]:
# Decide on the threads and number of works
# Client dashboard link lets you check the status of Dask

from dask.distributed import Client, progress

client = Client(processes=False, threads_per_worker=2, n_workers=4)

client

0,1
Client  Scheduler: inproc://192.168.149.120/39137/1  Dashboard: http://192.168.149.120:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 17.18 GB






In [24]:
# Perform SVD (through numpy method)
U, S, Vt = np.linalg.svd(da_svd_food)





In [25]:
Vt.compute()









































































































































































































































































































































































































































































































































































































































































KeyboardInterrupt: 

In [27]:
# Calculate cosine similarity and return top_n movies
def top_cosine_similarity(data, movie_id, top_n=10):
    
    # Movie id starts from 1 in the dataset
    index = movie_id - 1
    
    # Movie row is the index of the
    movie_row = data[index, :]
    
    
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]

In [26]:
movie_row = df_ratings[index, :]

NameError: name 'index' is not defined







































































































































































































































































































































































































































































































In [28]:
def print_similar_movies(movie_data, movie_id, top_indexes):
    print('Recommendations for {0}: \n'.format(
    movie_data[movie_data.movie_id == movie_id].title.values[0]))
    for id in top_indexes + 1:
        print(movie_data[movie_data.movie_id == id].title.values[0])

In [26]:
k = 50
movie_id = 10 # (getting an id from movies.dat)
top_n = 10
sliced = Vt.T[:, :k] # representative data
indexes = top_cosine_similarity(sliced, movie_id, top_n)



KeyboardInterrupt: 

In [None]:
print_similar_movies(movie_data, movie_id, indexes)