## Recommender: Collaborative Filtering

This notebook shows the construction of a recommender system purely built from collaborative filtering.

In [1]:
###############
### IMPORTS ###
###############

# Calculating SVD matrix is too large so use Dask
import dask
import dask.dataframe as dd
import dask.array as da

import numpy as np
import pandas as pd

from scipy.linalg import svd
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df_all = dd.read_csv('data/dataframe_merged.csv')

In [3]:
print('Shape of dataframe: ', df_all.shape)
print('Columns of dataframe: ', df_all.columns)

Shape of dataframe:  (Delayed('int-5af51dda-987f-4721-bbff-ee1ca145e34f'), 28)
Columns of dataframe:  Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'keywords', 'director'],
      dtype='object')


I am interested in looking at the ratings dataset, where users and their ratings are mapped to movieIds. I will also load in the titles, so that I can refer to this to map the index to movie title.

In [4]:
df_titles = pd.read_csv('data/dataframe_merged.csv', usecols=['title', 'id'])

In [5]:
df_titles

Unnamed: 0,id,title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
3,31357,Waiting to Exhale
4,11862,Father of the Bride Part II
...,...,...
46623,439050,Subdue
46624,111109,Century of Birthing
46625,67758,Betrayal
46626,227506,Satan Triumphant


In [6]:
df_ratings = pd.read_csv('data/ratings_small.csv')

In [7]:
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [8]:
df_movie_user = df_ratings.pivot_table(index='userId', columns='movieId', values='rating')
print('Shape of df_movie_user', df_movie_user.shape)

Shape of df_movie_user (671, 9066)


In [9]:
# Transpose matrix, fill in NaN with means, then transpose again
df_movie_user_imp = df_movie_user.T.fillna(df_movie_user.mean(axis=1)).T

In [10]:
df_movie_user_imp

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.550000,2.550000,2.550000,2.550000,2.550000,2.550000,2.550000,2.550000,2.550000,2.550000,...,2.550000,2.550000,2.550000,2.550000,2.550000,2.550000,2.550000,2.550000,2.550000,2.550000
2,3.486842,3.486842,3.486842,3.486842,3.486842,3.486842,3.486842,3.486842,3.486842,4.000000,...,3.486842,3.486842,3.486842,3.486842,3.486842,3.486842,3.486842,3.486842,3.486842,3.486842
3,3.568627,3.568627,3.568627,3.568627,3.568627,3.568627,3.568627,3.568627,3.568627,3.568627,...,3.568627,3.568627,3.568627,3.568627,3.568627,3.568627,3.568627,3.568627,3.568627,3.568627
4,4.348039,4.348039,4.348039,4.348039,4.348039,4.348039,4.348039,4.348039,4.348039,4.000000,...,4.348039,4.348039,4.348039,4.348039,4.348039,4.348039,4.348039,4.348039,4.348039,4.348039
5,3.910000,3.910000,4.000000,3.910000,3.910000,3.910000,3.910000,3.910000,3.910000,3.910000,...,3.910000,3.910000,3.910000,3.910000,3.910000,3.910000,3.910000,3.910000,3.910000,3.910000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,3.647059,3.647059,3.647059,3.647059,3.647059,4.000000,3.647059,3.647059,3.647059,3.647059,...,3.647059,3.647059,3.647059,3.647059,3.647059,3.647059,3.647059,3.647059,3.647059,3.647059
668,3.750000,3.750000,3.750000,3.750000,3.750000,3.750000,3.750000,3.750000,3.750000,3.750000,...,3.750000,3.750000,3.750000,3.750000,3.750000,3.750000,3.750000,3.750000,3.750000,3.750000
669,3.351351,3.351351,3.351351,3.351351,3.351351,3.351351,3.351351,3.351351,3.351351,3.351351,...,3.351351,3.351351,3.351351,3.351351,3.351351,3.351351,3.351351,3.351351,3.351351,3.351351
670,4.000000,3.806452,3.806452,3.806452,3.806452,3.806452,3.806452,3.806452,3.806452,3.806452,...,3.806452,3.806452,3.806452,3.806452,3.806452,3.806452,3.806452,3.806452,3.806452,3.806452


In [11]:
U, sigma, Vt = svd(df_movie_user_imp)

In [12]:
pd.DataFrame(Vt)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9056,9057,9058,9059,9060,9061,9062,9063,9064,9065
0,-0.010738,-0.010436,-0.010404,-0.010434,-0.010427,-0.010601,-0.010457,-0.010500,-0.010463,-0.010445,...,-0.010499,-0.010496,-0.010501,-0.010496,-0.010498,-0.010505,-0.010507,-0.010508,-0.010500,-0.010509
1,0.034905,-0.015285,-0.025924,-0.004659,-0.002879,0.014648,-0.018206,-0.000655,0.000495,-0.008200,...,-0.004200,-0.014484,-0.001421,-0.007050,-0.005198,-0.000105,0.004143,-0.000336,0.000120,0.007819
2,-0.020820,0.004546,-0.002218,0.003341,-0.006226,-0.021318,-0.017634,-0.000221,0.000479,-0.000838,...,-0.014827,0.010724,0.000858,0.005611,0.004131,-0.000072,-0.002567,0.000033,-0.000011,0.027948
3,-0.008446,-0.015257,0.013588,0.001231,0.028957,0.059991,0.009326,-0.000007,-0.000235,0.013173,...,-0.000378,-0.017132,-0.000560,-0.002392,-0.001761,-0.000010,0.001679,-0.000035,0.000015,0.000708
4,-0.070465,0.004190,0.013218,0.007852,0.036395,-0.011458,-0.003034,-0.001038,-0.000208,-0.000503,...,-0.001339,-0.032670,0.001845,0.013424,0.009884,-0.000011,-0.005500,0.000201,-0.000083,0.002559
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9061,-0.001143,-0.000373,-0.000796,-0.000382,-0.000309,-0.000122,0.000548,-0.000540,-0.000256,-0.000566,...,-0.000111,-0.000135,-0.000119,-0.000172,-0.000157,0.997102,-0.000094,0.000597,-0.000471,-0.000116
9062,-0.000329,0.000348,-0.000912,0.000192,-0.000648,-0.000974,0.001308,0.000245,-0.000561,0.000096,...,-0.000118,-0.000168,0.000323,-0.000211,-0.000183,-0.000079,0.998601,-0.000111,-0.000104,-0.000085
9063,0.004474,0.000291,0.005545,0.002347,-0.006414,-0.004463,-0.006450,-0.002698,-0.011364,0.001351,...,0.000054,-0.000209,-0.000139,-0.000101,-0.000099,0.000566,0.000050,0.937341,0.031528,-0.000367
9064,-0.003442,0.000265,-0.003219,-0.001002,0.003074,0.002193,0.003261,0.000901,0.005286,-0.000692,...,-0.000187,-0.000051,-0.000087,-0.000109,-0.000110,-0.000433,-0.000189,0.031469,0.983927,0.000028


In [15]:
def get_recommends(itemID, Vt, num_recom=2):
    recs = []
    for item in range(Vt.T.shape[0]):
        if item != itemID:
            recs.append([item,np.dot(Vt.T[itemID],Vt.T[item])])
    final_rec = [i[0] for i in sorted(recs,key=lambda x: x[1],reverse=True)]
    return final_rec[:num_recom]

#### get_recommends(2, Vt, num_recom=10)