## Recommender: Collaborative Filtering

This notebook shows the construction of a recommender system purely built from collaborative filtering.

In [21]:
###############
### IMPORTS ###
###############

# Calculating SVD matrix is too large so use Dask
import dask
import dask.dataframe as dd
import dask.array as da

import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity

In [125]:
df_all = dd.read_csv('data/dataframe_merged.csv')

In [126]:
print('Shape of dataframe: ', df_all.shape)
print('Columns of dataframe: ', df_all.columns)

Shape of dataframe:  (Delayed('int-6f7d8303-6a4b-43b3-b28a-426454297a08'), 28)
Columns of dataframe:  Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'keywords', 'director'],
      dtype='object')


I am interested in looking at the ratings dataset, where users and their ratings are mapped to movieIds. I will also load in the titles, so that I can refer to this to map the index to movie title.

In [127]:
df_titles = pd.read_csv('data/dataframe_merged.csv', usecols=['title', 'id'])

In [128]:
df_titles

Unnamed: 0,id,title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
3,31357,Waiting to Exhale
4,11862,Father of the Bride Part II
...,...,...
46623,439050,Subdue
46624,111109,Century of Birthing
46625,67758,Betrayal
46626,227506,Satan Triumphant


In [81]:
df_ratings = pd.read_csv('data/ratings_small.csv')

In [82]:
df_ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [83]:
df_movie_user = df_ratings.pivot_table(index='userId', columns='movieId', values='rating')
print('Shape of df_movie_user', df_movie_user.shape)

Shape of df_movie_user (671, 9066)


In [84]:
df_movie_user

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,,,4.0,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,,,,,,4.0,,,,,...,,,,,,,,,,
668,,,,,,,,,,,...,,,,,,,,,,
669,,,,,,,,,,,...,,,,,,,,,,
670,4.0,,,,,,,,,,...,,,,,,,,,,


In [85]:
# Transpose matrix, fill in NaN with means, then transpose again
df_movie_user_imp = df_movie_user.T.fillna(df_movie_user.mean(axis=1)).T

In [86]:
df_movie_user_imp

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.550000,2.550000,2.550000,2.550000,2.550000,2.550000,2.550000,2.550000,2.550000,2.550000,...,2.550000,2.550000,2.550000,2.550000,2.550000,2.550000,2.550000,2.550000,2.550000,2.550000
2,3.486842,3.486842,3.486842,3.486842,3.486842,3.486842,3.486842,3.486842,3.486842,4.000000,...,3.486842,3.486842,3.486842,3.486842,3.486842,3.486842,3.486842,3.486842,3.486842,3.486842
3,3.568627,3.568627,3.568627,3.568627,3.568627,3.568627,3.568627,3.568627,3.568627,3.568627,...,3.568627,3.568627,3.568627,3.568627,3.568627,3.568627,3.568627,3.568627,3.568627,3.568627
4,4.348039,4.348039,4.348039,4.348039,4.348039,4.348039,4.348039,4.348039,4.348039,4.000000,...,4.348039,4.348039,4.348039,4.348039,4.348039,4.348039,4.348039,4.348039,4.348039,4.348039
5,3.910000,3.910000,4.000000,3.910000,3.910000,3.910000,3.910000,3.910000,3.910000,3.910000,...,3.910000,3.910000,3.910000,3.910000,3.910000,3.910000,3.910000,3.910000,3.910000,3.910000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,3.647059,3.647059,3.647059,3.647059,3.647059,4.000000,3.647059,3.647059,3.647059,3.647059,...,3.647059,3.647059,3.647059,3.647059,3.647059,3.647059,3.647059,3.647059,3.647059,3.647059
668,3.750000,3.750000,3.750000,3.750000,3.750000,3.750000,3.750000,3.750000,3.750000,3.750000,...,3.750000,3.750000,3.750000,3.750000,3.750000,3.750000,3.750000,3.750000,3.750000,3.750000
669,3.351351,3.351351,3.351351,3.351351,3.351351,3.351351,3.351351,3.351351,3.351351,3.351351,...,3.351351,3.351351,3.351351,3.351351,3.351351,3.351351,3.351351,3.351351,3.351351,3.351351
670,4.000000,3.806452,3.806452,3.806452,3.806452,3.806452,3.806452,3.806452,3.806452,3.806452,...,3.806452,3.806452,3.806452,3.806452,3.806452,3.806452,3.806452,3.806452,3.806452,3.806452


In [87]:
# Compute similarity between all users
cos_similar = cosine_similarity(df_movie_user_imp.values)

In [88]:
# Cosine similarity matrix between all users
pd.DataFrame(cos_similar)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,661,662,663,664,665,666,667,668,669,670
0,1.000000,0.999597,0.999754,0.999344,0.999758,0.999530,0.999495,0.999605,0.999750,0.999721,...,0.999664,0.999853,0.999216,0.997895,0.999608,0.999676,0.999749,0.999736,0.999703,0.999630
1,0.999597,1.000000,0.999604,0.999189,0.999614,0.999380,0.999402,0.999467,0.999594,0.999568,...,0.999505,0.999700,0.999072,0.997658,0.999474,0.999477,0.999596,0.999557,0.999514,0.999514
2,0.999754,0.999604,1.000000,0.999359,0.999761,0.999512,0.999555,0.999622,0.999738,0.999739,...,0.999685,0.999863,0.999253,0.997932,0.999667,0.999714,0.999763,0.999755,0.999707,0.999663
3,0.999344,0.999189,0.999359,1.000000,0.999360,0.999142,0.999170,0.999243,0.999329,0.999319,...,0.999267,0.999450,0.998834,0.997679,0.999219,0.999298,0.999354,0.999335,0.999310,0.999253
4,0.999758,0.999614,0.999761,0.999360,1.000000,0.999540,0.999531,0.999604,0.999755,0.999741,...,0.999690,0.999866,0.999219,0.997942,0.999607,0.999683,0.999744,0.999737,0.999707,0.999635
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
666,0.999676,0.999477,0.999714,0.999298,0.999683,0.999460,0.999480,0.999546,0.999683,0.999663,...,0.999594,0.999783,0.999162,0.997849,0.999581,1.000000,0.999706,0.999652,0.999630,0.999580
667,0.999749,0.999596,0.999763,0.999354,0.999744,0.999529,0.999521,0.999612,0.999755,0.999731,...,0.999665,0.999858,0.999233,0.997923,0.999636,0.999706,1.000000,0.999726,0.999730,0.999647
668,0.999736,0.999557,0.999755,0.999335,0.999737,0.999505,0.999519,0.999579,0.999718,0.999695,...,0.999642,0.999831,0.999211,0.997958,0.999581,0.999652,0.999726,1.000000,0.999687,0.999629
669,0.999703,0.999514,0.999707,0.999310,0.999707,0.999486,0.999474,0.999602,0.999720,0.999676,...,0.999646,0.999812,0.999180,0.997893,0.999520,0.999630,0.999730,0.999687,1.000000,0.999608


In [89]:
# Create a matrix of dimensions cos_similar x cos_similar filled with diagonal 1's
diagonal_one_mat = np.zeros((len(cos_similar), len(cos_similar)), int)
np.fill_diagonal(diagonal_one_mat, 1)
diagonal_one_mat

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 1]])

In [90]:
# Subtract cos_similar by the diagonal matrix
cos_similar = cos_similar - diagonal_one_mat

In [91]:
pd.DataFrame(cos_similar)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,661,662,663,664,665,666,667,668,669,670
0,7.815970e-14,9.995967e-01,9.997541e-01,9.993438e-01,9.997584e-01,0.999530,0.999495,0.999605,0.999750,0.999721,...,0.999664,0.999853,0.999216,0.997895,0.999608,9.996762e-01,9.997487e-01,9.997360e-01,9.997026e-01,9.996299e-01
1,9.995967e-01,-1.310063e-14,9.996037e-01,9.991885e-01,9.996137e-01,0.999380,0.999402,0.999467,0.999594,0.999568,...,0.999505,0.999700,0.999072,0.997658,0.999474,9.994766e-01,9.995960e-01,9.995572e-01,9.995137e-01,9.995137e-01
2,9.997541e-01,9.996037e-01,1.061373e-13,9.993590e-01,9.997606e-01,0.999512,0.999555,0.999622,0.999738,0.999739,...,0.999685,0.999863,0.999253,0.997932,0.999667,9.997135e-01,9.997631e-01,9.997553e-01,9.997075e-01,9.996631e-01
3,9.993438e-01,9.991885e-01,9.993590e-01,2.775558e-14,9.993597e-01,0.999142,0.999170,0.999243,0.999329,0.999319,...,0.999267,0.999450,0.998834,0.997679,0.999219,9.992982e-01,9.993537e-01,9.993355e-01,9.993101e-01,9.992533e-01
4,9.997584e-01,9.996137e-01,9.997606e-01,9.993597e-01,1.998401e-14,0.999540,0.999531,0.999604,0.999755,0.999741,...,0.999690,0.999866,0.999219,0.997942,0.999607,9.996828e-01,9.997441e-01,9.997373e-01,9.997069e-01,9.996348e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
666,9.996762e-01,9.994766e-01,9.997135e-01,9.992982e-01,9.996828e-01,0.999460,0.999480,0.999546,0.999683,0.999663,...,0.999594,0.999783,0.999162,0.997849,0.999581,-1.216804e-13,9.997058e-01,9.996525e-01,9.996304e-01,9.995797e-01
667,9.997487e-01,9.995960e-01,9.997631e-01,9.993537e-01,9.997441e-01,0.999529,0.999521,0.999612,0.999755,0.999731,...,0.999665,0.999858,0.999233,0.997923,0.999636,9.997058e-01,4.884981e-15,9.997261e-01,9.997303e-01,9.996466e-01
668,9.997360e-01,9.995572e-01,9.997553e-01,9.993355e-01,9.997373e-01,0.999505,0.999519,0.999579,0.999718,0.999695,...,0.999642,0.999831,0.999211,0.997958,0.999581,9.996525e-01,9.997261e-01,-6.517009e-14,9.996867e-01,9.996289e-01
669,9.997026e-01,9.995137e-01,9.997075e-01,9.993101e-01,9.997069e-01,0.999486,0.999474,0.999602,0.999720,0.999676,...,0.999646,0.999812,0.999180,0.997893,0.999520,9.996304e-01,9.997303e-01,9.996867e-01,1.099121e-13,9.996078e-01


In [92]:
# Number of similar users for recommendation
n_recommendation = 100

# Plot top n recommendations
n_plot = 10


In [93]:
# User index for recommendation
user_index = 0

In [99]:
# Create an array of lowest to highest cosine similarity by UserID
similar_user_index = np.argsort(cos_similar[user_index])[::-1]

# Create an array of lowest to highest cosine similarity by cosine similarity score
similar_user_score = np.sort(cos_similar[user_index])[::-1]

In [115]:
similar_user_index

array([ 45, 445, 442, 437,  39, 447, 538, 525, 662, 279, 112,  15, 582,
       228, 565, 180, 126, 108,  70, 448,  75,  23, 621, 434, 412, 555,
       511, 453, 297, 318, 402, 505, 520, 494, 487, 313, 631, 217, 346,
       458, 103, 153, 232, 632, 328, 367, 356, 248, 224,  63, 220, 629,
       319, 171, 636,  99, 650, 322, 656, 257, 615, 502, 514, 172, 566,
        30,  88, 512, 463, 230, 484, 245, 301, 414, 588, 266, 634, 333,
       306, 542, 299, 510, 483, 332,  61, 215,  36, 209, 210, 202, 350,
        37, 334,  46, 114, 488, 444, 193, 599, 454, 397, 208, 113, 144,
       166, 531, 376,  43,  12, 348,  64, 371, 223,  26, 111, 477,  10,
       635, 251, 273, 207, 489, 539,  83, 316, 192, 418, 128, 516, 135,
       255, 559, 351, 146, 161, 179, 657, 205, 436, 368, 590, 157, 473,
       214,  13, 644,  49, 263, 272, 337, 326, 537, 567,  59, 265, 605,
       630, 570, 659,   4,  89, 617, 575, 100, 229, 641, 600, 170, 564,
         2, 295, 339, 271, 396,   8, 611, 667, 468, 189, 288, 35

In [116]:
similar_user_score

array([9.99870079e-01, 9.99863821e-01, 9.99863614e-01, 9.99861882e-01,
       9.99857664e-01, 9.99854284e-01, 9.99853494e-01, 9.99853106e-01,
       9.99852959e-01, 9.99851270e-01, 9.99846736e-01, 9.99846182e-01,
       9.99845850e-01, 9.99845529e-01, 9.99845376e-01, 9.99844379e-01,
       9.99843094e-01, 9.99842265e-01, 9.99841532e-01, 9.99840418e-01,
       9.99838467e-01, 9.99837680e-01, 9.99837624e-01, 9.99836293e-01,
       9.99834104e-01, 9.99833351e-01, 9.99832798e-01, 9.99832380e-01,
       9.99831343e-01, 9.99831250e-01, 9.99831146e-01, 9.99830906e-01,
       9.99827509e-01, 9.99827381e-01, 9.99826736e-01, 9.99824199e-01,
       9.99823467e-01, 9.99823130e-01, 9.99821176e-01, 9.99820497e-01,
       9.99820215e-01, 9.99820007e-01, 9.99819957e-01, 9.99819585e-01,
       9.99819330e-01, 9.99819220e-01, 9.99819109e-01, 9.99818766e-01,
       9.99818707e-01, 9.99818086e-01, 9.99817822e-01, 9.99817665e-01,
       9.99816807e-01, 9.99816370e-01, 9.99816090e-01, 9.99815987e-01,
      

In [118]:
# Get unrated movies that user_index did not rate
unrated_movies = df_movie_user.iloc[user_index][df_movie_user.iloc[user_index].isna()].index

In [119]:
pd.DataFrame(unrated_movies)

Unnamed: 0,movieId
0,1
1,2
2,3
3,4
4,5
...,...
9041,161944
9042,162376
9043,162542
9044,162672


In [121]:
# Weight ratings of the top n most similar users with their rating and compute the mean for each movie
mean_movie_recommendations = (df_movie_user_imp.iloc[similar_user_index[:n_recommendation]].T * similar_user_score[:n_recommendation]).T.mean(axis=0)

In [122]:
mean_movie_recommendations

movieId
1         4.037653
2         4.020091
3         4.016888
4         3.994663
5         3.982460
            ...   
161944    4.010957
162376    4.010957
162542    4.010957
162672    4.010957
163949    4.010957
Length: 9066, dtype: float64

In [129]:
# Filter for unrated movies and sort results
best_movie_recommendations = mean_movie_recommendations[unrated_movies].sort_values(ascending=False).to_frame().join(df_titles['title'])

In [130]:
best_movie_recommendations

Unnamed: 0_level_0,0,title
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
318,4.208256,Swimming with Sharks
296,4.146551,Quiz Show
858,4.122750,Gone Fishin'
608,4.101979,The Aristocats
260,4.101575,L'Enfer
...,...,...
208,3.964207,The Browning Version
65,3.964003,Lawnmower Man 2: Beyond Cyberspace
344,3.943062,Clear and Present Danger
231,3.934611,Ed Wood


In [132]:
# Create user-id mapping
user_id_mapping = {id:i for i, id in enumerate(df_movie_user_imp.index)}

In [133]:
user_id_mapping

{1: 0,
 2: 1,
 3: 2,
 4: 3,
 5: 4,
 6: 5,
 7: 6,
 8: 7,
 9: 8,
 10: 9,
 11: 10,
 12: 11,
 13: 12,
 14: 13,
 15: 14,
 16: 15,
 17: 16,
 18: 17,
 19: 18,
 20: 19,
 21: 20,
 22: 21,
 23: 22,
 24: 23,
 25: 24,
 26: 25,
 27: 26,
 28: 27,
 29: 28,
 30: 29,
 31: 30,
 32: 31,
 33: 32,
 34: 33,
 35: 34,
 36: 35,
 37: 36,
 38: 37,
 39: 38,
 40: 39,
 41: 40,
 42: 41,
 43: 42,
 44: 43,
 45: 44,
 46: 45,
 47: 46,
 48: 47,
 49: 48,
 50: 49,
 51: 50,
 52: 51,
 53: 52,
 54: 53,
 55: 54,
 56: 55,
 57: 56,
 58: 57,
 59: 58,
 60: 59,
 61: 60,
 62: 61,
 63: 62,
 64: 63,
 65: 64,
 66: 65,
 67: 66,
 68: 67,
 69: 68,
 70: 69,
 71: 70,
 72: 71,
 73: 72,
 74: 73,
 75: 74,
 76: 75,
 77: 76,
 78: 77,
 79: 78,
 80: 79,
 81: 80,
 82: 81,
 83: 82,
 84: 83,
 85: 84,
 86: 85,
 87: 86,
 88: 87,
 89: 88,
 90: 89,
 91: 90,
 92: 91,
 93: 92,
 94: 93,
 95: 94,
 96: 95,
 97: 96,
 98: 97,
 99: 98,
 100: 99,
 101: 100,
 102: 101,
 103: 102,
 104: 103,
 105: 104,
 106: 105,
 107: 106,
 108: 107,
 109: 108,
 110: 109,
 111: 11

In [138]:
prediction = []
# Iterate over all testset items
for user_id in df_ratings['userId'].unique():
    
    # Sort similar users by index
    similar_user_index = np.argsort(cos_similar[user_id_mapping[user_id]])[::-1]
    # Sort similar users by score
    similar_user_score = np.sort(cos_similar[user_id_mapping[user_id]])[::-1]
    
    for movie_id in df_ratings[df_ratings['userId']==user_id]['movieId'].values:

        # Compute predicted score
        score = (df_movie_user_imp.iloc[similar_user_index[:n_recommendation]][movie_id] * similar_user_score[:n_recommendation]).values.sum() / similar_user_score[:n_recommendation].sum()
        prediction.append([user_id, movie_id, score])

In [139]:
prediction

[[1, 31, 4.012361484663676],
 [1, 1029, 4.011681355578426],
 [1, 1061, 4.009736910165526],
 [1, 1129, 3.9999316775625067],
 [1, 1172, 4.054756618892538],
 [1, 1263, 4.011681355578426],
 [1, 1287, 4.010682546964908],
 [1, 1293, 4.0194591815346135],
 [1, 1339, 4.011530219087025],
 [1, 1343, 4.0120980271155435],
 [1, 1371, 4.011681355578426],
 [1, 1405, 4.012065971174006],
 [1, 1953, 4.011681355578426],
 [1, 2105, 4.011681355578426],
 [1, 2150, 4.016931286286931],
 [1, 2193, 4.011681355578426],
 [1, 2294, 4.011681355578426],
 [1, 2455, 4.011681355578426],
 [1, 2968, 4.010923760067469],
 [1, 3671, 4.008681298545642],
 [2, 10, 3.9880694236485663],
 [2, 17, 4.013837879415157],
 [2, 39, 4.007761268244154],
 [2, 47, 4.0428827704686805],
 [2, 50, 4.046110977376314],
 [2, 52, 4.002597949344528],
 [2, 62, 4.003168329356673],
 [2, 110, 4.077847602260313],
 [2, 144, 3.9979407806311467],
 [2, 150, 4.025262990298183],
 [2, 153, 3.9768233681124503],
 [2, 161, 4.006041838794559],
 [2, 165, 3.9554160499

In [144]:
# Create prediction DataFrame
df_pred = pd.DataFrame(prediction, columns=['User', 'Movie', 'Prediction']).set_index(['User', 'Movie'])
df_pred = df_ratings.set_index(['userId', 'movieId'])

In [145]:
df_pred

Unnamed: 0_level_0,Unnamed: 1_level_0,rating,timestamp
userId,movieId,Unnamed: 2_level_1,Unnamed: 3_level_1
1,31,2.5,1260759144
1,1029,3.0,1260759179
1,1061,3.0,1260759182
1,1129,2.0,1260759185
1,1172,4.0,1260759205
...,...,...,...
671,6268,2.5,1065579370
671,6269,4.0,1065149201
671,6365,4.0,1070940363
671,6385,2.5,1070979663
