In [1]:
import pandas as pd
import numpy as np

from surprise import accuracy
from surprise import SVD, NMF
from surprise import Dataset
from surprise import Reader
from surprise import NormalPredictor
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from surprise.model_selection import GridSearchCV

In [2]:
# Read in the ratings_small (user-item) dataframe
df_ratings = pd.read_csv('data/ratings_10kusers.csv')

In [3]:
df_all = pd.read_csv('data/dataframe_merged.csv', usecols=['id', 'title'])

In [4]:
df_ratings = df_ratings[df_ratings['movieId'].isin(df_all['id'])]

In [5]:
df_ratings.tail(10)

Unnamed: 0,userId,movieId,rating,timestamp
999979,10183,348,3.0,833460122
999980,10183,349,5.0,833459462
999981,10183,350,4.0,833460139
999987,10183,364,5.0,833459740
999993,10183,377,5.0,833460368
999994,10183,378,5.0,833461211
999995,10183,380,4.0,833459432
999996,10183,381,4.0,833460156
999997,10183,410,3.0,833459559
999998,10183,415,4.0,833460564


In [6]:
# Set the reader to have a rating_scale from 1-5 (default)
reader = Reader(rating_scale=(1, 5))

In [7]:
# The data only consists of userId, movieId, and rating
data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)

In [8]:
# Use surprise package for a train-test split of 80-20
# Note that the train-test split will split by general rows, not specific users
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=0.20)

In [9]:
# trainsetfull = data.build_full_trainset()

In [10]:
print('Number of users: ', trainset.n_users)
print('Number of movies (items): ', trainset.n_items, '\n')

Number of users:  9913
Number of movies (items):  4819 



In [11]:
trainset_iids = list(trainset.all_items())
iid_converter = lambda x: trainset.to_raw_iid(x)
trainset_raw_iids = list(map(iid_converter, trainset_iids))

In [12]:
trainset_raw_iids

[2097,
 1584,
 150778,
 2294,
 1954,
 1931,
 778,
 4226,
 111,
 253,
 4886,
 2019,
 41566,
 344,
 1831,
 948,
 256,
 454,
 2004,
 505,
 4612,
 1373,
 410,
 547,
 1393,
 47099,
 1278,
 296,
 435,
 867,
 4011,
 260,
 480,
 597,
 27391,
 318,
 104374,
 7450,
 2028,
 339,
 62,
 1391,
 56801,
 527,
 393,
 588,
 1272,
 25,
 1722,
 216,
 5902,
 2657,
 593,
 68954,
 1250,
 1682,
 575,
 49530,
 36529,
 1358,
 315,
 349,
 111759,
 70,
 33166,
 587,
 3146,
 44191,
 222,
 2018,
 153,
 1375,
 1986,
 1923,
 34334,
 3549,
 1996,
 273,
 1283,
 1088,
 55207,
 196,
 6934,
 162,
 171,
 745,
 551,
 1997,
 40629,
 2108,
 594,
 312,
 3052,
 912,
 165,
 345,
 1580,
 1566,
 89904,
 5608,
 150,
 1073,
 586,
 2048,
 31658,
 539,
 1089,
 488,
 3489,
 4639,
 2762,
 858,
 2694,
 266,
 913,
 22,
 379,
 306,
 4975,
 1597,
 923,
 40819,
 2959,
 1721,
 455,
 1629,
 193,
 117,
 231,
 708,
 8368,
 928,
 1378,
 1918,
 3160,
 898,
 110,
 2699,
 2861,
 84954,
 46976,
 1573,
 2291,
 535,
 431,
 69278,
 3527,
 799,
 5,
 471,

In [13]:
from surprise import KNNWithMeans
my_k = 15
my_min_k = 5
my_sim_option = {
    'name':'cosine', 'user_based':False, 'verbose': False
    }
algo = KNNWithMeans(sim_options = my_sim_option)
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fa8ae432190>

In [14]:
# Dataframe of all the movies voted for
pd.DataFrame(algo.sim)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4809,4810,4811,4812,4813,4814,4815,4816,4817,4818
0,1.000000,0.976028,0.0,0.955760,0.962836,0.947046,0.973704,0.960718,0.974666,0.956293,...,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
1,0.976028,1.000000,1.0,0.941341,0.951855,0.949505,0.947732,0.963034,0.952646,0.951597,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,0.000000,1.000000,1.0,1.000000,0.000000,0.000000,1.000000,1.000000,1.000000,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.955760,0.941341,1.0,1.000000,0.949118,0.985153,0.952087,0.966832,0.938414,0.952920,...,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
4,0.962836,0.951855,0.0,0.949118,1.000000,0.967832,0.956871,0.957338,0.960400,0.947764,...,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4814,1.000000,1.000000,0.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
4815,0.000000,1.000000,0.0,0.000000,0.000000,0.000000,0.000000,1.000000,1.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4816,0.000000,1.000000,0.0,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4817,1.000000,1.000000,0.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0


In [15]:
# Same dataframe as algo.sim but the indices/columns are now movieId
df_cos_surprise = pd.DataFrame(algo.sim, index=trainset_raw_iids, columns=trainset_raw_iids)

df_cos_surprise

Unnamed: 0,2097,1584,150778,2294,1954,1931,778,4226,111,253,...,5857,77852,34482,5842,31598,44124,34729,113432,60309,8691
2097,1.000000,0.976028,0.0,0.955760,0.962836,0.947046,0.973704,0.960718,0.974666,0.956293,...,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
1584,0.976028,1.000000,1.0,0.941341,0.951855,0.949505,0.947732,0.963034,0.952646,0.951597,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
150778,0.000000,1.000000,1.0,1.000000,0.000000,0.000000,1.000000,1.000000,1.000000,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2294,0.955760,0.941341,1.0,1.000000,0.949118,0.985153,0.952087,0.966832,0.938414,0.952920,...,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
1954,0.962836,0.951855,0.0,0.949118,1.000000,0.967832,0.956871,0.957338,0.960400,0.947764,...,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44124,1.000000,1.000000,0.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
34729,0.000000,1.000000,0.0,0.000000,0.000000,0.000000,0.000000,1.000000,1.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
113432,0.000000,1.000000,0.0,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
60309,1.000000,1.000000,0.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0


In [16]:
df_all

Unnamed: 0,id,title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
3,31357,Waiting to Exhale
4,11862,Father of the Bride Part II
...,...,...
46623,439050,Subdue
46624,111109,Century of Birthing
46625,67758,Betrayal
46626,227506,Satan Triumphant


In [17]:
df_all = df_all.reset_index()
df_all.index = df_all.id

In [18]:
df_all

Unnamed: 0_level_0,index,id,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
862,0,862,Toy Story
8844,1,8844,Jumanji
15602,2,15602,Grumpier Old Men
31357,3,31357,Waiting to Exhale
11862,4,11862,Father of the Bride Part II
...,...,...,...
439050,46623,439050,Subdue
111109,46624,111109,Century of Birthing
67758,46625,67758,Betrayal
227506,46626,227506,Satan Triumphant


In [19]:
movieIdtoindex = df_all['index'].to_dict()

In [20]:
df_cos_surprise = df_cos_surprise.rename(index=movieIdtoindex, columns=movieIdtoindex)

In [21]:
df_cos_surprise

Unnamed: 0,8694,6783,15644,4645,7170,11607,7125,26195,4169,2902,...,12069,31674,13926,7448,6111,43839,34333,18078,17817,5635
8694,1.000000,0.976028,0.0,0.955760,0.962836,0.947046,0.973704,0.960718,0.974666,0.956293,...,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
6783,0.976028,1.000000,1.0,0.941341,0.951855,0.949505,0.947732,0.963034,0.952646,0.951597,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
15644,0.000000,1.000000,1.0,1.000000,0.000000,0.000000,1.000000,1.000000,1.000000,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4645,0.955760,0.941341,1.0,1.000000,0.949118,0.985153,0.952087,0.966832,0.938414,0.952920,...,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
7170,0.962836,0.951855,0.0,0.949118,1.000000,0.967832,0.956871,0.957338,0.960400,0.947764,...,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43839,1.000000,1.000000,0.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
34333,0.000000,1.000000,0.0,0.000000,0.000000,0.000000,0.000000,1.000000,1.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
18078,0.000000,1.000000,0.0,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
17817,1.000000,1.000000,0.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0


In [31]:
# Make a pandas dataframe of movie x movie length from df_all
# Fill in the values from matrix 'algo.sim'
# Set the diagonal to "1"
df_blank = pd.DataFrame(np.nan, range(1,len(df_all)), range(1,len(df_all)))
df_blank

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,46618,46619,46620,46621,46622,46623,46624,46625,46626,46627
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46623,,,,,,,,,,,...,,,,,,,,,,
46624,,,,,,,,,,,...,,,,,,,,,,
46625,,,,,,,,,,,...,,,,,,,,,,
46626,,,,,,,,,,,...,,,,,,,,,,


In [32]:
df_blank = df_cos_surprise.combine_first(df_blank)

In [33]:
df_blank

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,46618,46619,46620,46621,46622,46623,46624,46625,46626,46627
0,1.0,0.0,,,,0.989949,,,,1.0,...,,,,,,,,,,
1,0.0,1.0,,,,0.998274,,,,1.0,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46623,,,,,,,,,,,...,,,,,,,,,,
46624,,,,,,,,,,,...,,,,,,,,,,
46625,,,,,,,,,,,...,,,,,,,,,,
46626,,,,,,,,,,,...,,,,,,,,,,


In [34]:
# # Assign the diagonals as 1
# df_blank.values[[np.arange(df_blank.shape[0])]*2] = 1

In [36]:
np.fill_diagonal(df_blank.values, 1)
df_blank

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,46618,46619,46620,46621,46622,46623,46624,46625,46626,46627
0,1.0,0.0,,,,0.989949,,,,1.0,...,,,,,,,,,,
1,0.0,1.0,,,,0.998274,,,,1.0,...,,,,,,,,,,
2,,,1.0,,,,,,,,...,,,,,,,,,,
3,,,,1.0,,,,,,,...,,,,,,,,,,
4,,,,,1.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46623,,,,,,,,,,,...,,,,,,1.0,,,,
46624,,,,,,,,,,,...,,,,,,,1.0,,,
46625,,,,,,,,,,,...,,,,,,,,1.0,,
46626,,,,,,,,,,,...,,,,,,,,,1.0,


In [37]:
df_bank = df_blank.fillna(0)

In [38]:
df_blank

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,46618,46619,46620,46621,46622,46623,46624,46625,46626,46627
0,1.0,0.0,,,,0.989949,,,,1.0,...,,,,,,,,,,
1,0.0,1.0,,,,0.998274,,,,1.0,...,,,,,,,,,,
2,,,1.0,,,,,,,,...,,,,,,,,,,
3,,,,1.0,,,,,,,...,,,,,,,,,,
4,,,,,1.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46623,,,,,,,,,,,...,,,,,,1.0,,,,
46624,,,,,,,,,,,...,,,,,,,1.0,,,
46625,,,,,,,,,,,...,,,,,,,,1.0,,
46626,,,,,,,,,,,...,,,,,,,,,1.0,


In [39]:
np.save('cosine_similarity/cos_ratings.npy', df_blank)