In [21]:
import pandas as pd
import numpy as np
from datetime import date
from scipy.sparse import coo_matrix

data_df = pd.read_csv('./ratings.csv', sep=',', names=["UserID", "MovieID", "Rating", "Timestamp"],dtype={'Rating':'float64'})
 #dtype={'UserID':'int64','MovieID':'int64','Rating':'float64','Timestamp':'float'}
#dataframe object with movie information
movie_data_df = pd.read_csv('./movies_metadata.csv', sep=',', names=["movieId", "title", "genres"])

#data_df['Timestamp'] = pd.to_datetime(data_df['Timestamp'])
data_df['Timestamp'] = pd.to_datetime(data_df['Timestamp'], unit='s')
cutoff_date = "2017-06-01" # this was the earliest i could do without a memory error
data_df = data_df[data_df['Timestamp'] >= cutoff_date]
            

data_df.to_csv('filtered_ratings.csv', index=False)
# First, generate dictionaries for mapping old id to new id for users and movies
unique_MovieID = data_df['MovieID'].unique()
unique_UserID = data_df['UserID'].unique()
j = 0
user_old2new_id_dict = dict()
for u in unique_UserID:
    user_old2new_id_dict[u] = j
    j += 1
j = 0
movie_old2new_id_dict = dict()
for i in unique_MovieID:
    movie_old2new_id_dict[i] = j
    j += 1
    
# Then, use the generated dictionaries to reindex UserID and MovieID in the data_df
user_list = data_df['UserID'].values
movie_list = data_df['MovieID'].values
for j in range(len(data_df)):
    user_list[j] = user_old2new_id_dict[user_list[j]]
    movie_list[j] = movie_old2new_id_dict[movie_list[j]]
data_df['UserID'] = user_list
data_df['movieID'] = movie_list

# generate train_df with 70% samples and test_df with 30% samples, and there should have no overlap between them.
train_index = np.random.random(len(data_df)) <= 0.7
train_df = data_df[train_index]
test_df = data_df[~train_index]

# generate train_mat and test_mat
num_user = len(data_df['UserID'].unique())
num_movie = len(data_df['MovieID'].unique())
num_ratings = len(data_df['Rating'])
print(num_ratings)

print(train_df.info())
# train_mat = train_df[['Rating', 'UserID']].copy()
# test_mat = test_df[['Rating', 'UserID']].copy()
train_mat = coo_matrix((train_df['Rating'].values, (train_df['UserID'].values, train_df['MovieID'].values)), shape=(num_user, num_movie)).astype(float).toarray()
test_mat = coo_matrix((test_df['Rating'].values, (test_df['UserID'].values, test_df['MovieID'].values)), shape=(num_user, num_movie)).astype(float).toarray()

356861
<class 'pandas.core.frame.DataFrame'>
Int64Index: 250042 entries, 2806 to 26023517
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   UserID     250042 non-null  int64         
 1   MovieID    250042 non-null  int64         
 2   Rating     250042 non-null  float64       
 3   Timestamp  250042 non-null  datetime64[ns]
 4   movieID    250042 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 11.4 MB
None


In [2]:
train_mat = (train_mat > 0).astype(float)
test_mat = (test_mat > 0).astype(float)

print(train_mat)

[[1. 1. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 1. 1.]]


user-user Collaborative Filtering with implicit feedback (from hw2) SWITCH TO ITEM-ITEM

In [4]:
#implicit user user collab filtering

# numer = np.matmul(train_mat, train_mat.T)
# denom = np.sum(train_mat ** 2, axis=1, keepdims=True) ** 0.5
# Cosine = numer / np.matmul(denom, denom.T)

# calculating cosinne similarity between items
num_rating_items = np.sum(train_mat, axis=0, keepdims=True)
numer = np.matmul(train_mat.T, train_mat)  # num_item * num_item
denom = num_rating_items.T + num_rating_items - numer  # num_item * num_item
denom[denom == 0] = 1
cosine_sim_mat = numer / denom  # num_item * num_item
print(cosine_sim_mat)

[[1.         0.13129103 0.08258929 ... 0.         0.         0.        ]
 [0.13129103 1.         0.09465021 ... 0.         0.         0.        ]
 [0.08258929 0.09465021 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         1.         1.        ]
 [0.         0.         0.         ... 1.         1.         1.        ]
 [0.         0.         0.         ... 1.         1.         1.        ]]


In [16]:
#use cosine sim mat to give recommendations

#track which items the user likes so we can find similar items
user_train_like = []
for u in range(num_user):
    user_train_like.append(np.where(train_mat[u,:] > 0)[0])

# use equation to get a predicted preference score based on similarity between items 
prediction_mat = train_mat.copy()
for i in range(num_movie):
    similarities = cosine_sim_mat[i, :]
    similarities[i] = -1
    N_idx = np.argpartition(similarities, -10)[-10:]
    N_sim = similarities[N_idx]
    prediction_mat[:, i] = np.sum(N_sim.reshape((1, -1)) * train_mat[:, N_idx], axis=1) / (np.sum(N_sim) + 1e-10)
    
print(prediction_mat[0])

recommendation = [] # will append each user's top 50 movies here
for u in range(num_user):
    train_like = user_train_like[u]
    prediction_mat[u][train_like] = -9999 # don't recommend what user already liked
    top50_iid = np.argpartition(prediction_mat[u], -50)[-50:]
    top50_iid = top50_iid[np.argsort(prediction_mat[u][top50_iid])[-1::-1]]
    recommendation.append(top50_iid)
    
print(recommendation[0]) #top 50 movies for user 0

[0.40739744 0.43044576 0.40008909 ... 0.         0.         0.        ]
[17711 13992 14343   393   178    23   330   501   145    71   351    37
    66    34   187   159   184   156   269   127     7   551   790   461
   439  1106   311   173   907    70   472   179   144  2209   241   506
   835   541   441   227    40   727    48  2132   225   546   962    52
    80   181]


In [28]:
# use the prediction mat to recommend movies to the group

user_group = [750, 50, 50] # FIXME - change this to ID of whoever is in group instead of hardcode

# get the groups average predicted preference for each movie
total_preference = []

for i in range(num_movie):
    total_preference = 0.0
    for u in user_group:
        rating_sum+=prediction_mat[u][i]
    
    average_ratings.append(rating_sum)

top_50 = np.argsort(average_ratings)[-50:] # get top 50 movies
print(top_50)
    

[  559   132   127   204   526   536   202   210   250   606   714   727
     5   688   255  2026    34   790   476   134  2017    35   837   258
   209   506  2132  1096 18228   632    44   118   603   246   812   104
   786    81   574  1830   218   652    99   224   344 11077   245   177
   386    37]


In [17]:
# Calculate recall@k, precision@k with k=5, 20, 50 and print out the average over all users for these 6 metrics.
# Your Code Here...

user_test_like = []
for u in range(num_user):
    user_test_like.append(np.where(test_mat[u, :] > 0)[0])
    
recalls = np.zeros(3)
precisions = np.zeros(3)
user_count = 0.

for u in range(num_user):
    test_like = user_test_like[u]
    test_like_num = len(test_like)
    if test_like_num == 0:
        continue
    rec = recommendation[u]
    hits = np.zeros(3)
    for k in range(50):
        if rec[k] in test_like:
            if k < 50:
                hits[2] += 1
                if k < 20:
                    hits[1] += 1
                    if k < 5:
                        hits[0] += 1
    recalls[0] += (hits[0] / test_like_num)
    recalls[1] += (hits[1] / test_like_num)
    recalls[2] += (hits[2] / test_like_num)
    precisions[0] += (hits[0] / 5.)
    precisions[1] += (hits[1] / 20.)
    precisions[2] += (hits[2] / 50.)
    user_count += 1

recalls /= user_count
precisions /= user_count

print('recall@5\t[%.6f],\t||\t recall@20\t[%.6f],\t||\t recall@50\t[%.6f]' % (recalls[0], recalls[1], recalls[2]))
print('precision@5\t[%.6f],\t||\t precision@20\t[%.6f],\t||\t precision@50\t[%.6f]' % (precisions[0], precisions[1], precisions[2]))

recall@5	[0.073838],	||	 recall@20	[0.178107],	||	 recall@50	[0.264495]
precision@5	[0.137508],	||	 precision@20	[0.114519],	||	 precision@50	[0.085970]
