# Import libraries

In [1]:
import numpy as np
import pandas as pd
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity
from sklearn.metrics import mean_squared_error
from scipy.spatial.distance import correlation, cosine
from scipy.sparse import csr_matrix
import random
import warnings
warnings.filterwarnings('ignore')
# import surprise

# Import the dataset

In [2]:
r_cols = ['user_id', 'item_id', 'rating', 'timestamp']
df0 = pd.read_csv('u.data',  sep='\t', names=r_cols, encoding='latin-1')

# creating a new copy so that any changes to original data 'u.item' will not affect the train and test set below
full_data = df0.copy()

full_data[['user_id', 'item_id', 'rating']] = full_data[['user_id', 'item_id', 'rating']].astype(dtype = 'int64')

# sort by timestamp for splitting into training and testing data set later
full_data = full_data.sort_values(by = 'timestamp')

# Splitting the datasets

Splitting the dataset in the way that users' latest ratings are in the testing set. 

In [3]:
for i in set(list(full_data.user_id)):
    sub_df = full_data[full_data.user_id == i]
    len_df = sub_df.shape[0]
    if i == 1:        
        movieTrain = sub_df.head(round(len_df*0.7))
        movieTest = sub_df.tail(len_df - round(len_df*0.7))
    else:
        movieTrain = pd.concat([movieTrain, sub_df.head(round(len_df*0.7))])
        movieTest = pd.concat([movieTest, sub_df.tail(len_df - round(len_df*0.7))])

In [4]:
# build the user-item matrix based on training dataset

# item-based
UIM_train = movieTrain.pivot_table(index='user_id', columns='item_id', values='rating')

# user_based
IUM_train = UIM_train.T

# Question 1Build a simple version of the collaborative filter in such a way that outputs the mean rating for the movie by all the users who have rated it. Note that here the ratings of each user is assigned an equal weight. If some movies are available only in the test set and not in the training set, assign the default rating of 3.0. What is the RMSE score obtained by this model?

In [5]:
# minimum number of ratings of each movie
t = 20

# getting the mean rating of each movie in training set
train_mean = movieTrain.groupby('item_id', sort=True).agg(rating_mean=('rating', 'mean'), 
                                                          frequency=('rating', 'count')).reset_index()
train_mean_filtered = train_mean[train_mean['frequency'] > t]


# setting the default rating
def_rating = 3.0

# left join testing set with training set mean rating
merge_test = pd.merge(movieTest, train_mean, on="item_id", how='left')

# replace the NaN values in mean rating column due to left join (certain movies in testing set are NOT in training set)
merge_test = merge_test.fillna(value={'rating_mean': def_rating})

# left join testing set with training set mean rating
merge_test0 = pd.merge(movieTest, train_mean_filtered, on="item_id", how='left')

# replace the NaN values in mean rating column due to left join (certain movies in testing set are NOT in training set)
merge_test0 = merge_test0.fillna(value={'rating_mean': def_rating})

y_true1 = merge_test['rating']
y_hat1 = merge_test['rating_mean']

y_true01 = merge_test0['rating']
y_hat01 = merge_test0['rating_mean']

rmse1 = sqrt(mean_squared_error(y_true1, y_hat1))
print("RMSE using mean as predictor (before filter):%f" %rmse1)

rmse01 = sqrt(mean_squared_error(y_true01, y_hat01))
print("RMSE using mean as predictor (after filter):%f" %rmse01)

RMSE using mean as predictor (before filter):1.068078
RMSE using mean as predictor (after filter):1.064576


# Question 2

Defining functions for prediction, finding k nearest neighbors, RSME

In [6]:
def prediction(userid, itemid, IUM, mean, PCC, k=10):
    if itemid in IUM.index:
        # users in training set who rated movie 'itemid' and the item's ratings
        rated_users = pd.DataFrame(IUM.loc[itemid].dropna())
        rated_users.columns = ['rating']
        
        # min possible k nearest neighbors, adjustment if list of rated users less than input k
        k = min(k, len(rated_users))
        
        # pass only the similarities of the users who rated target item into next function
        pearson_mat_all = PCC[userid].loc[rated_users.index]
        # filter out negative similarities
        pearson_mat_all = pearson_mat_all[pearson_mat_all > 0]
        
        # getting sum of similarities and list of k nearest neighbors
        list_of_sim, sum_sim, knn_users_list = findksimilarusers(userid, itemid, IUM, pearson_mat_all, k)
        
        # target user mean rating
        target_user_mean_rating = mean.loc[userid]
        
        # nearest neigbors mean ratings
        knn_user_mean_rating = mean.loc[knn_users_list]
        
        # weighted ratings of the selected item for nearest neigbors
        centered_ratings = (rated_users.loc[knn_users_list]['rating'] - knn_user_mean_rating['rating_mean']).values
        
        # numerator
        numer = np.dot(list_of_sim.values, centered_ratings)
        
        y_hat2 = target_user_mean_rating + (numer / sum_sim)
        
        return y_hat2[0]
              
    else:
        return 3.0
    
    
def findksimilarusers(userid, itemid, IUM, corr, k=10): 
    # sort descending without 'userid' input
    pearson_mat = corr.sort_values(ascending=False)
    
    # nearest k neighbors
    pearson_mat = pearson_mat.head(k)
    
    # sum of the similarities
    sum_sim = pearson_mat.sum()
    
    # userid of the k nearest neighbors
    knn_users = pearson_mat.index.values
    
    return pearson_mat, sum_sim, knn_users

In [7]:
def rmse_filtered(y_hat_arr, y_true_arr):
    gate = {'predicted_rating': y_hat_arr, 'actual_rating': y_true_arr}
    df_yy = pd.DataFrame(data=gate)
    df_yy = df_yy.dropna()
    
    rmse = sqrt(mean_squared_error(df_yy['actual_rating'], df_yy['predicted_rating']))
    
    return rmse

In [None]:
Prediction starts here

In [8]:
%%time

# training dataset for checking
trainingset = movieTrain['item_id']

# mean ratings of user
train_user_mean = movieTrain.groupby('user_id', sort=True).agg(rating_mean=('rating', 'mean'))

# compute the Pearson Correlation Coefficient
pearson_mat_all = IUM_train.corr(method='pearson')

UI_pairs = zip(movieTest['user_id'].values, movieTest['item_id'].values)
y_hat2 = []

for j,k in UI_pairs:
    y_hat2.append(prediction(j, k, IUM_train, train_user_mean, pearson_mat_all))

y_hat2 = np.array(y_hat2)
y_true2 = np.array(movieTest['rating'])

Wall time: 3min 37s


Calculate the RSME

In [9]:
rmse2 = rmse_filtered(y_hat2, y_true2)
print("RMSE for PCC user-based collaborative:%f" %rmse2)

RMSE for PCC user-based collaborative:1.038146


# Question 3

Import data and join table to get the movie name

In [None]:
m_cols = ['item_id', 'movie_title', 'release_date', 'video_release_date', 'URL', 'unknown', 'Action', 'Adventure', 'Animation',
         'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
          'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
df_movieinfo = pd.read_csv('u.item',  sep='|', names=m_cols, encoding='latin-1')

# creating a new copy so that any changes to original data 'u.item' will not affect the train and test set below
full_movie = df_movieinfo.loc[:, ['item_id', 'movie_title']]
full_movie.columns = ['item_id', 'movie_title']
full_movie['item_id'] = pd.to_numeric(full_movie['item_id'])

movie_mergedf = pd.merge(movieTest, full_movie, on='item_id', how='left')

Generate a list of 10 random users

In [None]:
user_pool = random.sample(set(movieTest.user_id), 10)
print(user_pool)

In [None]:
train_user_mean = movieTrain.groupby('user_id', sort=True).agg(rating_mean=('rating', 'mean'))
pearson_mat_all = IUM_train.corr(method='pearson')

In [None]:
%%time
x = int(input("User ID: "))

# indexing
idx = movie_mergedf[movie_mergedf['user_id'] == x]

# pairs of userid and movieid of the target user in the testing set
UI_pairs_target = zip(idx.user_id.values, idx.item_id.values, idx.movie_title.values, idx.rating.values)

y_hat3 = []
movie_title = []
movie_ID = []
y_true3 = []

for j,k,l,m in UI_pairs_target:
    y_hat3.append(round(prediction(j, k, IUM_train, train_user_mean, pearson_mat_all)))
    y_true3.append(float(m))
    movie_ID.append(k)
    movie_title.append(l)
    

d = {'movie_ID':movie_ID, 'movie_title': movie_title, 'predicted_rating': y_hat3, 'actual_rating': y_true3}

# df of matching movie_title
tabular = pd.DataFrame(data=d).sort_values(by='predicted_rating', ascending=False).head(3)


print("\nTop 3 movies of user with User ID = {0}:\n".format(x))
print(tabular)


rmse3 = rmse_filtered(y_hat3, y_true3)
print("\nRMSE for User {0}: {1}".format(x, rmse3))

# Question 4

Using the same training and testing set as above.
Like Question 1, we set the minimum number of ratings = 20 for each movie.

In [None]:
IUM_train_new = movieTrain.pivot_table(index='item_id', columns='user_id', values='rating')

# conditional indexing
idx1 = train_mean_filtered['item_id'].values

# filtered user-item matrix, with item_id as index
IUM_train_new = IUM_train_new.loc[idx1]

# UIM_train_new = IUM_train_new.T

Creating cosine similarity matrix

In [None]:
sparse_IUM = csr_matrix(IUM_train_new.fillna(0))
item_sim = cosine_similarity(sparse_IUM)
item_sim_df = pd.DataFrame(item_sim, columns=IUM_train_new.index, index=IUM_train_new.index)

In [None]:
def prediction1(userid, itemid, IUM, w=20):
    if itemid in IUM.index:
        # items rated by user in the training data set
        idxx = IUM.loc[:, userid].dropna().index
        cosine_df_0 = item_sim_df.loc[item_sim_df.index.isin(idxx)]
        
        
        cosine_df = pd.DataFrame(cosine_df_0[itemid].sort_values(ascending=False)) 
        cosine_df.reset_index(level=0, inplace=True)
        cosine_df.columns = ['item_id','similarity']

        # getting similarities of w nearest items, excluding the target item itself
        cosine_df = cosine_df[cosine_df['item_id'] != itemid].head(w)
        
        cosine_sim = cosine_df['similarity'].values
        sum_cosine_sim = cosine_sim.sum()
        
        # getting actual ratings of w nearest items
        cosine_rating = IUM.loc[list(cosine_df['item_id']), userid]
        
        #predicted ratings
        y_hat4 = np.dot(cosine_sim, cosine_rating) / sum_cosine_sim
        
        return y_hat4
    else:
        return 3.0

In [None]:
%%time

# w nearest items
w=20


UI_pairs_new = zip(movieTest['user_id'].values, movieTest['item_id'].values)
y_hat4 = []

for j,k in UI_pairs_new:
    y_hat4.append(prediction1(j, k, IUM_train_new, w))

y_hat4 = np.array(y_hat4)
y_true4 = np.array(movieTest['rating'])

Calculate the RMSE

In [None]:
rmse4 = rmse_filtered(y_hat4, y_true4)
print("RMSE for item-based (cosine similarity) collaborative filtering:%f" %rmse4)