# Lab 10: Recommender System

In this assignment, we will study how to do user-based collaborative filtering and item-based collaborative filtering. 

## 1. Dataset

In this assignment, we will use MovieLens-100K dataset. It includes about 100,000 ratings from 1000 users on 1700 movies.  

In [1]:
from math import sqrt
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics.pairwise import linear_kernel
from sklearn.neighbors import NearestNeighbors


# 1. load data
user_ratings_train = pd.read_csv('./ml-100k/u1.base',
                            sep='\t',names=['user_id','movie_id','rating'], usecols=[0,1,2])

user_ratings_test = pd.read_csv('./ml-100k/u1.test',
                            sep='\t',names=['user_id','movie_id','rating'], usecols=[0,1,2])

movie_info =  pd.read_csv('./ml-100k/u.item', 
                          sep='|', names=['movie_id','title'], usecols=[0,1],
                          encoding="ISO-8859-1")

user_ratings_train = pd.merge(movie_info, user_ratings_train)
user_ratings_test = pd.merge(movie_info, user_ratings_test)

# 2. get the rating matrix. Each row is a user, and each column is a movie.
user_ratings_train = user_ratings_train.pivot_table(index=['user_id'],
                                        columns=['title'],
                                        values='rating')

user_ratings_test = user_ratings_test.pivot_table(index=['user_id'],
                                        columns=['title'],
                                        values='rating')




user_ratings_train = user_ratings_train.reindex(
                            index=user_ratings_train.index.union(user_ratings_test.index), 
                            columns=user_ratings_train.columns.union(user_ratings_test.columns) )

user_ratings_test = user_ratings_test.reindex(
                            index=user_ratings_train.index.union(user_ratings_test.index), 
                            columns=user_ratings_train.columns.union(user_ratings_test.columns) )

print(user_ratings_train.shape)
print(user_ratings_test.shape)

(943, 1664)
(943, 1664)


## Task 1. User-based CF

* Use pearson correlation to get the similarity between different users.
* Based on the obtained similarity score, predict the ratings. You can use 5 nearest neighbors or 10 nearest neighbors.
* Compute MAE for the testing set.

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation, cosine
import ipywidgets as widgets
from IPython.display import display, clear_output
from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt
import sys, os
from contextlib import contextmanager

#pearson_sim = 1-pairwise_distances(user_ratings_train, metric="correlation")
#my_rho = np.corrcoef(user_ratings_train)

#print(my_rho)

user_correlations = user_ratings_train.transpose().corr()
user_correlations.fillna(0,inplace=True)
print(user_correlations.head())
closest_neighbors = pd.DataFrame()

for row in range(1, 944):
    closest_neighbors = closest_neighbors.append(user_correlations[row].argsort()[:10])
print(closest_neighbors)

user_id       1         2         3         4         5         6         7    \
user_id                                                                         
1        1.000000  0.060231  0.021759 -0.002824  0.146565  0.233708  0.121845   
2        0.060231  1.000000  0.034066  0.072088 -0.016823  0.157917  0.024915   
3        0.021759  0.034066  1.000000  0.136499  0.012301  0.057590 -0.027068   
4       -0.002824  0.072088  0.136499  1.000000 -0.001700 -0.022823  0.007401   
5        0.146565 -0.016823  0.012301 -0.001700  1.000000  0.112891  0.107598   

user_id       8         9         10   ...       934       935       936  \
user_id                                ...                                 
1        0.067226  0.040734  0.148499  ...  0.186978  0.053743  0.143976   
2        0.004220  0.097244  0.044572  ...  0.038910  0.282706  0.281954   
3        0.072306 -0.009917  0.002926  ...  0.006376  0.019473  0.107743   
4        0.035109 -0.007420  0.002225  ... -0.014407

In [10]:
from collections import Counter
import math

def knn(data, query, k, distance_fn, choice_fn):
    neighbor_distances_and_indices = []
    
    # 3. For each example in the data
    for index, example in enumerate(data):
        # 3.1 Calculate the distance between the query example and the current
        # example from the data.
        distance = distance_fn(example[:-1], query)
        
        # 3.2 Add the distance and the index of the example to an ordered collection
        neighbor_distances_and_indices.append((distance, index))
    
    # 4. Sort the ordered collection of distances and indices from
    # smallest to largest (in ascending order) by the distances
    sorted_neighbor_distances_and_indices = sorted(neighbor_distances_and_indices)
    
    # 5. Pick the first K entries from the sorted collection
    k_nearest_distances_and_indices = sorted_neighbor_distances_and_indices[:k]
    
    # 6. Get the labels of the selected K entries
    k_nearest_labels = [data[i][-1] for distance, i in k_nearest_distances_and_indices]

    # 7. If regression (choice_fn = mean), return the average of the K labels
    # 8. If classification (choice_fn = mode), return the mode of the K labels
    return k_nearest_distances_and_indices , choice_fn(k_nearest_labels)

def mean(labels):
    return sum(labels) / len(labels)

def mode(labels):
    return Counter(labels).most_common(1)[0][0]

def euclidean_distance(point1, point2):
    sum_squared_distance = 0
    for i in range(len(point1)):
        sum_squared_distance += math.pow(point1[i] - point2[i], 2)
    return math.sqrt(sum_squared_distance)

In [14]:
test_updated = user_ratings_test.transpose()
test_updated.fillna(0,inplace=True)
reg_k_nearest_neighbors, reg_prediction = knn(
        test_updated, user_correlations, k=5, distance_fn=euclidean_distance, choice_fn=lambda x: None)

TypeError: 'int' object is not subscriptable

In [22]:
from sklearn.neighbors import KNeighborsClassifier
myKNN = KNeighborsClassifier(n_neighbors = 5)
myKNN.fit(user_correlations)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

## Task 2. Item-based CF
* Use cosine similarity to get the similarity between different items.
* Based on the obtained similarity score, predict the ratings. You can use 5 nearest neighbors or 10 nearest neighbors.
* Compute MAE for the testing set.

In [5]:
# cosine similarity
item_corrs = user_ratings_train.transpose()
item_corrs.fillna(0,inplace=True)
print(item_corrs)
temp_item_row = [];

for row in item_corrs:
    cos_sim=np.dot(row,temp_item_row)/(np.linalg.norm(row)*np.linalg.norm(temp_item_row))
    temp_item_row = row
    print(cos_sim)

user_id                                1    2    3    4    5    6    7    8    \
title                                                                           
'Til There Was You (1997)              0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
1-900 (1994)                           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
101 Dalmatians (1996)                  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
12 Angry Men (1957)                    5.0  0.0  0.0  0.0  0.0  4.0  4.0  0.0   
187 (1997)                             0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
...                                    ...  ...  ...  ...  ...  ...  ...  ...   
Young Guns II (1990)                   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
Young Poisoner's Handbook, The (1995)  0.0  0.0  0.0  0.0  0.0  0.0  3.0  0.0   
Zeus and Roxanne (1997)                0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
unknown                                0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
Á köldum klaka (Cold Fever) 

In [25]:
importing surprise package and builtin data
from surprise import Dataset, evaluate
from suprise import KNNBasic
from collections import defaultdict


# loading data
trainingSet = user_ratings_train
trainingSet

# cosine similarity between 2 vectors
sim_options = {
    'name': 'cosine',
    'user_based': False
}
knn = KNNBasic(sim_options=sim_options)

# training the model
knn.train(trainingSet)

# movie recommendations for users
testSet = trainingSet.build_anti_testset()
predictions = knn.test(testSet)

#top three movie recommendations for each user.
 
def get_top5_recommendations(predictions, topN = 5):
     
    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_recs[uid].append((iid, est))
     
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:topN]
     
    return top_recs

# in data we have movieid's.read_item_names maps each movie’s ID to its name.
import os, io
 
def read_item_names():
    """Read the u.item file from MovieLens 100-k dataset and returns a
    mapping to convert raw ids into movie names.
    """
 
    file_name = (os.path.expanduser('~') +
                 '/.surprise_data/ml-100k/ml-100k/u.item')
    rid_to_name = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
 
    return rid_to_name

# top 3 recommendations
top5_recommendations = get_top5_recommendations(predictions)
rid_to_name = read_item_names()
for uid, user_ratings in top5_recommendations.items():
    print(uid, [rid_to_name[iid] for (iid, _) in user_ratings])

ModuleNotFoundError: No module named 'suprise'