# Lab 8: Recommender System

In this assignment, we will study how to do user-based collaborative filtering and item-based collaborative filtering. 

## 1. Dataset

In this assignment, we will use MovieLens-100K dataset. It includes about 100,000 ratings from 1000 users on 1700 movies.  

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from typing import Tuple


# 1. load data
user_ratings_train = pd.read_csv('./ml-100k/u1.base',
                                 sep='\t', names=['user_id', 'movie_id', 'rating'], usecols=[0, 1, 2])

user_ratings_test = pd.read_csv('./ml-100k/u1.test',
                                sep='\t', names=['user_id', 'movie_id', 'rating'], usecols=[0, 1, 2])

movie_info = pd.read_csv('./ml-100k/u.item',
                         sep='|', names=['movie_id', 'title'], usecols=[0, 1],
                         encoding="ISO-8859-1")

user_ratings_train = pd.merge(movie_info, user_ratings_train)
user_ratings_test = pd.merge(movie_info, user_ratings_test)

# 2. get the rating matrix. Each row is a user, and each column is a movie.
user_ratings_train = user_ratings_train.pivot_table(index=['user_id'],
                                                    columns=['title'],
                                                    values='rating')

user_ratings_test = user_ratings_test.pivot_table(index=['user_id'],
                                                  columns=['title'],
                                                  values='rating')


user_ratings_train = user_ratings_train.reindex(
    index=user_ratings_train.index.union(user_ratings_test.index),
    columns=user_ratings_train.columns.union(user_ratings_test.columns))

user_ratings_test = user_ratings_test.reindex(
    index=user_ratings_train.index.union(user_ratings_test.index),
    columns=user_ratings_train.columns.union(user_ratings_test.columns))

print(user_ratings_train.shape)
print(user_ratings_test.shape)


(943, 1664)
(943, 1664)


## Define shared methods

In [2]:
def calculate_distances(minperiods: int, nneighbors: int, user_ratings: pd.DataFrame) -> pd.DataFrame:
    distances = user_ratings.transpose().corr(method='pearson', min_periods=minperiods)
    distances = distances.dropna(axis=0, thresh=nneighbors+1).dropna(axis=1, thresh=nneighbors+1)
    distances = distances.loc[distances.columns]
    distances = 1 - distances
    distances[distances < 0] = 0
    return distances


In [3]:
def create_model(nneighbors: int, metric = 'precomputed') -> NearestNeighbors:
    model = NearestNeighbors(metric=metric,
                             algorithm='brute',
                             n_neighbors=nneighbors,
                             n_jobs=-1)
    
    return model

In [4]:
def train_model(model: NearestNeighbors, distances: pd.DataFrame) -> NearestNeighbors:
    return model.fit(csr_matrix(distances.fillna(0).values))

In [5]:
def get_kneighbors(model: NearestNeighbors, distances: pd.DataFrame, nneighbors: int) -> Tuple[np.ndarray, np.ndarray]:
    similarity, indices = model.kneighbors(csr_matrix(distances.fillna(0).values),
                                           n_neighbors=nneighbors)

    return similarity, indices


In [6]:
def get_neighborhoods(distances: pd.DataFrame, similarity: np.ndarray, indices: np.ndarray) -> pd.DataFrame:
    neighborhoods = pd.DataFrame({'neighborhood_ids': [distances.iloc[neighbors].index.to_list() for neighbors in indices],
                                  'distance': similarity.tolist()},
                                 index=distances.index)

    return neighborhoods


In [7]:
def mae(user_ratings: pd.DataFrame, predicted_ratings: pd.DataFrame) -> float:
    abs_dif = abs(user_ratings - predicted_ratings)
    n = np.sum(user_ratings.count())
    return abs_dif.sum().sum() / n


## Task 1. User-based CF

* Use pearson correlation to get the similarity between different users.
* Based on the obtained similarity score, predict the ratings. You can use 5 nearest neighbors or 10 nearest neighbors.
* Compute MAE for the testing set.

In [8]:
def usercf_predict_rating_closure(user_ratings: pd.DataFrame, neighborhoods: pd.DataFrame):
    def usercf_predict_rating_using_nmean(row):
        neighbors_ids = neighborhoods['neighborhood_ids'].loc[row.name]
        pred_ratings = user_ratings.loc[neighbors_ids].mean()
        return round(pred_ratings)
    
    return usercf_predict_rating_using_nmean

In [9]:
def usercf_make_predictions(user_ratings: pd.DataFrame, neighborhoods: int) -> pd.DataFrame:
    pred = user_ratings.reindex(neighborhoods.index).apply(usercf_predict_rating_closure(user_ratings,
                                                                                         neighborhoods),
                                                           axis='columns')

    return pred


In [10]:
def recommendation_system_user_model(minperiods: int, nneighbors: int, user_ratings: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    user_distances = calculate_distances(minperiods, nneighbors, user_ratings)
    model = create_model(nneighbors)
    train_model(model, user_distances)
    similarity, indices = get_kneighbors(model, user_distances, nneighbors)
    neighborhoods = get_neighborhoods(user_distances, similarity, indices)
    pred = usercf_make_predictions(user_ratings, neighborhoods)

    user_ratings.merge(pred, how='right')

    return pred, neighborhoods


Calculate MAE on test set using user-based CF with 10 neighbors:

In [11]:
pred, neighborhoods = recommendation_system_user_model(minperiods=5,
                                                       nneighbors=10,
                                                       user_ratings=user_ratings_train)
pred = pred.reindex(index=user_ratings_train.index)

mae(user_ratings_test, pred)



0.3987031195233088

## Task 2. Item-based CF
* Use cosine similarity to get the similarity between different items.
* Based on the obtained similarity score, predict the ratings. You can use 5 nearest neighbors or 10 nearest neighbors.
* Compute MAE for the testing set.

In [12]:
def itemcf_predict_rating_closure(user_ratings: pd.DataFrame, neighborhoods: pd.DataFrame):
    def itemcf_predict_rating_using_nmean(column):
        neighbors_titles = neighborhoods.loc[column.name][0]
        return round(user_ratings[neighbors_titles].mean(axis=1))

    return itemcf_predict_rating_using_nmean


In [13]:
def itemcf_make_predictions(user_ratings: pd.DataFrame, neighborhoods: int) -> pd.DataFrame:
    pred = (user_ratings
            .reindex(columns=neighborhoods.index)
            .apply(itemcf_predict_rating_closure(user_ratings,
                                                 neighborhoods)))

    return pred


In [14]:
def recommendation_system_item_model(minperiods: int, nneighbors: int, user_ratings: pd.DataFrame):
    movie_distances = calculate_distances(minperiods, nneighbors, user_ratings.transpose())
    model = create_model(nneighbors, metric='cosine')
    train_model(model, movie_distances)
    similarity, indices = get_kneighbors(model, movie_distances, nneighbors)
    neighborhoods = get_neighborhoods(movie_distances, similarity, indices)
    pred = itemcf_make_predictions(user_ratings, neighborhoods)
    return pred, neighborhoods

Calculate MAE on test set using item-based CF with 10 neighbors:

In [15]:
pred, neighborhoods = recommendation_system_item_model(minperiods=10,
                                                       nneighbors=10,
                                                       user_ratings=user_ratings_train)
pred = pred.reindex(index=user_ratings_train.index)

mae(user_ratings_test, pred)

0.6202493615742827

## Acknowledgements

I heavily relied on these notebooks to complete this lab:

- https://www.kaggle.com/code/marfritz/user-neighborhood-based-cf
- https://www.kaggle.com/code/marfritz/item-neighborhood-based-cf