## 4.2 - Building a Movie Recommendation System

#### Get the data

In [None]:
ratings_data = "../../../data/ml-100k/u.data"
movies_data = "../../../data/ml-100k/u.item"

In [None]:
from collections import defaultdict

user_ratings = defaultdict(dict)
movie_ratings = defaultdict(dict)

with open(ratings_data, 'r') as f:
    for line in f:
        user, movie, stars, _ = line.split('\t')
        user_ratings[user][movie] = float(stars)
        movie_ratings[movie][user] = float(stars)

In [None]:
len(user_ratings)

In [None]:
len(movie_ratings)

In [None]:
user_ratings["1"]  # userID = 1

In [None]:
movies = {}
with open(movies_data, 'r', encoding="latin-1") as f:
    for line in f:
        movie_id, title, *_ = line.split('|')
        movies[movie_id] = title
        
len(movies)

In [None]:
movies["127"], movies["187"], movies["29"]  # movie ID = 127, 187, 29

In [None]:
movie_ratings["127"]

In [None]:
sum(movie_ratings["127"].values()) / len(movie_ratings["127"])

In [None]:
import pandas as pd
import numpy as np

ratings = pd.read_csv(ratings_data, sep='\t', names=['user', 'movie', 'rating', 'timestamp'])

ratings.head()

In [None]:
ratings.shape

In [None]:
n_movies = ratings["movie"].unique().shape
n_movies

In [None]:
n_users = ratings["user"].unique().shape
n_users

In [None]:
data_matrix = np.zeros((ratings.user.max(), ratings.movie.max()))

In [None]:
for item in ratings.itertuples():
    data_matrix[item.user-1, item.movie-1] = item.rating

In [None]:
data_matrix

In [None]:
data_matrix.shape

#### Distance / Similarity

https://en.wikipedia.org/wiki/Euclidean_distance

$\mbox{euclidean}(x, y) = \big{|}\big{|} x - y \big{|}\big{|}_{2} = \sqrt{\sum_{i=0}^{n} (x_{i} - y_{i})^{2}}$

https://en.wikipedia.org/wiki/Cosine_similarity

$\mbox{cosine}(x, y) = 1 - \frac{x \cdot y}{|| x ||_{2} || y ||_{2}}$, i.e. one minus cosine similarity

In [None]:
from scipy.spatial.distance import cosine

cosine(data_matrix[:, 126], data_matrix[:, 186])  # Godfather vs Godfather II

In [None]:
cosine(data_matrix[:, 126], data_matrix[:, 28])  # Godfather vs Batman Forever

In [None]:
cosine(data_matrix[0, :], data_matrix[2, :])  # user 1 vs user 3

In [None]:
cosine(data_matrix[0, :], data_matrix[915, :])  # user 1 vs user 916

In [None]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data_matrix, test_size=0.2)

In [None]:
train_data.shape, test_data.shape

In [None]:
from sklearn.metrics.pairwise import pairwise_distances

user_distance = pairwise_distances(train_data, metric='cosine')
item_distance = pairwise_distances(train_data.T, metric='cosine')

In [None]:
user_distance

In [None]:
user_similarity = 1 - user_distance
item_similarity = 1 - item_distance

In [None]:
user_similarity.shape, item_similarity.shape

In [None]:
train_data.shape

#### Prediction

$r_{u,i}$ = rating user u gave to item i

$\hat{r}_{u,i}$ = rating prediction for user u and item i

$\mbox{sim}(u, v)$ = similarity between user u and user v

$\hat{r}_{u,i} = \frac{\sum_{v} \mbox{sim}(u, v)r_{v,i}}{\sum_{v} \big{|}\mbox{sim}(u, v)\big{|}}$

In [None]:
def make_user_prediction(data, u_similarity):
    return u_similarity.dot(data) / np.array([np.abs(u_similarity).sum(axis=1)]).T

def make_item_prediction(data, i_similarity):
    return data.dot(i_similarity) / np.array([np.abs(i_similarity).sum(axis=1)])

user_pred = make_user_prediction(train_data, user_similarity)
item_pred = make_item_prediction(train_data, item_similarity)

In [None]:
user_pred.shape

In [None]:
item_pred.shape

In [None]:
from sklearn.metrics import mean_squared_error

def matrix_mse(prediction, actual):
    prediction = prediction[actual.nonzero()].flatten()  # ignore zero terms
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(prediction, actual)

matrix_mse(user_pred, train_data)

In [None]:
matrix_mse(item_pred, train_data)