# Data Engineering - Collaborative Filtering

I did not have any success with my previous approach, so now I will try Collaborative Filtering approach.

In [1]:
import pandas as pd
import sys
import os
sys.path.append("../")
from definitions import ROOT_DIR
import numpy as np

In [2]:
data_folder = os.path.join(ROOT_DIR, 'data/interim')

In [3]:
data = pd.read_csv(os.path.join(data_folder, 'data.csv'))
films = pd.read_csv(os.path.join(data_folder, 'films.csv'))
user = pd.read_csv(os.path.join(data_folder, 'user.csv'))

## Basic User-User and Item-Item

In [46]:
data_no_ts = data.drop('timestamp', axis=1)
sampled = data_no_ts.sample(n=int(data.shape[0] * 0.7))

# 0.1 runs 0.3 seconds
# 0.2 runs 1.3 seconds
# 0.3 runs 3.2 seconds
# 0.4 runs 12.2 seconds
# 0.5 runs 45.5 seconds
# 0.6 runs 138.3 seconds ~ 2 minutes
# 0.7 runs 511.7 seconds ~ 8 minutes
# 0.8 runs 1893.2 seconds ~ 30 minutes
# 0.9 runs 7005.3 seconds ~ 116 minutes
# 1.0 runs 21015.9 seconds ~ 350 minutes

In [47]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(sampled, test_size=0.2)

train_data_matrix = train_data.values
test_data_matrix = test_data

In [48]:
from sklearn.metrics.pairwise import pairwise_distances

user_correlation = 1 - pairwise_distances(train_data, metric='correlation', n_jobs=1)
user_correlation[np.isnan(user_correlation)] = 0

item_correlation = 1 - pairwise_distances(train_data_matrix.T, metric='correlation', n_jobs=1)
item_correlation[np.isnan(item_correlation)] = 0

In [49]:
# Function to predict ratings
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        # Use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [50]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# Function to calculate RMSE
def rmse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))
# Predict ratings on the training data with both similarity score
user_prediction = predict(train_data_matrix, user_correlation, type='user')
item_prediction = predict(train_data_matrix, item_correlation, type='item')
# RMSE on the train data
print('User-based CF RMSE: ' + str(rmse(user_prediction, train_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, train_data_matrix)))

User-based CF RMSE: 140.19990292628052
Item-based CF RMSE: 71.1057482132197


In [55]:
train_data_matrix

array([[ 42, 999,   4],
       [592, 482,   4],
       [181,   3,   2],
       ...,
       [846,  12,   5],
       [320, 368,   3],
       [897, 429,   5]])

In [51]:
user_prediction

array([[ 288.72855902,  634.18943222,  122.08200876],
       [ 546.88854744,  471.01776824,   60.09368432],
       [ 343.9601261 ,    1.81789451, -159.77802061],
       ...,
       [ 569.60240417,  228.12700244,   65.27059339],
       [ 378.82982138,  385.92406849,  -73.75388986],
       [ 683.53256722,  496.09511678,  151.372316  ]])

## SVD

Now I need to create a new rating matrix, where rows represent different films and cols represent users. Basically this is a matrix, which maps users ratings to movies.

In [17]:
ratings_mat = np.ndarray(
    shape=(np.max(data.item_id.values), np.max(data.user_id.values)),
    dtype=np.uint8
)
ratings_mat[data.item_id.values-1, data.user_id.values-1] = data.rating.values
ratings_mat.shape

(1682, 943)

Below I am normalizing this matrix and computing its Singular Value Decomposition (SVD).

In [9]:
normalised_mat = ratings_mat - np.asarray([(np.mean(ratings_mat, 1))]).T
A = normalised_mat.T / np.sqrt(ratings_mat.shape[0] - 1)
U, S, V = np.linalg.svd(A)

Function to calculate cosine similarity of films.

In [10]:
def top_cosine_similarity(data, movie_id, top_n=10):
    index = movie_id - 1 # Movie id starts from 1 in the dataset
    movie_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]

Function to print top N similar movies.

In [14]:
def print_similar_movies(movie_data, movie_id, top_indexes):
    print('Recommendations for {0}: \n'.format(
    movie_data[movie_data.movie_id == movie_id]['movie title'].values[0]))
    for id in top_indexes + 1:
        print(movie_data[movie_data.movie_id == id]['movie title'].values[0])

In [15]:
k = 50
movie_id = 10 # (getting an id from movies.dat)
top_n = 10
sliced = V.T[:, :k] # representative data
indexes = top_cosine_similarity(sliced, movie_id, top_n)

print_similar_movies(films, movie_id, indexes)

Recommendations for Richard III (1995): 

Richard III (1995)
Twelfth Night (1996)
Losing Chase (1996)
Convent, The (Convento, O) (1995)
Angels and Insects (1995)
Othello (1995)
Restoration (1995)
In the Bleak Midwinter (1995)
Kansas City (1996)
Haunted World of Edward D. Wood Jr., The (1995)
