In [1]:
import numpy as np
import pandas as pd
import pickle
import time

In [2]:
# load the  data

with open('../data/user2movie.pkl', 'rb') as f:
  user2movie = pickle.load(f)

with open('../data/movie2user.pkl', 'rb') as f:
  movie2user = pickle.load(f)

with open('../data/usermovie2rating.pkl', 'rb') as f:
  usermovie2rating = pickle.load(f)


# test data

with open('../data/user2movie_test.pkl', 'rb') as f:
  user2movie_test = pickle.load(f)

with open('../data/movie2user_test.pkl', 'rb') as f:
  movie2user_test = pickle.load(f)


with open('../data/usermovie2rating_test.pkl', 'rb') as f:
  usermovie2rating_test = pickle.load(f)

## average rating of each user

In [3]:
%%time
# create dictonary with user as key and average rating as value
avg_rating_for_user = {}
for i in user2movie:
    
    list_movies = user2movie[i]
    ratings = []
    
    for j in list_movies:
        ratings.append(usermovie2rating[(i , j)])
        
    avg_rating = sum(ratings) / len(ratings)
    avg_rating_for_user[i] = avg_rating

CPU times: user 402 ms, sys: 1.31 ms, total: 403 ms
Wall time: 402 ms


## The baseline model

In [4]:
# use mse as the metric

# calculate mse from 2 dictonarys
def calc_mse(pred_dict , actual_dict):
    pred_keys = list(pred_dict.keys())
    pred_list = []
    actual_list = []
    
    for i in pred_keys:
        actual_list.append(actual_dict[i])
        pred_list.append(pred_dict[i])
    pred_array = np.asarray(pred_list)
    actual_array = np.asarray(actual_list)
    
    mse = np.sum((pred_array - actual_array)**2) / len(pred_array)
    return mse

In [5]:
# the baseline model will be using the average rating as the rating for each user

# store items in dictionary

def baseline(user2movie = user2movie , usermovie2rating = usermovie2rating):
    baseline_predictions = {}
    num_of_user = len(user2movie.keys())
    for i in range(num_of_user):
        movies = user2movie[i]
        for j in movies:
            baseline_predictions[(i , j)] = avg_rating_for_user[i]
    return baseline_predictions

In [6]:
train_baseline_predictions = baseline(user2movie= user2movie , usermovie2rating = usermovie2rating)
test_baseline_predictions = baseline(user2movie= user2movie_test , usermovie2rating = usermovie2rating_test)

In [7]:
# mse on the train set
calc_mse(pred_dict=train_baseline_predictions , actual_dict=usermovie2rating)

0.8679885735783192

In [8]:
# mse on the test set
calc_mse(pred_dict= test_baseline_predictions , actual_dict= usermovie2rating_test)

0.8732110245400387

## The weights for colabarative filtering

The weights are given by:

$$ w_{ii'} = \frac{\sum_{j \in \psi_{ii'}} (r_{ij} -\bar{r_{i}})(r_{i'j} -\bar{r_{i'}}) }{\sqrt{\sum_{j \in \psi_{ii'}}(r_{ij} - \bar{r}_{i})^{2}}   \sqrt{\sum_{j \in \psi_{ii'}} (r_{i'j} - \bar{r}_{i'})^{2} } } $$

$w_{ii'}$ - the weights

$\psi_{ii'}$ - set of movies rated by both i and i'

$r_{i'j}$ - rating user i' gave rating j

$r_{ij}$ - rating user i gave rating j

$\bar{r_{i}}$ - average rating of user i

$\bar{r_{i'}}$ - average rating of user i'

In [9]:
# some helper functions

# generates an array of ratigns
def array_of_ratings (user , shared_movies):
    my_list = []
    for i in shared_movies:
        my_list.append(usermovie2rating[user , i])
    return np.asarray(my_list)

# formula for calculating weights
def calculate_weights(dev_0 , dev_1):
    return np.dot(dev_0 , dev_1) / (np.sqrt(np.sum(dev_0**2)) * np.sqrt(np.sum(dev_1**2)))

# returns only the 25 most relavant key value pairs for the dictionary
def filter_dictonary(my_dict , num = 25):
    my_series = pd.Series(my_dict)
    my_series2 = my_series.copy()    
    # decided by larged modulo value as negative values have equal use to positive values
    index = my_series.apply(lambda x : np.abs(x)).sort_values(ascending = False).head(num).index.to_list()
    return my_series.loc[index].to_dict()

In [10]:
#total complexity O(N^2 M) where N is number of users and M is number of movies
start = time.time()

#num of users
users = len(user2movie.keys())
#keep all values stored inside a dictionary
all_user_weights = {}
#length of common movies required
l = 5
for u in range(users): #O(N)

    # weights dictionary
    weights_dict = {}
    movies_u = set(user2movie[u])
    for i in user2movie: #O(N)
        if i is not u:
            movies_i = set(user2movie[i]) #O(M)
            shared_movies = movies_i & movies_u

            if len(shared_movies) >= l:
            
                # ratings of shared movies by both users
                arr_0_ratings = array_of_ratings(u , shared_movies)
                arr_1_ratings = array_of_ratings(i , shared_movies)
            
                # average ratings by both users
                avg_r0 = avg_rating_for_user[u]
                avg_r1 = avg_rating_for_user[i]

                weights_dict[i] = calculate_weights(arr_1_ratings - avg_r1 , arr_0_ratings - avg_r0)
    sorted_dict = filter_dictonary(weights_dict)
    all_user_weights[u] = sorted_dict
    
    # keep track of where we are at during while generating the weights
    if (u+1) % 200 == 0:
        print('number of cycles passed:' , u + 1)
        print('time taken so far is ',  time.time() - start , 'seconds')
    
    
end = time.time()
print('total time' , end - start , 'in seconds')

number of cycles passed: 200
time taken so far is  63.65066123008728 seconds
number of cycles passed: 400
time taken so far is  132.45628023147583 seconds
number of cycles passed: 600
time taken so far is  203.9902799129486 seconds
number of cycles passed: 800
time taken so far is  270.407968044281 seconds
number of cycles passed: 1000
time taken so far is  336.7224073410034 seconds
total time 336.7225227355957 in seconds


In [11]:
# save the weights dictionary since data takes long time to generate

with open('../data/user_user_weights_saved_data.pkl', 'wb') as f:
    pickle.dump(all_user_weights, f)

## User-User Colabarative Filtering

$$ \hat{dev}(i , j) = \frac{ \sum_{i' \in \Omega_{j}}  w_{ii'}( r(i' , j) - \bar{r_{i'}})}{\sum_{i' \in \Omega_{i'}}|w_{ii'}| } $$

$$ s(i , j) = \bar{r_{i}} + \hat{dev}( i , j ) $$

s(i , j) - rating user i gives to item j

$\bar{r_{i}}$ - average rating of user i

$\hat{dev(i , j)}$ - weighted deviation of movie j other users i'

$\hat{r(i' , j)}$ - rating user i' gave to item j

$\bar{r_{i'}}$ - average rating of user i'

$w_{ii'}$ - weight between user i and i'

$\Omega_{i'}$ - all relavant users with relavant shared movie

In [12]:
# now we need to make predictions on the data we have

def make_predictions(all_user_weights = all_user_weights , user2movie = user2movie 
                                , movie2user = movie2user, usermovie2rating = usermovie2rating):
    # users
    users = len(user2movie.keys())
    
    # keep all the predictions in a dictionary
    pred_user_movie2rating = {}
    
    for u in range(users):
        
        #weights of each user
        user_weights = all_user_weights[u]
        
        # all movies watched by our relavant user
        movies = user2movie[u]
        
        for m in movies:

            # find common users
            common_users = set(user_weights.keys())
            
            # set of all users who have rated move m
            all_users_movie_m = set(movie2user[m])


            # set of users who have rated moive m and have a weight in dictionary
            relavant_users = all_users_movie_m & set(common_users)
            
            denominator = 0
            numerator = 0
            for i in relavant_users:
                numerator += all_user_weights[u][i] * (usermovie2rating[(i , m)] - avg_rating_for_user[i])
                denominator += float(np.abs(all_user_weights[u][i]))
            
            if int(denominator) != 0:
                pred_user_movie2rating[(u , m)] = avg_rating_for_user[u] +  numerator/denominator
            else:
                pred_user_movie2rating[( u , m)] = avg_rating_for_user[u]
    
    return pred_user_movie2rating

## Evaluating the model

In [13]:
%%time
# make predictions from train data
train_data = make_predictions()

CPU times: user 22.2 s, sys: 63.5 ms, total: 22.3 s
Wall time: 22.3 s


In [14]:
# test data 2
test_data = make_predictions(user2movie = user2movie_test)

In [15]:
# evaluate train by mse
calc_mse(pred_dict= train_data  , actual_dict= usermovie2rating)

0.4410769437772073

In [16]:
# evaluate test by mse
calc_mse(pred_dict= test_data  , actual_dict= usermovie2rating_test)

0.6300383317556393

## Predicting on new data

In [17]:
# now we need to predict a set of ratings for each user for movies they have not watched

#number of movies
set_of_movies = set(movie2user.keys())

In [18]:
unrated_user2movie = {}
for key in user2movie:
    movies_watched_train = set(user2movie[key])
    movies_watched_test = set(user2movie_test[key])
    movies_not_watched = set_of_movies - (movies_watched_train | movies_watched_test)
    for i in movies_not_watched:
        if key not in unrated_user2movie:
            unrated_user2movie[key] = [i]
        else:
            unrated_user2movie[key].append(i)

In [19]:
# predictions on unwatched moives
predictions = make_predictions(user2movie = unrated_user2movie)

In [20]:
len(predictions)

372887