In [1]:
import numpy as np
import pandas as pd

# load the data
import pickle

# import time
import time

In [2]:
# load the  data

with open('../data/user2movie.pkl', 'rb') as f:
  user2movie = pickle.load(f)

with open('../data/movie2user.pkl', 'rb') as f:
  movie2user = pickle.load(f)

with open('../data/usermovie2rating.pkl', 'rb') as f:
  usermovie2rating = pickle.load(f)


# test data

with open('../data/user2movie_test.pkl', 'rb') as f:
  user2movie_test = pickle.load(f)

with open('../data/movie2user_test.pkl', 'rb') as f:
  movie2user_test = pickle.load(f)


with open('../data/usermovie2rating_test.pkl', 'rb') as f:
  usermovie2rating_test = pickle.load(f)

## average rating of each Item

In [3]:
%%time
# create dictonary with user as key and average rating as value
avg_rating_for_movie = {}
for i in movie2user:
    
    list_users = movie2user[i]
    ratings = []
    
    for j in list_users:
        ratings.append(usermovie2rating[(j , i)])
        
    avg_rating = sum(ratings) / len(ratings)
    avg_rating_for_movie[i] = avg_rating

CPU times: user 44.5 ms, sys: 477 µs, total: 45 ms
Wall time: 45.5 ms


## The baseline model

In [4]:
# use mse as the metric

# calculate mse from 2 dictonarys
def calc_mse(pred_dict , actual_dict):
    pred_keys = list(pred_dict.keys())
    pred_list = []
    actual_list = []
    
    for i in pred_keys:
        actual_list.append(actual_dict[i])
        pred_list.append(pred_dict[i])
    pred_array = np.asarray(pred_list)
    actual_array = np.asarray(actual_list)
    
    mse = np.sum((pred_array - actual_array)**2) / len(pred_array)
    return mse

In [5]:
# the baseline model will be using the average rating as the rating for each user

# store items in dictionary

def baseline(movie2user = movie2user , usermovie2rating = usermovie2rating):
    baseline_predictions = {}
    num_of_movies = len(user2movie.keys())
    for i in range(num_of_movies):
        users = movie2user[i]
        for j in users:
            baseline_predictions[(j , i)] = avg_rating_for_movie[i]
    return baseline_predictions

In [6]:
train_baseline_predictions = baseline(movie2user= movie2user , usermovie2rating = usermovie2rating)
test_baseline_predictions = baseline(movie2user= movie2user_test , usermovie2rating = usermovie2rating_test)

In [7]:
calc_mse(pred_dict=train_baseline_predictions , actual_dict=usermovie2rating)

0.8159394158738645

In [8]:
calc_mse(pred_dict= test_baseline_predictions , actual_dict= usermovie2rating_test)

0.8316568077899196

## The weights for item - item colabarative filtering

The weights are given by:

$$ w_{jj'} = \frac{\sum_{i \in \psi_{jj'}} (r_{ij} -\bar{r_{j}})(r_{ij'} -\bar{r_{j'}}) }{\sqrt{\sum_{i \in \psi_{jj'}}(r_{ij} - \bar{r}_{j})^{2}}   \sqrt{\sum_{i \in \psi_{jj'}} (r_{ij'} - \bar{r}_{j'})^{2} } } $$

$w_{jj'}$ - the weights between 2 movies

$\psi_{jj'}$ - set of users who watched by both j and j'

$r_{j'i}$ - rating user i gave rating j'

$r_{ij}$ - rating user i gave rating moive j

$\bar{r_{i}}$ - average rating of movie j

$\bar{r_{i'}}$ - average rating of movie j'

In [9]:
# some helper functions

# generates an array of ratigns
def array_of_ratings (movies , shared_users):
    my_list = []
    for i in shared_users:
        my_list.append(usermovie2rating[i , movies])
    return np.asarray(my_list)

# formula for calculating weights
def calculate_weights(dev_0 , dev_1):
    return np.dot(dev_0 , dev_1) / (np.sqrt(np.sum(dev_0**2)) * np.sqrt(np.sum(dev_1**2)))

# returns only the 25 most relavant key value pairs for the dictionary
def filter_dictonary(my_dict , num = 25):
    my_series = pd.Series(my_dict)
    my_series2 = my_series.copy()    
    # decided by larged modulo value as negative values have equal use to positive values
    index = my_series.apply(lambda x : np.abs(x)).sort_values(ascending = False).head(num).index.to_list()
    return my_series.loc[index].to_dict()

In [10]:
#total complexity O(M^2 * N) where N is number of users and M is number of movies
start = time.time()

#num of movies
movies = len(movie2user.keys())
#keep all values stored inside a dictionary
all_movie_weights = {}

#number of common users required
l = 5
for m in range(movies): #O(M)

    # weights dictionary
    weights_dict = {}
    users_m = set(movie2user[m])
    for j in movie2user: #O(M)
        if j is not m:
            users_j = set(movie2user[j]) #O(N)
            shared_users = users_j & users_m

            if len(shared_users) >= l:
            
                # ratings of shared movies by both users
                arr_0_ratings = array_of_ratings(m , shared_users)
                arr_1_ratings = array_of_ratings(j , shared_users)
            
                # average ratings by both users
                avg_r0 = avg_rating_for_movie[m]
                avg_r1 = avg_rating_for_movie[j]

                weights_dict[j] = calculate_weights(arr_1_ratings - avg_r1 , arr_0_ratings - avg_r0)
    sorted_dict = filter_dictonary(weights_dict)
    all_movie_weights[m] = sorted_dict
    
    # keep track of where we are at during while generating the weights
#     if (m+1) % 200 == 0:
#         print('number of cycles passed:' , m + 1)
#         print('time taken so far is ',  time.time() - start , 'seconds')
    
    
end = time.time()
print('total time' , end - start , 'in seconds')

total time 13.275274276733398 in seconds


In [11]:
# save dictionary as it takes a long time to generate again

with open('../data/item_item_weights.pickle', 'wb') as f:
    pickle.dump(all_movie_weights, f)

## Item-Item Colabarative Filtering

$$ \hat{dev}(i , j) = \frac{ \sum_{i' \in \Omega_{j}}  w_{ii'}( r(i' , j) - \bar{r_{i'}})}{\sum_{i' \in \Omega_{i'}}|w_{ii'}| } $$

$$ s(i , j) = \bar{r_{i}} + \hat{dev}( i , j ) $$

s(i , j) - rating user i gives to item j

$\bar{r_{j}}$ - average rating of movie j

$\hat{dev(i , j)}$ - weighted deviation of movie j other users i'

$\hat{r(i , j')}$ - rating user i gave to item j'

$\bar{r_{j'}}$ - average rating of movie j'

$w_{jj'}$ - weight between movie j and j'

$\Omega_{j'}$ - all relavant movies with relavant shared user

In [12]:
# now we need to make predictions on the data we have

def make_predictions(all_movie_weights = all_movie_weights , user2movie = user2movie 
                                , movie2user = movie2user, usermovie2rating = usermovie2rating):
    # movies
    movies = len(movie2user.keys())
    
    # keep all the predictions in a dictionary
    pred_movie_user2rating = {}
    
    for m in range(movies):
        
        #weights of each user
        movies_weights = all_movie_weights[m]
        
        # all movies watched by our relavant user
        users = movie2user[m]
        
        for u in users:

            # find common movies
            common_movies = set(movies_weights.keys())
            
            # set of all users who have rated move m
            all_movies_user_u = set(user2movie[u])


            # set of users who have rated moive m and have a weight in dictionary
            relavant_movies = all_movies_user_u & common_movies
            
            denominator = 0
            numerator = 0
            for i in relavant_movies:
                numerator += all_movie_weights[m][i] * (usermovie2rating[(u , i)] - avg_rating_for_movie[i])
                denominator += float(np.abs(all_movie_weights[m][i]))
            
            if int(denominator) != 0:
                pred_movie_user2rating[(u , m)] = avg_rating_for_movie[m] +  numerator/denominator
            else:
                pred_movie_user2rating[( u , m)] = avg_rating_for_movie[m]
    
    return pred_movie_user2rating

In [13]:
%%time
# make predictions from train data
train_data = make_predictions()

CPU times: user 2.21 s, sys: 0 ns, total: 2.21 s
Wall time: 2.21 s


In [14]:
# test data 2
test_data = make_predictions(movie2user = movie2user_test)
len(test_data)

15406

In [15]:
# evaluate train by mse
calc_mse(pred_dict= train_data  , actual_dict= usermovie2rating)

0.4388296147947832

In [16]:
# evaluate test by mse
calc_mse(pred_dict= test_data  , actual_dict= usermovie2rating_test)

0.5829560450628802

## Predicting on new data

In [17]:
# now we need to predict a set of ratings for each user for movies they have not watched

#number of movies
set_of_movies = set(movie2user.keys())

In [18]:
unrated_movie2user = {}
for key in movie2user:
    movies_watched_train = set(movie2user[key])
    movies_watched_test = set(movie2user_test[key])
    movies_not_watched = set_of_movies - (movies_watched_train | movies_watched_test)
    for i in movies_not_watched:
        if key not in unrated_movie2user:
            unrated_movie2user[key] = [i]
        else:
            unrated_movie2user[key].append(i)

In [19]:
# predictions on unwatched moives
predictions = make_predictions(movie2user = unrated_movie2user)

In [20]:
len(predictions)

12970