In [217]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ratings2/ratings.csv


In [218]:
# installing `surprise` package

! pip install scikit-surprise



In [219]:
# imports

import pandas as pd
import numpy as np
import surprise
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD, accuracy
from collections import defaultdict

# Reading the data

In [220]:
file_path = '/kaggle/working/ratings_edit.csv'

# creating a Reader object
reader = Reader(line_format= 'user item rating timestamp', sep = ',', rating_scale = (0,5))

# importing the data
dat = Dataset.load_from_file(file_path, reader = reader)

# Performing train-test split

In [221]:
np.random.seed(10)
trainset, testset = train_test_split(dat,test_size = 0.2)

In [222]:
print('No. of users',trainset.n_users)
print('No. of items', trainset.n_items)
print('No. of ratings', trainset.n_ratings)

No. of users 7045
No. of items 20760
No. of ratings 838860


# Performing `Model Based Collaborative Filtering` now.

The procedure as we know is to compute matrices  **U (m x k)** and **M (n x k)** such that, utility matrix **R (m x n)** is as close to $U*M^t$ as possible. The MSE loss function can be used, with added L2 penalty on the size of vectors in M, U to avoid overfitting. We can use Stochastic Gradient Descent as the optimizer. This procedure of computing a matrix factorization of the utility matrix R in terms of **user matrix U** and **item matrix M** as a method to find the **missing ratings of R** is termed as `SVD Recommendation procedure`. In essence, it's not actually applying an SVD algorithm to the utility matrix (because SVD doesn't work with missing entries), but it's an **SVD inspired** algorithm.   

In [224]:
# initializing
SVD_model = SVD()

# fitting to trainset
SVD_model.fit(trainset)

# predicting on test set
predictions = SVD_model.test(testset)

In [225]:
# calculating RMSE
RMSE = accuracy.rmse(predictions)

RMSE: 0.8315


## RMSE of 0.8315 on test set

# To compute precision and recall
The problem is currently **not** in the framework of a **classification problem**. We are trying to **predict values** and **not classes**.
For computing **precision** and **recall**, we need to **convert** the problem into a **classification problem**.
<br><br>
Let's assume that of all the ratings in the original **utility matrix R**, a **rating > 4** (out of 5) indicates that the user **liked** the movie. And thus, **rating < 4** indicates that the user **didn't enjoy** the movie a lot. This **converts** our **utility matrix** into a matrix of **0s** and **1s**, 0s being ratings < 4 and 1s being ratings > 4. After computation of **R^hat** (using the procedure described earlier), we can similarly **convert** our **R_hat** into a matrix of **0s** and **1s** on the basis of **threshold of 4** (just for the purpose of computation of precision and recall). Then, the problem being a **typical clasification problem now**, precision and recall can be computed as follows: 

**Precison = True Positives / (False Postives + True Positives) <br>
Recall = True Positives/ (True Postives + False negatives)**

**Note**: This will be calculated only for the **top k estimated ratings** of every user (i.e, among the top k estimated ratings for a user how many **relevant ones out of retreived (precision)** and how many **relevant ones out of total relevant ones (recall)**).

In [226]:
def precision_and_recall_at_k(predictions, threshold, k):
    
    
    # map predictions to true values for every user
    user_est_true = defaultdict(list)
    
    for uid,_, true_r, est_r, _ in predictions:
        user_est_true[uid].append((est_r,true_r))
        
    precisions = dict()
    recalls = dict()
    
    for uid, user_ratings in user_est_true.items():
        
        
        # sort user ratings and look at top k estimated ratings:
        user_ratings.sort(key = lambda x: x[0], reverse = True)
        
        # number of relevant items i.e., ratings > threshold in utility matrix R:
        # this refers to true positives + false negatives
        tp_fn = sum([true_r > threshold for (_,true_r) in user_ratings[:k]])
        
        # number of recommended items i.e., ratings > threshold in R_hat:
        # this refers to true positives + false positives
        tp_fp = sum([est_r > threshold for (est_r,_) in user_ratings[:k]])
        
        # number of relevant and recommended items i.e, ratings > threshold in both i.e,
        # utility matrix R and R_hat: Refers to true positves
        tp = sum([(true_r > threshold and est_r > threshold) 
                     for (est_r,true_r) in user_ratings[:k]])   

        if tp_fp == 0:
            precisions[uid] = 0
        else:
            precisions[uid] = tp/tp_fp
        
        if tp_fn == 0:
            recalls[uid] = 0
        else:
            recalls[uid] = tp/tp_fn
        
    return precisions, recalls
        
    
    

In [227]:
precisions, recalls = precision_and_recall_at_k(predictions, threshold = 3, k = 100)

In [228]:
precision_list = np.array([t for _,t in precisions.items()])
recall_list = np.array([t for _,t in recalls.items()])

In [229]:
print('Average precision:', round(np.mean(precision_list),2))
print('Average recall:',round(np.mean(recall_list),2))

Average precision: 0.71
Average recall: 0.92


In [230]:
print('Median precision:', round(np.median(precision_list),2))
print('Median recall:', round(np.median(recall_list),2))

Median precision: 0.75
Median recall: 1.0


## Average & median precision of 0.7 & 0.75 respectively.<br> Average & median recall of 0.92 and 1.0 respectively   

### These values seem to be decent in the context of a recommender system, because, recall ~ 1 is always good and should be there. But model with precision 0.7 or 0.75 gives user more useful/ diverse information than model with precision 1. 

### Lastly, to give the user actual movie predictions, we can just output the movies corresponding to top k ratings for a user from R_hat (k can be 20 or so).<br><br> Following is the function for the same.

In [231]:
def get_k_predictions(predictions, k = 10, user_id = None):
    
    # map movies to corresponding estimated ratings for each user
    user_est_true = defaultdict(list)
    
    for uid,iid, _, est_r, _ in predictions:
        user_est_true[uid].append((iid,est_r))
        
    # sort the predictions for each user and retrieve the `k` highest ones
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key = lambda x :x[1],reverse = True)
        
        user_est_true[uid] = user_ratings[:k]
    
    if user_id == None:
        # return for all users
        return user_est_true
    else:   
        return user_est_true[user_id]

In [232]:
# top 20 estimated ratings for user id = 12 (user ids : 1 to 7045 valid)
get_k_predictions(predictions=predictions, k = 20, user_id = '12')

[('1213', 4.369919441138452),
 ('1259', 4.319582773494192),
 ('910', 4.303041096248849),
 ('1617', 4.283768147057547),
 ('1210', 4.212587677233288),
 ('1394', 4.209719977920252),
 ('1188', 4.193736172633765),
 ('8827', 4.186390936705928),
 ('36', 4.146636889811388),
 ('2973', 4.090057029586237),
 ('69757', 4.069130275236678),
 ('898', 4.036136249201578),
 ('6331', 4.035731515521555),
 ('56367', 4.000549844901921),
 ('1673', 3.9901378855132683),
 ('68237', 3.965321541602196),
 ('2997', 3.948230976731987),
 ('6385', 3.9333614679067868),
 ('27815', 3.9175213991837814),
 ('2692', 3.9077057463945333)]