In [5]:
%%html
<style> 
@import url('https://fonts.googleapis.com/css?family=Orbitron|Roboto');
a {color: #37c9e1; font-family: 'Roboto';} 
h1 {color: #C20E69; font-family: 'Poppins'} 
h2, h3 {color: #25B89B; font-family: 'Poppins';}
h4 {color: #818286; font-family: 'Roboto';}
                                      
</style>

# **PROJECT OBJECTIVE**: 
We will build a recommendation system using popularity based and collaborative filtering methods to recommend
mobile phones to a user which are most popular and personalised respectively..

# **CONTEXT:** 
India is the second largest market globally for smartphones after China. About 134 million smartphones were sold across India
in the year 2017 and is estimated to increase to about 442 million in 2022. India ranked second in the average time spent on mobile web by
smartphone users across Asia Pacific. The combination of very high sales volumes and the average smartphone consumer behaviour has
made India a very attractive market for foreign vendors. As per Consumer behaviour, 97% of consumers turn to a search engine when they
are buying a product vs. 15% who turn to social media. If a seller succeeds to publish smartphones based on user’s behaviour/choice at the
right place, there are 90% chances that user will enquire for the same.

`This Case Study is targeted to build a recommendation system based on individual consumer’s behaviour or choice.`

# **DATA DESCRIPTION:**
- author : name of the person who gave the rating
- country : country the person who gave the rating belongs to
- data : date of the rating
- domain: website from which the rating was taken from
- extract: rating content
- language: language in which the rating was given
- product: name of the product/mobile phone for which the rating was given
- score: average rating for the phone
- score_max: highest rating given for the phone
- source: source from where the rating was taken

# <a id='importing'>Importing the necessary libraries📗</a> 

## **Packages used**
- Numpy: 1.19.3
- Pandas: 1.1.4
- matplotlib: 3.2.1
- Seaborn: 0.10.1
- MissingNo: 0.4.2
- Sklearn: 0.22.2.post1
- Pandas_profiling: 2.9.0

In [6]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
import os
import glob
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from zipfile import ZipFile
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import math
import json
import time
# import sklearn.external.joblib as extjoblib
import joblib
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import scipy.sparse
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
#import the reqired libraries
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings; warnings.simplefilter('ignore')
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise import accuracy

from scipy.spatial.distance import correlation, cosine
import ipywidgets as widgets
from IPython.display import display, clear_output
from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt
import sys, os
from contextlib import contextmanager
%matplotlib inline

# filterwarnings to ignore all unnecessary warnings and logs
import warnings
warnings.filterwarnings('ignore')
#declaring metric as global which can be changed by the user later
global metric
metric='cosine'

# <a id='reading'>Reading the dataset 📚</a>

In [None]:
# Import the dataset and give the column names
# columns=['phone_url','date','lang','country','source','domain','score','score_max','extract','author','product']
phone_user_review1_df=pd.read_csv('phone_user_review_file_1.csv',encoding = "ISO-8859-1")
phone_user_review2_df=pd.read_csv('phone_user_review_file_2.csv',encoding = "ISO-8859-1")
phone_user_review3_df=pd.read_csv('phone_user_review_file_3.csv',encoding = "ISO-8859-1")
phone_user_review4_df=pd.read_csv('phone_user_review_file_4.csv',encoding = "ISO-8859-1")
phone_user_review5_df=pd.read_csv('phone_user_review_file_5.csv',encoding = "ISO-8859-1")
phone_user_review6_df=pd.read_csv('phone_user_review_file_6.csv',encoding = "ISO-8859-1")
# Concatenating dataframes without duplicates 
combined_phone_user_review_df = pd.concat([phone_user_review1_df
                                           ,phone_user_review2_df, phone_user_review3_df, 
                                           phone_user_review4_df, phone_user_review5_df,phone_user_review6_df
                                          ]).drop_duplicates() 


In [None]:
combined_phone_user_review_df.head()

In [None]:
from colorama import Fore, Back, Style
rows, columns = combined_phone_user_review_df.shape
print(Fore.YELLOW + "No of rows: ", Style.RESET_ALL,rows) 
print(Fore.YELLOW + "No of columns: ", Style.RESET_ALL,columns) 

In [None]:
sample_df = combined_phone_user_review_df.sample(n = 100000, random_state = 612) 

In [None]:
sample_df.head()

In [None]:
rows, columns = sample_df.shape
print(Fore.YELLOW + "No of rows: ", Style.RESET_ALL,rows) 
print(Fore.YELLOW + "No of columns: ", Style.RESET_ALL,columns) 

In [None]:
# It is very important to check and remove data duplicates. 
# Else our model may break or report overly optimistic / pessimistic performance results
dupes=sample_df.duplicated()
print(' The number of duplicates in the dataset are:',sum(dupes), '\n')
dupes_record=pd.DataFrame(sample_df[dupes])
print(' The duplicate observations are:') 
dupes_record

In [None]:
# After dropping duplicates
sample_df.drop_duplicates(keep="first")
rows, columns = sample_df.shape
print(Fore.YELLOW + "No of rows: ", Style.RESET_ALL,rows) 
print(Fore.YELLOW + "No of columns: ", Style.RESET_ALL,columns)

# <a id='basic'>Basic Data Exploration 🏕️</a> 

1. Check for Data type of columns
2. Check for null values.
3. Check for outliers
4. Look for the category distribution in categorical columns
5. Plot for correlation
6. Look for new variables

In [None]:
#Check Data types
sample_df.dtypes

In [None]:
# Datetime conversion using astype for date column:
sample_df['date'] = sample_df['date'].astype('datetime64[ns]')

In [None]:
# Datetime conversion using astype for date column:
# sample_df['score'] = pd.to_numeric(sample_df['score'])
sample_df['score'] = sample_df['score'].round().astype('Int64')

In [None]:
# Datetime conversion using astype for date column:
# sample_df['score_max'] = pd.to_numeric(sample_df['score_max'])
sample_df['score_max'] = sample_df['score_max'].round().astype('Int64')

In [None]:
#Again check Data types
sample_df.dtypes

## &#9703; Categorical & Numerical Columns

In [None]:
def cols():
    cat_cols = [col for col in sample_df.columns if sample_df[col].dtypes == "O"]
    if len(cat_cols) == 0:
        print("There is no Categorical Column")
    else:
        print("Number of Categorical Column: ", len(cat_cols),"\n",cat_cols)
    
    num_cols = [col for col in sample_df.columns if sample_df[col].dtypes != "O"]
    if len(num_cols) == 0:
        print("There is no Numerical Column")
    print("Number of Numerical Columns: ", len(num_cols),"\n",num_cols)
cols()

In [None]:
# Dropping less relevent columns
''' dropping multiple column based on name'''
sample_df.drop(['phone_url', 'extract', 'domain', 'date','lang', 'country', 'source'], axis = 1,inplace = True)


In [None]:
sample_df.info()

## Checking missing values

In [None]:
#display in each column how many null values are there
sample_df.apply(lambda x: sum(x.isnull()))

In [None]:
# determine the threshold for missing values
def assess_NA(data):
    """
    Returns a pandas dataframe denoting the total number of NA values and the percentage of NA values in each column.
    The column names are noted on the index.
    
    Parameters
    ----------
    data: dataframe
    """
    # pandas series denoting features and the sum of their null values
    null_sum = data.isnull().sum()# instantiate columns for missing data
    total = null_sum.sort_values(ascending=False)
    percent = ( ((null_sum / len(data.index))*100).round(2) ).sort_values(ascending=False)
    
    # concatenate along the columns to create the complete dataframe
    df_NA = pd.concat([total, percent], axis=1, keys=['Number of NA', 'Percent NA'])
    
    # drop rows that don't have any missing data; omit if you want to keep all rows
    df_NA = df_NA[ (df_NA.T != 0).any() ]
    
    return df_NA     

In [None]:
df_NA = assess_NA(sample_df)
df_NA

In [None]:
fig , ax = plt.subplots(figsize = (14,10))
sns.heatmap(sample_df.isnull() , cbar = False, cmap = "YlGnBu_r")

In [None]:
# Code to get number of categories in missing value columns
print("Number of Categories in: ")
for ColName in sample_df[['author','product']]:
    print("{} = {}".format(ColName,       len(sample_df[ColName].unique())))

### Impute / Replace Missing Values with Median

In [None]:
# imputing missing data with median value can only be done with numerical data.
sample_df['score'].fillna(sample_df['score'].median(), inplace = True)

In [None]:
# imputing missing data with median value can only be done with numerical data.
sample_df['score_max'].fillna(sample_df['score_max'].median(), inplace = True)

In [None]:
sample_df['author'].fillna(sample_df['author'].mode()[0], inplace = True)

In [None]:
sample_df['product'].fillna(sample_df['product'].mode()[0], inplace = True)

In [None]:
df_NA = assess_NA(sample_df)
df_NA

In [None]:
sample_df.info()

## Univariate Analysis¶

**As rule of thumb, skewness can be interpreted like this:**

`Skewness`
- Fairly Symmetrical	-0.5 to 0.5
- Moderate Skewed	-0.5 to -1.0 and 0.5 to 1.0
- Highly Skewed	< -1.0 and > 1.0

In [None]:
# skewness along the index axis 
sample_df.skew(axis = 0, skipna = True) 

In [None]:
# skewness along the index axis 
sample_df.kurt(axis = 0, skipna = True) 

In [None]:
fig,ax=plt.subplots(1,2,figsize=(15,8))
sns.distplot(sample_df['score'],ax=ax[0],kde=True,hist=False)
sns.distplot(sample_df['score_max'],ax=ax[0],kde=True,hist=False)
plt.show()
print(sample_df.skew())

In [None]:
print(sample_df['score'].astype(float).skew())
print(sample_df['score'].astype(float).kurt())

In [None]:
print(sample_df['score_max'].astype(float).skew())
print(sample_df['score_max'].astype(float).kurt())

In [None]:
#display 5 point summary of dataframe
sample_df.describe([0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99]).transpose()

In [None]:
sample_df.score.value_counts()

In [None]:
# Summary statistics of 'rating' variable
sample_df['score'].describe([0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99]).transpose()

In [None]:
# find minimum and maximum ratings
print('The minimum score is: %d' %(sample_df['score'].min()))
print('The maximum score is: %d' %(sample_df['score'].max()))

<p>Score are on scale of 0-10<p>

In [None]:
# Check the distribution of ratings 
with sns.axes_style('white'):
    g = sns.factorplot("score", data=sample_df, aspect=2.0,kind='count')
    g.set_ylabels("Total number of score") 

## Ratings Distribution

In [None]:
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

data = sample_df['score'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / sample_df.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )
# Create layout
layout = dict(title = 'Distribution Of {} Phone-ratings'.format(sample_df.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)


We can see that more that 80% of all scorings in the data are 6,7,8, 9 and 10, and very few ratings around 15%-18% are in the lower ratings range.

## Ratings Distribution By Product (Phone)

In [None]:
# Number of ratings per phone
data = sample_df.groupby('product')['score'].count().clip(upper=50)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Scores',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Scores Per Phone (Clipped at 50)',
                   xaxis = dict(title = 'Number of Scores Per Phone'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)


In [None]:
sample_df.groupby('product')['score'].count().reset_index().sort_values('score', ascending=False)[:10]

## Ratings Distribution By Authors (Users)

In [None]:
# Number of ratings per book
data = sample_df.groupby('author')['score'].count().clip(upper=50)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'Scores',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Scores Per User (Clipped at 50)',
                   xaxis = dict(title = 'Number of Scores Per User'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [None]:
sample_df.groupby('author')['score'].count().reset_index().sort_values('score', ascending=False)[:10]

Most of the users gave more than 5 ratings, and very few users gave many ratings, although the most productive user have given 9827 ratings.

To reduce the dimensionality of the dataset, we will filter out rarely rated product and rarely rating authors.

In [None]:
min_phone_ratings = 50
filter_products = sample_df['product'].value_counts() > min_phone_ratings
filter_products = filter_products[filter_products].index.tolist()

min_user_ratings = 50
filter_users = sample_df['author'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()

df_new = sample_df[(sample_df['product'].isin(filter_products)) & (sample_df['author'].isin(filter_users))]
print('The original data frame shape:\t{}'.format(sample_df.shape))
print('The new data frame shape:\t{}'.format(df_new.shape))

In [None]:
# Get a series object containing the count of unique elements
# in each column of dataframe
uniqueValues = df_new.nunique()
print('Count of unique values in each column :')
print(uniqueValues)

In [None]:
print(Fore.YELLOW + "The total phones are",Style.RESET_ALL,f"{df_new['product'].count()},", Fore.BLUE + "from those the unique types are", Style.RESET_ALL, f"{df_new['product'].value_counts().shape[0]}.")

In [None]:
print(Fore.YELLOW + "The total users are",Style.RESET_ALL,f"{df_new['author'].count()},", Fore.BLUE + "from those the unique types are", Style.RESET_ALL, f"{df_new['author'].value_counts().shape[0]}.")

### Consider only ratings from 1-10 and leave 0s in column `Score`

In [None]:
df_new['score'].unique()

In [None]:
#Hence segragating implicit and explict ratings datasets
ratings_explicit = df_new[df_new.score != 0]
ratings_implicit = df_new[df_new.score == 0]

In [None]:
print(df_new.shape)
print(ratings_explicit.shape)
print(ratings_implicit.shape)

In [None]:
# Top 10 users based on rating
most_rated = ratings_explicit.groupby('author').size().sort_values(ascending=False)[:10]
most_rated

In [None]:
# print(sample_df)
counts = ratings_explicit['author'].value_counts()
df_final = ratings_explicit[ratings_explicit['author'].isin(counts[counts >= 50].index)]

In [None]:
counts1 = pd.value_counts(df_final['author'])
counts1

In [None]:
df_final = df_final[df_final['author'].isin(counts1[counts1 >= 50].index)]
df_final

In [None]:
df_final.head()

In [None]:
# ratings_explicit

In [None]:
from surprise import Dataset,Reader
from surprise.model_selection import cross_validate
from surprise import NormalPredictor


reader = Reader(rating_scale=(1, 10))

In [None]:
df_final.head(2)

In [None]:
df_final.shape

In [None]:
data = Dataset.load_from_df(df_final[['author', 'product', 'score']], reader)

In [None]:
data.df.head()

# SVD Based Recommendation System

In [None]:
from surprise import Dataset,Reader

reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df_final[['author', 'product', 'score']], reader)

In [None]:
# Split data to train and test
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25,random_state=123)

In [None]:
trainset.all_ratings()

In [None]:
# However the ids are the inner ids and not the raw ids
# raw ids can be obatined as follows

print(trainset.to_raw_uid(0))
#print(trainset.to_raw_iid(1066))

In [None]:
from surprise import SVD, KNNWithMeans
from surprise import accuracy

In [None]:
svd_model = SVD(n_factors=5,biased=False)
svd_model.fit(trainset)

In [None]:
testset[0]

In [None]:
test_pred = svd_model.test(testset)

In [None]:
# compute RMSE
accuracy.rmse(test_pred)

In [None]:
from surprise import SVD
from surprise.model_selection import GridSearchCV

param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

With the Surprise library, we will benchmark the following algorithms

### Basic algorithms

#### NormalPredictor

* NormalPredictor algorithm predicts a random rating based on the distribution of the training set, which is assumed to be normal. This is one of the most basic algorithms that do not do much work.

#### BaselineOnly

* BasiclineOnly algorithm predicts the baseline estimate for given user and item.

### k-NN algorithms

#### KNNBasic

* KNNBasic is a basic collaborative filtering algorithm.

#### KNNWithMeans

* KNNWithMeans is basic collaborative filtering algorithm, taking into account the mean ratings of each user.

#### KNNWithZScore

* KNNWithZScore is a basic collaborative filtering algorithm, taking into account the z-score normalization of each user.

#### KNNBaseline

* KNNBaseline is a basic collaborative filtering algorithm taking into account a baseline rating.

### Matrix Factorization-based algorithms

#### SVD

* SVD algorithm is equivalent to Probabilistic Matrix Factorization (http://papers.nips.cc/paper/3208-probabilistic-matrix-factorization.pdf)

#### SVDpp

* The SVDpp algorithm is an extension of SVD that takes into account implicit ratings.

#### NMF

* NMF is a collaborative filtering algorithm based on Non-negative Matrix Factorization. It is very similar with SVD.

### Slope One

* Slope One is a straightforward implementation of the SlopeOne algorithm. (https://arxiv.org/abs/cs/0702144)

### Co-clustering

* Co-clustering is a collaborative filtering algorithm based on co-clustering (http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.113.6458&rep=rep1&type=pdf)


We use rmse as our accuracy metric for the predictions.

In [None]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE','MAE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)

In [None]:
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

In [None]:
surprise_results

BaselineOnly algorithm gave us the best rmse, therefore, we will proceed further with BaselineOnly and use Alternating Least Squares (ALS).

In [None]:
print('Using ALS')
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
algo = BaselineOnly(bsl_options=bsl_options)
cross_validate(algo, data, measures=['RMSE','MAE'], cv=3, verbose=False)

We use the train_test_split() to sample a trainset and a testset with given sizes, and use the accuracy metric of rmse. We’ll then use the fit() method which will train the algorithm on the trainset, and the test() method which will return the predictions made from the testset

In [None]:
trainset, testset = train_test_split(data, test_size=0.25)
algo = BaselineOnly(bsl_options=bsl_options)
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

In [None]:
trainset = algo.trainset
print(algo.__class__.__name__)

In [None]:
def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)

In [None]:
df.head()

In [None]:
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

In [None]:
best_predictions

The above are the best predictions, and they are not lucky guesses. Because Ui is anywhere between 25 to 148, they are not really small, meaning that significant number of users have rated the target product.

In [None]:
worst_predictions

The worst predictions look pretty surprise. Let's look in more details of the First Product "OnePlus X (Onyx, 16GB)", the product was rated by 28 users, user "Amazon Customer" rated 2, our BaselineOnly algorithm predicts 8.30.

In [None]:
df_new.loc[df_new['product'] == 'OnePlus X (Onyx, 16GB)']['score'].describe()

In [None]:
import matplotlib.pyplot as plt
%matplotlib notebook

df_new.loc[df_new['product'] == 'OnePlus 3 (Soft Gold, 64 GB)']['score'].hist()
plt.xlabel('score')
plt.ylabel('Number of scores')
plt.title('Number of scores OnePlus 3T has received')
plt.show();

It turns out, most of the ratings this product received was between [6-10], in another word, most of the users in the data rated this phone [6-10], only very few users rated 2. Same with the other predictions in "worst predictions" list.

## Checking the Phone relevance and recommendation

**Precision@k = Recommended items that are relevant/Recommended items**

**Recall@k= Recommended items that are relevant/Relevant items**

_Precision at k is the proportion of recommended items in the top-k set that are relevant_

Its interpretation is as follows. Suppose that my precision at 10 in a top-10 recommendation problem is 80%. This means that 80% of the recommendation I make are relevant to the user.


_Recall at k is the proportion of relevant items found in the top-k recommendations_

Suppose that we computed recall at 10 and found it is 40% in our top-10 recommendation system. This means that 40% of the total number of the relevant items appear in the top-k results.

An item is considered relevant if its true rating rui is greater than a given threshold. An item is considered recommended if its estimated rating r^ui is greater than the threshold, and if it is among the k highest estimated ratings.

In [None]:
from collections import defaultdict

from surprise.model_selection import KFold


def precision_recall_at_k(predictions, k=20, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended phone that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant phone that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls



kf = KFold(n_splits=20)
for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=20, threshold=4)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))

In [None]:

from surprise import SVD
from surprise import Dataset


def get_top_n(predictions, n=20):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


# First train an BaselineOnly algorithm on the smartphone dataset.
trainset = data.build_full_trainset()
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=20)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

Summarise the insights.

Model-based Collaborative Filtering is a personalised recommender system, the recommendations are based on the past behavior of the user and it is not dependent on any additional information.