In [1]:
!pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
#Loading Libraries
import pandas as pd
from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans

In [3]:
#Data from about 60,000 users who have rated 140 or more jokes.
jester = Dataset.load_builtin('jester')

In [4]:
pd.DataFrame(jester.raw_ratings).head()

Unnamed: 0,0,1,2,3
0,1,5,0.219,
1,1,7,-9.281,
2,1,8,-9.281,
3,1,13,-6.781,
4,1,15,0.875,


In [5]:
#Naming Columns
jester_df = pd.DataFrame(jester.raw_ratings)[[0,1,2]]
jester_df.columns = ['user_id', 'item_id','rating']
jester_df.head()

Unnamed: 0,user_id,item_id,rating
0,1,5,0.219
1,1,7,-9.281
2,1,8,-9.281
3,1,13,-6.781
4,1,15,0.875


## Memory Based Collaborative Filtering

In [6]:
#Split into train and test
import numpy as np
msk = np.random.rand(len(jester_df)) < 0.8
train = jester_df[msk]
test = jester_df[~msk]

In [7]:
#Specifying a scale at which jokes are rated i.e., -10 to 10.
reader = Reader(rating_scale = (-10,10))
data = Dataset.load_from_df(train, reader)

In [8]:
trainingSet = data.build_full_trainset()

In [9]:
sim_options = {
    'name': 'cosine',
    'user_based': False #Compute similarities between items
}

algo = KNNWithMeans(sim_options=sim_options)

In [10]:
algo.fit(trainingSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f12b87b2750>

In [11]:
#Predicting rating for a particular user and item
prediction = algo.predict('500','1640')
prediction.est

1.6170391132614796

In [12]:
#Predicting on whole test data
predicted = []
for _, row in test.iterrows():
  predicted.append(algo.predict(row.user_id, row.item_id))

In [13]:
test['predicted'] = [i.est for i in predicted]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [14]:
test.head()

Unnamed: 0,user_id,item_id,rating,predicted
1,1,7,-9.281,-5.302374
2,1,8,-9.281,-2.481679
10,1,21,-7.188,4.571029
17,1,102,0.75,2.429287
22,1,107,2.031,4.060082


In [15]:
test['error'] = test['rating'] - test['predicted']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [16]:
test.head()

Unnamed: 0,user_id,item_id,rating,predicted,error
1,1,7,-9.281,-5.302374,-3.978626
2,1,8,-9.281,-2.481679,-6.799321
10,1,21,-7.188,4.571029,-11.759029
17,1,102,0.75,2.429287,-1.679287
22,1,107,2.031,4.060082,-2.029082


In [17]:
mean_absolute_error = np.mean(test['error'])

In [18]:
mean_absolute_error

-0.0021963126308773974

In [19]:
test['squared_error'] = test['error'] * test['error']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [20]:
test.head()

Unnamed: 0,user_id,item_id,rating,predicted,error,squared_error
1,1,7,-9.281,-5.302374,-3.978626,15.829468
2,1,8,-9.281,-2.481679,-6.799321,46.230768
10,1,21,-7.188,4.571029,-11.759029,138.274771
17,1,102,0.75,2.429287,-1.679287,2.820006
22,1,107,2.031,4.060082,-2.029082,4.117173


In [21]:
np.sqrt(np.mean(test['squared_error']))

4.146160781363914

In [22]:
rmse = np.sqrt(np.mean(np.square(test['rating']-test['predicted'])))

In [23]:
rmse

4.146160781363914

## Matrix Factorization Collaborative Filtering

In [25]:
#Reloading Dataset and naming columns
jester = Dataset.load_builtin('jester')
pd.DataFrame(jester.raw_ratings).head()

jester_df = pd.DataFrame(jester.raw_ratings)[[0,1,2]]
jester_df.columns = ['user_id', 'item_id','rating']
jester_df.head()

reader = Reader(rating_scale = (-10,10))
data = Dataset.load_from_df(train, reader)

In [26]:
#Train Test Split
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size = 0.25)

In [27]:
#Using SVD Algorithm for matrix factorization

from surprise import SVD, accuracy
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f12c7b6edd0>

In [28]:
predictions = algo.test(testset)

In [29]:
#Evaluation

from surprise import accuracy
accuracy.rmse(predictions)

RMSE: 4.4006


4.400562017788305

In [30]:
accuracy.mae(predictions)

MAE:  3.3178


3.31779988252886

In [31]:
accuracy.mse(predictions)

MSE: 19.3649


19.36494607240108

Hence, the root mean squared error for Memory Based Collaborative Filtering is 4.1462 and the RMSE for Matrix Factorization Based Collaborative Filtering is 4.4006.

Thus, the memory based filtering method provides better recommendations with lesser error.