# Collaborative Filtering : Model Based

> ## Library

In [31]:
!pip install scikit-surprise

[0m

In [85]:
import sklearn
print(sklearn.__version__)

1.4.1.post1


In [86]:
import pandas as pd
import seaborn as sns

from surprise import Reader, Dataset, SVD, BaselineOnly, accuracy

from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

> ## Data

In [87]:
df=pd.read_csv('u.data', sep='\t')
df.columns=['user_id','item_id','rating','timestamp']

In [88]:
df

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,172,5,881250949
1,0,133,1,881250949
2,196,242,3,881250949
3,186,302,3,891717742
4,22,377,1,878887116
...,...,...,...,...
99997,880,476,3,880175444
99998,716,204,5,879795543
99999,276,1090,1,874795795
100000,13,225,2,882399156


In [89]:
#user_item_rating_matrix
user_item_rating_matrix=df.pivot_table(values='rating',
    index='user_id',
    columns='item_id')

In [90]:
user_item_rating_matrix

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [91]:
reader=Reader(rating_scale=(0,5))
data=Dataset.load_from_df(df[['user_id', 'item_id', 'rating']], reader)

In [94]:
df.head(10)

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,172,5,881250949
1,0,133,1,881250949
2,196,242,3,881250949
3,186,302,3,891717742
4,22,377,1,878887116
5,244,51,2,880606923
6,166,346,1,886397596
7,298,474,4,884182806
8,115,265,2,881171488
9,253,465,5,891628467


In [55]:
df['rating'].value_counts()

rating
4    34174
3    27145
5    21202
2    11370
1     6111
Name: count, dtype: int64

> ## Validation

Data Splitting

In [107]:
trainset, testset = train_test_split(data, test_size=0.00000000001, random_state=101)

1. SVD

In [108]:
#define model
algo=SVD()

#fitting model
algo.fit(trainset)

#prediction
predictions=algo.test(testset)

In [109]:
accuracy.rmse(predictions)

RMSE: 0.1770


0.17701641879082475

In [110]:
accuracy.mse(predictions)

MSE: 0.0313


0.03133481252152865

In [111]:
accuracy.mae(predictions)

MAE:  0.1770


0.17701641879082475

In [119]:
svd=SVD()
bsl_options={'method' : 'als',
            'n_epochs' :5,
            'reg_u' : 12,
            'reg_i' : 5
            }
#define model
als=BaselineOnly(bsl_options=bsl_options)


list_model=[svd, als]

a=[]
for i in list_model:
    #define model
    algo=SVD()

    #fitting model
    algo.fit(trainset)

    #prediction
    predictions=algo.test(testset)
    
    mae=accuracy.mae(predictions)
    a.append(mae)
b=pd.DataFrame({'Nama Model':list_model, 'MAE':a})
b

MAE:  1.2921
MAE:  0.5037


Unnamed: 0,Nama Model,MAE
0,<surprise.prediction_algorithms.matrix_factori...,1.292134
1,<surprise.prediction_algorithms.baseline_only....,0.503657


In [117]:
a

[0.4441125601430338, 0.3527039888758168]

2. ALS

In [105]:
bsl_options={'method' : 'als',
            'n_epochs' :5,
            'reg_u' : 12,
            'reg_i' : 5
            }
#define model
algo=BaselineOnly(bsl_options=bsl_options)
#fitting model
algo.fit(data)
#prediction
predictions=algo.test(testset)

Estimating biases using als...


AttributeError: 'DatasetAutoFolds' object has no attribute 'n_users'

In [64]:
accuracy.rmse(predictions)

RMSE: 0.9452


0.9451928986263317

In [65]:
accuracy.mse(predictions)

MSE: 0.8934


0.8933896156136469

In [66]:
accuracy.mae(predictions)

MAE:  0.7486


0.7486372537318952

In [67]:
pd.DataFrame(predictions)

Unnamed: 0,uid,iid,r_ui,est,details
0,911,7,4.0,3.761990,{'was_impossible': False}
1,207,98,4.0,3.928419,{'was_impossible': False}
2,617,427,4.0,3.419328,{'was_impossible': False}
3,577,40,4.0,3.325578,{'was_impossible': False}
4,57,411,4.0,3.030139,{'was_impossible': False}
...,...,...,...,...,...
24996,156,205,3.0,3.898150,{'was_impossible': False}
24997,642,173,5.0,4.521901,{'was_impossible': False}
24998,605,1,4.0,3.921866,{'was_impossible': False}
24999,886,127,4.0,4.162618,{'was_impossible': False}


> ## Cross Validation

SVD

In [68]:
algo=SVD()

cv_svd=cross_validate(algo,
    data,
    measures=['rmse', 'mae'],
    cv=5,verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9367  0.9358  0.9274  0.9356  0.9431  0.9357  0.0050  
MAE (testset)     0.7401  0.7377  0.7315  0.7391  0.7414  0.7380  0.0034  
Fit time          0.53    0.52    0.51    0.51    0.51    0.52    0.01    
Test time         0.03    0.09    0.03    0.03    0.09    0.05    0.03    


In [69]:
print('rmse cv mean :', cv_svd['test_rmse'].mean())

rmse cv mean : 0.9357212300845305


ALS

In [70]:
bsl_options={'method' : 'als',
            'n_epochs' :5,
            'reg_u' : 12,
            'reg_i' : 5
            }
#define model
algo=BaselineOnly(bsl_options=bsl_options)

cv_als=cross_validate(algo,
    data,
    measures=['rmse', 'mae'],
    cv=5,verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9378  0.9393  0.9430  0.9449  0.9429  0.9415  0.0026  
MAE (testset)     0.7434  0.7449  0.7482  0.7470  0.7442  0.7456  0.0018  
Fit time          0.05    0.05    0.05    0.05    0.05    0.05    0.00    
Test time         0.02    0.02    0.02    0.07    0.02    0.03    0.02    


In [71]:
print('rmse cv mean :', cv_als['test_rmse'].mean())

rmse cv mean : 0.9415482962738245


SVD memiliki performa yang lebih baik dibandingkan dengan ALS

> ## Hyperparameter Tuning

In [72]:
param_grid={'n_epochs' : [5, 10, 20], 
           'lr_all' : [0.002, 0.005, 0.01],
           'reg_all' : [0.02, 0,1, 0.5]
           }
gscv=GridSearchCV(SVD,
    param_grid,
    measures=['rmse', 'mae'],
    cv=5)

gscv.fit(data)

In [73]:
gscv.best_score

{'rmse': 0.9346761907990707, 'mae': 0.7366982302560287}

In [74]:
gscv.best_params

{'rmse': {'n_epochs': 10, 'lr_all': 0.01, 'reg_all': 0.02},
 'mae': {'n_epochs': 10, 'lr_all': 0.01, 'reg_all': 0.02}}

Before

In [75]:
algo=SVD()

cv_svd=cross_validate(algo,
    data,
    measures=['rmse', 'mae'],
    cv=5,verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9448  0.9258  0.9316  0.9388  0.9393  0.9361  0.0066  
MAE (testset)     0.7433  0.7310  0.7356  0.7413  0.7390  0.7380  0.0043  
Fit time          0.51    0.52    0.51    0.53    0.57    0.53    0.03    
Test time         0.03    0.03    0.03    0.09    0.03    0.04    0.02    


In [76]:
print('rmse cv mean :', cv_svd['test_rmse'].mean())

rmse cv mean : 0.9360550812577892


After

In [77]:
algo=SVD(n_epochs=10,
    lr_all=0.01,
    reg_all=0.02)

cv_svd=cross_validate(algo,
    data,
    measures=['rmse', 'mae'],
    cv=5,verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9400  0.9415  0.9394  0.9326  0.9287  0.9364  0.0049  
MAE (testset)     0.7397  0.7425  0.7393  0.7366  0.7324  0.7381  0.0034  
Fit time          0.30    0.27    0.26    0.26    0.27    0.27    0.01    
Test time         0.03    0.09    0.03    0.03    0.03    0.04    0.02    


In [78]:
print('rmse cv mean :', cv_svd['test_rmse'].mean())

rmse cv mean : 0.9364379083257998


> ## Prediction Result

Kita akan merekomendasikan item 565, 647,665, dan 677 ke user 0, 111, dan 212

In [24]:
# df[df['user_id'].isin([0,111,212])]['item_id'].unique()

In [79]:
df_test=pd.DataFrame(columns=['user_id','item_id'])

user=[0,111,212]
item=[565, 647, 665, 677]

for i in user:
    for j in item:
        data_concat=pd.DataFrame({'user_id' : [i],'item_id' : [j]})
        df_test=pd.concat([df_test, data_concat])

In [80]:
df_test=df_test.reset_index(drop=True)
df_test

Unnamed: 0,user_id,item_id
0,0,565
1,0,647
2,0,665
3,0,677
4,111,565
5,111,647
6,111,665
7,111,677
8,212,565
9,212,647


In [81]:
algo=SVD(n_epochs=20,
    lr_all=0.005,
    reg_all=0.02)

algo.fit(trainset)

y=[]

for _, row in df_test.iterrows():
    est=algo.predict(row.user_id, row.item_id)
    y.append(est[3])

df_test['rating']=y

df_test.sort_values(by=['user_id','rating'],ascending=[1,0], inplace= True)
df_test

Unnamed: 0,user_id,item_id,rating
1,0,647,4.185188
3,0,677,3.474935
0,0,565,2.981455
2,0,665,2.847154
5,111,647,4.119125
7,111,677,3.365999
4,111,565,2.987509
6,111,665,2.840387
9,212,647,4.206117
11,212,677,3.632134


USER ID - 0

In [82]:
df_test[df_test['user_id']==0]

Unnamed: 0,user_id,item_id,rating
1,0,647,4.185188
3,0,677,3.474935
0,0,565,2.981455
2,0,665,2.847154


rekomendasi item untuk user id 0 adalah : 647, 677, 565, 665

USER ID - 111

In [83]:
df_test[df_test['user_id']==111]

Unnamed: 0,user_id,item_id,rating
5,111,647,4.119125
7,111,677,3.365999
4,111,565,2.987509
6,111,665,2.840387


rekomendasi item untuk user id 111 adalah : 647, 677, 565, 665

USER ID - 212

In [84]:
df_test[df_test['user_id']==212]

Unnamed: 0,user_id,item_id,rating
9,212,647,4.206117
11,212,677,3.632134
8,212,565,3.094075
10,212,665,2.94062


rekomendasi item untuk user id 212 adalah : 647, 677, 665, 565