# Colaborative Filtering: Model Based

In [2]:
import pandas as pd
import numpy as np

import seaborn as sns

from surprise import Reader
from surprise import Dataset

from surprise import SVD
from surprise import BaselineOnly

from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

In [3]:
column_names= ['user_id','item_id','rating','timestamp']
df= pd.read_csv('u.data', sep='\t', names= column_names)

## Data

In [4]:
df

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742
...,...,...,...,...
99998,880,476,3,880175444
99999,716,204,5,879795543
100000,276,1090,1,874795795
100001,13,225,2,882399156


In [5]:
df.pivot_table(values='rating', index='user_id', columns='item_id')

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [6]:
reader=Reader(rating_scale=(0,5))
data=Dataset.load_from_df(df[['user_id','item_id','rating']],reader)

In [7]:
data.df.head(10)

Unnamed: 0,user_id,item_id,rating
0,0,50,5
1,0,172,5
2,0,133,1
3,196,242,3
4,186,302,3
5,22,377,1
6,244,51,2
7,166,346,1
8,298,474,4
9,115,265,2


## Validation

In [9]:
#data splitting
trainset, testset= train_test_split(data, test_size=0.25, random_state=101)

In [10]:
# SVD
algo=SVD()
algo.fit(trainset)
prediction=algo.test(testset)

In [11]:
accuracy.rmse(prediction)

RMSE: 0.9393


0.9393282188529801

In [13]:
# ALS

bsl_option= {'method':'als',
            'n_epoch':5,
            'reg_u':12,
            'reg_i':5}
algo=BaselineOnly(bsl_options=bsl_option)
algo.fit(trainset)
predictions= algo.test(testset)

Estimating biases using als...


In [14]:
accuracy.rmse(predictions)

RMSE: 0.9407


0.9407128231972565

Berdasarkan hasil validasi, untuk dataset ini, algoritma SVD labih baik hasil evaluasinya dari ALS

## Cross Validasi

In [15]:
#SVD
algo=SVD()
cv_svd=cross_validate(algo,data, measures=['RMSE','MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9356  0.9360  0.9276  0.9331  0.9434  0.9351  0.0051  
MAE (testset)     0.7357  0.7366  0.7327  0.7366  0.7412  0.7366  0.0027  
Fit time          4.05    4.10    4.04    4.05    4.09    4.07    0.02    
Test time         0.27    0.10    0.11    0.11    0.10    0.14    0.07    


In [16]:
print('rmse cv mean', cv_svd['test_rmse'].mean())

rmse cv mean 0.9351312909511409


In [17]:
#ALS
bsl_option={'method':'als',
           'n_epoch':5,
           'reg_u':12,
           'reg_i':5}
algo= BaselineOnly(bsl_options=bsl_option)
cv_als=cross_validate(algo,data,measures=['RMSE','MAE'], cv=5, verbose=False)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


In [18]:
print('rmse cv mean', cv_als['test_rmse'].mean())

rmse cv mean 0.9411202201853971


berdasarkan hasil cross validasi, SVD memiliki performa yang lebih dari dari ALS 

## Hyperparameter Tunning 

In [22]:
param_grid={'n_epochs':[5,10,20],
           'lr_all':[0.002,0.005],
           'reg_all':[0.2,0.4,0.6]}
#n_epochs – The number of iteration of the SGD procedure. Default is 20.
#r_all – The learning rate for all parameters. Default is 0.005
#reg_all – The regularization term for all parameters. Default is 0.02
#https://surprise.readthedocs.io/en/stable/matrix_factorization.html

In [23]:
gs= GridSearchCV(SVD,param_grid, measures=[u'rmse',u'mae'],cv=5)
gs.fit(data)

In [24]:
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.9469432893674743
{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}


In [25]:
print(gs.best_score['mae'])
print(gs.best_params['mae'])

0.7539078920024173
{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.2}


In [26]:
# performance on testset

algo=SVD(n_epochs=20,lr_all=0.005, reg_all=0.2)
algo.fit(trainset)
prediction=algo.test(testset)
accuracy.rmse(prediction)

RMSE: 0.9454


0.9454186191226462

In [27]:
# before and after tunning

In [28]:
#before
algo=SVD()
algo.fit(trainset)
prediction=algo.test(testset)
accuracy.rmse(prediction)

RMSE: 0.9357


0.9356614092167571

In [29]:
#after
algo=SVD(n_epochs=20,lr_all=0.005, reg_all=0.2)
algo.fit(trainset)
prediction=algo.test(testset)
accuracy.rmse(prediction)

RMSE: 0.9454


0.9453797333231155

before tunning lebih baik dari after tunning

## Prediction result

Prediksi rating yang akan diberikan oleh user 0,111,212 terhadap item 565,647,665 dan 677

In [30]:
df_test=pd.DataFrame(columns=['user_id','item_id'])
for i in [0,111,212]:
    for j in [565,647,665,677]:
        df_test=df_test.append({'user_id':i,'item_id':j},ignore_index=True)

df_test

Unnamed: 0,user_id,item_id
0,0,565
1,0,647
2,0,665
3,0,677
4,111,565
5,111,647
6,111,665
7,111,677
8,212,565
9,212,647


In [31]:
algo=SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1fae9f52880>

In [38]:
algo.predict(0, 565)

Prediction(uid=0, iid=565, r_ui=None, est=2.7980095390064634, details={'was_impossible': False})

In [39]:
y=[]

for _, row in df_test.iterrows():
    est=algo.predict(row['user_id'], row['item_id'])
    y.append(est[3])

In [41]:
df_test['rating']=y

In [45]:
df_test.sort_values(by=['user_id','rating'],ascending=[True,False], inplace=True)

In [47]:
df_test

Unnamed: 0,user_id,item_id,rating
1,0,647,4.158179
3,0,677,3.206192
0,0,565,2.79801
2,0,665,2.694923
5,111,647,4.203241
7,111,677,3.131092
4,111,565,2.791478
6,111,665,2.601333
9,212,647,4.222451
11,212,677,3.152053


In [46]:
df_test[df_test['user_id']==0]

Unnamed: 0,user_id,item_id,rating
1,0,647,4.158179
3,0,677,3.206192
0,0,565,2.79801
2,0,665,2.694923


In [57]:
df[df['item_id']==647]['rating'].mean()

4.1

In [50]:
df[df['user_id'].isin([0,111,212])]

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
476,111,328,4,891679939
886,111,301,4,891680028
1451,212,180,1,879303974
2450,111,313,4,891679901
3308,111,887,3,891679692
3571,111,340,4,891679692
3947,212,515,4,879303571


In [58]:
# latihan

In [60]:
#dataset
pd.read_csv('rating3.csv').drop(columns='Unnamed: 0')

Unnamed: 0,user_id,anime_id,rating
0,1,8074,10.0
1,1,11617,10.0
2,1,11757,10.0
3,1,15451,10.0
4,2,11771,10.0
...,...,...,...
77863,999,11757,6.0
77864,999,16498,9.0
77865,999,21881,5.0
77866,999,22319,8.0


dengan menggunakan data anime dan rating 3, buatlah recommendation system dengan skema berikut:
    * bandingkan algoritma SVD dan ALS
    * tunning algoritma yang menurut kalian lebih baik dengan skema:
        * n_epoch: 5,10
        * reg_i: 1,2,5
        * reg_u: 10,12
       
Setelah mendapatkan model terbaik coba prediksi rating anime berikut:
    * 1 Fullmetal Alchemist: Brotherhood , anime_id 5114
    * 1179 Detective Conan OVA 09, anime_id 2514
    * 1577 Ranma ½, anime_id 1010
    * 249 Saint Seiya: Meiou Hades Juuni Kyuu-hen, anime_id 1257 
    
Oleh user:
    * 60
    * 300
    * 600
    * 900
    
Bagaimana urutan rekomendasi yang akan kalian berikan untuk masing-masing user?