In [67]:
import boto3
import pandas as pd
import numpy as np
import sys
from io import StringIO
import surprise
from surprise import accuracy
from sklearn.model_selection import train_test_split
from surprise import SVD, NMF, KNNBaseline

In [4]:
sys.path.insert(1, '../src')
from Collaborative_recommender import CollaborativeRecommender

In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
%reload_ext autoreload

In [7]:
s3 = boto3.client('s3')

In [8]:
df_all_15 = pd.read_csv("s3://recommender-system-amazon/df_all_15_text.csv")

In [9]:
df_all_15.head(5)

Unnamed: 0.1,Unnamed: 0,fit,title,also_buy,image_x,price,asin,overall,reviewerID,reviewText,item_text,title_nlp
0,0,,Black Mountain Products Resistance Band Set wi...,"['1612431712', 'B01AVDVHTI', 'B002YQUP7Q', 'B0...",['https://images-na.ssl-images-amazon.com/imag...,$17.32,7245456313,5.0,A1J5VYG8O0YEMB,work advertis work advertis,black mountain,black mountain product resist band set door an...
1,1,,Black Mountain Products Resistance Band Set wi...,"['1612431712', 'B01AVDVHTI', 'B002YQUP7Q', 'B0...",['https://images-na.ssl-images-amazon.com/imag...,$17.32,7245456313,4.0,AYMDRRCOPKTCF,four star product arriv prompt describ,black mountain,black mountain product resist band set door an...
2,2,,Black Mountain Products Resistance Band Set wi...,"['1612431712', 'B01AVDVHTI', 'B002YQUP7Q', 'B0...",['https://images-na.ssl-images-amazon.com/imag...,$17.32,7245456313,4.0,A3AX86YSUHLELL,great resist band top qualiti okay resist band...,black mountain,black mountain product resist band set door an...
3,3,,Black Mountain Products Resistance Band Set wi...,"['1612431712', 'B01AVDVHTI', 'B002YQUP7Q', 'B0...",['https://images-na.ssl-images-amazon.com/imag...,$17.32,7245456313,5.0,A12MJJT1RL2S7K,perfect transport resist band perfect cross fi...,black mountain,black mountain product resist band set door an...
4,4,,Black Mountain Products Resistance Band Set wi...,"['1612431712', 'B01AVDVHTI', 'B002YQUP7Q', 'B0...",['https://images-na.ssl-images-amazon.com/imag...,$17.32,7245456313,5.0,A33BKACRGR3H38,awesom great band crosstrain dont like lift gy...,black mountain,black mountain product resist band set door an...


In [10]:
df_all_15.shape

(227509, 12)

In [11]:
df_all_15.columns

Index(['Unnamed: 0', 'fit', 'title', 'also_buy', 'image_x', 'price', 'asin',
       'overall', 'reviewerID', 'reviewText', 'item_text', 'title_nlp'],
      dtype='object')

In [12]:
df_collab = df_all_15[['reviewerID',"asin","overall"]]

In [13]:
df_collab.head()

Unnamed: 0,reviewerID,asin,overall
0,A1J5VYG8O0YEMB,7245456313,5.0
1,AYMDRRCOPKTCF,7245456313,4.0
2,A3AX86YSUHLELL,7245456313,4.0
3,A12MJJT1RL2S7K,7245456313,5.0
4,A33BKACRGR3H38,7245456313,5.0


In [14]:
df_collab.shape

(227509, 3)

In [22]:
UM = pd.pivot_table(df_collab,index='reviewerID', columns='asin', values='overall')

In [None]:
UM

In [44]:
b = (~np.isnan(UM.values)).sum()

In [47]:
b/(b+a)*100

0.034218338461255854

In [18]:
df_collab.describe()

Unnamed: 0,overall
count,227509.0
mean,4.474311
std,0.955971
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


## Using surprise to develop a collaborative filter (NMF based method)

In [73]:
reader = surprise.Reader(rating_scale=(1,5))
recom = CollaborativeRecommender(df_collab,reader,NMF)

Use grid search to find the best paramaters for the model

In [74]:
# param_grid = {'n_factors': [4,8,12,16,20,24,32], 'lr_all': [0.002, 0.005], 'n_epochs': [5, 10]}
param_grid = {'n_factors': [8,15,23,30,37,42], 'n_epochs': [5, 10]}
recom.grid(param_grid,cv=3)
recom.fit()


<Collaborative_recommender.CollaborativeRecommender at 0x7f9a40b498d0>

Accuracy for the best model

In [38]:
pred, acc = recom.pred()

print(f'accuracy of the best model: {acc}')

RMSE: 0.6899
accuracy of the best model: 0.6899440357584086


In [39]:
df_pred = pd.DataFrame.from_dict(pred)
df_pred['Iu'] = df_pred.uid.apply(recom.get_Iu)
df_pred['Ui'] = df_pred.iid.apply(recom.get_Ui)
df_pred['err'] = abs(df_pred.est - df_pred.r_ui)



In [40]:
df_pred.sort_values(by='err',ascending=False)

Unnamed: 0,uid,iid,r_ui,est,details,Iu,Ui,err
199801,A3SJFAYA81NIJB,B0072T3X38,1.0,5.0,{'was_impossible': False},10,198,4.0
220094,A3BLY5DYMNWARB,B013XADM6W,1.0,5.0,{'was_impossible': False},8,26,4.0
87673,A26KNV0HTA4U5D,B0030WK1BS,1.0,5.0,{'was_impossible': False},50,72,4.0
121546,A2XWG40J6EUAM0,B003SHBL00,1.0,5.0,{'was_impossible': False},10,86,4.0
173845,A1CT6MLYNBG3L4,B00Z8X3BL4,1.0,5.0,{'was_impossible': False},15,34,4.0
...,...,...,...,...,...,...,...,...
106013,A1WZI4N6V5F0EC,B016APS3CU,5.0,5.0,{'was_impossible': False},15,33,0.0
106011,A1WZI4N6V5F0EC,B00SAY9LLC,5.0,5.0,{'was_impossible': False},15,18,0.0
106009,A1WZI4N6V5F0EC,B00JO2BWFC,5.0,5.0,{'was_impossible': False},15,93,0.0
106008,A1WZI4N6V5F0EC,B00I7Z0FDM,5.0,5.0,{'was_impossible': False},15,8,0.0


In [121]:
df_pred['rank'] = (df_pred["est"]+df_pred["err"]).astype(int).rank(method='dense', ascending=False).astype(int)
df_pred.sort_values(by = 'rank',ascending=False)

Unnamed: 0,uid,iid,r_ui,est,details,Iu,Ui,err,rank
134804,A1B7E7KVYWCEQS,B00FPQQEJ0,1.0,1.046207,{'was_impossible': False},4,1,0.046207,9
162442,A1AWB9CEPLNBFD,B0013G8OMG,1.0,1.022248,{'was_impossible': False},50,219,0.022248,9
165923,A1M3F4G45OXRDI,B000ZKSWGY,1.0,1.114857,{'was_impossible': False},3,62,0.114857,9
165924,A1M3F4G45OXRDI,B00JH02RW8,1.0,1.041763,{'was_impossible': False},3,11,0.041763,9
165925,A1M3F4G45OXRDI,B00TSLYU5I,1.0,1.102381,{'was_impossible': False},3,23,0.102381,9
...,...,...,...,...,...,...,...,...,...
59449,AB48ATLRVT1BW,B000MF63M2,1.0,5.000000,{'was_impossible': False},14,345,4.000000,1
10852,AV0B6NGGOAISO,B0051F8PSI,1.0,5.000000,{'was_impossible': False},81,151,4.000000,1
101085,A26DG05WFGH2V0,B0078ZTWP4,1.0,5.000000,{'was_impossible': False},23,35,4.000000,1
11278,A3AR1T1QJWRBYM,B00009V2YO,1.0,5.000000,{'was_impossible': False},26,47,4.000000,1


In [122]:
cf_model = df_pred.pivot_table(index='uid', 
                            columns='iid', values='est')

In [123]:
cf_model

iid,7245456275,7245456313,B00003G4JR,B00004NKIQ,B00004SYN6,B00004T11T,B00004TBLW,B00004YTPE,B000051ZHS,B0000568SY,...,B01HAGT6G0,B01HBACO4G,B01HBPL78K,B01HCI0I6S,B01HCKR9ZE,B01HCU3NTU,B01HD17S3A,B01HE0PMFQ,B01HEYYKLO,B01HHB2HK0
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0029274J35Q1MYNKUWO,,,,,,,,,,,...,,,,,,,,,,
A0098869QS7I6WOAODWI,,,,,,,,,,,...,,,,,,,,,,
A0289048PRWFY7ZXQKCD,,,,,,,,,,,...,,,,,,,,,,
A0418008MEWDDIUPHXP7,,,,,,,,,,,...,,,5.0,,,,,,,
A0464351OZXPUPKGI6HO,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZZ2W9JD10R3H,,,,,,,,,,,...,,,,,,,,,,
AZZCSO2MC725N,,,,,,,,,,,...,,,,,,,,,,
AZZG39P6RDCHL,,,,,,,,,,,...,,,,,,,,,,
AZZKYP9254H32,,,,,,,,,,,...,,,,,,,,,,


In [132]:
recommended_items = pd.DataFrame(cf_model.loc['A0464351OZXPUPKGI6HO'])
recommended_items.columns = ["predicted_rating"]

In [133]:
recommended_items.sort_values('predicted_rating', ascending=False).head(10)

Unnamed: 0_level_0,predicted_rating
iid,Unnamed: 1_level_1
B000JFKGZU,5.0
B001PR0MP0,5.0
B001PR0MS2,5.0
B00EJRKIMA,5.0
B00HH6VBWG,5.0
7245456275,
7245456313,
B00003G4JR,
B00004NKIQ,
B00004SYN6,


Top n prediction 

In [57]:
top_n = recom.get_top_n()
top_n["A3SJFAYA81NIJB"]

[('B001U0HJN0', 5),
 ('B0051V68DQ', 5),
 ('B0072T3X38', 5),
 ('B00J01N3O0', 5),
 ('B005CGLUVU', 4.94235579583709),
 ('B01A4ZXWJ6', 4.845179640835079),
 ('B00E221A88', 4.817236828575972),
 ('B006OU4ERA', 4.797538209512706),
 ('B01H71AZ36', 4.739778908724118),
 ('B01DIXWNZ0', 4.297563334551882)]

## Using KNN methods to get the k nearest neighbors of an item  

In [110]:
reader = surprise.Reader(rating_scale=(1,5))
sim_options = {'name': 'pearson_baseline', 'user_based': False}
recom2 = CollaborativeRecommender(df_collab,reader,KNNBaseline(sim_options=sim_options),simil=True)
recom2.fit()

this is correct
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [112]:
recom2.K_neighbors_items('B01DIXWNZ0')


['B0016BNDXI',
 'B000YYYP3K',
 'B00AK8E6J8',
 'B0019MM47U',
 '7245456313',
 '7245456275',
 'B00003G4JR',
 'B00004NKIQ',
 'B00004SYN6',
 'B00004T11T']

In [113]:
pred2, acc2 = recom2.pred()

RMSE: 0.0485


In [None]:
bucket = 'recommender-system-amazon' 
csv_buffer = StringIO()
df_collab.sm.to_csv(csv_buffer)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'df_colab_sm.csv').put(Body=csv_buffer.getvalue())


In [33]:
df_collab['asin'].values

array(['0899332757', '0899332757', '0899332757', ..., 'B01HJDGJ1E',
       'B01HJDGJ1E', 'B01HJGAJ9O'], dtype=object)

In [27]:
UM.columns

Index(['0899332757', '0899333257', '0971100764', '3843518912', '7245456275',
       '7245456313', '7245456453', '8804850086', '9641766805', 'B00000IURU',
       ...
       'B01HJ234E0', 'B01HJ4DN08', 'B01HJ4EYS8', 'B01HJ8YC0Y', 'B01HJA241G',
       'B01HJA7KWE', 'B01HJDGJ1E', 'B01HJDHNX2', 'B01HJDZ34I', 'B01HJGAJ9O'],
      dtype='object', name='asin', length=56351)