In [3]:
import boto3
import pandas as pd
import numpy as np
import sys
from io import StringIO
import surprise
from surprise import accuracy
from sklearn.model_selection import train_test_split
from surprise import SVD, NMF

In [4]:
sys.path.insert(1, '../src')
from Collaborative_recommender import CollaborativeRecommender

In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
%reload_ext autoreload

In [7]:
s3 = boto3.client('s3')

In [8]:
df_all_15 = pd.read_csv("s3://recommender-system-amazon/df_all_15_text.csv")

In [9]:
df_all_15.head(5)

Unnamed: 0.1,Unnamed: 0,fit,title,also_buy,image_x,price,asin,overall,reviewerID,reviewText,item_text,title_nlp
0,0,,Black Mountain Products Resistance Band Set wi...,"['1612431712', 'B01AVDVHTI', 'B002YQUP7Q', 'B0...",['https://images-na.ssl-images-amazon.com/imag...,$17.32,7245456313,5.0,A1J5VYG8O0YEMB,work advertis work advertis,black mountain,black mountain product resist band set door an...
1,1,,Black Mountain Products Resistance Band Set wi...,"['1612431712', 'B01AVDVHTI', 'B002YQUP7Q', 'B0...",['https://images-na.ssl-images-amazon.com/imag...,$17.32,7245456313,4.0,AYMDRRCOPKTCF,four star product arriv prompt describ,black mountain,black mountain product resist band set door an...
2,2,,Black Mountain Products Resistance Band Set wi...,"['1612431712', 'B01AVDVHTI', 'B002YQUP7Q', 'B0...",['https://images-na.ssl-images-amazon.com/imag...,$17.32,7245456313,4.0,A3AX86YSUHLELL,great resist band top qualiti okay resist band...,black mountain,black mountain product resist band set door an...
3,3,,Black Mountain Products Resistance Band Set wi...,"['1612431712', 'B01AVDVHTI', 'B002YQUP7Q', 'B0...",['https://images-na.ssl-images-amazon.com/imag...,$17.32,7245456313,5.0,A12MJJT1RL2S7K,perfect transport resist band perfect cross fi...,black mountain,black mountain product resist band set door an...
4,4,,Black Mountain Products Resistance Band Set wi...,"['1612431712', 'B01AVDVHTI', 'B002YQUP7Q', 'B0...",['https://images-na.ssl-images-amazon.com/imag...,$17.32,7245456313,5.0,A33BKACRGR3H38,awesom great band crosstrain dont like lift gy...,black mountain,black mountain product resist band set door an...


In [10]:
df_all_15.shape

(227509, 12)

In [11]:
df_all_15.columns

Index(['Unnamed: 0', 'fit', 'title', 'also_buy', 'image_x', 'price', 'asin',
       'overall', 'reviewerID', 'reviewText', 'item_text', 'title_nlp'],
      dtype='object')

In [12]:
df_collab = df_all_15[['reviewerID',"asin","overall"]]

In [13]:
df_collab.head()

Unnamed: 0,reviewerID,asin,overall
0,A1J5VYG8O0YEMB,7245456313,5.0
1,AYMDRRCOPKTCF,7245456313,4.0
2,A3AX86YSUHLELL,7245456313,4.0
3,A12MJJT1RL2S7K,7245456313,5.0
4,A33BKACRGR3H38,7245456313,5.0


In [14]:
df_collab.shape

(227509, 3)

In [22]:
UM = pd.pivot_table(df_collab,index='reviewerID', columns='asin', values='overall')

In [None]:
UM

In [44]:
b = (~np.isnan(UM.values)).sum()

In [47]:
b/(b+a)*100

0.034218338461255854

In [18]:
df_collab.describe()

Unnamed: 0,overall
count,227509.0
mean,4.474311
std,0.955971
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


## Using surprise to develop a collaborative filter (NMF based method)

In [15]:
reader = surprise.Reader(rating_scale=(1,5))
recom = CollaborativeRecommender(df_collab,reader,NMF)

Use grid search to find the best paramaters for the model

In [16]:
# param_grid = {'n_factors': [4,8,12,16,20,24,32], 'lr_all': [0.002, 0.005], 'n_epochs': [5, 10]}
param_grid = {'n_factors': [8,15,23,30,37,42], 'n_epochs': [5, 10]}
recom.grid(param_grid,cv=3)
recom.fit()


<Collaborative_recommender.CollaborativeRecommender at 0x7f9a865f7310>

Accuracy for the best model

In [18]:
pred, acc = recom.predictions()

print(f'accuracy of the best model: {acc}')

RMSE: 0.6899
accuracy of the best model: 0.6899440357584086


In [34]:
df_pred = pd.DataFrame.from_dict(pred)
df_pred['Iu'] = df_pred.uid.apply(recom.get_Iu)
df_pred['Ui'] = df_pred.iid.apply(get_Ui)
df_pred['err'] = abs(df_pred.est - df_pred.r_ui)



In [35]:
df_pred.sort_values(by='err',ascending=False)

Unnamed: 0,uid,iid,r_ui,est,details,err,Iu
199801,A3SJFAYA81NIJB,B0072T3X38,1.0,5.0,{'was_impossible': False},4.0,10
220094,A3BLY5DYMNWARB,B013XADM6W,1.0,5.0,{'was_impossible': False},4.0,8
87673,A26KNV0HTA4U5D,B0030WK1BS,1.0,5.0,{'was_impossible': False},4.0,50
121546,A2XWG40J6EUAM0,B003SHBL00,1.0,5.0,{'was_impossible': False},4.0,10
173845,A1CT6MLYNBG3L4,B00Z8X3BL4,1.0,5.0,{'was_impossible': False},4.0,15
...,...,...,...,...,...,...,...
106013,A1WZI4N6V5F0EC,B016APS3CU,5.0,5.0,{'was_impossible': False},0.0,15
106011,A1WZI4N6V5F0EC,B00SAY9LLC,5.0,5.0,{'was_impossible': False},0.0,15
106009,A1WZI4N6V5F0EC,B00JO2BWFC,5.0,5.0,{'was_impossible': False},0.0,15
106008,A1WZI4N6V5F0EC,B00I7Z0FDM,5.0,5.0,{'was_impossible': False},0.0,15


Top n prediction for a given user

In [None]:
bucket = 'recommender-system-amazon' 
csv_buffer = StringIO()
df_collab.sm.to_csv(csv_buffer)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'df_colab_sm.csv').put(Body=csv_buffer.getvalue())


In [33]:
df_collab['asin'].values

array(['0899332757', '0899332757', '0899332757', ..., 'B01HJDGJ1E',
       'B01HJDGJ1E', 'B01HJGAJ9O'], dtype=object)

In [27]:
UM.columns

Index(['0899332757', '0899333257', '0971100764', '3843518912', '7245456275',
       '7245456313', '7245456453', '8804850086', '9641766805', 'B00000IURU',
       ...
       'B01HJ234E0', 'B01HJ4DN08', 'B01HJ4EYS8', 'B01HJ8YC0Y', 'B01HJA241G',
       'B01HJA7KWE', 'B01HJDGJ1E', 'B01HJDHNX2', 'B01HJDZ34I', 'B01HJGAJ9O'],
      dtype='object', name='asin', length=56351)