# Problem III

<b>Please create a simple machine learning pipeline</b> that will give user recommendation (1 user at least has 100 product recommendation) and save the recommendation to PostgreSQL using this <a href='https://nijianmo.github.io/amazon/index.html'>Amazon Dataset</a> and upload it in your github page. You will be grade not for the machine learning algorithm / model but for the pipeline performance (eg. processing time, no of data processed).

In [1]:
import numpy as np
import pandas as pd
import gzip
import json

In [2]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    
    for d in parse(path):
        df[i] = d
    
        i += 1
      
    return pd.DataFrame.from_dict(df, orient='index')

In [3]:
df = getDF('data/All_Beauty.json.gz')

print("Number of observations:", len(df))
print('First 5 rows:')
df.head()

Number of observations: 371345
First 5 rows:


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,1.0,True,"02 19, 2015",A1V6B6TNIC10QE,143026860,theodore j bigham,great,One Star,1424304000,,,
1,4.0,True,"12 18, 2014",A2F5GHSXFQ0W6J,143026860,Mary K. Byke,My husband wanted to reading about the Negro ...,... to reading about the Negro Baseball and th...,1418860800,,,
2,4.0,True,"08 10, 2014",A1572GUYS7DGSR,143026860,David G,"This book was very informative, covering all a...",Worth the Read,1407628800,,,
3,5.0,True,"03 11, 2013",A1PSGLFK1NSVO,143026860,TamB,I am already a baseball fan and knew a bit abo...,Good Read,1362960000,,,
4,5.0,True,"12 25, 2011",A6IKXKZMTKGSC,143026860,shoecanary,This was a good story of the Black leagues. I ...,"More than facts, a good story read!",1324771200,5.0,,


In [4]:
df.describe()

Unnamed: 0,overall,unixReviewTime
count,371345.0,371345.0
mean,4.112093,1440742000.0
std,1.362099,69627430.0
min,1.0,947462400.0
25%,4.0,1416096000.0
50%,5.0,1456963000.0
75%,5.0,1485562000.0
max,5.0,1538438000.0


In [5]:
df.dtypes

overall           float64
verified             bool
reviewTime         object
reviewerID         object
asin               object
reviewerName       object
reviewText         object
summary            object
unixReviewTime      int64
vote               object
style              object
image              object
dtype: object

In [6]:
df = df.fillna('')
df = df.drop(['verified', 'reviewTime', 'unixReviewTime', 'vote', 'style', 'image'], axis=1)

In [7]:
df.head()

Unnamed: 0,overall,reviewerID,asin,reviewerName,reviewText,summary
0,1.0,A1V6B6TNIC10QE,143026860,theodore j bigham,great,One Star
1,4.0,A2F5GHSXFQ0W6J,143026860,Mary K. Byke,My husband wanted to reading about the Negro ...,... to reading about the Negro Baseball and th...
2,4.0,A1572GUYS7DGSR,143026860,David G,"This book was very informative, covering all a...",Worth the Read
3,5.0,A1PSGLFK1NSVO,143026860,TamB,I am already a baseball fan and knew a bit abo...,Good Read
4,5.0,A6IKXKZMTKGSC,143026860,shoecanary,This was a good story of the Black leagues. I ...,"More than facts, a good story read!"


In [8]:
users = {}

for name,uid in zip(df['reviewerName'], df['reviewerID']):
    if name not in users:
        users[name] = uid

for i, n in enumerate(df['reviewerName']):
    if i == 20:
        break
    print(n, users[n])
    

theodore j bigham A1V6B6TNIC10QE
Mary K. Byke A2F5GHSXFQ0W6J
David G A1572GUYS7DGSR
TamB A1PSGLFK1NSVO
shoecanary A6IKXKZMTKGSC
W. Powell A36NF437WZLQ9E
Robert S. Clay Jr. A10Q8NIFOVOHFV
Jacqueline Diaz A26PO1B2Q2G1CS
Khadijah Ali-Evans AQ812VYVTC2RJ
rabiyaa123 AMACNEW14ADMX
Lilly A5FS4FVWR77O6
igzotikvet A1NLF2JD7BVOB4
M. Morretti A3CKVF2ZR1CBO0
Amazon Customer A3IFT6SR16SAYU
Zury M. A2BQ7NB90SBVIA
Shenia Morris A92AYWTA52KXE
Amazon Customer A3IFT6SR16SAYU
Alex G A2V9BG2MDQVCYX
TSENG TSUNG-CHIN AIL9Q82L1G2YO
Nikolai G. J. Geier A1ROGN2QPKZGP7


In [9]:
# Uncomment and run code below to install scikit-surprise
#!pip installl scikit-surprise

In [10]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection.validation import cross_validate

In [11]:
reader = Reader()
data = Dataset.load_from_df(df[['reviewerID', 'asin', 'overall']], reader)
svd_model = SVD()

In [13]:
# Run 5-fold cross-validation and print results
cross_validate(svd_model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.2584  1.2527  1.2485  1.2603  1.2508  1.2542  0.0045  
MAE (testset)     0.9730  0.9701  0.9671  0.9748  0.9708  0.9711  0.0026  
Fit time          12.96   13.11   13.02   13.02   13.12   13.04   0.06    
Test time         0.57    0.33    0.31    0.30    0.33    0.37    0.10    


{'test_rmse': array([1.25835997, 1.25273694, 1.24854844, 1.26027407, 1.25084689]),
 'test_mae': array([0.97298344, 0.97009708, 0.96707625, 0.97476023, 0.97076833]),
 'fit_time': (12.955358505249023,
  13.107915878295898,
  13.0161874294281,
  13.01918363571167,
  13.119885444641113),
 'test_time': (0.5724680423736572,
  0.3311145305633545,
  0.3101680278778076,
  0.30419135093688965,
  0.3251304626464844)}

In [14]:
new_training_set = data.build_full_trainset()
svd_model.fit(new_training_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2697f909550>

In [15]:
metadata = getDF('data/meta_All_Beauty.json.gz')

In [16]:
metadata.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,image,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin
0,[],,[Loud 'N Clear Personal Sound Amplifier allows...,,Loud 'N Clear&trade; Personal Sound Amplifier,[],[],,idea village,[],"2,938,573 in Beauty & Personal Care (",[],{'ASIN: ': '6546546450'},All Beauty,,,,6546546450
1,[],,[No7 Lift & Luminate Triple Action Serum 50ml ...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"[B01E7LCSL6, B008X5RVME]",[],,,[],"872,854 in Beauty & Personal Care (",[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776
2,[],,[No7 Stay Perfect Foundation now stays perfect...,,No7 Stay Perfect Foundation Cool Vanilla by No7,[],[],,No7,[],"956,696 in Beauty & Personal Care (","[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]","{'Shipping Weight:': '3.5 ounces (', 'ASIN: ':...",All Beauty,,,$28.76,7250468162
3,[],,[],,Wella Koleston Perfect Hair Colour 44/44 Mediu...,[B0041PBXX8],[https://images-na.ssl-images-amazon.com/image...,,,[],"1,870,258 in Beauty & Personal Care (",[],"{'  Item Weight: ': '1.76 ounces', 'Sh...",All Beauty,,,,7367905066
4,[],,[Lacto Calamine Skin Balance Daily Nourishing ...,,Lacto Calamine Skin Balance Oil control 120 ml...,[],[https://images-na.ssl-images-amazon.com/image...,,Pirmal Healthcare,[],"67,701 in Beauty & Personal Care (","[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...","{'Shipping Weight:': '12 ounces (', 'ASIN: ': ...",All Beauty,,,$12.15,7414204790


In [17]:
metadata = metadata[['asin', 'title']]
metadata.head()

Unnamed: 0,asin,title
0,6546546450,Loud 'N Clear&trade; Personal Sound Amplifier
1,7178680776,No7 Lift &amp; Luminate Triple Action Serum 50...
2,7250468162,No7 Stay Perfect Foundation Cool Vanilla by No7
3,7367905066,Wella Koleston Perfect Hair Colour 44/44 Mediu...
4,7414204790,Lacto Calamine Skin Balance Oil control 120 ml...


In [18]:
USER_NAME = 'Shenia Morris'
USER_ID = users[USER_NAME]

metadata['recommendation_score'] = metadata['asin'].apply(lambda x: svd_model.predict(USER_ID, x).est)

In [19]:
num_of_recommendations_to_show = 100
metadata = metadata.sort_values(by=['recommendation_score'], ascending=False)

print('Top {} Recommendations for {}'.format(num_of_recommendations_to_show, USER_NAME))
recommendations = metadata.head(num_of_recommendations_to_show)
recommendations

Top 100 Recommendations for Shenia Morris


Unnamed: 0,asin,title,recommendation_score
2184,B000VV1YOY,"essie nail polish, cuticle care, primers and f...",5.000000
9269,B00D3M0CRS,Poppy Austin Pure Argan Oil for Hair &amp; Ski...,5.000000
1795,B000OV40LA,Thieves Spray - 1 oz by Young Living,5.000000
23557,B0153R4C44,Pantene Pro-v Curly Hair No Crunch Curls Whip ...,5.000000
686,B00020UR4C,Truefitt &amp; Hill Trafalgar After Shave Spla...,5.000000
...,...,...,...
32740,B01HBSH2EK,"Sage Hill Essential Oil Labels - Blanks, Blend...",4.892628
1716,B000NOT9GO,"Hydrolatum Cream for Dry Skin, 1 lb",4.892463
6415,B006YGCSKO,Finipil LAIT 50 Antiseptic Cream 4pk- 44 ml each,4.890265
23208,B014DH4FTC,True Glow Eye Cream - Intense Dark Circles Cor...,4.887478


In [None]:
# Uncomment and run this code below to install psycopg2 which can connect Python to Postgresql
#!pip install psycopg2

In [None]:
import psycopg2

def insert_recommendation_list(product_id_list, product_name_list):
    
    query = """INSERT INTO recommendations(product_id_list, product_name_list)
            VALUES(%s, %s)"""
    
    conn = None
    
    try:
        print('Connecting to database...')
        conn = psycopg2.connect(host='localhost',
                               database='kitabisa_db',
                               user='admin',
                               password='k1t@B!$a')
        cur = conn.cursor()
        print('Inserting rows...')
        cur.execute(sql, (product_id_list, product_name_list))
        conn.commit()
        cur.close()
        
    except(Exception, psycopg2.DatabaseError) as error:
        print(error)
    
    finally:
        if conn is not None:
            print('Transaction completed, closing database connection...')
            conn.close()

In [None]:
insert_recommendation_list(recommendations['asin'].values.tolist(), recommendations['title'].values.tolist())