In [1]:
import json
import boto3
import psycopg2
from psycopg2.extras import execute_batch
import csv
import pandas as pd
import numpy as np
from datetime import datetime
!pip install scikit-surprise
from surprise import Reader, Dataset, SVD
from surprise.model_selection.validation import cross_validate

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25ldone
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=1178896 sha256=d020a9263e31c25554d277e3f32726161cf0f9833f73a188bdbd84fb97ba5b3b
  Stored in directory: /home/ec2-user/.cache/pip/wheels/df/e4/a6/7ad72453dd693f420b0c639bedeec34641738d11b55d8d9b84
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


### Part1: Modelling
> Data Flow: featues from S3 Glue Output Bucket to S3 SageMaker Output Bucket

In [2]:
# Get latest folder/filename
s3 = boto3.client("s3")
bucket = 'de-ers.imba-glue-output'
output_bucket = 'de-ers.imba-sagemaker-output'

# Read latest files in bucket
response = s3.list_objects_v2(Bucket=bucket)
sorted_obj = sorted(response['Contents'], key=lambda obj: obj['LastModified'], reverse=True)
folder = sorted_obj[0]['Key'].split('/')[0]
filename = sorted_obj[0]['Key'].split('/')[1]

filename

'part-00000-1b211f7a-b237-4eda-afc6-9b04ce77ea3a-c000.csv'

In [3]:
# Read csv into dataframe
df = pd.read_csv(f's3://{bucket}/{folder}/{filename}', nrows=2000000, usecols=['user_id', 'product_id', 'user_reorder_ratio', 'product_seq_time_1', 'product_seq_time_2'])

# Get reorder probability by dividing second time reorder by first time reorder
# Higher reorder probability means higher rank on the product
df['reorder_prob'] = df.product_seq_time_2 / df.product_seq_time_1
df.drop(['user_reorder_ratio', 'product_seq_time_1', 'product_seq_time_2'], axis=1, inplace=True)

# Categorize reorder probability (0 - 1) into 5 ranks
# 0   - 0.2 -> 1
# 0.2 - 0.4 -> 2
# 0.4 - 0.6 -> 3
# 0.6 - 0.8 -> 4
# 0.8 - 1   -> 5
df.loc[df['reorder_prob'] <= 0.2, 'reorder_prob'] = 1.0
df.loc[df['reorder_prob'] <= 0.4, 'reorder_prob'] = 2.0
df.loc[df['reorder_prob'] <= 0.6, 'reorder_prob'] = 3.0
df.loc[df['reorder_prob'] <= 0.8, 'reorder_prob'] = 4.0
df.loc[df['reorder_prob'] < 1.0, 'reorder_prob'] = 5.0

In [4]:
# Load dataframe into reader, prepare for running ML model
reader = Reader()
data = Dataset.load_from_df(df[['user_id', 'product_id', 'reorder_prob']], reader)

# Use SVD as ML model, and do cross validation to get  optimal set of parameters
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.0898  0.0907  0.0906  0.0903  0.0914  0.0906  0.0005  
MAE (testset)     0.0398  0.0401  0.0400  0.0398  0.0401  0.0400  0.0001  
Fit time          25.70   27.15   26.84   27.07   27.12   26.78   0.55    
Test time         4.54    3.93    4.49    4.48    4.54    4.40    0.23    


{'test_rmse': array([0.08977942, 0.09067974, 0.09060268, 0.0903327 , 0.0914001 ]),
 'test_mae': array([0.03982427, 0.04006724, 0.03998801, 0.03982578, 0.04009008]),
 'fit_time': (25.70211100578308,
  27.154389142990112,
  26.844313621520996,
  27.065598487854004,
  27.115967988967896),
 'test_time': (4.535582542419434,
  3.9345719814300537,
  4.492355823516846,
  4.475348949432373,
  4.542288780212402)}

In [5]:
# Traning dataset
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fdf758590f0>

In [7]:
titles = df.copy()
titles['Estimate_Score'] = titles['product_id'].apply(lambda x: svd.predict(2712, x).est)
titles = titles.sort_values(by=['Estimate_Score'], ascending=False)
titles[titles['user_id'] == 2712]

Unnamed: 0,product_id,user_id,reorder_prob,Estimate_Score
450855,23909,2712,4.0,3.964805
1231078,24852,2712,4.0,3.955884
118870,36431,2712,3.0,3.007014
338257,44536,2712,3.0,3.004192
1636182,46347,2712,3.0,2.976843
1520297,20955,2712,3.0,2.974023
333302,41246,2712,3.0,2.894641
1369000,2475,2712,2.0,2.064329
767484,2934,2712,2.0,2.062262
928997,45764,2712,2.0,2.055053


In [8]:
# Get top-3 recommendations based on user_id
user_ids = df['user_id'].unique()
recommendations = {}
for user_id in user_ids:
    user_item_df = titles[titles['user_id'] == user_id].head(3)
    recommendations[str(user_id)] = user_item_df['product_id'].values.tolist()

In [9]:
# Save results as a json file and store into S3 bucket
output = 'recommendations-' + datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + '.json'

s3.put_object(Bucket=output_bucket, Key=output, Body=json.dumps(recommendations))

{'ResponseMetadata': {'RequestId': 'DVCB2Y319XHMBJ0C',
  'HostId': 'FD/2Mi0qCskFboA1GB2vVyn1FWS9NS2PnQbqX6nlf+eX6mEu1PHEVPdUMegmJ/tsEvPAPVTDPjU=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'FD/2Mi0qCskFboA1GB2vVyn1FWS9NS2PnQbqX6nlf+eX6mEu1PHEVPdUMegmJ/tsEvPAPVTDPjU=',
   'x-amz-request-id': 'DVCB2Y319XHMBJ0C',
   'date': 'Fri, 19 May 2023 12:49:24 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"50a266b83f3838a2976ec1329d6aa40d"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"50a266b83f3838a2976ec1329d6aa40d"',
 'ServerSideEncryption': 'AES256'}

In [10]:
# Define filenames, and prepare for loading json into RDS
input_bucket = 'de-ers.imba-sagemaker-output'
local_filename = 'recommendation.json'
output_csv = 'recommendation.csv'

# Get latest json file
s3 = boto3.client('s3')
response = s3.list_objects_v2(Bucket=input_bucket)
json_filename = sorted(response['Contents'], key=lambda obj: obj['LastModified'], reverse=True)[0]['Key']

# Download file
s3.download_file(input_bucket, json_filename, local_filename)

# Read json into a variable
with open(local_filename, encoding='utf-8') as read:
    json_data = json.load(read)

### Part2: Write Recommendation Back to RDS
> Data Flow: from S3 Sagemaker Output Bucket to RDS recommendation schema as recommendation table

In [11]:
# Write Json to csv
with open(output_csv, 'w', newline='') as csvfile:
    w = csv.writer(csvfile)
    w.writerow(['user_id', 'product_id', 'recommendation_count'])
    
    for k, v in json_data.items():
        for value in v:
            w.writerow([k, value, 0])

In [13]:
# postgresql connection
host = 'imba-rdsinstance-instance-1.cy4i5jen2oog.ap-southeast-2.rds.amazonaws.com'
port = 5432
database = 'postgres'
username = '...'
password = '...'

conn = psycopg2.connect(
    host = host,
    port = port,
    database = database,
    user = username,
    password = password
)

cur = conn.cursor()

In [14]:
# batch load into database
data = []

with open(output_csv, 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    for row in reader:
        user_id, product_id, recommendation_count = row
        data.append((user_id, product_id, recommendation_count))
        
query = f'insert into recommendation.recommendation values (%s, %s, %s) on conflict do nothing;'
batch_size = 10000

with conn.cursor() as cur:
    execute_batch(cur, query, data, page_size=batch_size)
    conn.commit()
    
# Close connection
cur.close()
conn.close()