In [1]:
import json
import boto3
import pandas as pd
import numpy as np
from datetime import datetime
!pip install scikit-surprise
from surprise import Reader, Dataset, SVD
from surprise.model_selection.validation import cross_validate

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25ldone
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=1178896 sha256=3b2506a5ad1c1d64d23eb20d7a29706eb7e40af0d7a11142d0c0dc07e170a811
  Stored in directory: /home/ec2-user/.cache/pip/wheels/df/e4/a6/7ad72453dd693f420b0c639bedeec34641738d11b55d8d9b84
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [2]:
# Get latest folder/filename
s3 = boto3.client("s3")
bucket = 'jrde-upload-from-glue'
output_bucket = 'jrde-recommendation-results'

response = s3.list_objects_v2(Bucket=bucket)
sorted_obj = sorted(response['Contents'], key=lambda obj: obj['LastModified'], reverse=True)
folder = sorted_obj[0]['Key'].split('/')[0]
filename = sorted_obj[0]['Key'].split('/')[1]

filename

'part-00000-fddc7612-2381-4eee-a1f0-5328fd1072e2-c000.csv'

In [3]:
df = pd.read_csv(f's3://{bucket}/{folder}/{filename}', nrows=2000000, usecols=['user_id', 'product_id', 'user_reorder_ratio', 'product_seq_time_1', 'product_seq_time_2'])
df['reorder_prob'] = df.product_seq_time_2 / df.product_seq_time_1
df.drop(['user_reorder_ratio', 'product_seq_time_1', 'product_seq_time_2'], axis=1, inplace=True)
df.loc[df['reorder_prob'] <= 0.2, 'reorder_prob'] = 1.0
df.loc[df['reorder_prob'] <= 0.4, 'reorder_prob'] = 2.0
df.loc[df['reorder_prob'] <= 0.6, 'reorder_prob'] = 3.0
df.loc[df['reorder_prob'] <= 0.8, 'reorder_prob'] = 4.0
df.loc[df['reorder_prob'] < 1.0, 'reorder_prob'] = 5.0

In [16]:
reader = Reader()
data = Dataset.load_from_df(df[['user_id', 'product_id', 'reorder_prob']], reader)
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.0850  0.0847  0.0842  0.0861  0.0854  0.0851  0.0006  
MAE (testset)     0.0374  0.0372  0.0372  0.0375  0.0373  0.0373  0.0001  
Fit time          27.88   28.76   28.60   28.02   28.43   28.34   0.34    
Test time         4.46    3.87    4.44    4.47    3.90    4.23    0.28    


{'test_rmse': array([0.08502632, 0.08468011, 0.08422991, 0.08605652, 0.08539579]),
 'test_mae': array([0.03744931, 0.03716287, 0.03717931, 0.03750396, 0.03726147]),
 'fit_time': (27.879758596420288,
  28.758902072906494,
  28.599554300308228,
  28.01816725730896,
  28.4322726726532),
 'test_time': (4.464025020599365,
  3.8684091567993164,
  4.443245887756348,
  4.465251684188843,
  3.89990234375)}

In [17]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f2aff3c76d0>

In [19]:
titles = df.copy()
titles['Estimate_Score'] = titles['product_id'].apply(lambda x: svd.predict(2712, x).est)
titles = titles.sort_values(by=['Estimate_Score'], ascending=False)
titles[titles['user_id'] == 2712]

Unnamed: 0,product_id,user_id,reorder_prob,Estimate_Score
396332,23909,2712,4.0,3.911204
1092506,19643,2712,3.0,2.995804
1421423,38929,2712,3.0,2.965416
1854640,18811,2712,3.0,2.924098
0,12,2712,2.0,2.053515
1129736,21582,2712,2.0,1.981313
1006223,13423,2712,2.0,1.979889
1255651,29750,2712,2.0,1.972856
1754777,11121,2712,2.0,1.962968
379691,22788,2712,2.0,1.962166


In [21]:
user_ids = df['user_id'].unique()
recommendations = {}
for user_id in user_ids:
    user_item_df = titles[titles['user_id'] == user_id].head(3)
    recommendations[str(user_id)] = user_item_df['product_id'].values.tolist()

In [22]:
output = 'recommendations-' + datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + '.json'

s3.put_object(Bucket=output_bucket, Key=output, Body=json.dumps(recommendations))

{'ResponseMetadata': {'RequestId': 'VAQ77Z19X83DH7C0',
  'HostId': 'OwfNYj0XqXdKVZreAgim6P8y1XB4c4LPv2mSNRxu76tHhsagvfOdMtBCP6l9SPtNiaZ0jeX1nQBmCnnYE+xJNg==',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'OwfNYj0XqXdKVZreAgim6P8y1XB4c4LPv2mSNRxu76tHhsagvfOdMtBCP6l9SPtNiaZ0jeX1nQBmCnnYE+xJNg==',
   'x-amz-request-id': 'VAQ77Z19X83DH7C0',
   'date': 'Tue, 28 Mar 2023 22:09:12 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"73e1398606b31a863cc599871e8f7ef9"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"73e1398606b31a863cc599871e8f7ef9"',
 'ServerSideEncryption': 'AES256'}