# Setup

In [2]:
%%capture
import random 
import numpy as np
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
from numpy.linalg import *
from scipy.spatial import distance
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
np.random.seed(42)  # don't change this line

import base64
import datetime
!pip install recommenders

In [3]:

import sys
import os
import scrapbook as sb
import pandas as pd
import numpy as np
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.utils.constants import SEED as DEFAULT_SEED
from recommenders.models.deeprec.deeprec_utils import prepare_hparams

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))

2023-04-25 13:00:06.608513: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


System version: 3.9.16 (main, Mar  8 2023, 04:29:44) 
[Clang 14.0.6 ]
Pandas version: 1.5.3
Tensorflow version: 2.10.0


# Helper Functions for Data Cleanup
- `save_and_compress_embeddings(file_name, first_col_name)`: put file of embedding output and the name of the first col (user / item)

In [75]:
def save_and_compress_embeddings(file_name, first_col_name):
    try:
        df = pd.read_csv(file_name, header = None, delimiter='\t')
        df.columns = [first_col_name, 'embeddings']
        embeddings = df.embeddings.str.split(' ', expand=True).add_prefix('embedding_')
        df = pd.concat([df[first_col_name], embeddings], axis=1)
        df.to_csv(f'{file_name}.bz2', compression='bz2', index=False)
        os.remove(file_name)
    except Exception as e:
        print(e)
        print(f'Failed to compress {file_name}')

# Load amazon data into `amazon_data`

In [5]:
path = "https://5190-hav-recommendation-data.s3.us-east-1.amazonaws.com/ratings_Electronics.csv?response-content-disposition=inline&X-Amz-Security-Token=IQoJb3JpZ2luX2VjEA8aCXVzLWVhc3QtMSJIMEYCIQCyQI%2B9ZlyMfQ%2FlP9mnL%2Bzz9t5u5kF9fxbT%2BgEZSzRIvwIhAILSNkqmQNTrpk2svBCM5js7qY%2BAVrSj88Edv5i8Bj0VKqwCCBgQABoMNzk1MjAwNzY1OTc3Igx%2BIAsXPtRpO8bmVpMqiQJv8oAIQXcqxthZSWJx3G9wHJyStBg%2FR2I7hAbx3afzEd0BC3v%2BtCes%2FGgoYnGQoUoCgo8nEcKaCIsXl%2BNobf7W5N9%2F9uUzKrNovFrMB8TKc1Rn3Hkf3JCmFyYnBRylkN7fClxajvkzK1JaKpLDPsXib51wjLH0E4Ipncf4B8mXk0rBDiSJSgjwAwbzIWhxlWJc%2FFBa1ItrswdSz6HcoSucoDhMBmUONHCzoTjoNq7aQo3JFNia3pwoj3cr0xcFqtjhrHNz0KgL10pxGWAk0wbbLd4oMgC20ExrQZQkd6Cf9SfPydD%2BwUIB%2B6XGbqQ3xwy6ZGgzgkzGDBk4Z9bpLBtNXOc2ewWeb%2BP3MJfHn6IGOt4BTVr%2FC4rInMrAAU9rYNkUozM%2BW5M7clxsRxCJ9xB4APB5ICTEmmjnss9nGt35pe4ss9RyH7pb16JcsAa16%2B0dk1OLzpYnKcs33W1lVIw3GPdE4iulh7Qgr5OFkfvnwSJ6%2B%2FmB%2By1hsH9kBlIPhYMqB7P4U7jzT7PYoSpy%2BNMcfpEuEJBtUPzGZVPGopxoyorj8o0%2FvmfkQno97ls7dt0Wimk2NYJm1PYeP2S60GVQQB3PqMeQdbkmhRqWJAkzB1kP10kZIYqzb7v5iYMZCblyacS%2BT9UNzzuC7iUjq03w&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20230425T154859Z&X-Amz-SignedHeaders=host&X-Amz-Expires=43200&X-Amz-Credential=ASIA3SJNIYAM5PXI5Y25%2F20230425%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=7abc592f0dfeeac771bbf1f76a40db85d204f8babfc3f7c06264cd6afbef2048"
amazon_data = pd.read_csv(path, header=None)
amazon_data.columns = ['userId', 'productId', 'rating', 'timestamp']
amazon_data
#takes 10s or so

Unnamed: 0,userId,productId,rating,timestamp
0,AKM1MP6P0OYPR,0132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,0321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,0439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,0439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,0439886341,1.0,1334707200
...,...,...,...,...
7824477,A2YZI3C9MOHC0L,BT008UKTMW,5.0,1396569600
7824478,A322MDK0M89RHN,BT008UKTMW,5.0,1313366400
7824479,A1MH90R0ADMIK0,BT008UKTMW,4.0,1404172800
7824480,A10M2KEFPEQDHN,BT008UKTMW,4.0,1297555200


# Model 4: LightGCN (Microsoft) - With Original Movie Data

In [74]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

# Model parameters
EPOCHS = 50
BATCH_SIZE = 1024

SEED = DEFAULT_SEED  # Set None for non-deterministic results

yaml_file = "lightgcn.yaml"
user_file = "embeddings/movie/user_embeddings.csv"
item_file = "embeddings/movie/item_embeddings.csv"

In [7]:
df = movielens.load_pandas_df(size=MOVIELENS_DATA_SIZE)

df.head()

100%|██████████| 4.81k/4.81k [00:00<00:00, 9.58kKB/s]


Unnamed: 0,userID,itemID,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [8]:
train, test = python_stratified_split(df, ratio=0.75)

In [9]:
data = ImplicitCF(train=train, test=test, seed=SEED)


  df = train if test is None else train.append(test)


In [12]:
hparams = prepare_hparams(yaml_file,
                          n_layers=3,
                          batch_size=BATCH_SIZE,
                          epochs=EPOCHS,
                          learning_rate=0.005,
                          eval_epoch=5,
                          top_k=TOP_K,
                         )

In [14]:
model = LightGCN(hparams, data, seed=SEED)

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.


In [15]:
with Timer() as train_time:
    model.fit()

print("Took {} seconds for training.".format(train_time.interval))

Epoch 1 (train)3.4s: train loss = 0.47207 = (mf)0.47183 + (embed)0.00024
Epoch 2 (train)3.2s: train loss = 0.28781 = (mf)0.28716 + (embed)0.00065
Epoch 3 (train)3.2s: train loss = 0.25237 = (mf)0.25155 + (embed)0.00082
Epoch 4 (train)3.2s: train loss = 0.23345 = (mf)0.23245 + (embed)0.00100
Epoch 5 (train)3.2s + (eval)0.3s: train loss = 0.22850 = (mf)0.22738 + (embed)0.00112, recall = 0.15910, ndcg = 0.34758, precision = 0.30244, map = 0.09196
Epoch 6 (train)3.2s: train loss = 0.22022 = (mf)0.21899 + (embed)0.00123
Epoch 7 (train)3.2s: train loss = 0.20932 = (mf)0.20798 + (embed)0.00135
Epoch 8 (train)3.2s: train loss = 0.19944 = (mf)0.19795 + (embed)0.00148
Epoch 9 (train)3.2s: train loss = 0.18843 = (mf)0.18681 + (embed)0.00162
Epoch 10 (train)3.2s + (eval)0.2s: train loss = 0.18596 = (mf)0.18419 + (embed)0.00177, recall = 0.17681, ndcg = 0.38348, precision = 0.33552, map = 0.10558
Epoch 11 (train)3.2s: train loss = 0.17630 = (mf)0.17440 + (embed)0.00190
Epoch 12 (train)3.3s: train l

In [16]:
topk_scores = model.recommend_k_items(test, top_k=TOP_K, remove_seen=True)

topk_scores.head()

Unnamed: 0,userID,itemID,prediction
0,1,7,5.750921
1,1,89,5.375122
2,1,919,5.301207
3,1,475,5.167018
4,1,210,5.155258


In [17]:
eval_map = map_at_k(test, topk_scores, k=TOP_K)
eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.135623
NDCG:	0.455061
Precision@K:	0.399258
Recall@K:	0.213782


In [18]:
# Record results with papermill for tests
sb.glue("map", eval_map)
sb.glue("ndcg", eval_ndcg)
sb.glue("precision", eval_precision)
sb.glue("recall", eval_recall)

In [24]:
model.infer_embedding(user_file, item_file)

In [76]:
save_and_compress_embeddings(user_file, "userID")
save_and_compress_embeddings(item_file, "itemID")

# Model 4: LightGCN - Run on Our Data

In [48]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
AMAZON_DATA_SIZE = 100000

# Model parameters
EPOCHS = 50
BATCH_SIZE = 1024

SEED = DEFAULT_SEED  # Set None for non-deterministic results

yaml_file = "lightgcn.yaml"
user_file = "embeddings/amazon/100k_user_embeddings.csv"
item_file = "embeddings/amazon/100k_item_embeddings.csv"

In [39]:
df = amazon_data[:AMAZON_DATA_SIZE]
df = df.rename(columns={'userId': 'userID', 'productId': 'itemID'})
df.head()

Unnamed: 0,userID,itemID,rating,timestamp
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,439886341,1.0,1334707200


In [40]:
train, test = python_stratified_split(df, ratio=0.75)

In [41]:
data = ImplicitCF(train=train, test=test, seed=SEED)

  df = train if test is None else train.append(test)


In [42]:
hparams = prepare_hparams(yaml_file,
                          n_layers=3,
                          batch_size=BATCH_SIZE,
                          epochs=EPOCHS,
                          learning_rate=0.005,
                          eval_epoch=5,
                          top_k=TOP_K,
                         )

In [43]:
model = LightGCN(hparams, data, seed=SEED)

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.


In [44]:
with Timer() as train_time:
    model.fit()

print("Took {} seconds for training.".format(train_time.interval))

Epoch 1 (train)13.9s: train loss = 0.29012 = (mf)0.29000 + (embed)0.00012
Epoch 2 (train)13.5s: train loss = 0.06081 = (mf)0.06050 + (embed)0.00032
Epoch 3 (train)14.0s: train loss = 0.02944 = (mf)0.02903 + (embed)0.00041
Epoch 4 (train)13.7s: train loss = 0.01747 = (mf)0.01700 + (embed)0.00048
Epoch 5 (train)14.1s + (eval)0.4s: train loss = 0.01200 = (mf)0.01147 + (embed)0.00053, recall = 0.06498, ndcg = 0.04170, precision = 0.00754, map = 0.03322
Epoch 6 (train)13.8s: train loss = 0.00866 = (mf)0.00809 + (embed)0.00056
Epoch 7 (train)13.8s: train loss = 0.00648 = (mf)0.00588 + (embed)0.00060
Epoch 8 (train)13.3s: train loss = 0.00527 = (mf)0.00465 + (embed)0.00062
Epoch 9 (train)13.5s: train loss = 0.00470 = (mf)0.00407 + (embed)0.00064
Epoch 10 (train)13.3s + (eval)0.3s: train loss = 0.00408 = (mf)0.00342 + (embed)0.00066, recall = 0.07278, ndcg = 0.04424, precision = 0.00846, map = 0.03400
Epoch 11 (train)13.3s: train loss = 0.00340 = (mf)0.00273 + (embed)0.00067
Epoch 12 (train)13

In [45]:
topk_scores = model.recommend_k_items(test, top_k=TOP_K, remove_seen=True)

topk_scores.head()

Unnamed: 0,userID,itemID,prediction
0,A101OAAMZYWQ3U,B00004TWM6,15.307242
1,A101OAAMZYWQ3U,B00004T8R2,12.10918
2,A101OAAMZYWQ3U,B00004ZCJE,9.471293
3,A101OAAMZYWQ3U,9983891212,9.318825
4,A101OAAMZYWQ3U,B00004TBLW,8.481276


In [46]:
eval_map = map_at_k(test, topk_scores, k=TOP_K)
eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.026817
NDCG:	0.035155
Precision@K:	0.006733
Recall@K:	0.057623


In [77]:
model.infer_embedding(user_file, item_file)
save_and_compress_embeddings(user_file, "userID")
save_and_compress_embeddings(item_file, "itemID")

KeyboardInterrupt: 