# Setup

In [8]:
%%capture
# installations, if necessary
!pip install recommenders

In [9]:
%%capture
import random 
import numpy as np
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
from numpy.linalg import *
from scipy.spatial import distance
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
np.random.seed(42)  # don't change this line

import base64
import datetime

In [10]:
import scrapbook as sb
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.utils.constants import SEED as DEFAULT_SEED
from recommenders.models.deeprec.deeprec_utils import prepare_hparams

DETAULT_SEED = 42 # this is what is is lol

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.9.16 (main, Mar  8 2023, 04:29:44) 
[Clang 14.0.6 ]
Pandas version: 1.5.3
Tensorflow version: 2.10.0


# Helper Functions for Data Cleanup
- `save_and_compress_embeddings(file_name, first_col_name)`: put file of embedding output and the name of the first col (user / item)

In [11]:
def save_and_compress_embeddings(file_name, first_col_name):
    try:
        df = pd.read_csv(file_name, header = None, delimiter='\t')
        df.columns = [first_col_name, 'embeddings']
        embeddings = df.embeddings.str.split(' ', expand=True).add_prefix('embedding_')
        df = pd.concat([df[first_col_name], embeddings], axis=1)
        df.to_csv(f'{file_name}.bz2', compression='bz2', index=False)
        os.remove(file_name)
    except Exception as e:
        print(e)
        print(f'Failed to compress {file_name}')

In [12]:
# given a yaml file name in the structure of 'embed_size_n_layers_batch_size_decay_epochs_learning_rate_eval_epoch_top_k', return the hyperparameters
def get_hyperparameters(yaml_file):
    embed_size = int(yaml_file.split('_')[0])
    n_layers = int(yaml_file.split('_')[1])
    batch_size = int(yaml_file.split('_')[2])
    # decay has a decimal that we changed to an underscore, so we need to change it back
    decay = float(yaml_file.split('_')[3] + '.' + yaml_file.split('_')[4])
    epochs = int(yaml_file.split('_')[5])
    # learning rate has a decimal that we changed to an underscore, so we need to change it back
    learning_rate = float(yaml_file.split('_')[6] + '.' + yaml_file.split('_')[7])
    eval_epoch = int(yaml_file.split('_')[8])
    top_k = int(yaml_file.split('_')[9])
    return embed_size, n_layers, batch_size, decay, epochs, learning_rate, eval_epoch, top_k

In [13]:
# create a drop duplicates function that keeps the row with the highest MAP
def drop_duplicates(df):
    df.sort_values(by=['MAP'], ascending=False, inplace=True)
    df.drop_duplicates(subset=['embed_size', 'n_layers', 'batch_size', 'decay', 'epochs', 'learning_rate', 'eval_epoch', 'top_k'], keep='first', inplace=True)
    df.reset_index(inplace=True)
    df.drop(columns=['index'], inplace=True)
    return df

# Admin Code so Things Don't Break

In [None]:
if not os.path.exists('yamls'):
    os.makedirs('yamls')
if not os.path.exists('embeddings'):
    os.makedirs('embeddings')
if not os.path.exists('models'):
    os.makedirs('models')

# Load amazon data into `amazon_data`

In [14]:
path = "https://5190-hav-recommendation-data.s3.us-east-1.amazonaws.com/ratings_Electronics.csv"
amazon_data = pd.read_csv(path, header=None)
amazon_data.columns = ['userId', 'productId', 'rating', 'timestamp']
amazon_data

Unnamed: 0,userId,productId,rating,timestamp
0,AKM1MP6P0OYPR,0132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,0321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,0439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,0439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,0439886341,1.0,1334707200
...,...,...,...,...
7824477,A2YZI3C9MOHC0L,BT008UKTMW,5.0,1396569600
7824478,A322MDK0M89RHN,BT008UKTMW,5.0,1313366400
7824479,A1MH90R0ADMIK0,BT008UKTMW,4.0,1404172800
7824480,A10M2KEFPEQDHN,BT008UKTMW,4.0,1297555200


# Preprocessing 
- subset to only users with 50+ ratings

In [15]:
# Preprocessing
user_rating_count = amazon_data['userId'].value_counts().rename('user_rating_count')
augmented_amazon_data = amazon_data.merge(user_rating_count.to_frame(), left_on='userId', right_index=True)
subset_df = augmented_amazon_data[augmented_amazon_data.user_rating_count >= 50]
print(subset_df.shape)
subset_df.head()

(125871, 5)


Unnamed: 0,userId,productId,rating,timestamp,user_rating_count
94,A3BY5KCNQZXV5U,0594451647,5.0,1390176000,50
14863,A3BY5KCNQZXV5U,B00000JD4V,4.0,1118016000,50
134213,A3BY5KCNQZXV5U,B000063574,5.0,1016668800,50
338368,A3BY5KCNQZXV5U,B0000CDJP8,5.0,1258761600,50
634048,A3BY5KCNQZXV5U,B0007Y794O,5.0,1369872000,50


In [32]:
subset_df.tail()

Unnamed: 0,userId,productId,rating,timestamp,user_rating_count
7811895,A328S9RN3U5M68,B00JGL37FO,5.0,1400976000,76
7817686,A328S9RN3U5M68,B00K00FN3O,5.0,1400544000,76
7824063,A328S9RN3U5M68,B00L21HC7A,5.0,1405123200,76
7824081,A328S9RN3U5M68,B00L2442H0,5.0,1405123200,76
7824103,A328S9RN3U5M68,B00L26YDA4,5.0,1405123200,76


In [56]:
product_rating_count = subset_df['productId'].value_counts().rename('product_rating_count')
product_rating_data   = subset_df.merge(product_rating_count.to_frame(),
                                left_on='productId',
                                right_index=True)
product_rating_data = product_rating_data[product_rating_data.product_rating_count >= 10]
product_rating_data.sort_values(by=['product_rating_count'])

Unnamed: 0,userId,productId,rating,timestamp,user_rating_count,product_rating_count
7667532,A2CWIYIETNBAK3,B00FZ9SMVU,5.0,1391644800,55,10
7259162,A2HMF8ZR67BNZS,B00CGWNK2U,5.0,1377129600,130,10
7259182,A3NEAETOSXDBOM,B00CGWNK2U,4.0,1377043200,177,10
3782500,A226VGZWOEBPGL,B0041OUA38,5.0,1318464000,80,10
3782463,A3BKNXX8QFIXIV,B0041OUA38,5.0,1316563200,92,10
...,...,...,...,...,...,...
6105828,AMUP8DYE7EAN2,B0088CJT4U,5.0,1367107200,85,206
6105930,A27GITTN6AVW5I,B0088CJT4U,3.0,1353974400,56,206
6105655,A2UEB48LAWFUCW,B0088CJT4U,3.0,1365897600,63,206
6106206,A30UP2KKD5IQEP,B0088CJT4U,4.0,1362787200,69,206


In [57]:
final_df = product_rating_data[['userId', 'productId', 'rating', 'timestamp']] # final df is the df we use
final_df

Unnamed: 0,userId,productId,rating,timestamp
634048,A3BY5KCNQZXV5U,B0007Y794O,5.0,1369872000
633970,AKT8TGIT6VVZ5,B0007Y794O,5.0,1147305600
633944,A1ILWPH1GHUXE2,B0007Y794O,4.0,1358553600
634073,A1ZM846Y7AUYD,B0007Y794O,4.0,1201132800
633998,A2ED50E3KWKUKW,B0007Y794O,5.0,1331856000
...,...,...,...,...
7667497,AWPN47SSWK1JV,B00FZ9SMVU,5.0,1392076800
7667517,A2MJ8OL2FYN7CW,B00FZ9SMVU,5.0,1389312000
7667443,A1GWG5CWLKJ7ET,B00FZ9SMVU,5.0,1392854400
7667539,A3VBZDYGHF4NK8,B00FZ9SMVU,5.0,1392854400


# Basically Gridsearch

In [62]:
import yaml 

yaml_dir = 'yamls'
yaml_files = []

# results filename - change this to whatever you want
results_filename = 'lightgcn_results_users_50_plus_products_10_plus.csv'

# read in results to see which yaml files have already been written
if os.path.exists(results_filename):
    old_results_df = pd.read_csv(results_filename)
    written_yaml_files = old_results_df['yaml_file'].tolist()

# write a bunch of different combinations of yaml files
embed_size_list = [64]
n_layers_list = [5, 7]
batch_size_list = [2048]
decay_list = [0.01, 0.1]
epochs_list = [50, 100, 500, 1000]
learning_rate_list = [0.001,0.01, 0.1]
eval_epoch_list = [-1]
top_k_list = [10]

# create a for loop that writes the yaml files for all combinations of the above hyperparameters
for embed_size in embed_size_list:
    for n_layers in n_layers_list:
        for batch_size in batch_size_list:
            for decay in decay_list:
                for epochs in epochs_list:
                    for learning_rate in learning_rate_list:
                        for eval_epoch in eval_epoch_list:
                            for top_k in top_k_list:
                                filename = f'{embed_size}_{n_layers}_{batch_size}_{decay}_{epochs}_{learning_rate}_{eval_epoch}_{top_k}'
                                filename = filename.replace('.','_')

                                if filename in written_yaml_files:
                                    continue
                                # write yaml file
                                data = {
                                    'model': {
                                        'model_type': 'lightgcn',
                                        'embed_size': embed_size,
                                        'n_layers': n_layers
                                    },
                                    'train': {
                                        'batch_size': batch_size,
                                        'decay': decay,
                                        'epochs': epochs,
                                        'learning_rate': learning_rate,
                                        'eval_epoch': eval_epoch,
                                        'top_k': top_k
                                    },
                                    'info': {
                                        'save_model' : True, # whether to save model
                                        'save_epoch' : 100, # if save_model is set to True, save the model every save_epoch
                                        'metrics' : ["recall", "ndcg", "precision", 'map'], # metrics for evaluation
                                        'MODEL_DIR' : './models/' # directory of saved models
                                    }
                                }

                                with open(f'{yaml_dir}/{filename}.yaml', 'w') as outfile:
                                    yaml.dump(data, outfile, default_flow_style=False)
                                outfile.close()

                                yaml_files.append(filename)

yaml_files

['64_5_2048_0_01_50_0_1_-1_10',
 '64_5_2048_0_01_100_0_1_-1_10',
 '64_5_2048_0_01_500_0_1_-1_10',
 '64_5_2048_0_01_1000_0_001_-1_10',
 '64_5_2048_0_01_1000_0_01_-1_10',
 '64_5_2048_0_01_1000_0_1_-1_10',
 '64_5_2048_0_1_50_0_1_-1_10',
 '64_5_2048_0_1_100_0_1_-1_10',
 '64_5_2048_0_1_500_0_1_-1_10',
 '64_5_2048_0_1_1000_0_001_-1_10',
 '64_5_2048_0_1_1000_0_01_-1_10',
 '64_5_2048_0_1_1000_0_1_-1_10',
 '64_7_2048_0_01_50_0_1_-1_10',
 '64_7_2048_0_01_100_0_1_-1_10',
 '64_7_2048_0_01_500_0_1_-1_10',
 '64_7_2048_0_01_1000_0_001_-1_10',
 '64_7_2048_0_01_1000_0_01_-1_10',
 '64_7_2048_0_01_1000_0_1_-1_10',
 '64_7_2048_0_1_50_0_1_-1_10',
 '64_7_2048_0_1_100_0_1_-1_10',
 '64_7_2048_0_1_500_0_1_-1_10',
 '64_7_2048_0_1_1000_0_001_-1_10',
 '64_7_2048_0_1_1000_0_01_-1_10',
 '64_7_2048_0_1_1000_0_1_-1_10']

In [63]:
%%capture
# Select Amazon data size: 100k, 1m, 10m, or 20m
AMAZON_DATA_SIZE = len(final_df)

SEED = DEFAULT_SEED  # Set None for non-deterministic results

TOP_K = 10

results = {}

for filename in yaml_files: 
    yaml_file = f'yamls/{filename}.yaml'
    user_file = f"embeddings/{filename}_user_embeddings.csv"
    item_file = f"embeddings/{filename}_item_embeddings.csv"
    df = final_df[:AMAZON_DATA_SIZE]
    df = df.rename(columns={'userId': 'userID', 'productId': 'itemID'})
    df.head()
    train, test = python_stratified_split(df, ratio=0.75)
    data = ImplicitCF(train=train, test=test, seed=SEED)
    hparams = prepare_hparams(yaml_file)
    model = LightGCN(hparams, data, seed=SEED)
    with Timer() as train_time:
        model.fit()

    print("Took {} seconds for training.".format(train_time.interval))
    topk_scores = model.recommend_k_items(test, top_k=TOP_K, remove_seen=True)

    topk_scores.head()
    eval_map = map_at_k(test, topk_scores, k=TOP_K)
    eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
    eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
    eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

    print("MAP:\t%f" % eval_map,
        "NDCG:\t%f" % eval_ndcg,
        "Precision@K:\t%f" % eval_precision,
        "Recall@K:\t%f" % eval_recall, sep='\n')

    results[filename] = [eval_map, eval_ndcg, eval_precision, eval_recall, train_time.interval]
    model.infer_embedding(user_file, item_file)
    save_and_compress_embeddings(user_file, "userID")
    save_and_compress_embeddings(item_file, "itemID")

    # write results each iteration in case things crash
    results_df = pd.DataFrame.from_dict(results, orient='index', columns=['MAP', 'NDCG', 'Precision', 'Recall', 'Train Time'])
    results_df['embed_size'], results_df['n_layers'], results_df['batch_size'], results_df['decay'], results_df['epochs'], results_df['learning_rate'], results_df['eval_epoch'], results_df['top_k'] = zip(*results_df.index.map(get_hyperparameters))
    results_df.reset_index(inplace=True)
    results_df.rename(columns={'index': 'yaml_file'}, inplace=True)
    if os.path.exists(results_filename):
        # read in previous results and append new results
        old_results_df = pd.read_csv(results_filename)
        old_results_df = old_results_df.append(results_df)
        old_results_df.reset_index(inplace=True)
        old_results_df.drop(columns=['index'], inplace=True)
        drop_duplicates(old_results_df)
        old_results_df.to_csv(results_filename, index=False)
    else:
        results_df.to_csv(results_filename, index=False)
    

# pass results dict into a dataframe


KeyboardInterrupt: 

# Run on Dataset Shifts

In [68]:
# create dataset shifts
sorted_df = final_df.sort_values(by=['timestamp'])
sorted_df

Unnamed: 0,userId,productId,rating,timestamp
40702,A1RPTVW5VEOSI,B00004SB92,5.0,956966400
40387,AUITG1DJ3QUGK,B00004SB92,5.0,961200000
21343,A3A15L96IYUO6V,B00001P4XA,4.0,962236800
40261,ALUNVOQRXOZIA,B00004SB92,1.0,965347200
21317,ARXU3FESTWMJJ,B00001P4XA,3.0,966988800
...,...,...,...,...
7820216,A3G5MOHY1U635N,B00K91DB7Y,4.0,1405987200
7427171,A26BDXG9KVH7SU,B00DQZSIW8,4.0,1405987200
7801613,A3MFORLOKIOEQY,B00IVPU5BK,5.0,1405987200
7657417,A2R1HUYHXV7H18,B00FSA8VQ2,5.0,1405987200


In [69]:
ind=pd.DataFrame(sorted_df.userId.unique())
ind['index']=range(1, len(ind) + 1)
ind.rename(columns={0:'userId'},inplace=True)
ind

Unnamed: 0,userId,index
0,A1RPTVW5VEOSI,1
1,AUITG1DJ3QUGK,2
2,A3A15L96IYUO6V,3
3,ALUNVOQRXOZIA,4
4,ARXU3FESTWMJJ,5
...,...,...
1527,AC9QEWWN4W9MC,1528
1528,AGILRZOT49R0V,1529
1529,A3TR3KLL5PXSZ8,1530
1530,A3SU7JSTPH9CC9,1531


In [71]:
merge_df=sorted_df.merge(ind,left_on='userId',right_on='userId')
merge_df.drop(columns=['userId'],inplace=True)
merge_df.rename(columns={'index':'userId'},inplace=True)
merge_df = merge_df[['userId', 'productId', 'rating', 'timestamp']]
merge_df

Unnamed: 0,userId,productId,rating,timestamp
0,1,B00004SB92,5.0,956966400
1,1,B000NK8EWI,5.0,1195862400
2,1,B001QTXL82,5.0,1241049600
3,1,B001TZWNF0,3.0,1257724800
4,1,B002M3SOBU,5.0,1260403200
...,...,...,...,...
42936,1532,B002HU629E,3.0,1400025600
42937,1532,B00004T8R2,5.0,1400198400
42938,1532,B000VDCT3C,5.0,1400198400
42939,1532,B002FYL7PG,4.0,1400198400


In [72]:
# fixme: this isn't actually splitting the data into user tranches because we're not splitting by user id
# HOWEVER keep this because we're doing this for the other models too - consistency is key for comparison
import math
early_df=merge_df[0:int(len(merge_df)/2)]
late_df=merge_df[int(len(merge_df)/2):]

In [79]:
# read in top hyperparameters from results file
results_df = pd.read_csv(results_filename)

# get top hyperparameters
best_hyperparameters = results_df.sort_values(by=['MAP'], ascending=False).head(1)
best_yaml_file = best_hyperparameters['yaml_file'].values[0]
best_yaml_file


In [82]:
# run method on early_df and late_df using top hyperparameters
def run_lightgcn(df, filename, results_filename, seed=DEFAULT_SEED, top_k=10, prepend=""):
    # Select Amazon data size: 100k, 1m, 10m, or 20m
    AMAZON_DATA_SIZE = len(df)
    SEED = seed
    TOP_K = top_k

    results = {}

    yaml_file = f'yamls/{filename}.yaml'
    user_file = f"embeddings/{prepend}{filename}_user_embeddings.csv"
    item_file = f"embeddings/{prepend}{filename}_item_embeddings.csv"
    df = final_df[:AMAZON_DATA_SIZE]
    df = df.rename(columns={'userId': 'userID', 'productId': 'itemID'})
    train, test = python_stratified_split(df, ratio=0.75)
    data = ImplicitCF(train=train, test=test, seed=SEED)
    hparams = prepare_hparams(yaml_file)
    model = LightGCN(hparams, data, seed=SEED)
    with Timer() as train_time:
        model.fit()

    print("Took {} seconds for training.".format(train_time.interval))
    topk_scores = model.recommend_k_items(test, top_k=TOP_K, remove_seen=True)

    topk_scores.head()
    eval_map = map_at_k(test, topk_scores, k=TOP_K)
    eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
    eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
    eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

    print("MAP:\t%f" % eval_map,
        "NDCG:\t%f" % eval_ndcg,
        "Precision@K:\t%f" % eval_precision,
        "Recall@K:\t%f" % eval_recall, sep='\n')

    results[filename] = [eval_map, eval_ndcg, eval_precision, eval_recall, train_time.interval]
    model.infer_embedding(user_file, item_file)
    save_and_compress_embeddings(user_file, "userID")
    save_and_compress_embeddings(item_file, "itemID")

    # write results each iteration in case things crash
    results_df = pd.DataFrame.from_dict(results, orient='index', columns=['MAP', 'NDCG', 'Precision', 'Recall', 'Train Time'])
    results_df['embed_size'], results_df['n_layers'], results_df['batch_size'], results_df['decay'], results_df['epochs'], results_df['learning_rate'], results_df['eval_epoch'], results_df['top_k'] = zip(*results_df.index.map(get_hyperparameters))
    results_df.reset_index(inplace=True)
    results_df.rename(columns={'index': 'yaml_file'}, inplace=True)
    if os.path.exists(results_filename):
        # read in previous results and append new results
        old_results_df = pd.read_csv(results_filename)
        old_results_df = old_results_df.append(results_df)
        old_results_df.reset_index(inplace=True)
        old_results_df.drop(columns=['index'], inplace=True)
        drop_duplicates(old_results_df)
        old_results_df.to_csv(results_filename, index=False)
    else:
        results_df.to_csv(results_filename, index=False)



In [83]:
run_lightgcn(early_df, best_yaml_file, 'early_lightgcn_results.csv', seed=DEFAULT_SEED, top_k=10, prepend="early_")
run_lightgcn(late_df, best_yaml_file, 'late_lightgcn_results.csv', seed=DEFAULT_SEED, top_k=10, prepend="late_")

  df = train if test is None else train.append(test)


Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.
Epoch 1 (train)0.6s: train loss = 0.69249 = (mf)0.69121 + (embed)0.00128
Epoch 2 (train)0.2s: train loss = 0.67220 = (mf)0.66867 + (embed)0.00353
Epoch 3 (train)0.2s: train loss = 0.61566 = (mf)0.60293 + (embed)0.01273
Epoch 4 (train)0.2s: train loss = 0.55798 = (mf)0.52925 + (embed)0.02873
Epoch 5 (train)0.2s: train loss = 0.52483 = (mf)0.47947 + (embed)0.04536
Epoch 6 (train)0.2s: train loss = 0.50129 = (mf)0.44490 + (embed)0.05639
Epoch 7 (train)0.2s: train loss = 0.48965 = (mf)0.42629 + (embed)0.06336
Epoch 8 (train)0.2s: train loss = 0.47830 = (mf)0.41026 + (embed)0.06803
Epoch 9 (train)0.2s: train loss = 0.47692 = (mf)0.40629 + (embed)0.07063
Epoch 10 (train)0.2s: train loss = 0.47217 = (mf)0.39967 + (embed)0.07250
Epoch 11 (train)0.2s: train loss = 0.47325 = (mf)0.39905 + (embed)0.07420
Epoch 12 (train)0.2s: train loss = 0.47084 = (mf)0.39540 + (embed)0.07544
Epoch 13 (train)0.2s: t

  df = train if test is None else train.append(test)


Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.
Epoch 1 (train)0.6s: train loss = 0.69237 = (mf)0.69108 + (embed)0.00129
Epoch 2 (train)0.2s: train loss = 0.67051 = (mf)0.66684 + (embed)0.00367
Epoch 3 (train)0.2s: train loss = 0.61400 = (mf)0.60089 + (embed)0.01311
Epoch 4 (train)0.2s: train loss = 0.56604 = (mf)0.53770 + (embed)0.02834
Epoch 5 (train)0.2s: train loss = 0.54026 = (mf)0.49860 + (embed)0.04166
Epoch 6 (train)0.2s: train loss = 0.51286 = (mf)0.46380 + (embed)0.04906
Epoch 7 (train)0.2s: train loss = 0.49645 = (mf)0.44130 + (embed)0.05515
Epoch 8 (train)0.2s: train loss = 0.48127 = (mf)0.41953 + (embed)0.06173
Epoch 9 (train)0.2s: train loss = 0.47903 = (mf)0.41166 + (embed)0.06738
Epoch 10 (train)0.2s: train loss = 0.47456 = (mf)0.40293 + (embed)0.07163
Epoch 11 (train)0.2s: train loss = 0.47496 = (mf)0.40053 + (embed)0.07443
Epoch 12 (train)0.2s: train loss = 0.47234 = (mf)0.39647 + (embed)0.07587
Epoch 13 (train)0.2s: t