# Setup

In [8]:
%%capture
# installations, if necessary
!pip install recommenders

In [9]:
%%capture
import random 
import numpy as np
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
from numpy.linalg import *
from scipy.spatial import distance
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
np.random.seed(42)  # don't change this line

import base64
import datetime

In [10]:
import scrapbook as sb
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.utils.constants import SEED as DEFAULT_SEED
from recommenders.models.deeprec.deeprec_utils import prepare_hparams

DETAULT_SEED = 42 # this is what is is lol

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.9.16 (main, Mar  8 2023, 04:29:44) 
[Clang 14.0.6 ]
Pandas version: 1.5.3
Tensorflow version: 2.10.0


# Helper Functions for Data Cleanup
- `save_and_compress_embeddings(file_name, first_col_name)`: put file of embedding output and the name of the first col (user / item)

In [11]:
def save_and_compress_embeddings(file_name, first_col_name):
    try:
        df = pd.read_csv(file_name, header = None, delimiter='\t')
        df.columns = [first_col_name, 'embeddings']
        embeddings = df.embeddings.str.split(' ', expand=True).add_prefix('embedding_')
        df = pd.concat([df[first_col_name], embeddings], axis=1)
        df.to_csv(f'{file_name}.bz2', compression='bz2', index=False)
        os.remove(file_name)
    except Exception as e:
        print(e)
        print(f'Failed to compress {file_name}')

In [12]:
# given a yaml file name in the structure of 'embed_size_n_layers_batch_size_decay_epochs_learning_rate_eval_epoch_top_k', return the hyperparameters
def get_hyperparameters(yaml_file):
    embed_size = int(yaml_file.split('_')[0])
    n_layers = int(yaml_file.split('_')[1])
    batch_size = int(yaml_file.split('_')[2])
    # decay has a decimal that we changed to an underscore, so we need to change it back
    decay = float(yaml_file.split('_')[3] + '.' + yaml_file.split('_')[4])
    epochs = int(yaml_file.split('_')[5])
    # learning rate has a decimal that we changed to an underscore, so we need to change it back
    learning_rate = float(yaml_file.split('_')[6] + '.' + yaml_file.split('_')[7])
    eval_epoch = int(yaml_file.split('_')[8])
    top_k = int(yaml_file.split('_')[9])
    return embed_size, n_layers, batch_size, decay, epochs, learning_rate, eval_epoch, top_k

In [13]:
# create a drop duplicates function that keeps the row with the highest MAP
def drop_duplicates(df):
    df.sort_values(by=['MAP'], ascending=False, inplace=True)
    df.drop_duplicates(subset=['embed_size', 'n_layers', 'batch_size', 'decay', 'epochs', 'learning_rate', 'eval_epoch', 'top_k'], keep='first', inplace=True)
    df.reset_index(inplace=True)
    df.drop(columns=['index'], inplace=True)
    return df

# Load amazon data into `amazon_data`

In [14]:
path = "https://5190-hav-recommendation-data.s3.us-east-1.amazonaws.com/ratings_Electronics.csv"
amazon_data = pd.read_csv(path, header=None)
amazon_data.columns = ['userId', 'productId', 'rating', 'timestamp']
amazon_data

Unnamed: 0,userId,productId,rating,timestamp
0,AKM1MP6P0OYPR,0132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,0321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,0439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,0439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,0439886341,1.0,1334707200
...,...,...,...,...
7824477,A2YZI3C9MOHC0L,BT008UKTMW,5.0,1396569600
7824478,A322MDK0M89RHN,BT008UKTMW,5.0,1313366400
7824479,A1MH90R0ADMIK0,BT008UKTMW,4.0,1404172800
7824480,A10M2KEFPEQDHN,BT008UKTMW,4.0,1297555200


# Preprocessing 
- subset to only users with 50+ ratings

In [15]:
# Preprocessing
user_rating_count = amazon_data['userId'].value_counts().rename('user_rating_count')
augmented_amazon_data = amazon_data.merge(user_rating_count.to_frame(), left_on='userId', right_index=True)
subset_df = augmented_amazon_data[augmented_amazon_data.user_rating_count >= 50]
print(subset_df.shape)
subset_df.head()

(125871, 5)


Unnamed: 0,userId,productId,rating,timestamp,user_rating_count
94,A3BY5KCNQZXV5U,0594451647,5.0,1390176000,50
14863,A3BY5KCNQZXV5U,B00000JD4V,4.0,1118016000,50
134213,A3BY5KCNQZXV5U,B000063574,5.0,1016668800,50
338368,A3BY5KCNQZXV5U,B0000CDJP8,5.0,1258761600,50
634048,A3BY5KCNQZXV5U,B0007Y794O,5.0,1369872000,50


In [16]:
subset_df.tail()

Unnamed: 0,userId,productId,rating,timestamp,user_rating_count
7811895,A328S9RN3U5M68,B00JGL37FO,5.0,1400976000,76
7817686,A328S9RN3U5M68,B00K00FN3O,5.0,1400544000,76
7824063,A328S9RN3U5M68,B00L21HC7A,5.0,1405123200,76
7824081,A328S9RN3U5M68,B00L2442H0,5.0,1405123200,76
7824103,A328S9RN3U5M68,B00L26YDA4,5.0,1405123200,76


# Model 4: LightGCN - Run on Our Data

In [17]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
AMAZON_DATA_SIZE = 100000

# Model parameters
EPOCHS = 50
BATCH_SIZE = 1024

SEED = DEFAULT_SEED  # Set None for non-deterministic results

yaml_file = "lightgcn.yaml"
user_file = "embeddings/amazon/100k_user_embeddings.csv"
item_file = "embeddings/amazon/100k_item_embeddings.csv"

In [18]:
df = subset_df[:AMAZON_DATA_SIZE]
df = df.rename(columns={'userId': 'userID', 'productId': 'itemID'})
df.drop(columns='user_rating_count', inplace=True)
df.head()

Unnamed: 0,userID,itemID,rating,timestamp
94,A3BY5KCNQZXV5U,0594451647,5.0,1390176000
14863,A3BY5KCNQZXV5U,B00000JD4V,4.0,1118016000
134213,A3BY5KCNQZXV5U,B000063574,5.0,1016668800
338368,A3BY5KCNQZXV5U,B0000CDJP8,5.0,1258761600
634048,A3BY5KCNQZXV5U,B0007Y794O,5.0,1369872000


In [19]:
train, test = python_stratified_split(df, ratio=0.75)

In [20]:
data = ImplicitCF(train=train, test=test, seed=SEED)

  df = train if test is None else train.append(test)


In [21]:
hparams = prepare_hparams(
    yaml_file,
    n_layers=3,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    learning_rate=0.005,
    eval_epoch=5,
    top_k=TOP_K,
)

In [22]:
model = LightGCN(hparams, data, seed=SEED)

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.


2023-04-26 15:49:11.227545: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-26 15:49:11.238898: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:354] MLIR V1 optimization pass is not enabled


In [23]:
with Timer() as train_time:
    model.fit()

print("Took {} seconds for training.".format(train_time.interval))

Epoch 1 (train)4.1s: train loss = 0.51765 = (mf)0.51751 + (embed)0.00014
Epoch 2 (train)4.0s: train loss = 0.16956 = (mf)0.16891 + (embed)0.00065
Epoch 3 (train)4.0s: train loss = 0.09665 = (mf)0.09566 + (embed)0.00099
Epoch 4 (train)4.0s: train loss = 0.06747 = (mf)0.06625 + (embed)0.00122
Epoch 5 (train)4.0s + (eval)0.9s: train loss = 0.05316 = (mf)0.05176 + (embed)0.00140, recall = 0.01625, ndcg = 0.03706, precision = 0.03427, map = 0.00614
Epoch 6 (train)4.0s: train loss = 0.04242 = (mf)0.04087 + (embed)0.00155
Epoch 7 (train)3.9s: train loss = 0.03598 = (mf)0.03429 + (embed)0.00168
Epoch 8 (train)3.9s: train loss = 0.03176 = (mf)0.02995 + (embed)0.00180
Epoch 9 (train)3.9s: train loss = 0.02699 = (mf)0.02508 + (embed)0.00191
Epoch 10 (train)3.9s + (eval)0.8s: train loss = 0.02328 = (mf)0.02127 + (embed)0.00201, recall = 0.01567, ndcg = 0.03601, precision = 0.03333, map = 0.00585
Epoch 11 (train)3.9s: train loss = 0.02062 = (mf)0.01852 + (embed)0.00210
Epoch 12 (train)3.9s: train l

In [24]:
topk_scores = model.recommend_k_items(test, top_k=TOP_K, remove_seen=True)

topk_scores.head()

Unnamed: 0,userID,itemID,prediction
0,A100UD67AHFODS,B001FA1NK0,12.427428
1,A100UD67AHFODS,B002V88HFE,10.693367
2,A100UD67AHFODS,B005CLPP84,10.614769
3,A100UD67AHFODS,B001TH7GUU,10.468667
4,A100UD67AHFODS,B0019EHU8G,10.422582


In [25]:
eval_map = map_at_k(test, topk_scores, k=TOP_K)
eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.004391
NDCG:	0.028074
Precision@K:	0.026956
Recall@K:	0.012454


In [26]:
model.infer_embedding(user_file, item_file)
save_and_compress_embeddings(user_file, "userID")
save_and_compress_embeddings(item_file, "itemID")

# Basically Gridsearch

In [27]:
import yaml 
# 
yaml_dir = 'yamls'
yaml_files = []

# read in results to see which yaml files have already been written
old_results_df = pd.read_csv('lightgcn_results.csv')
written_yaml_files = old_results_df['yaml_file'].tolist()

# write a bunch of different combinations of yaml files
embed_size_list = [64]
n_layers_list = [7]
batch_size_list = [2048]
decay_list = [0.01, 0.1]
epochs_list = [1000]
learning_rate_list = [0.001,0.01]
eval_epoch_list = [-1]
top_k_list = [20]

# create a for loop that writes the yaml files for all combinations of the above hyperparameters
for embed_size in embed_size_list:
    for n_layers in n_layers_list:
        for batch_size in batch_size_list:
            for decay in decay_list:
                for epochs in epochs_list:
                    for learning_rate in learning_rate_list:
                        for eval_epoch in eval_epoch_list:
                            for top_k in top_k_list:
                                filename = f'{embed_size}_{n_layers}_{batch_size}_{decay}_{epochs}_{learning_rate}_{eval_epoch}_{top_k}'
                                filename = filename.replace('.','_')

                                if filename in written_yaml_files:
                                    continue
                                # write yaml file
                                data = {
                                    'model': {
                                        'model_type': 'lightgcn',
                                        'embed_size': embed_size,
                                        'n_layers': n_layers
                                    },
                                    'train': {
                                        'batch_size': batch_size,
                                        'decay': decay,
                                        'epochs': epochs,
                                        'learning_rate': learning_rate,
                                        'eval_epoch': eval_epoch,
                                        'top_k': top_k
                                    },
                                    'info': {
                                        'save_model' : True, # whether to save model
                                        'save_epoch' : 100, # if save_model is set to True, save the model every save_epoch
                                        'metrics' : ["recall", "ndcg", "precision", 'map'], # metrics for evaluation
                                        'MODEL_DIR' : './models/' # directory of saved models
                                    }
                                }

                                with open(f'{yaml_dir}/{filename}.yaml', 'w') as outfile:
                                    yaml.dump(data, outfile, default_flow_style=False)
                                outfile.close()

                                yaml_files.append(filename)

yaml_files

[]

In [28]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
AMAZON_DATA_SIZE = len(subset_df)

# Model parameters
EPOCHS = 50
BATCH_SIZE = 1024

SEED = DEFAULT_SEED  # Set None for non-deterministic results

results = {}

for filename in yaml_files: 
    yaml_file = f'yamls/{filename}.yaml'
    user_file = f"embeddings/{filename}_user_embeddings.csv"
    item_file = f"embeddings/{filename}_item_embeddings.csv"
    df = subset_df[:AMAZON_DATA_SIZE]
    df = df.rename(columns={'userId': 'userID', 'productId': 'itemID'})
    df.head()
    train, test = python_stratified_split(df, ratio=0.75)
    data = ImplicitCF(train=train, test=test, seed=SEED)
    hparams = prepare_hparams(yaml_file,
                            n_layers=3,
                            batch_size=BATCH_SIZE,
                            epochs=EPOCHS,
                            learning_rate=0.005,
                            eval_epoch=5,
                            top_k=TOP_K,
                            )
    model = LightGCN(hparams, data, seed=SEED)
    with Timer() as train_time:
        model.fit()

    print("Took {} seconds for training.".format(train_time.interval))
    topk_scores = model.recommend_k_items(test, top_k=TOP_K, remove_seen=True)

    topk_scores.head()
    eval_map = map_at_k(test, topk_scores, k=TOP_K)
    eval_ndcg = ndcg_at_k(test, topk_scores, k=TOP_K)
    eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
    eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

    print("MAP:\t%f" % eval_map,
        "NDCG:\t%f" % eval_ndcg,
        "Precision@K:\t%f" % eval_precision,
        "Recall@K:\t%f" % eval_recall, sep='\n')

    results[filename] = [eval_map, eval_ndcg, eval_precision, eval_recall, train_time.interval]
    model.infer_embedding(user_file, item_file)
    save_and_compress_embeddings(user_file, "userID")
    save_and_compress_embeddings(item_file, "itemID")

    # write results each iteration in case things crash
    results_df = pd.DataFrame.from_dict(results, orient='index', columns=['MAP', 'NDCG', 'Precision', 'Recall', 'Train Time'])
    results_df['embed_size'], results_df['n_layers'], results_df['batch_size'], results_df['decay'], results_df['epochs'], results_df['learning_rate'], results_df['eval_epoch'], results_df['top_k'] = zip(*results_df.index.map(get_hyperparameters))
    results_df.reset_index(inplace=True)
    results_df.rename(columns={'index': 'yaml_file'}, inplace=True)

    # read in previous results and append new results
    old_results_df = pd.read_csv('lightgcn_results.csv')
    old_results_df = old_results_df.append(results_df)
    old_results_df.reset_index(inplace=True)
    old_results_df.drop(columns=['index'], inplace=True)
    drop_duplicates(old_results_df)
    old_results_df.to_csv('lightgcn_results.csv', index=False)

# pass results dict into a dataframe
