In [1]:
# """
# Colab execution only
# """
# import os
# from pathlib import Path

# !git clone "https://github.com/antoniosh97/Recommender-System-2023.git"

# path = Path('Recommender-System-2023/Implementation/3_LabReplication')
# print(f"Current path: \n{path}\nContent inside the folder:\n{os.listdir(path)}")
# os.chdir(path)

In [2]:
#====================== Import de librerias =====================#

from pathlib import Path
import json
import gzip
from urllib.request import urlopen
import datetime
import plotly.express as px
import plotly.graph_objects as go

# !pip install wget
# import wget
import logging

import torch
import pandas as pd
import numpy as np
import csv
import os
import scipy.sparse as sp
from typing import Tuple, Dict, Any, List
from tqdm import tqdm, trange
from IPython import embed
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
import sys 
# Guarda train_dataset y test_x
import pickle


sampling_method = "FM_RND_POP_pickleLoad"

if not os.getcwd().split(os.sep)[-1] == "Implementation":
    os.chdir("..")
execution_path = os.getcwd()
execution_path

'c:\\Users\\brend\\OneDrive\\Escritorio\\Postgrado\\RecSys_Project\\GitHub_repo\\Clone\\Recommender-System-2023\\Implementation'

In [3]:
%load_ext tensorboard

logs_base_dir = "runs_"+sampling_method
os.environ["run_tensorboard"] = logs_base_dir

os.makedirs(f'{execution_path}/{"4_Modelling"}/{logs_base_dir}', exist_ok=True)
tb_fm = SummaryWriter(log_dir=f'{execution_path}/{"4_Modelling"}/{logs_base_dir}/{logs_base_dir}_FM/')
tb_rnd = SummaryWriter(log_dir=f'{execution_path}/{"4_Modelling"}/{logs_base_dir}/{logs_base_dir}_RANDOM/')
tb_pop = SummaryWriter(log_dir=f'{execution_path}/{"4_Modelling"}/{logs_base_dir}/{logs_base_dir}_POP/')

def save_data_configuration(text):
    save_data_dir = "data_config_" + sampling_method +".txt" 
    path = f'{execution_path}/{"4_Modelling"}/{save_data_dir}'
    with open(path, "a") as data_file:
        data_file.write(text+"\n")

    return text

In [4]:
# Let's define some hyper-parameters
hparams = {
    'batch_size':64,
    'num_epochs':12,
    'hidden_size': 32,
    'learning_rate':1e-4,
}

# we select to work on GPU if it is available in the machine, otherwise
# will run on CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

#============ Definicion de valores de configuracion ============#

min_reviews, min_usuarios = [6,6]
col_names = {"col_id_reviewer": "reviewerID",
             "col_id_product": "asin",
             "col_rating": "overall",
             "col_unix_time": "unixReviewTime",
             "col_timestamp": "timestamp",
             "col_year": "year"}

csv_filename = execution_path/Path("3_DataPreparation/interactions_minR{}_minU{}.csv".format(min_reviews,min_usuarios))
df = pd.read_csv(csv_filename)
df.head()

Unnamed: 0,asin,reviewerID,overall,unixReviewTime,timestamp,year
0,0,9132,5.0,1477785600,2016-10-30 02:00:00,1970
1,0,10612,5.0,1467244800,2016-06-30 02:00:00,1970
2,0,257,1.0,1454716800,2016-02-06 01:00:00,1970
3,0,4425,5.0,1434844800,2015-06-21 02:00:00,1970
4,0,2523,4.0,1420329600,2015-01-04 01:00:00,1970


In [5]:
print(save_data_configuration(str(df.nunique())))


data = df[[*col_names.values()][:4]].astype('int32').to_numpy()

add_dims=0
for i in range(data.shape[1] - 2):  # do not affect to timestamp and rating // origin::: for i in range(data.shape[1] - 1)
    # MAKE IT START BY 0
    data[:, i] -= np.min(data[:, i])
    # RE-INDEX
    data[:, i] += add_dims
    add_dims = np.max(data[:, i]) + 1
dims = np.max(data, axis=0) + 1
print("Dim of users: {}\nDim of items: {}\nDims of unixtime: {}".format(dims[0], dims[1], dims[2]))
data

asin               6178
reviewerID        14138
overall               5
unixReviewTime     3622
timestamp          3622
year                  1
dtype: int64
Dim of users: 14138
Dim of items: 20316
Dims of unixtime: 6


array([[      9132,      14138,          5, 1477785600],
       [     10612,      14138,          5, 1467244800],
       [       257,      14138,          1, 1454716800],
       ...,
       [      9051,      20315,          5, 1530144000],
       [      3412,      20315,          5, 1527465600],
       [      9805,      20315,          5, 1527206400]])

## Pickle LOAD

In [6]:
class PointData(Dataset):
    def __init__(self,
                 data: np.ndarray,
                 dims: list) -> None:
        """
        Dataset formatter adapted point-wise algorithms
        Parameters
        """
        super(PointData, self).__init__()
        self.interactions = data
        self.dims = dims

    def __len__(self) -> int:
        return len(self.interactions)
        
    def __getitem__(self, index: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Return the pairs user-item and the target.
        """
        return self.interactions[index][:-1], self.interactions[index][-1]

In [7]:
mod_path = execution_path / Path("4_Modelling/mod_baseline")
assert os.path.exists(mod_path), f"The following path does not exist: \n{mod_path}"
timestamp = "{d}day{m}mon{y}year".format(d="04", m="03", y="2023")

assert os.path.exists(mod_path / f"MOD_baseline_train_{timestamp}.pkl"), f"The following path does not exist: \n{mod_path}"
# Train
with open(mod_path / f"MOD_baseline_train_{timestamp}.pkl", 'rb') as handle:
    train_dataset = pickle.load(handle)

# Test
with open(mod_path / f"MOD_baseline_test_{timestamp}.pkl", 'rb') as handle:
    test_x = pickle.load(handle)

# Popularity Rec
with open(mod_path / f"MOD_baseline_popRec_{timestamp}.pkl", 'rb') as handle:
    popularity_recommendations = pickle.load(handle)


In [8]:
train_dataset

<__main__.PointData at 0x26b73fd81c8>

In [9]:
test_x

[array([[    0, 17249],
        [    0, 14138],
        [    0, 14139],
        ...,
        [    0, 20313],
        [    0, 20314],
        [    0, 20315]]),
 array([[    1, 18015],
        [    1, 14140],
        [    1, 14141],
        ...,
        [    1, 20311],
        [    1, 20312],
        [    1, 20313]]),
 array([[    2, 14196],
        [    2, 14138],
        [    2, 14140],
        ...,
        [    2, 20313],
        [    2, 20314],
        [    2, 20315]]),
 array([[    3, 17499],
        [    3, 14140],
        [    3, 14145],
        ...,
        [    3, 20300],
        [    3, 20307],
        [    3, 20309]]),
 array([[    4, 14230],
        [    4, 14141],
        [    4, 14144],
        ...,
        [    4, 20307],
        [    4, 20308],
        [    4, 20313]]),
 array([[    5, 19906],
        [    5, 15598],
        [    5, 15761],
        [    5, 16626],
        [    5, 16641],
        [    5, 17112],
        [    5, 17655],
        [    5, 17700],
        [    

# Building Factorization Machines model

In [10]:
class FM_operation(torch.nn.Module):

    def __init__(self, 
                 reduce_sum: bool=True) -> None:
        super().__init__()
        self.reduce_sum = reduce_sum

    def forward(self,
                x: torch.Tensor) -> float:
        """
        :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
        """
        # square_of_sum = np.sum(x, dim=1) ** 2 # ...
        # sum_of_square = np.sum(x ** 2, dim=1) # ...
        
        square_of_sum = torch.pow(torch.sum(x, dim=1),2)
        sum_of_square = torch.sum(torch.pow(x,2), dim=1)
        ix = square_of_sum - sum_of_square
        if self.reduce_sum:
            ix = torch.sum(ix, dim=1, keepdim=True)
        return 0.5 * ix
        

In [11]:
class FactorizationMachineModel(torch.nn.Module):
    """
    A pytorch implementation of Factorization Machine.

    Reference:
        S Rendle, Factorization Machines, 2010.
    """

    def __init__(self, 
                 field_dims: list,
                 embed_dim: float) -> None:
        super().__init__()
        self.linear = torch.nn.Linear(len(field_dims), 1)
        self.embedding = torch.nn.Embedding(field_dims[-1], embed_dim)
        self.fm = FM_operation(reduce_sum=True)

        torch.nn.init.xavier_uniform_(self.embedding.weight.data)

    def forward(self, interaction_pairs: torch.Tensor) -> torch.Tensor:
        """
        :param interaction_pairs: Long tensor of size ``(batch_size, num_fields)``
        """
        out = self.linear(interaction_pairs.float()) + self.fm(self.embedding(interaction_pairs))
        return out.squeeze(1)
        
    def predict(self, 
                interactions: np.ndarray,
                device: torch.device) -> torch.Tensor:
        # return the score, inputs are numpy arrays, outputs are tensors
        test_interactions = torch.from_numpy(interactions).to(dtype=torch.long, device=device) #, dtype=torch.long)
        output_scores = self.forward(test_interactions)
        return output_scores

# Pipeline functions

## Training

In [12]:
from statistics import mean

def train_one_epoch(model: torch.nn.Module,
                    optimizer: torch.optim,
                    data_loader: torch.utils.data.DataLoader,
                    criterion: torch.nn.functional,
                    device: torch.device) -> float:
    model.train()
    total_loss = []

    for i, (interactions, targets) in enumerate(data_loader):
        interactions = interactions.to(device)
        targets = targets.to(device)

        predictions = model(interactions[:,:2])
    
        loss = criterion(predictions, targets.float())
        model.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss.append(loss.item())

    return mean(total_loss)

# Define metrics

In [13]:
import math

def getHitRatio(recommend_list: list,
                gt_item: int) -> bool:
    if gt_item in recommend_list:
        return 1
    else:
        return 0

def getNDCG(recommend_list: list,
            gt_item: int) -> float:
    idx = np.where(recommend_list == gt_item)[0]
    if len(idx) > 0:
        return math.log(2)/math.log(idx+2)
    else:
        return 0

# Inference


In [14]:
def test(model: torch.nn.Module,
         test_x: np.ndarray,
         device: torch.device,
         topk: int=10) -> Tuple[float, float]:
    # Test the HR and NDCG for the model @topK
    model.eval()

    user_recommend_list = np.zeros(len(test_x)).tolist()
    index = 0

    HR, NDCG = [], []
    for user_test in test_x:
        gt_item = user_test[0][1]
        predictions = model.predict(user_test, device)
        _, indices = torch.topk(predictions, topk)
        recommend_list = user_test[indices.cpu().detach().numpy()][:, 1]
        user_recommend_list[index] = recommend_list.tolist().copy()
        index +=1

        HR.append(getHitRatio(recommend_list, gt_item))
        NDCG.append(getNDCG(recommend_list, gt_item))
        
    coverage = len(set(np.hstack(user_recommend_list))) / (dims[1]-dims[0]) *100
    
    return mean(HR), mean(NDCG), user_recommend_list, coverage

In [15]:
def test_pop(model: torch.nn.Module,
         test_x: np.ndarray,
         device: torch.device,
         topk: int=10) -> Tuple[float, float]:
    # Test the HR and NDCG for the model @topK
    model.eval()

    user_recommend_list = np.zeros(len(test_x)).tolist()
    index = 0

    HR, NDCG = [], []
    for user_test in test_x:
        gt_item = user_test[0][1]
        predictions = model.predict(user_test, device)
        # print(predictions)
        # _, indices = torch.topk(predictions, topk)
        # print(indices)
        # print(user_test)
        recommend_list = predictions[:topk]
        # user_test[indices.cpu().detach().numpy()][:, 1]
        # print(recommend_list)
        user_recommend_list[index] = np.hstack(recommend_list.tolist().copy())
        index +=1

        HR.append(getHitRatio(recommend_list, gt_item))
        NDCG.append(getNDCG(recommend_list, gt_item))
        
    coverage = len(set(np.hstack(user_recommend_list))) / (dims[1]-dims[0]) *100
    
    return mean(HR), mean(NDCG), user_recommend_list, coverage

# PIPELINE
## Defining the model, the loss and the optimizer



In [16]:
dims = train_dataset.dims
model = FactorizationMachineModel(dims, hparams['hidden_size']).to(device)

criterion = torch.nn.BCEWithLogitsLoss(reduction='mean')
optimizer = torch.optim.Adam(params=model.parameters(), lr=hparams['learning_rate'])

## Random evaluation

In [17]:
import random
class RandomModel(torch.nn.Module):
    def __init__(self, 
                 dims: list) -> None:
        super(RandomModel, self).__init__()
        """
        Simple random based recommender system
        """
        self.all_items = list(range(dims[0], dims[1]))

    def forward(self) -> None:
        pass

    def predict(self,
                interactions: np.ndarray,
                device=None) -> torch.Tensor:
        return torch.FloatTensor(random.sample(self.all_items, len(interactions)))

rnd_model = RandomModel(dims)

## Popularity evaluation

In [18]:
# Popularity-Based Recommender System
class PopularityBasedModel(torch.nn.Module):
  """
  https://github.com/LaxmiChaudhary/Amzon-Product-Recommendation/blob/master/Recommendation%20System.ipynb
  The Popularity-based recommender system is a non-personalised recommender system and these are based on frequecy counts, which may be not suitable to the user.
  We can see the differance above for the user id 4, 6 & 8, The Popularity based model has recommended the same set of 5 products to both.
  """
  def __init__(self, 
                 popularity_recommendations) -> None:
        super(PopularityBasedModel, self).__init__()
        """
        Simple random based recommender system
        """
        self.popularity_recommendations = popularity_recommendations
  
  def predict(self,
              interactions: np.ndarray,
              device=None) -> torch.Tensor:
      return torch.IntTensor(self.popularity_recommendations)

pop_model = PopularityBasedModel(popularity_recommendations)

## NCF evaluation

In [None]:
class NCF(torch.nn.Module):
    def __init__(self, 
                 field_dims: list,
                 embed_dim: float) -> None:
        super().__init__()
        self.embed_dim = embed_dim
        self.embedding = torch.nn.Embedding(field_dims[-1], embed_dim)
        sequential = [[self.embed_dim*2, 32], [32, 16], [16, 8]]
        self.mlp = torch.nn.Sequential(
                        torch.nn.Linear(sequential[0][0], sequential[0][1]), #  (64, 32)
                        torch.nn.ReLU(),
                        torch.nn.Linear(sequential[1][0], sequential[1][1]),
                        torch.nn.ReLU(),
                        torch.nn.Linear(sequential[2][0], sequential[2][1]),
                        torch.nn.ReLU())
        self.last_fc = torch.nn.Linear(sequential[2][1]+self.embed_dim, 1)
        torch.nn.init.xavier_uniform_(self.embedding.weight.data)

    def forward(self, interaction_pairs: np.ndarray) -> torch.Tensor:
        user_embedding_mf = self.embedding(interaction_pairs[:,0])
        item_embedding_mf = self.embedding(interaction_pairs[:,1])

        # MLP vector reshaped to 2 dimensions
        mlp_vector = self.embedding(interaction_pairs)  # (6142, 2, 32)
        mlp_vector = mlp_vector.reshape(-1, self.embed_dim*2) # (6142, 64)

        # MatrixFactorization vector
        mf_vector = torch.mul(user_embedding_mf, item_embedding_mf) # ( _ , 32)

        # MLP thought layers
        mlp_vector = self.mlp(mlp_vector) # ( _ , sequential[2][1]) = ( _ , 8)

        # NCF: concat MLP_output and MF_output
        # NCF: Last fully connected layer of size (40x1) --> (8: mlp output + 32: mf output => 40)
        # NCF: Activation function ReLU
        concatenation = torch.cat([mlp_vector, mf_vector], dim=-1) # (_, 40)
        concatenation = self.last_fc(concatenation)
        # concatenation = self.relu(concatenation)
        return concatenation.squeeze()

    def predict(self,
                interactions: np.ndarray,
                device: torch.device) -> torch.Tensor:
        test_interactions = torch.from_numpy(interactions).to(dtype=torch.long, device=device)
        output_scores = self.forward(test_interactions)
        return output_scores

ncf_model = NCF(dims, hparams['hidden_size']).to(device)

## Final pipeline

In [19]:
data_loader = DataLoader(train_dataset, batch_size=hparams['batch_size'], shuffle=True, num_workers=0)

# Start training the model

In [20]:
# DO EPOCHS NOW
from datetime import datetime
import time
save_data_configuration(datetime.now().strftime("%d-%b-%Y  %H:%M"))
time_start = time.time()
topk = 10

fm = np.zeros([hparams['num_epochs'],3])
rnd = np.zeros([hparams['num_epochs'],3])
pop = np.zeros([hparams['num_epochs'],3])
ncf = np.zeros([hparams['num_epochs'],3])

for epoch_i in range(hparams['num_epochs']):
    #data_loader.dataset.negative_sampling()
    train_loss = train_one_epoch(model, optimizer, data_loader, criterion, device)

    hr, ndcg, recommend_list_fm, cov_fm = test(model, test_x, device, topk=topk)
    print(save_data_configuration(f'MODEL: FACTORIZATION MACHINE'))
    print(save_data_configuration(f'epoch {epoch_i}:'))
    print(save_data_configuration(f'training loss = {train_loss:.4f} | Eval: HR@{topk} = {hr:.4f}, NDCG@{topk} = {ndcg:.4f} '))
    fm[epoch_i] = [hr, ndcg, cov_fm]
    tb_fm.add_scalar('train/loss', train_loss, epoch_i)
    tb_fm.add_scalar(f'eval/HR@{topk}', hr, epoch_i)
    tb_fm.add_scalar(f'eval/NDCG@{topk}', ndcg, epoch_i)

    hr, ndcg, recommend_list_rnd, cov_rnd = test(rnd_model, test_x, device, topk=topk)
    print(save_data_configuration(f'MODEL: RANDOM'))
    print(save_data_configuration(f'epoch {epoch_i}:'))
    print(save_data_configuration(f'training loss = {train_loss:.4f} | Eval: HR@{topk} = {hr:.4f}, NDCG@{topk} = {ndcg:.4f} '))
    rnd[epoch_i] = [hr, ndcg, cov_rnd]
    tb_rnd.add_scalar(f'eval/HR@{topk}', hr, epoch_i)
    tb_rnd.add_scalar(f'eval/NDCG@{topk}', ndcg, epoch_i)

    hr, ndcg, recommend_list_pop, cov_pop = test_pop(pop_model, test_x, device, topk=topk)
    print(save_data_configuration(f'MODEL: POPULARITY-BASED'))
    print(save_data_configuration(f'epoch {epoch_i}:'))
    print(save_data_configuration(f'training loss = {train_loss:.4f} | Eval: HR@{topk} = {hr:.4f}, NDCG@{topk} = {ndcg:.4f} '))
    pop[epoch_i] = [hr, ndcg, cov_pop]
    tb_pop.add_scalar(f'eval/HR@{topk}', hr, epoch_i)
    tb_pop.add_scalar(f'eval/NDCG@{topk}', ndcg, epoch_i)

    hr, ndcg, recommend_list_ncf, cov_ncf = test(ncf_model, test_x, device, topk=topk)
    print(save_data_configuration(f'MODEL: NEURAL COLLABORATIVE FILTERING'))
    print(save_data_configuration(f'epoch {epoch_i}:'))
    print(save_data_configuration(f'training loss = {train_loss:.4f} | Eval: HR@{topk} = {hr:.4f}, NDCG@{topk} = {ndcg:.4f} '))
    ncf[epoch_i] = [hr, ndcg, cov_ncf]
    tb_pop.add_scalar(f'eval/HR@{topk}', hr, epoch_i)
    tb_pop.add_scalar(f'eval/NDCG@{topk}', ndcg, epoch_i)

    save_data_configuration("_"*65)
 
save_data_configuration(f"# Training duration: {(time.time()-time_start):.4f}")

MODEL: FACTORIZATION MACHINE
epoch 0:
training loss = 60.8743 | Eval: HR@10 = 0.0016, NDCG@10 = 0.0008 
MODEL: RANDOM
epoch 0:
training loss = 60.8743 | Eval: HR@10 = 0.0023, NDCG@10 = 0.0009 
MODEL: POPULARITY-BASED
epoch 0:
training loss = 60.8743 | Eval: HR@10 = 0.0000, NDCG@10 = 0.0000 
MODEL: FACTORIZATION MACHINE
epoch 1:
training loss = 0.5127 | Eval: HR@10 = 0.0165, NDCG@10 = 0.0089 
MODEL: RANDOM
epoch 1:
training loss = 0.5127 | Eval: HR@10 = 0.0022, NDCG@10 = 0.0010 
MODEL: POPULARITY-BASED
epoch 1:
training loss = 0.5127 | Eval: HR@10 = 0.0000, NDCG@10 = 0.0000 
MODEL: FACTORIZATION MACHINE
epoch 2:
training loss = 0.5054 | Eval: HR@10 = 0.0244, NDCG@10 = 0.0174 
MODEL: RANDOM
epoch 2:
training loss = 0.5054 | Eval: HR@10 = 0.0015, NDCG@10 = 0.0007 
MODEL: POPULARITY-BASED
epoch 2:
training loss = 0.5054 | Eval: HR@10 = 0.0000, NDCG@10 = 0.0000 
MODEL: FACTORIZATION MACHINE
epoch 3:
training loss = 0.4879 | Eval: HR@10 = 0.0255, NDCG@10 = 0.0187 
MODEL: RANDOM
epoch 3:
trai

'# Training duration: 2806.7924'

In [21]:
print(f"Coverage from FM: {cov_fm}")
print(f"Coverage from RAND: {cov_rnd}")
print(f"Coverage from POP: {cov_pop}")


Coverage from FM: 6.587892521851732
Coverage from RAND: 100.0
Coverage from POP: 0.16186468112657817


# Visualization

In [22]:
# os.chdir(os.getcwd() / Path("4_Modelling"))


In [23]:
# %tensorboard --logdir run_tensorboard