In [1]:
# """
# Colab execution only
# """
# import os
# from pathlib import Path

# !git clone "https://github.com/antoniosh97/Recommender-System-2023.git"

# path = Path('Recommender-System-2023/Implementation/3_LabReplication')
# print(f"Current path: \n{path}\nContent inside the folder:\n{os.listdir(path)}")
# os.chdir(path)

In [1]:
#====================== Import de librerias =====================#

from pathlib import Path
import json
import gzip
from urllib.request import urlopen
import datetime
import plotly.express as px
import plotly.graph_objects as go

# !pip install wget
# import wget
import logging

import torch
import pandas as pd
import numpy as np
import csv
import os
import scipy.sparse as sp
from typing import Tuple, Dict, Any, List
from tqdm import tqdm, trange
from IPython import embed
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
import sys 

sampling_method = "FullSampling"

os.chdir("Z:\\Personal\\UPC_POSTGRADO_IA_DL\\9-PROJECT\\Recommender-System-2023\\Implementation\\")
execution_path = os.getcwd()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%load_ext tensorboard

logs_base_dir = "runs_"+sampling_method
os.environ["run_tensorboard"] = logs_base_dir

os.makedirs(f'{execution_path}/{"4_Modelling"}/{logs_base_dir}', exist_ok=True)
tb_fm = SummaryWriter(log_dir=f'{execution_path}/{"4_Modelling"}/{logs_base_dir}/{logs_base_dir}_FM/')
tb_rnd = SummaryWriter(log_dir=f'{execution_path}/{"4_Modelling"}/{logs_base_dir}/{logs_base_dir}_RANDOM/')

def save_data_configuration(text):
    save_data_dir = "data_config_" + sampling_method +".txt" 
    path = f'{execution_path}/{"4_Modelling"}/{save_data_dir}'
    with open(path, "a") as data_file:
        data_file.write(text+"\n")

    return text

In [3]:
# Let's define some hyper-parameters
hparams = {
    'batch_size':64,
    'num_epochs':12,
    'hidden_size': 32,
    'learning_rate':1e-4,
    'strategy':"TLOO",
    'num_neg':4,
}

# we select to work on GPU if it is available in the machine, otherwise
# will run on CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [4]:
#============ Definicion de valores de configuracion ============#

min_reviews, min_usuarios = [6,6]
col_names = {"col_id_reviewer": "reviewerID",
             "col_id_product": "asin",
             "col_unix_time": "unixReviewTime",
             "col_rating": "overall",
             "col_timestamp": "timestamp",
             "col_year": "year"}

csv_filename = execution_path/Path("3_DataPreparation/interactions_minR{}_minU{}.csv".format(min_reviews,min_usuarios))

In [5]:
df = pd.read_csv(csv_filename)
df.head()

Unnamed: 0,asin,reviewerID,overall,unixReviewTime,timestamp,year
0,0,9132,5.0,1477785600,2016-10-30 02:00:00,1970
1,0,10612,5.0,1467244800,2016-06-30 02:00:00,1970
2,0,257,1.0,1454716800,2016-02-06 01:00:00,1970
3,0,4425,5.0,1434844800,2015-06-21 02:00:00,1970
4,0,2523,4.0,1420329600,2015-01-04 01:00:00,1970


In [6]:
save_data_configuration(str(df.nunique()))
df.nunique()

asin               6178
reviewerID        14138
overall               5
unixReviewTime     3622
timestamp          3622
year                  1
dtype: int64

# Splitting dataset (TLOO/RLOO strategy)

In [7]:
def split_train_test(data: np.ndarray,
                     n_users: int, strategy) -> Tuple[np.ndarray, np.ndarray]:
    # Split and remove timestamp
    train_x, test_x = [], []
    for u in trange(n_users, desc='spliting train/test and removing timestamp...'):
        user_data = data[data[:, 0] == u]
        sorted_data = user_data[user_data[:, -1].argsort()]
        if len(sorted_data) == 1:
            train_x.append(sorted_data[0][:-1])
        else:
            if (strategy=="TLOO"):
                train_x.append(sorted_data[:-1][:, :-1])
                test_x.append(sorted_data[-1][:-1])
            else:
                # seleccionar uno random RLOO Random Leave One Out
                idx = np.random.choice(np.arange(sorted_data.shape[0]), size=1) # devuelve size indices de la dimension 0
                test_x.append(sorted_data[idx,:-1]) # añado el registro a test
                sorted_data = np.delete(sorted_data, (idx), axis=0) # lo borramos de la lista que va a train            
                train_x.append(sorted_data[:,:-1]) # añadimos el resto a train                
    return np.vstack(train_x), np.stack(test_x)

In [8]:
data = df[[*col_names.values()][:3]].astype('int32').to_numpy()
print(max(data[:,0]))
print(max(data[:,1]))
data

14137
6177


array([[      9132,          0, 1477785600],
       [     10612,          0, 1467244800],
       [       257,          0, 1454716800],
       ...,
       [      9051,       6177, 1530144000],
       [      3412,       6177, 1527465600],
       [      9805,       6177, 1527206400]])

In [9]:
add_dims=0
for i in range(data.shape[1] - 1):  # do not affect to timestamp
    # MAKE IT START BY 0
    data[:, i] -= np.min(data[:, i])
    # RE-INDEX
    data[:, i] += add_dims
    add_dims = np.max(data[:, i]) + 1
dims = np.max(data, axis=0) + 1
print("Dim of users: {}\nDim of items: {}\nDims of unixtime: {}".format(dims[0], dims[1], dims[2]))
data

Dim of users: 14138
Dim of items: 20316
Dims of unixtime: 1538006401


array([[      9132,      14138, 1477785600],
       [     10612,      14138, 1467244800],
       [       257,      14138, 1454716800],
       ...,
       [      9051,      20315, 1530144000],
       [      3412,      20315, 1527465600],
       [      9805,      20315, 1527206400]])

In [10]:
len(data[:,1])

137364

In [11]:
max(data[:,1])

20315

In [12]:
cols = [0,1,2]
print([max(data[:,col]) for col in cols])
dims

[14137, 20315, 1538006400]


array([     14138,      20316, 1538006401])

In [13]:
train_x, test_x = split_train_test(data, dims[0], strategy=hparams["strategy"])
train_x

spliting train/test and removing timestamp...: 100%|██████████| 14138/14138 [00:05<00:00, 2586.93it/s]


array([[    0, 19248],
       [    0, 19249],
       [    0, 14823],
       ...,
       [14137, 14159],
       [14137, 18245],
       [14137, 18904]])

# Negative sampling

In [14]:
print(test_x.shape)
test_x

(14138, 2)


array([[    0, 17249],
       [    1, 18015],
       [    2, 14196],
       ...,
       [14135, 19938],
       [14136, 20214],
       [14137, 15542]])

In [15]:
train_x = train_x[:, :2]
dims = dims[:2]
print("New dims:",dims)
print("New train_x:\n",train_x)

New dims: [14138 20316]
New train_x:
 [[    0 19248]
 [    0 19249]
 [    0 14823]
 ...
 [14137 14159]
 [14137 18245]
 [14137 18904]]


In [16]:
def build_adj_mx(n_feat:int, data:np.ndarray) -> sp.dok_matrix :
    train_mat = sp.dok_matrix((n_feat, n_feat), dtype=np.float32)
    for x in tqdm(data, desc=f"BUILDING ADJACENCY MATRIX..."):
        train_mat[x[0], x[1]] = 1.0
        train_mat[x[1], x[0]] = 1.0
        # IDEA: We treat features that are not user or item differently because we do not consider
        #  interactions between contexts
        if data.shape[1] > 2:
            for idx in range(len(x[2:])):
                train_mat[x[0], x[2 + idx]] = 1.0
                train_mat[x[1], x[2 + idx]] = 1.0
                train_mat[x[2 + idx], x[0]] = 1.0
                train_mat[x[2 + idx], x[1]] = 1.0
    return train_mat

In [17]:
def ng_sample(data: np.ndarray, dims: list, num_ng:int=hparams["num_neg"]) -> Tuple[np.ndarray, sp.dok_matrix]:
    rating_mat = build_adj_mx(dims[-1], data)
    interactions = []
    min_item, max_item = dims[0], dims[1]
    for num, x in tqdm(enumerate(data), desc='perform negative sampling...'):
        interactions.append(np.append(x, 1))
        for t in range(num_ng):
            j = np.random.randint(min_item, max_item) #if not pop else random.sample(items_to_sample, 1)[0]
            # IDEA: Loop to exclude true interactions (set to 1 in adj_train) user - item
            while (x[0], j) in rating_mat or j == int(x[1]):
                j = np.random.randint(min_item, max_item) #if not pop else random.sample(items_to_sample, 1)[0]
            interactions.append(np.concatenate([[x[0], j], x[2:], [0]]))
    # eliminando pares user item duplicados
    # user = -1
    # min_item, max_item = dims[0], dims[1]
    # for num, x in tqdm(enumerate(data), desc='perform negative sampling...'):
    #     interactions.append(x) # Añadimos de uno en uno, x, que es positivo (ya viene el rating '1')  
    #     if user != x[0]: # en cada cambio de usuario
    #         userItem = [] 
    #         user = x[0]    
    #     userItem.append(x)
    #     for t in range(num_ng): # vamos añadir k negativos random
    #         j = np.random.randint(min_item, max_item) #if not pop else random.sample(items_to_sample, 1)[0]
    #         # IDEA: Loop to exclude true interactions (set to 1 in adj_train) user - item
    #         #       also exclude items duplicated for the same user
    #         while ((x[0], j) in rating_mat or j == int(x[1]) or isInList(np.concatenate([[x[0], j], x[2:], [0]]), userItem)):
    #             j = np.random.randint(min_item, max_item) #if not pop else random.sample(items_to_sample, 1)[0]
    #         interactions.append(np.concatenate([[x[0], j], x[2:], [0]]))
             
    return np.vstack(interactions), rating_mat

"""
Check if element is in list
"""
def isInList(element, list):
    for negative in list:
        if (set(element) == set(element) & set(negative)):
            return True
    return False


In [18]:
train_x, rating_mat = ng_sample(train_x, dims)
print("Dimensions matrix:\n",dims)
print("\nRating matrix:")
rating_mat

BUILDING ADJACENCY MATRIX...: 100%|██████████| 123226/123226 [00:06<00:00, 18399.67it/s]
perform negative sampling...: 123226it [00:11, 10614.08it/s]


Dimensions matrix:
 [14138 20316]

Rating matrix:


<20316x20316 sparse matrix of type '<class 'numpy.float32'>'
	with 246452 stored elements in Dictionary Of Keys format>

In [19]:
dims[-1]-dims[0]

6178

In [20]:
# Exercise 2

## Evaristo
#### number of ones
print(np.count_nonzero(rating_mat.toarray())/(dims[-1]*dims[-1]))
### number of zeros
print(1 - np.count_nonzero(rating_mat.toarray())/(dims[-1]*dims[-1]))

# ## Brenda
# #### Who sparse is the matrix??
# print(1 - rating_mat.shape[0] / rating_mat.count_nonzero())

0.000597112191656141
0.9994028878083439


In [21]:
train_x[:10]

array([[    0, 19248,     1],
       [    0, 19637,     0],
       [    0, 15478,     0],
       [    0, 15470,     0],
       [    0, 18580,     0],
       [    0, 19249,     1],
       [    0, 17061,     0],
       [    0, 19238,     0],
       [    0, 17826,     0],
       [    0, 17615,     0]])

# Creating dataset class

In [22]:
class PointData(Dataset):
    def __init__(self,
                 data: np.ndarray,
                 dims: list) -> None:
        """
        Dataset formatter adapted point-wise algorithms
        Parameters
        """
        super(PointData, self).__init__()
        self.interactions = data
        self.dims = dims

    def __len__(self) -> int:
        return len(self.interactions)
        
    def __getitem__(self, 
                    index: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Return the pairs user-item and the target.
        """
        return self.interactions[index][:-1], self.interactions[index][-1]

train_dataset = PointData(train_x, dims)

In [23]:
train_dataset[0]

(array([    0, 19248]), 1)

# Preparing the test set for inference

In [24]:
test_x

array([[    0, 17249],
       [    1, 18015],
       [    2, 14196],
       ...,
       [14135, 19938],
       [14136, 20214],
       [14137, 15542]])

In [25]:
import math
print(rating_mat.shape)
bits = math.ceil(math.log(rating_mat.shape[0],2))
print("rating_mat contains log2(rating_mat.shape[0]) = {} bits".format(bits))

(20316, 20316)
rating_mat contains log2(rating_mat.shape[0]) = 15 bits


In [26]:
def items_to_compute(dims):

    # LAB
    # items2compute = []
    # for user in trange(dims[0]):
    #     aux = zero_positions[zero_positions[:, 0] == user][:, 1]
    #     items2compute.append(aux[aux >= dims[0]])
    # items2compute[0]

    items2compute = []
    items_zero_per_user = []
    for user in trange(dims[0]):
        aux1 = rating_mat[user, (dims[0]+1):]        
        items_zero_per_user = np.where(aux1.A==0)  # devuelve los indices
        aux = items_zero_per_user[:] + (dims[0]+1) # le sumamos la dimension para ubicar correctamente la columna
        aux = aux[1]
        # lista items del user en train del mismo usuario que tengan rating 0 ó 1
        items_train_user = train_x[train_x[:,0]==user][:,1]
        # los retiramos del test (los 1 como no existen no se retiran, solo eliminan los 0)
        aux = list(set(aux) - set(items_train_user))
        items2compute.append(sorted(aux)) # añadimos los valores de item

    return items2compute

items2compute = items_to_compute(dims)


100%|██████████| 14138/14138 [01:30<00:00, 156.81it/s]


In [27]:
def build_test_set(itemsnoninteracted:list, gt_test_interactions: np.ndarray) -> list:
    #max_users, max_items = dims # number users (943), number items (2625)
    test_set = []
    for pair, negatives in tqdm(zip(gt_test_interactions, itemsnoninteracted), desc="BUILDING TEST SET..."):
        # APPEND TEST SETS FOR SINGLE USER
        negatives = np.delete(negatives, np.where(negatives == pair[1]))
        single_user_test_set = np.vstack([pair, ] * (len(negatives)+1))
        single_user_test_set[:, 1][1:] = negatives
        test_set.append(single_user_test_set.copy())
    return test_set

test_x = build_test_set(items2compute, test_x)
test_x[0]

BUILDING TEST SET...: 14138it [02:08, 110.38it/s]


array([[    0, 17249],
       [    0, 14139],
       [    0, 14141],
       ...,
       [    0, 20313],
       [    0, 20314],
       [    0, 20315]])

# Building Factorization Machines model

In [28]:
class FM_operation(torch.nn.Module):

    def __init__(self, 
                 reduce_sum: bool=True) -> None:
        super().__init__()
        self.reduce_sum = reduce_sum

    def forward(self,
                x: torch.Tensor) -> float:
        """
        :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
        """
        # square_of_sum = np.sum(x, dim=1) ** 2 # ...
        # sum_of_square = np.sum(x ** 2, dim=1) # ...
        
        square_of_sum = torch.pow(torch.sum(x, dim=1),2)
        sum_of_square = torch.sum(torch.pow(x,2), dim=1)
        ix = square_of_sum - sum_of_square
        if self.reduce_sum:
            ix = torch.sum(ix, dim=1, keepdim=True)
        return 0.5 * ix
        

In [29]:
class FactorizationMachineModel(torch.nn.Module):
    """
    A pytorch implementation of Factorization Machine.

    Reference:
        S Rendle, Factorization Machines, 2010.
    """

    def __init__(self, 
                 field_dims: list,
                 embed_dim: float) -> None:
        super().__init__()
        self.linear = torch.nn.Linear(len(field_dims), 1)
        self.embedding = torch.nn.Embedding(field_dims[-1], embed_dim)
        self.fm = FM_operation(reduce_sum=True)

        torch.nn.init.xavier_uniform_(self.embedding.weight.data)

    def forward(self, interaction_pairs: torch.Tensor) -> torch.Tensor:
        """
        :param interaction_pairs: Long tensor of size ``(batch_size, num_fields)``
        """
        out = self.linear(interaction_pairs.float()) + self.fm(self.embedding(interaction_pairs))
        return out.squeeze(1)
        
    def predict(self, 
                interactions: np.ndarray,
                device: torch.device) -> torch.Tensor:
        # return the score, inputs are numpy arrays, outputs are tensors
        test_interactions = torch.from_numpy(interactions).to(dtype=torch.long, device=device) #, dtype=torch.long)
        output_scores = self.forward(test_interactions)
        return output_scores

# Pipeline functions

## Training

In [30]:
from statistics import mean

def train_one_epoch(model: torch.nn.Module,
                    optimizer: torch.optim,
                    data_loader: torch.utils.data.DataLoader,
                    criterion: torch.nn.functional,
                    device: torch.device) -> float:
    model.train()
    total_loss = []

    for i, (interactions, targets) in enumerate(data_loader):
        interactions = interactions.to(device)
        targets = targets.to(device)

        predictions = model(interactions)
    
        loss = criterion(predictions, targets.float())
        model.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss.append(loss.item())

    return mean(total_loss)

# Define metrics

In [31]:
import math

def getHitRatio(recommend_list: list,
                gt_item: int) -> bool:
    if gt_item in recommend_list:
        return 1
    else:
        return 0

def getNDCG(recommend_list: list,
            gt_item: int) -> float:
    idx = np.where(recommend_list == gt_item)[0]
    if len(idx) > 0:
        return math.log(2)/math.log(idx+2)
    else:
        return 0

# Inference


In [32]:
def test(model: torch.nn.Module,
         test_x: np.ndarray,
         device: torch.device,
         topk: int=10) -> Tuple[float, float]:
    # Test the HR and NDCG for the model @topK
    model.eval()

    rcmnd_lst = np.zeros(len(test_x)).tolist()
    index = 0

    HR, NDCG = [], []
    for user_test in test_x:
        gt_item = user_test[0][1]
        predictions = model.predict(user_test, device)
        _, indices = torch.topk(predictions, topk)
        recommend_list = user_test[indices.cpu().detach().numpy()][:, 1]
        rcmnd_lst[index] = recommend_list.tolist().copy()
        index +=1

        HR.append(getHitRatio(recommend_list, gt_item))
        NDCG.append(getNDCG(recommend_list, gt_item))
    return mean(HR), mean(NDCG), rcmnd_lst

# PIPELINE
## Defining the model, the loss and the optimizer



In [33]:
dims = train_dataset.dims
model = FactorizationMachineModel(dims, hparams['hidden_size']).to(device)

criterion = torch.nn.BCEWithLogitsLoss(reduction='mean')
optimizer = torch.optim.Adam(params=model.parameters(), lr=hparams['learning_rate'])

## Random evaluation

In [34]:
import random
class RandomModel(torch.nn.Module):
    def __init__(self, 
                 dims: list) -> None:
        super(RandomModel, self).__init__()
        """
        Simple random based recommender system
        """
        self.all_items = list(range(dims[0], dims[1]))

    def forward(self) -> None:
        pass

    def predict(self,
                interactions: np.ndarray,
                device=None) -> torch.Tensor:
        return torch.FloatTensor(random.sample(self.all_items, len(interactions)))

rnd_model = RandomModel(dims)

## Final pipeline

In [35]:
data_loader = DataLoader(train_dataset, batch_size=hparams['batch_size'], shuffle=True, num_workers=0)

# Start training the model

In [36]:
# DO EPOCHS NOW
topk = 10
for epoch_i in range(hparams['num_epochs']):
    #data_loader.dataset.negative_sampling()
    train_loss = train_one_epoch(model, optimizer, data_loader, criterion, device)
    hr, ndcg, recommend_list_fm = test(model, test_x, device, topk=topk)

    print(save_data_configuration(f'epoch {epoch_i}:'))
    print(save_data_configuration(f'training loss = {train_loss:.4f} | Eval: HR@{topk} = {hr:.4f}, NDCG@{topk} = {ndcg:.4f} '))
    print('\n')
 
    tb_fm.add_scalar('train/loss', train_loss, epoch_i)
    tb_fm.add_scalar('eval/HR@{topk}', hr, epoch_i)
    tb_fm.add_scalar('eval/NDCG@{topk}', ndcg, epoch_i)

    hr, ndcg, recommend_list_rnd = test(rnd_model, test_x, device, topk=topk)
    tb_rnd.add_scalar('eval/HR@{topk}', hr, epoch_i)
    tb_rnd.add_scalar('eval/NDCG@{topk}', ndcg, epoch_i)

epoch 0:
training loss = 3468.9819 | Eval: HR@10 = 0.0018, NDCG@10 = 0.0010 


epoch 1:
training loss = 0.5130 | Eval: HR@10 = 0.0135, NDCG@10 = 0.0055 


epoch 2:
training loss = 0.5047 | Eval: HR@10 = 0.0254, NDCG@10 = 0.0170 


epoch 3:
training loss = 0.4844 | Eval: HR@10 = 0.0262, NDCG@10 = 0.0172 


epoch 4:
training loss = 0.4593 | Eval: HR@10 = 0.0283, NDCG@10 = 0.0178 


epoch 5:
training loss = 0.4389 | Eval: HR@10 = 0.0292, NDCG@10 = 0.0181 


epoch 6:
training loss = 0.4231 | Eval: HR@10 = 0.0284, NDCG@10 = 0.0178 


epoch 7:
training loss = 0.4115 | Eval: HR@10 = 0.0302, NDCG@10 = 0.0176 


epoch 8:
training loss = 0.4032 | Eval: HR@10 = 0.0315, NDCG@10 = 0.0168 


epoch 9:
training loss = 0.3998 | Eval: HR@10 = 0.0351, NDCG@10 = 0.0178 


epoch 10:
training loss = 0.3962 | Eval: HR@10 = 0.0317, NDCG@10 = 0.0164 


epoch 11:
training loss = 0.3949 | Eval: HR@10 = 0.0338, NDCG@10 = 0.0170 




In [37]:
def coverage(recommend_list, train_x):
  coverage = np.zeros(len(recommend_list)).tolist()
  items_train = np.unique(train_x[:,1])
  index = 0

  for items_per_user in recommend_list:
    recomend_items_unique = np.unique(items_per_user)
    coverage[index] = len(recomend_items_unique) / len(items_train) *100
    index+=1

  return coverage, sum(coverage)/len(coverage)

coverage, mean_coverage = coverage(recommend_list_fm, train_x)

In [44]:
print(recommend_list_fm[:10])
mean_coverage

[[14521, 14526, 14513, 17337, 15023, 15349, 15021, 18856, 15028, 14612], [15023, 15021, 14612, 14521, 15256, 14526, 14513, 15349, 15028, 17337], [14521, 14526, 15023, 14513, 17337, 15021, 14612, 15028, 15256, 18856], [15023, 15021, 14612, 14521, 15256, 14513, 17337, 15349, 15028, 14526], [14521, 15023, 15021, 14612, 14526, 15256, 14513, 17337, 15349, 15028], [14521, 14526, 14513, 15349, 15028, 17337, 18856, 16919, 18932, 18854], [14521, 15023, 15021, 14612, 14526, 14513, 17337, 15256, 16919, 18854], [15023, 14521, 15021, 14612, 15256, 14513, 14526, 17337, 18856, 16919], [14521, 15023, 15021, 14612, 14513, 14526, 17337, 15028, 15256, 15349], [14521, 15023, 15021, 14612, 15256, 14513, 14526, 17337, 15349, 15028]]


0.16186468112660563

# Visualization

In [38]:
os.chdir(os.getcwd() / Path("4_Modelling"))


In [39]:
%tensorboard --logdir run_tensorboard