In [1]:
#====================== Import de librerias =====================#

import time
from pathlib import Path
import json
import gzip
from urllib.request import urlopen
import datetime
import plotly.express as px
import plotly.graph_objects as go
import wget
import logging
from tqdm import tqdm
import random
import torch
import pandas as pd
import numpy as np
import csv
import os
import scipy.sparse as sp
from typing import Tuple, Dict, Any, List
from tqdm import tqdm, trange
from IPython import embed
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
import tensorboard
import webbrowser

sampling_method = os.listdir()[4].split(".")[-2][3:].split("_")[-1]
logfile = "project.log"
old_path = os.getcwd()
os.chdir("..")
execution_path = os.getcwd()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%load_ext tensorboard

logs_base_dir = "runs_"+sampling_method
os.environ["run_tensorboard"] = logs_base_dir

os.makedirs(f'{execution_path}/{"4_Modelling"}/{logs_base_dir}', exist_ok=True)
tb_fm = SummaryWriter(log_dir=f'{execution_path}/{"4_Modelling"}/{logs_base_dir}/{logs_base_dir}_FM/')
tb_rnd = SummaryWriter(log_dir=f'{execution_path}/{"4_Modelling"}/{logs_base_dir}/{logs_base_dir}_RANDOM/')

def save_data_configuration(text):
    save_data_dir = "data_config_" + sampling_method +  ".txt"
    path = f'{execution_path}/{"4_Modelling"}/{save_data_dir}'
    with open(path, "a") as data_file:
        data_file.write(text+"\n")

    return text

In [3]:
# Let's define some hyper-parameters
hparams = {
    'batch_size':64,
    'num_epochs':12,
    'hidden_size': 32,
    'learning_rate':1e-4,
}

# we select to work on GPU if it is available in the machine, otherwise
# will run on CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [4]:
#============ Definicion de valores de configuracion ============#

min_reviews, min_usuarios = [6,6]
col_names = {"col_id_reviewer": "reviewerID",
             "col_id_product": "asin",
             "col_unix_time": "unixReviewTime",
             "col_rating": "overall",
             "col_timestamp": "timestamp",
             "col_year": "year"}

csv_filename = execution_path/Path("3_DataPreparation/interactions_minR{}_minU{}.csv".format(min_reviews,min_usuarios))

In [5]:
df = pd.read_csv(csv_filename)
df.head()

Unnamed: 0,asin,reviewerID,overall,unixReviewTime,timestamp,year
0,0,9132,5.0,1477785600,2016-10-30 02:00:00,1970
1,0,10612,5.0,1467244800,2016-06-30 02:00:00,1970
2,0,257,1.0,1454716800,2016-02-06 01:00:00,1970
3,0,4425,5.0,1434844800,2015-06-21 02:00:00,1970
4,0,2523,4.0,1420329600,2015-01-04 01:00:00,1970


In [6]:
save_data_configuration(str(df.nunique()))
df.nunique()

asin               6178
reviewerID        14138
overall               5
unixReviewTime     3622
timestamp          3622
year                  1
dtype: int64

# Splitting dataset (TLOO strategy)

In [7]:
def split_train_test(data: np.ndarray,
                     n_users: int) -> Tuple[np.ndarray, np.ndarray]:
    # Split and remove timestamp
    train_x, test_x = [], []
    for u in trange(n_users, desc='spliting train/test and removing timestamp...'):
        user_data = data[data[:, 0] == u]
        sorted_data = user_data[user_data[:, -1].argsort()]
        if len(sorted_data) == 1:
            train_x.append(sorted_data[0][:-1])
        else:
            train_x.append(sorted_data[:-1][:, :-1])
            test_x.append(sorted_data[-1][:-1])
    return np.vstack(train_x), np.stack(test_x)

In [8]:
data = df[[*col_names.values()][:3]].astype('int32').to_numpy()
data

array([[      9132,          0, 1477785600],
       [     10612,          0, 1467244800],
       [       257,          0, 1454716800],
       ...,
       [      9051,       6177, 1530144000],
       [      3412,       6177, 1527465600],
       [      9805,       6177, 1527206400]])

In [9]:
add_dims=0
for i in range(data.shape[1] - 1):  # do not affect to timestamp
    # MAKE IT START BY 0
    data[:, i] -= np.min(data[:, i])
    # RE-INDEX
    data[:, i] += add_dims
    add_dims = np.max(data[:, i]) + 1
dims = np.max(data, axis=0) + 1
print("Dim of users: {}\nDim of items: {}\nDims of unixtime: {}".format(dims[0], dims[1], dims[2]))
data

Dim of users: 14138
Dim of items: 20316
Dims of unixtime: 1538006401


array([[      9132,      14138, 1477785600],
       [     10612,      14138, 1467244800],
       [       257,      14138, 1454716800],
       ...,
       [      9051,      20315, 1530144000],
       [      3412,      20315, 1527465600],
       [      9805,      20315, 1527206400]])

In [10]:
train_x, test_x = split_train_test(data, dims[0])
train_x

spliting train/test and removing timestamp...: 100%|██████████| 14138/14138 [00:05<00:00, 2818.91it/s]


array([[    0, 19248],
       [    0, 19249],
       [    0, 14823],
       ...,
       [14137, 14159],
       [14137, 18245],
       [14137, 18904]])

# Negative sampling

In [11]:
train_x = train_x[:, :2]
dims = dims[:2]
print("New dims:",dims)
print("New train_x:\n",train_x)

New dims: [14138 20316]
New train_x:
 [[    0 19248]
 [    0 19249]
 [    0 14823]
 ...
 [14137 14159]
 [14137 18245]
 [14137 18904]]


In [12]:
def build_adj_mx(n_feat:int, data:np.ndarray) -> sp.dok_matrix :
    train_mat = sp.dok_matrix((n_feat, n_feat), dtype=np.float32)
    for x in tqdm(data, desc=f"BUILDING ADJACENCY MATRIX..."):
        train_mat[x[0], x[1]] = 1.0
        train_mat[x[1], x[0]] = 1.0
        # IDEA: We treat features that are not user or item differently because we do not consider
        #  interactions between contexts
        if data.shape[1] > 2:
            for idx in range(len(x[2:])):
                train_mat[x[0], x[2 + idx]] = 1.0
                train_mat[x[1], x[2 + idx]] = 1.0
                train_mat[x[2 + idx], x[0]] = 1.0
                train_mat[x[2 + idx], x[1]] = 1.0
    return train_mat

In [41]:
def ng_sample(data: np.ndarray, dims: list, num_ng:int=4) -> Tuple[np.ndarray, sp.dok_matrix]:
    rating_mat = build_adj_mx(dims[-1], data)
    interactions = []
    min_item, max_item = dims[0], dims[1]
    for num, x in tqdm(enumerate(data), desc='perform negative sampling...'):
        interactions.append(np.append(x, 1))
        for t in range(num_ng):
            j = np.random.randint(min_item, max_item) #if not pop else random.sample(items_to_sample, 1)[0]
            # IDEA: Loop to exclude true interactions (set to 1 in adj_train) user - item
            print([[x[0], j], x[2:], [0]])
            borrar = np.vstack(interactions)
            print(borrar[:,:2])
            # print(interactions[j])
            print([x[0], j])
            print( [x[0], j] in borrar[:,:2])
            # print(j == int(x[1]))
            # print((x[0], j) in rating_mat)
            while (x[0], j) in rating_mat or j == int(x[1]) :#or [x[0], j] in interactions[:,:2]:
                j = np.random.randint(min_item, max_item) #if not pop else random.sample(items_to_sample, 1)[0]
                
            interactions.append(np.concatenate([[x[0], j], x[2:], [0]]))
        break
    return np.vstack(interactions), rating_mat


In [52]:
"""
Check if element is in list
"""
def isInList(element, list):
    for negative in list:
        if (set(element) == set(element) & set(negative)):
            return True
    return False

def ng_sample(data: np.ndarray, dims: list, num_ng:int=4) -> Tuple[np.ndarray, sp.dok_matrix]:
    rating_mat = build_adj_mx(dims[-1], data)
    interactions = []
    user = -1
    min_item, max_item = dims[0], dims[1]
    for num, x in tqdm(enumerate(data), desc='perform negative sampling...'):
        interactions.append(x) # Añadimos de uno en uno, x, que es positivo (ya viene el rating '1')  
        if user != x[0]: # en cada cambio de usuario
            userItem = [] 
            user = x[0]    
        userItem.append(x)
        for t in range(num_ng): # vamos añadir k negativos random
            j = np.random.randint(min_item, max_item) #if not pop else random.sample(items_to_sample, 1)[0]
            # IDEA: Loop to exclude true interactions (set to 1 in adj_train) user - item
            #       also exclude items duplicated for the same user
            while ((x[0], j) in rating_mat or j == int(x[1]) or isInList(np.concatenate([[x[0], j], x[3:], [0]]), userItem)):
                j = np.random.randint(min_item, max_item) #if not pop else random.sample(items_to_sample, 1)[0]
            interactions.append(np.concatenate([[x[0], j], x[3:], [0]]))
    return np.vstack(interactions), rating_mat

In [15]:
dup=pd.DataFrame(train_x)
dup[dup.duplicated()]

train_x

array([[    0, 19248],
       [    0, 19249],
       [    0, 14823],
       ...,
       [14137, 14159],
       [14137, 18245],
       [14137, 18904]])

In [53]:
a, b = ng_sample(train_x, dims)

BUILDING ADJACENCY MATRIX...: 100%|██████████| 123226/123226 [00:03<00:00, 36537.54it/s]
perform negative sampling...: 123226it [00:25, 4904.32it/s]


ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 2 and the array at index 1 has size 3

In [16]:
train_x, rating_mat = ng_sample(train_x, dims)
print("Dimensions matrix:\n",dims)
# print("\nRating matrix:")
# rating_mat

BUILDING ADJACENCY MATRIX...: 100%|██████████| 123226/123226 [00:03<00:00, 38693.11it/s]
  if sys.path[0] == "":
  if sys.path[0] == "":
perform negative sampling...: 0it [00:00, ?it/s]


[[0, 19166], array([], dtype=int32), [0]]
[array([    0, 19248,     1])]


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [51]:
t = [0, 16654]
a = np.array([[    0, 19248],
 [    0, 18304]])

"True" in pd.DataFrame(a).duplicated()

False

In [33]:
dup=pd.DataFrame(train_x)
print(dup[dup.duplicated()])
# train_x
# t = [0, 18053, 0]
# a = np.array([[    0, 19248,     1],
#        [    0, 18053,     0],
#        [    0, 15058,     0],
#        [14137, 16599,     0],
#        [14137, 17383,     0],
#        [14137, 15885,     0]])
# t in a

            0      1  2
709        11  16982  0
1279       25  20183  0
1306       25  16952  0
1599       30  14156  0
1621       31  15548  0
...       ...    ... ..
615247  14118  14408  0
615427  14122  19335  0
615762  14127  18717  0
615819  14128  14632  0
615857  14128  16571  0

[2066 rows x 3 columns]


In [40]:
dup[dup[0]==25].sort_values(by=[1], ascending=False)


Unnamed: 0,0,1,2
1282,25,20303,0
1295,25,20280,1
1279,25,20183,0
1247,25,20183,0
1319,25,20078,0
...,...,...,...
1312,25,14306,0
1292,25,14291,0
1272,25,14255,0
1267,25,14176,0


In [49]:
dims[-1]-dims[0]

6178

In [50]:
# Exercise 2

## Evaristo
#### number of ones
print(np.count_nonzero(rating_mat.toarray())/(dims[-1]*dims[-1]))
### number of zeros
print(1 - np.count_nonzero(rating_mat.toarray())/(dims[-1]*dims[-1]))

# ## Brenda
# #### Who sparse is the matrix??
# print(1 - rating_mat.shape[0] / rating_mat.count_nonzero())

0.000597112191656141
0.9994028878083439


In [51]:
train_x[:10]

array([[    0, 19248,     1],
       [    0, 16988,     0],
       [    0, 15331,     0],
       [    0, 16732,     0],
       [    0, 18032,     0],
       [    0, 19249,     1],
       [    0, 18445,     0],
       [    0, 14442,     0],
       [    0, 17291,     0],
       [    0, 16161,     0]])

# Creating dataset class

In [52]:
class PointData(Dataset):
    def __init__(self,
                 data: np.ndarray,
                 dims: list) -> None:
        """
        Dataset formatter adapted point-wise algorithms
        Parameters
        """
        super(PointData, self).__init__()
        self.interactions = data
        self.dims = dims

    def __len__(self) -> int:
        return len(self.interactions)
        
    def __getitem__(self, 
                    index: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Return the pairs user-item and the target.
        """
        return self.interactions[index][:-1], self.interactions[index][-1]

train_dataset = PointData(train_x, dims)

In [53]:
train_dataset[0]

(array([    0, 19248]), 1)

# Preparing the test set for inference

In [54]:
test_x

array([[    0, 17249],
       [    1, 18015],
       [    2, 14196],
       ...,
       [14135, 19938],
       [14136, 20214],
       [14137, 15542]])

In [55]:
import math
print(rating_mat.shape)
bits = math.ceil(math.log(rating_mat.shape[0],2))
print("rating_mat contains log2(rating_mat.shape[0]) = {} bits".format(bits))

(20316, 20316)
rating_mat contains log2(rating_mat.shape[0]) = 15 bits


In [56]:
def create_test_no_interactions_stratified_users(train_x: np.ndarray, test_x: np.ndarray, dims_usuarios_productos: Tuple[int, int],  num_samples: int) -> np.ndarray:
    """
    Esta funcion se encarga de crear de manera eficiente un dataset que contenga las interacciones usuario-producto en test que no se hayan producido.
    
    Argumentos:
        train_x (np.ndarray): matriz de entrenamiento con las interacciones usuario-producto previas
        test_x (np.ndarray): matriz de prueba con las interacciones usuario-producto previas
        dims_usuarios_productos (Tuple[int, int]): rango de productos y usuarios disponibles
    
    Retorno:
        np.ndarray: una matriz con todas las interacciones usuario-producto en test que no se hayan producido
    """
    save_data_configuration("\n"+"#"*4+"  zero_positions: Stratified Sampling By Users  "+"#"*4)

    
    seed = 2   # get same results
    random.seed(a = seed)

    usuarios_test = np.unique(test_x[:, 0]) # Identificamos los usuarios presentes en la prueba
    total_productos = range(dims_usuarios_productos[0]-1, dims_usuarios_productos[1]) # Identificamos el rango de productos disponibles
    zero_positions = np.zeros((num_samples * len(usuarios_test), 2)).astype(int)
    start_index = 0
    
    # Recorremos cada usuario presente en la prueba
    for user in tqdm(usuarios_test):
        # Identificamos los productos en los que el usuario ha interactuado previamente en entrenamiento
        strata_items_train = np.unique(train_x[train_x[:, 0] == user][:, 1])
        strata_items_test = np.unique(test_x[test_x[:, 0] == user][:, 1])
        # Seleccionamos al azar 199 productos con los que el usuario no ha interactuado previamente
        random_selection_items = random.choices(list(set(total_productos) - set(strata_items_test) - set(strata_items_train)), k=num_samples)
        # Creamos una lista de interacciones usuario-producto para este usuario
        # lista_por_usuario = [[usuario, x] for x in productos_a_machear]
        # We must avoid using for loops!

        zero_positions[start_index:start_index + num_samples, 0] = user
        zero_positions[start_index:start_index + num_samples, 1] = random_selection_items
        start_index += num_samples
        # lista_por_usuario = np.vstack([(np.ones(len(productos_a_machear))*usuario).astype(int), productos_a_machear]).T
        
        # Si es el primer usuario, inicializamos una matriz con sus interacciones
        # if usuario == 0:
        #     zero_positions = lista_por_usuario
        # # Si no es el primer usuario, concatenamos sus interacciones a la matriz existente
        # else:
        #     zero_positions = np.concatenate((zero_positions, lista_por_usuario), axis=0)
    # [00:59<00:00, 236.39it/s]
    return zero_positions

In [57]:
# def zero_positions_mode(mode, rating_mat, train_x, test_x, dims):
   
#     if mode == 0:
#         save_data_configuration("\n"+"#"*4+"  zero_positions: all data  "+"#"*4)
#         return np.asarray(np.where(rating_mat.A==0)).T
#     elif mode == 1:
#         zero_true_matrix = np.where(rating_mat.A==0)
#         save_data_configuration("\n"+"#"*4+"  zero_positions: all data separated by rows  "+"#"*4)
#         return np.asarray([zero_true_matrix[0],zero_true_matrix[1]]).T
#     else:
#         save_data_configuration("\n"+"#"*4+"  zero_positions: random sampling  "+"#"*4)
#         zp = create_test_no_interactions(train_x, test_x, dims,  num_samples=199)
#         return zp

In [58]:
# zp = zero_positions_mode(2, rating_mat, train_x, test_x, dims)

In [59]:
# zero_positions = np.asarray(np.where(rating_mat.A==0)).T
zero_positions = create_test_no_interactions_stratified_users(train_x, test_x, dims,  num_samples=199)
print(save_data_configuration(str(zero_positions.shape)+"\n"))
zero_positions

100%|██████████| 14138/14138 [00:32<00:00, 437.08it/s]

(2813462, 2)






array([[    0, 16115],
       [    0, 16065],
       [    0, 16733],
       ...,
       [14137, 16332],
       [14137, 15660],
       [14137, 14457]])

In [60]:
"""
Esta parte del código se usaba porque el zero positions venia de la rating matrix la cual tenia informacion no útil como (user,user) o (item,item), por eso nos quedábamos con items mayores a dims[0]
Ya que si recordamos, la matrix tenia size: (users + items)
rango items iniciales = 0 ... 6177
rango items actuales  = 14138 ... 20135
"""

items2compute = []
for user in trange(dims[0]):
    aux = zero_positions[zero_positions[:, 0] == user][:, 1]
    items2compute.append(aux[aux >= dims[0]])
items2compute[0]

# MUY RÁPIDO !!!!!
# items2compute = []
# start = 0
# num_samples = 199
# for user in trange(dims[0]):
#     aux = zero_positions[start:start+num_samples,1]
#     start += num_samples 
#     items2compute.append(aux)
# items2compute[2]

100%|██████████| 14138/14138 [01:59<00:00, 117.97it/s]


array([16115, 16065, 16733, 16914, 15372, 14755, 14338, 18290, 20123,
       20128, 19971, 17366, 19049, 18819, 14674, 16352, 16075, 19747,
       19135, 18044, 16605, 16553, 19258, 18352, 18735, 15722, 19635,
       19846, 17842, 16531, 18392, 17230, 19539, 16376, 14367, 17510,
       15733, 15133, 14746, 15812, 14922, 15089, 18568, 16265, 16150,
       17383, 14867, 14624, 19235, 19662, 19412, 15924, 19480, 15348,
       18568, 15666, 15770, 19233, 19890, 15896, 14679, 19392, 17755,
       18389, 14526, 17412, 15820, 18044, 15842, 18298, 16123, 14566,
       19501, 19586, 14224, 20013, 18312, 17670, 19550, 15982, 20228,
       16852, 15280, 14693, 15818, 17569, 14810, 16746, 14233, 18074,
       17784, 15621, 17045, 19614, 15487, 17896, 17686, 15653, 18999,
       14635, 16580, 18622, 17447, 14357, 16902, 16106, 16541, 14714,
       16514, 17963, 15236, 17358, 17522, 14475, 18769, 16649, 16322,
       17323, 16607, 18508, 20180, 14795, 17087, 18466, 16573, 19158,
       14942, 14779,

In [61]:
def build_test_set(itemsnoninteracted:list, gt_test_interactions: np.ndarray) -> list:
    #max_users, max_items = dims # number users (943), number items (2625)
    test_set = []
    for pair, negatives in tqdm(zip(gt_test_interactions, itemsnoninteracted), desc="BUILDING TEST SET..."):
        # APPEND TEST SETS FOR SINGLE USER
        negatives = np.delete(negatives, np.where(negatives == pair[1]))
        single_user_test_set = np.vstack([pair, ] * (len(negatives)+1))
        single_user_test_set[:, 1][1:] = negatives
        test_set.append(single_user_test_set.copy()) # siempre tendremos 1 positivo y el resto negativos
        break
    return test_set

test_x = build_test_set(items2compute, test_x)

BUILDING TEST SET...: 0it [00:00, ?it/s]


# Building Factorization Machines model

In [74]:
test_x

[array([[    0, 17249],
        [    0, 16115],
        [    0, 16065],
        [    0, 16733],
        [    0, 16914],
        [    0, 15372],
        [    0, 14755],
        [    0, 14338],
        [    0, 18290],
        [    0, 20123],
        [    0, 20128],
        [    0, 19971],
        [    0, 17366],
        [    0, 19049],
        [    0, 18819],
        [    0, 14674],
        [    0, 16352],
        [    0, 16075],
        [    0, 19747],
        [    0, 19135],
        [    0, 18044],
        [    0, 16605],
        [    0, 16553],
        [    0, 19258],
        [    0, 18352],
        [    0, 18735],
        [    0, 15722],
        [    0, 19635],
        [    0, 19846],
        [    0, 17842],
        [    0, 16531],
        [    0, 18392],
        [    0, 17230],
        [    0, 19539],
        [    0, 16376],
        [    0, 14367],
        [    0, 17510],
        [    0, 15733],
        [    0, 15133],
        [    0, 14746],
        [    0, 15812],
        [    0, 

In [62]:
class FM_operation(torch.nn.Module):

    def __init__(self, 
                 reduce_sum: bool=True) -> None:
        super().__init__()
        self.reduce_sum = reduce_sum

    def forward(self,
                x: torch.Tensor) -> float:
        """
        :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
        """
        # square_of_sum = np.sum(x, dim=1) ** 2 # ...
        # sum_of_square = np.sum(x ** 2, dim=1) # ...
        
        square_of_sum = torch.pow(torch.sum(x, dim=1),2)
        sum_of_square = torch.sum(torch.pow(x,2), dim=1)
        ix = square_of_sum - sum_of_square
        if self.reduce_sum:
            ix = torch.sum(ix, dim=1, keepdim=True)
        return 0.5 * ix
        

In [63]:
class FactorizationMachineModel(torch.nn.Module):
    """
    A pytorch implementation of Factorization Machine.

    Reference:
        S Rendle, Factorization Machines, 2010.
    """

    def __init__(self, 
                 field_dims: list,
                 embed_dim: float) -> None:
        super().__init__()
        self.linear = torch.nn.Linear(len(field_dims), 1)
        self.embedding = torch.nn.Embedding(field_dims[-1], embed_dim)
        self.fm = FM_operation(reduce_sum=True)

        torch.nn.init.xavier_uniform_(self.embedding.weight.data)

    def forward(self, interaction_pairs: torch.Tensor) -> torch.Tensor:
        """
        :param interaction_pairs: Long tensor of size ``(batch_size, num_fields)``
        """
        out = self.linear(interaction_pairs.float()) + self.fm(self.embedding(interaction_pairs))
        return out.squeeze(1)
        
    def predict(self, 
                interactions: np.ndarray,
                device: torch.device) -> torch.Tensor:
        # return the score, inputs are numpy arrays, outputs are tensors
        test_interactions = torch.from_numpy(interactions).to(dtype=torch.long, device=device) #, dtype=torch.long)
        output_scores = self.forward(test_interactions)
        return output_scores

# Pipeline functions

## Training

In [64]:
from statistics import mean

def train_one_epoch(model: torch.nn.Module,
                    optimizer: torch.optim,
                    data_loader: torch.utils.data.DataLoader,
                    criterion: torch.nn.functional,
                    device: torch.device) -> float:
    model.train()
    total_loss = []

    for i, (interactions, targets) in enumerate(data_loader):
        interactions = interactions.to(device)
        targets = targets.to(device)

        predictions = model(interactions)
    
        loss = criterion(predictions, targets.float())
        model.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss.append(loss.item())

    return mean(total_loss)

# Define metrics

In [65]:
import math

def getHitRatio(recommend_list: list,
                gt_item: int) -> bool:
    if gt_item in recommend_list:
        return 1
    else:
        return 0

def getNDCG(recommend_list: list,
            gt_item: int) -> float:
    idx = np.where(recommend_list == gt_item)[0]
    if len(idx) > 0:
        return math.log(2)/math.log(idx+2)
    else:
        return 0

# Inference


In [66]:
def test(model: torch.nn.Module,
         test_x: np.ndarray,
         device: torch.device,
         topk: int=10) -> Tuple[float, float]:
    # Test the HR and NDCG for the model @topK
    model.eval()

    HR, NDCG = [], []
    for user_test in test_x:
        gt_item = user_test[0][1]
        predictions = model.predict(user_test, device)
        _, indices = torch.topk(predictions, topk)
        recommend_list = user_test[indices.cpu().detach().numpy()][:, 1]

        HR.append(getHitRatio(recommend_list, gt_item))
        NDCG.append(getNDCG(recommend_list, gt_item))
    return mean(HR), mean(NDCG)

# PIPELINE
## Defining the model, the loss and the optimizer



In [67]:
dims = train_dataset.dims
model = FactorizationMachineModel(dims, hparams['hidden_size']).to(device)

criterion = torch.nn.BCEWithLogitsLoss(reduction='mean')
optimizer = torch.optim.Adam(params=model.parameters(), lr=hparams['learning_rate'])

## Random evaluation

In [68]:
import random
class RandomModel(torch.nn.Module):
    def __init__(self, 
                 dims: list) -> None:
        super(RandomModel, self).__init__()
        """
        Simple random based recommender system
        """
        self.all_items = list(range(dims[0], dims[1]))

    def forward(self) -> None:
        pass

    def predict(self,
                interactions: np.ndarray,
                device=None) -> torch.Tensor:
        return torch.FloatTensor(random.sample(self.all_items, len(interactions)))

rnd_model = RandomModel(dims)

## Final pipeline

In [69]:
data_loader = DataLoader(train_dataset, batch_size=hparams['batch_size'], shuffle=True, num_workers=0)

# Start training the model

In [70]:
# DO EPOCHS NOW
from datetime import datetime
save_data_configuration(datetime.now().strftime("%d-%b-%Y  %H:%M"))
time_start = time.time()
topk = 10
for epoch_i in range(hparams['num_epochs']):
    #data_loader.dataset.negative_sampling()
    train_loss = train_one_epoch(model, optimizer, data_loader, criterion, device)
    hr, ndcg = test(model, test_x, device, topk=topk)
    
    print(save_data_configuration(f'MODEL: FACTORIZATION MACHINE'))
    print(save_data_configuration(f'epoch {epoch_i}:'))
    print(save_data_configuration(f'training loss = {train_loss:.4f} | Eval: HR@{topk} = {hr:.4f}, NDCG@{topk} = {ndcg:.4f} '))
    
 
    tb_fm.add_scalar('train/loss', train_loss, epoch_i)
    tb_fm.add_scalar('eval/HR@{topk}', hr, epoch_i)
    tb_fm.add_scalar('eval/NDCG@{topk}', ndcg, epoch_i)

    hr, ndcg = test(rnd_model, test_x, device, topk=topk)
    
    print(save_data_configuration(f'MODEL: RANDOM'))
    print(save_data_configuration(f'epoch {epoch_i}:'))
    print(save_data_configuration(f'training loss = {train_loss:.4f} | Eval: HR@{topk} = {hr:.4f}, NDCG@{topk} = {ndcg:.4f} '))
    save_data_configuration("_"*65)
 
    tb_rnd.add_scalar('eval/HR@{topk}', hr, epoch_i)
    tb_rnd.add_scalar('eval/NDCG@{topk}', ndcg, epoch_i)
save_data_configuration(f"# Training duration: {time.time()-time_start}")

MODEL: FACTORIZATION MACHINE
epoch 0:
training loss = 61.2017 | Eval: HR@10 = 0.0000, NDCG@10 = 0.0000 
MODEL: RANDOM
epoch 0:
training loss = 61.2017 | Eval: HR@10 = 0.0000, NDCG@10 = 0.0000 
MODEL: FACTORIZATION MACHINE
epoch 1:
training loss = 0.5166 | Eval: HR@10 = 0.0000, NDCG@10 = 0.0000 
MODEL: RANDOM
epoch 1:
training loss = 0.5166 | Eval: HR@10 = 0.0000, NDCG@10 = 0.0000 
MODEL: FACTORIZATION MACHINE
epoch 2:
training loss = 0.5054 | Eval: HR@10 = 0.0000, NDCG@10 = 0.0000 
MODEL: RANDOM
epoch 2:
training loss = 0.5054 | Eval: HR@10 = 0.0000, NDCG@10 = 0.0000 
MODEL: FACTORIZATION MACHINE
epoch 3:
training loss = 0.4869 | Eval: HR@10 = 0.0000, NDCG@10 = 0.0000 
MODEL: RANDOM
epoch 3:
training loss = 0.4869 | Eval: HR@10 = 0.0000, NDCG@10 = 0.0000 
MODEL: FACTORIZATION MACHINE
epoch 4:
training loss = 0.4625 | Eval: HR@10 = 0.0000, NDCG@10 = 0.0000 
MODEL: RANDOM
epoch 4:
training loss = 0.4625 | Eval: HR@10 = 0.0000, NDCG@10 = 0.0000 
MODEL: FACTORIZATION MACHINE
epoch 5:
train

'# Training duration: 1142.6090068817139'

# Visualization

In [71]:

tb = tensorboard.program.TensorBoard()
tb.configure(bind_all=True, logdir=f"4_Modelling/{logs_base_dir}")
url = tb.launch()
webbrowser.open_new_tab(url.replace("MSI","localhost"))

INFO:pytorch_profiler:Monitor runs begin


True

In [72]:
%tensorboard --logdir {logs_base_dir}

Reusing TensorBoard on port 6006 (pid 7452), started 0:23:16 ago. (Use '!kill 7452' to kill it.)

In [73]:
%tensorboard --logdir run_tensorboard

Reusing TensorBoard on port 6006 (pid 17036), started 0:23:14 ago. (Use '!kill 17036' to kill it.)

