### References

* [DeepFM: A Factorization-Machine based Neural Network for CTR Prediction](https://arxiv.org/pdf/1703.04247.pdf)
* [Factorization Machines](https://d2l.ai/chapter_recommender-systems/fm.html)
* [Sistemas de Recomendación (Parte 1): Filtros Colaborativos | Clase 22 | Aprendizaje Profundo 2021](https://www.youtube.com/watch?v=YAvX3BBh7U4)
* https://github.com/rixwew/pytorch-fm
* https://www.kaggle.com/c/avazu-ctr-prediction/data

In [11]:
import sys
sys.path.append('../src')

import logging
import numpy as np
import pandas as pd

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split

from pytorch_common.util import get_device, set_device_name 

from dataset.movielens import MovieLens1MDataset, MovieLens20MDataset

from sklearn.metrics import roc_auc_score

In [12]:
set_device_name('gpu')

device = get_device()

In [13]:
def load_dataset(name):
    if '1m' == name:
        dataset_path = '../datasets/ml-1m/ratings.dat'
        dataset = MovieLens1MDataset(dataset_path=dataset_path)
    else:
        dataset_path = '../datasets/ml-20m/ratings.csv'
        dataset = MovieLens20MDataset(dataset_path=dataset_path)

    logging.info('{} dataset loaded! Shape: {}'.format(dataset_path, dataset.shape))
    logging.info('Target count: {}'.format(dataset.targets_count()))

    return dataset

In [14]:
dataset = load_dataset('1m')

In [15]:
len(dataset)

1000209

In [16]:
dataset.targets_count()

{0.0: 424928, 1.0: 575281}

(UserID, MovieID) pairs:

In [17]:
dataset.items.shape

(1000209, 2)

In [18]:
dataset.items[0:5]

array([[   0, 1192],
       [   0,  660],
       [   0,  913],
       [   0, 3407],
       [   0, 2354]])

In [19]:
len(dataset.user_ids())

6040

In [20]:
len(dataset.movie_ids())

3706

Max UserID/MovieID:

In [21]:
dataset.field_dims

array([6040, 3952])

FeaturesEmbedding:

Este modulo es una capa de embedding  la cual puedo alojar los embedding de N features en una sola lookup table. Por esta cuestion se guarla en office para cada feature y tambien es la razon por la cual al dimencion de la lookup table se un array con las dimenciones de cada feature. Ej: UserID tiene 100 ids pero ModieOd 1000 entonces field_dims == [100, 1000]. Luego para hacer el forward se lse pasa como input un array de lotes(batches) donde cada observacion es un array de features. ej.: [UserID, MovieID].

In [22]:
class ModelMixin(object):
    @property
    def weights(self): return list(self.named_parameters())

In [11]:
class FeaturesEmbedding(torch.nn.Module, ModelMixin):

    def __init__(self, emb_lookup_table_size, emb_vector_size):
        super().__init__()
        self.embedding = torch.nn.Embedding(
            sum(emb_lookup_table_size), 
            emb_vector_size
        )
            self.offsets = np.array(
                (0, *np.cumsum(emb_lookup_table_size)[:-1]), 
                dtype=np.long
            )
        torch.nn.init.xavier_uniform_(self.embedding.weight.data)

    def forward(self, x):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
            x = x + x.new_tensor(self.offsets).unsqueeze(0)
        return self.embedding(x)

In [12]:
emb_lookup_table_size = [3, 4]
emb_vector_size = 3
emb = FeaturesEmbedding(emb_lookup_table_size, emb_vector_size)
emb.weights

In [13]:
x = torch.LongTensor([
    [0, 1], 
    [2, 3]
])
x.shape

In [14]:
offsets = np.array((0, *np.cumsum(emb_lookup_table_size)[:-1]), dtype=np.long)
offsets

In [15]:
x.new_tensor(offsets).unsqueeze(0)

In [16]:
y = x + x.new_tensor(offsets).unsqueeze(0)
y

In [17]:
y.shape

In [18]:
emb_out = emb(x)
emb_out

In [19]:
emb_out.shape

La salida de la capa embedding es la concatenacion de los vectores de embedding correspondientes a la cantidad de features para la cual se creo el la capa FeaturesEmbedding. En este caso tenemso dos features UserID y MovieId, por lo cual solo teneso dos tablas de embedding (almbas dentro de la misma lookup table). 

In [20]:
embedding_output = len(emb_lookup_table_size) * emb_vector_size

Cada embedding tiene 3 posiciones y tenemos dos features, por eso la salida tiene dimencion 6.

### FeaturesLinear

In [21]:
class FeaturesLinear(torch.nn.Module, ModelMixin):
    def __init__(self, emb_lookup_table_size, output_dim=1):
        super().__init__()
        self.fc      = FeaturesEmbedding(emb_lookup_table_size, output_dim)
        self.bias    = torch.nn.Parameter(torch.zeros((output_dim,)))

    def forward(self, x):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        return torch.sum(self.fc(x), dim=1) + self.bias

    @property
        def weight(self):
        return self.fc.weight, self.bias

In [22]:
fl = FeaturesLinear(emb_lookup_table_size)
fl

In [23]:
fl.weights

In [24]:
fl.fc(x)

### MultiLayerPerceptron

In [25]:
class MultiLayerPerceptron(torch.nn.Module, ModelMixin):

    def __init__(self, input_units, units_per_layer, dropout, output_layer=True):
        super().__init__()
        layers = list()

        for units in units_per_layer:
            layers.append(torch.nn.Linear(input_units, units))
            layers.append(torch.nn.BatchNorm1d(units))
            layers.append(torch.nn.ReLU())
            layers.append(torch.nn.Dropout(p=dropout))
            input_units = units

        if output_layer:
            layers.append(torch.nn.Linear(input_units, 1))
        
        self.mlp = torch.nn.Sequential(*layers)

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, embed_dim)``
        """
        return self.mlp(x)

In [26]:
mlp = MultiLayerPerceptron(
    input_units     = embedding_output, 
    units_per_layer = [5, 5], 
    dropout         = 0.2
)
mlp

In [27]:
mlp.weights

In [28]:
emb_out

In [29]:
mlp_input = emb_out.view(-1, embedding_output)
mlp_input

In [30]:
mlp_output = mlp(mlp_input)
mlp_output

### FactorizationMachine

In [45]:
class FactorizationMachine(torch.nn.Module, ModelMixin):

    def __init__(self, reduce_sum=True):
        super().__init__()
        self.reduce_sum = reduce_sum

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
        """
        square_of_sum = torch.sum(x, dim=1) ** 2
        sum_of_square = torch.sum(x ** 2, dim=1)
        ix = square_of_sum - sum_of_square
        if self.reduce_sum:
            ix = torch.sum(ix, dim=1, keepdim=True)

        return 0.5 * ix

In [46]:
emb_out

In [47]:
emb_out.shape

In [48]:
torch.sum(emb_out, dim=1)

In [49]:
torch.sum(emb_out, dim=1).shape

In [50]:
torch.sum(emb_out, dim=1, keepdim=True)

In [51]:
torch.sum(emb_out, dim=1, keepdim=True).shape

In [52]:
sum_embs = torch.sum(emb_out, dim=1) + torch.sum(emb_out, dim=1)
sum_embs

In [53]:
torch.sum(sum_embs, dim=1, keepdim=True)

In [54]:
torch.sum(sum_embs, dim=1, keepdim=True) * 0.5

In [56]:
fm = FactorizationMachine(emb_lookup_table_size)
fm(emb_out)

In [57]:
fm.weights

### DeepFactorizationMachineModel

In [70]:
class DeepFactorizationMachineModel(torch.nn.Module, ModelMixin):
    """
    A pytorch implementation of DeepFM.
    Reference:
        H Guo, et al. DeepFM: A Factorization-Machine based Neural Network for CTR Prediction, 2017.
    """

    def __init__(self, emb_lookup_table_size, emb_vector_size, mlp_units_per_layer, mlp_dropout):
        super().__init__()
        self.linear= FeaturesLinear(emb_lookup_table_size)
    
        self.fm = FactorizationMachine(reduce_sum=True)

        self.embedding       = FeaturesEmbedding(emb_lookup_table_size, emb_vector_size)
        self.emb_output_dim  = len(emb_lookup_table_size) * emb_vector_size

        self.mlp = MultiLayerPerceptron(
            input_units     = self.emb_output_dim, 
            units_per_layer = mlp_units_per_layer,
            dropout         = mlp_dropout
        )

    def forward(self, x):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        embed_x = self.embedding(x)

        x = self.linear(x) + \
            self.fm(embed_x) + \
            self.mlp(embed_x.view(-1, self.emb_output_dim))

        return torch.sigmoid(x.squeeze(1))

In [103]:
dfm = DeepFactorizationMachineModel(
    emb_lookup_table_size  = dataset.field_dims,
    emb_vector_size        = 50,
    mlp_units_per_layer    = [100, 100],
    mlp_dropout            = 0.2

).to(device)
dfm

In [111]:
import tqdm
def train(model, optimizer, data_loader, criterion, device, log_interval=100):
    model.train()
    total_loss = 0
    
    tk0 = tqdm.tqdm(data_loader, smoothing=0, mininterval=1.0)
    for i, (fields, target) in enumerate(tk0):
        fields, target = fields.to(device), target.to(device)
        y = model(fields)
        
        loss = criterion(y, target.float())
        model.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        if (i + 1) % log_interval == 0:
            tk0.set_postfix(loss=total_loss / log_interval)
            total_loss = 0


def test(model, data_loader, device):
    model.eval()
    targets, predicts = list(), list()
    with torch.no_grad():
        for fields, target in tqdm.tqdm(data_loader, smoothing=0, mininterval=1.0):
            fields, target = fields.to(device), target.to(device)
            y = model(fields)
            targets.extend(target.tolist())
            predicts.extend(y.tolist())
    return roc_auc_score(targets, predicts)

In [112]:
class EarlyStopper(object):

    def __init__(self, num_trials, save_path):
        self.num_trials = num_trials
        self.trial_counter = 0
        self.best_accuracy = 0
        self.save_path = save_path

    def is_continuable(self, model, accuracy):
        if accuracy > self.best_accuracy:
            self.best_accuracy = accuracy
            self.trial_counter = 0
            torch.save(model, self.save_path)
            return True
        elif self.trial_counter + 1 < self.num_trials:
            self.trial_counter += 1
            return True
        else:
            return False

In [113]:
def train_val_test_split(X, train_percent=0.7, val_percent=0.15):
    train_length = int(len(X) * train_percent)
    valid_length = int(len(X) * val_percent)
    test_length  = len(X) - train_length - valid_length

    return random_split(X, (train_length, val_length, test_length))

train_dataset, valid_dataset, test_dataset = train_val_test_split(dataset)

batch_size   = 2048

train_data_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=8)
valid_data_loader = DataLoader(valid_dataset, batch_size=batch_size, num_workers=8)
test_data_loader  = DataLoader(test_dataset,  batch_size=batch_size, num_workers=8)

In [120]:
learning_rate = 0.0000001
weight_decay  = 1e-6
epochs        = 500
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(
    params = dfm.parameters(), 
    lr=learning_rate, 
    weight_decay=weight_decay
)
early_stopper = EarlyStopper(
    num_trials=2, 
    save_path=f'./dfm.pt'
)

In [121]:
for epoch in range(epochs):
    train(dfm, optimizer, train_data_loader, criterion, device)
    auc = test(dfm, valid_data_loader, device)
    print('epoch:', epoch, 'validation: auc:', auc)

    if not early_stopper.is_continuable(dfm, auc):
        print(f'validation: best auc: {early_stopper.best_accuracy}')
        break

auc = test(dfm, test_data_loader, device)

print(f'test auc: {auc}')

100%|██████████| 391/391 [00:01<00:00, 275.03it/s, loss=0.449]
100%|██████████| 49/49 [00:00<00:00, 173.29it/s]


epoch: 0 validation: auc: 0.8535864606658227


100%|██████████| 391/391 [00:01<00:00, 286.66it/s, loss=0.449]
100%|██████████| 49/49 [00:00<00:00, 162.42it/s]


epoch: 1 validation: auc: 0.8535842706869562


100%|██████████| 391/391 [00:01<00:00, 284.69it/s, loss=0.449]
100%|██████████| 49/49 [00:00<00:00, 153.67it/s]


epoch: 2 validation: auc: 0.8535806560971877
validation: best auc: 0.8535864606658227


100%|██████████| 49/49 [00:00<00:00, 155.03it/s]

test auc: 0.8536817549862673



