# Extract Matrix Factorization Embeddings for Train

The code for this notebook's item matrix factorization comes from CPMP's notebook [here][1] and Radek's notebook [here][2]. 

[1]: https://www.kaggle.com/code/cpmpml/matrix-factorization-with-gpu
[2]: https://www.kaggle.com/code/radek1/matrix-factorization-pytorch-merlin-dataloader

# Data Preprocessing
We will load and process data with RAPIDS cuDF

In [1]:
import cudf
print('RAPIDS cuDF version',cudf.__version__)

train = cudf.read_parquet('../../data/train_data/train.parquet')
train = train.sort_values(['session','ts'])

test = cudf.read_parquet('../../data/train_data/test.parquet')
test = test.sort_values(['session','ts'])

train_pairs = cudf.concat([train, test],axis=0,ignore_index=True)[['session', 'aid']]
del train, test

train_pairs['aid_next'] = train_pairs.groupby('session').aid.shift(-1)
train_pairs = train_pairs[['aid', 'aid_next']].dropna().reset_index(drop=True)

cardinality_aids = 1855602 
print('Cardinality of items is',cardinality_aids)

RAPIDS cuDF version 21.10.01
Cardinality of items is 1855602


# Install Merlin Dataloader!
We will feed our PyTorch model with Merlin dataloader!

In [2]:
!pip install merlin-dataloader==0.0.2
from merlin.loader.torch import Loader 

train_pairs.to_pandas().to_parquet('all_pairs.parquet')
#train_pairs[:-10_000_000].to_pandas().to_parquet('train_pairs.parquet')
#train_pairs[-10_000_000:].to_pandas().to_parquet('valid_pairs.parquet')

from merlin.loader.torch import Loader 
from merlin.io import Dataset

train_ds = Dataset('all_pairs.parquet')
train_dl_merlin = Loader(train_ds, 65536, True)

Collecting merlin-dataloader==0.0.2
  Downloading merlin-dataloader-0.0.2.tar.gz (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m562.9 kB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l- \ | / - \ done
[?25h  Getting requirements to build wheel ... [?25l- done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
[?25hCollecting merlin-core
  Downloading merlin-core-0.7.0.tar.gz (108 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.0/108.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l- \ | / done
[?25h  Getting requirements to build wheel ... [?25l- done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
[?25h  Downloading merlin-core-0.6.0.tar.gz (108 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.0/108.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25

# Learn Item Embeddings with PyTorch Matrix Factorization Model
We will build a PyTorch model to generate item embeddings via matrix factorization.

In [3]:
import torch
from torch import nn

class MatrixFactorization(nn.Module):
    def __init__(self, n_aids, n_factors):
        super().__init__()
        self.aid_factors = nn.Embedding(n_aids, n_factors, sparse=True)
        
    def forward(self, aid1, aid2):
        aid1 = self.aid_factors(aid1)
        aid2 = self.aid_factors(aid2)
        
        return (aid1 * aid2).sum(dim=1)
    
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)

valid_ds = Dataset('valid_pairs.parquet')
valid_dl_merlin = Loader(valid_ds, 65536, True)

In [4]:
from torch.optim import SparseAdam

num_epochs=20
lr=0.1

model = MatrixFactorization(cardinality_aids+1, 32)
optimizer = SparseAdam(model.parameters(), lr=lr)
criterion = nn.BCEWithLogitsLoss()

model.to('cuda')
for epoch in range(num_epochs):
    for batch, _ in train_dl_merlin:
        model.train()
        losses = AverageMeter('Loss', ':.4e')
            
        aid1, aid2 = batch['aid'], batch['aid_next']
        aid1 = aid1.to('cuda')
        aid2 = aid2.to('cuda')
        output_pos = model(aid1, aid2)
        output_neg = model(aid1, aid2[torch.randperm(aid2.shape[0])])
        
        output = torch.cat([output_pos, output_neg])
        targets = torch.cat([torch.ones_like(output_pos), torch.zeros_like(output_pos)])
        loss = criterion(output, targets)
        losses.update(loss.item())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    model.eval()
    
    with torch.no_grad():
        accuracy = AverageMeter('accuracy')
        for batch, _ in valid_dl_merlin:
            aid1, aid2 = batch['aid'], batch['aid_next']
            output_pos = model(aid1, aid2)
            output_neg = model(aid1, aid2[torch.randperm(aid2.shape[0])])
            accuracy_batch = torch.cat([output_pos.sigmoid() > 0.5, output_neg.sigmoid() < 0.5]).float().mean()
            accuracy.update(accuracy_batch, aid1.shape[0])
            
    print(f'{epoch+1:02d}: * Train_Loss {losses.avg:.3f}  * Valid_Accuracy {accuracy.avg:.3f}')

01: * Train_Loss 0.601  * Valid_Accuracy 0.717
02: * Train_Loss 0.597  * Valid_Accuracy 0.726
03: * Train_Loss 0.592  * Valid_Accuracy 0.729
04: * Train_Loss 0.590  * Valid_Accuracy 0.730
05: * Train_Loss 0.588  * Valid_Accuracy 0.730
06: * Train_Loss 0.589  * Valid_Accuracy 0.731
07: * Train_Loss 0.587  * Valid_Accuracy 0.731
08: * Train_Loss 0.587  * Valid_Accuracy 0.732
09: * Train_Loss 0.583  * Valid_Accuracy 0.732
10: * Train_Loss 0.585  * Valid_Accuracy 0.732
11: * Train_Loss 0.587  * Valid_Accuracy 0.732
12: * Train_Loss 0.584  * Valid_Accuracy 0.732
13: * Train_Loss 0.586  * Valid_Accuracy 0.732
14: * Train_Loss 0.587  * Valid_Accuracy 0.732
15: * Train_Loss 0.583  * Valid_Accuracy 0.732
16: * Train_Loss 0.585  * Valid_Accuracy 0.732
17: * Train_Loss 0.585  * Valid_Accuracy 0.732
18: * Train_Loss 0.585  * Valid_Accuracy 0.732
19: * Train_Loss 0.582  * Valid_Accuracy 0.732
20: * Train_Loss 0.583  * Valid_Accuracy 0.732


# Extract Item Embeddings
We extract item embeddings from our model's embedding table.

In [5]:
# EXTRACT EMBEDDINGS FROM MODEL EMBEDDING TABLE
import numpy as np
embeddings = model.aid_factors.weight.detach().cpu().numpy().astype('float32')
np.save('../../data/item_user_features/item_embed_32',embeddings)
print('Item Matrix Factorization embeddings have shape',embeddings.shape)

Item Matrix Factorization embeddings have shape (1855603, 32)
