In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer

import torch
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from dataset import TransactionDataset, transaction_collate_fn

from torchmetrics.classification import MulticlassF1Score

In [2]:
df = pd.read_csv('data/rosbank/train.csv')
df['TRDATETIME'] = pd.to_datetime(df['TRDATETIME'], format='%d%b%y:%H:%M:%S')
df = df.rename(columns={'cl_id':'client_id', 'MCC':'small_group', 'amount':'amount_rur'})
df.head()

Unnamed: 0,PERIOD,client_id,small_group,channel_type,currency,TRDATETIME,amount_rur,trx_category,target_flag,target_sum
0,01/10/2017,0,5200,,810,2017-10-21 00:00:00,5023.0,POS,0,0.0
1,01/10/2017,0,6011,,810,2017-10-12 12:24:07,20000.0,DEPOSIT,0,0.0
2,01/12/2017,0,5921,,810,2017-12-05 00:00:00,767.0,POS,0,0.0
3,01/10/2017,0,5411,,810,2017-10-21 00:00:00,2031.0,POS,0,0.0
4,01/10/2017,0,6012,,810,2017-10-24 13:14:24,36562.0,C2C_OUT,0,0.0


In [3]:
mcc_to_id = {mcc: i+1 for i, mcc in enumerate(df['small_group'].unique())}

df['amount_rur_bin'] = 1 + KBinsDiscretizer(10, encode='ordinal', subsample=None).fit_transform(df[['amount_rur']]).astype('int')
df['small_group'] = df['small_group'].map(mcc_to_id)

In [4]:
ds = TransactionDataset(
    df, 
    id_col="client_id", 
    dt_col="TRDATETIME", 
    cat_cols=["small_group", "amount_rur_bin"],
    min_length=10,
    max_length=100
)

383 sequences were filtered


In [5]:
batch = [ds[0]]

In [6]:
class TransactionEncoder(nn.Module):
    def __init__(self, embedding_dict, linear_proj=None):
        super().__init__()

        self.features = embedding_dict.keys()
        self.embeddings = nn.ModuleDict({key: nn.Embedding(vocab, dim) for key, (vocab, dim) in embedding_dict.items()})
        self.linear_proj = nn.Identity()
        if linear_proj is not None:
            self.embedding_dim = linear_proj
            self.linear_proj = nn.Linear(sum([dim for key, (vocab, dim) in embedding_dict.items()]), linear_proj)
        else:
            self.embedding_dim = sum([dim for key, (vocab, dim) in embedding_dict.items()])

    def forward(self, x):
        embeddings = [self.embeddings[key](x[key]) for key in self.features]
        proj = self.linear_proj(torch.cat(embeddings, dim=2))
        return proj

In [7]:
a = TransactionEncoder(embedding_dict={"small_group": (500, 32), "amount_rur_bin": (11, 32)}, linear_proj=128)

In [8]:
a(transaction_collate_fn(batch)).shape

torch.Size([1, 60, 128])

In [9]:
a.requires_grad_(False)

TransactionEncoder(
  (embeddings): ModuleDict(
    (small_group): Embedding(500, 32)
    (amount_rur_bin): Embedding(11, 32)
  )
  (linear_proj): Linear(in_features=64, out_features=128, bias=True)
)

In [22]:
class Head(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        super().__init__()

        self.head = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, vocab_size),
        )

    def forward(self, x):
        return self.head(x)


class TransformerModel(nn.Module):
    def __init__(self, encoder, n_head, dim_feedforward, dropout, num_layers, head_hidden):
        super().__init__()

        self.transaction_encoder = encoder
        self.cat_cols = list(self.transaction_encoder.embeddings.keys())

        self.embedding_dim = self.transaction_encoder.embedding_dim
        self.encoder_layer = nn.TransformerEncoderLayer(
            self.embedding_dim, 
            n_head, 
            dim_feedforward=dim_feedforward, 
            dropout=dropout, 
            activation="gelu",
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        
        self.heads = nn.ModuleDict({
            key: Head(
                self.embedding_dim, 
                head_hidden, 
                self.transaction_encoder.embeddings[key].num_embeddings
            ) for key in self.cat_cols
        })

    def forward(self, x):
        N, S = x[self.cat_cols[0]].shape
        embeddings = self.transaction_encoder(x)

        attn_mask = self.generate_square_subsequent_mask(S)
        padding_mask = self.generate_padding_mask(x[self.cat_cols[0]])

        encoded = self.transformer_encoder(embeddings, mask=attn_mask, is_causal=True, src_key_padding_mask=padding_mask)
        logits = {key: self.heads[key](encoded) for key in self.cat_cols}
        return logits

    @staticmethod
    def generate_square_subsequent_mask(sz):
        return torch.triu(torch.full((sz, sz), float('-inf')), diagonal=1)
    
    @staticmethod
    def generate_padding_mask(x):
        return torch.where(x == 0, float('-inf'), 0)

In [23]:
transaction_encoder = TransactionEncoder(embedding_dict={"small_group": (500, 32), "amount_rur_bin": (11, 32)}, linear_proj=128)

transformer = TransformerModel(transaction_encoder, 8, 128, 0.1, 2, 128)

In [38]:
transformer.transaction_encoder.embeddings.small_group.weight

Parameter containing:
tensor([[ 1.4697, -0.6004,  0.0930,  ...,  1.3974,  0.3611,  1.1613],
        [ 1.4262, -1.0382,  0.8974,  ..., -0.5551, -1.5328, -0.4617],
        [ 0.3547, -1.5246,  1.3369,  ..., -0.3562,  2.6781, -1.6873],
        ...,
        [ 0.1102,  0.4043,  0.1082,  ..., -1.0991, -1.2011,  0.5216],
        [ 0.3473, -0.2307,  0.8292,  ...,  1.1241, -0.1690,  1.4924],
        [ 0.1354,  0.6890, -0.4192,  ..., -0.7222,  0.0679,  0.8775]],
       device='cuda:0', requires_grad=True)

In [24]:
batch = [ds[0], ds[1], ds[2]]

In [25]:
torch.triu(torch.full((10, 10), float('-inf')), diagonal=1).shape

torch.Size([10, 10])

In [26]:
def train_epoch(model, optimizer, dataloader, warmup=10, device="cuda"):
    model.train()
    model.to(device)

    # metrics = {key: MulticlassF1Score() for key in model.cat_cols}
    for batch_dict in dataloader:
        logits_dict = model(batch_dict)

        loss = 0
        for key, logits in logits_dict.items():
            y = batch_dict[key][:, warmup:]
            y_pred = logits[:, warmup - 1: -1].permute(0, 2, 1)
            
            # print(y_pred.shape)

            loss += nn.functional.cross_entropy(y_pred, y, ignore_index=0)

            

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(loss)


def val_epoch(model, dataloader, warmup=10, device="cuda"):
    model.eval()
    with torch.no_grad():
        for batch_dict in dataloader:
            logits_dict = model(batch_dict)

            loss = 0
            for key, logits in logits_dict.items():
                y = batch_dict[key][:, warmup + 1:]
                y_pred = logits[:, warmup: -1].permute(0, 2, 1)
        
                loss += nn.functional.cross_entropy(y_pred, y, ignore_index=0)

In [27]:
train_loader = DataLoader(ds, batch_size=32, shuffle=True, collate_fn=transaction_collate_fn)

In [28]:
transaction_encoder = TransactionEncoder(embedding_dict={"small_group": (500, 32), "amount_rur_bin": (11, 32)}, linear_proj=128)

transformer = TransformerModel(transaction_encoder, 8, 128, 0.1, 2, 128)
optimizer = torch.optim.Adam(transformer.parameters(), lr=1e-3)

In [29]:
train_epoch(transformer, optimizer, train_loader)

AttributeError: 'TransactionEncoder' object has no attribute 'device'