In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer

import torch
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from dataset import TransactionDataset, transaction_collate_fn

from torchmetrics.classification import MulticlassF1Score, Accuracy

from sklearn.model_selection import train_test_split

import time

import wandb

In [2]:
df = pd.read_csv('data/rosbank/train.csv')
df['TRDATETIME'] = pd.to_datetime(df['TRDATETIME'], format='%d%b%y:%H:%M:%S')
df = df.rename(columns={'cl_id':'client_id', 'MCC':'small_group', 'amount':'amount_rur'})
df.head()

Unnamed: 0,PERIOD,client_id,small_group,channel_type,currency,TRDATETIME,amount_rur,trx_category,target_flag,target_sum
0,01/10/2017,0,5200,,810,2017-10-21 00:00:00,5023.0,POS,0,0.0
1,01/10/2017,0,6011,,810,2017-10-12 12:24:07,20000.0,DEPOSIT,0,0.0
2,01/12/2017,0,5921,,810,2017-12-05 00:00:00,767.0,POS,0,0.0
3,01/10/2017,0,5411,,810,2017-10-21 00:00:00,2031.0,POS,0,0.0
4,01/10/2017,0,6012,,810,2017-10-24 13:14:24,36562.0,C2C_OUT,0,0.0


In [3]:
# df = pd.read_csv('data/sberbank/transactions_train.csv')
# df.head()

Unnamed: 0,client_id,trans_date,small_group,amount_rur
0,33172,6,4,71.463
1,33172,6,35,45.017
2,33172,8,11,13.887
3,33172,9,11,15.983
4,33172,10,11,21.341


In [3]:
mcc_to_id = {mcc: i+1 for i, mcc in enumerate(df['small_group'].unique())}

df['amount_rur_bin'] = 1 + KBinsDiscretizer(10, encode='ordinal', subsample=None).fit_transform(df[['amount_rur']]).astype('int')
df['small_group'] = df['small_group'].map(mcc_to_id)

In [4]:
df

Unnamed: 0,PERIOD,client_id,small_group,channel_type,currency,TRDATETIME,amount_rur,trx_category,target_flag,target_sum,amount_rur_bin
0,01/10/2017,0,1,,810,2017-10-21 00:00:00,5023.00,POS,0,0.0,9
1,01/10/2017,0,2,,810,2017-10-12 12:24:07,20000.00,DEPOSIT,0,0.0,10
2,01/12/2017,0,3,,810,2017-12-05 00:00:00,767.00,POS,0,0.0,6
3,01/10/2017,0,4,,810,2017-10-21 00:00:00,2031.00,POS,0,0.0,8
4,01/10/2017,0,5,,810,2017-10-24 13:14:24,36562.00,C2C_OUT,0,0.0,10
...,...,...,...,...,...,...,...,...,...,...,...
490508,01/04/2017,10176,2,type1,810,2017-04-24 14:05:26,600.00,WD_ATM_ROS,1,405.0,5
490509,01/06/2017,10171,4,type1,810,2017-06-06 00:00:00,132.00,POS,0,0.0,2
490510,01/02/2017,10167,51,type1,810,2017-02-03 00:00:00,1000.00,POS,1,280428.2,7
490511,01/06/2017,10163,39,type1,810,2017-06-08 00:00:00,100.00,POS,0,0.0,2


In [11]:
# df = df.rename({'trans_date': 'TRDATETIME'}, axis=1)

In [5]:
clients_train, clients_val_test = train_test_split(df["client_id"].unique(), test_size=0.2, random_state=42)
clients_val, clients_test = train_test_split(clients_val_test, test_size=0.5, random_state=42)

train_ds = TransactionDataset(
    df[lambda x: x["client_id"].isin(clients_train)], 
    id_col="client_id", 
    dt_col="TRDATETIME", 
    cat_cols=["small_group", "amount_rur_bin"],
    min_length=20,
    max_length=100
)

val_ds = TransactionDataset(
    df[lambda x: x["client_id"].isin(clients_val)], 
    id_col="client_id", 
    dt_col="TRDATETIME", 
    cat_cols=["small_group", "amount_rur_bin"],
    min_length=20,
    max_length=100
)

test_ds = TransactionDataset(
    df[lambda x: x["client_id"].isin(clients_test)], 
    id_col="client_id", 
    dt_col="TRDATETIME", 
    cat_cols=["small_group", "amount_rur_bin"],
    min_length=20,
    max_length=100
)

628 sequences were filtered
77 sequences were filtered
70 sequences were filtered


In [None]:
!pip install performer-pytorch

In [6]:
from performer_pytorch import Performer

In [79]:
import torch
from performer_pytorch import Performer

model = Performer(
    dim = 512,
    depth = 1,
    heads = 8,
    causal = True,
    dim_head = 64
)

x = torch.randn(1, 2048, 512)
model(x).shape

unable to import cuda code for auto-regressive Performer. will default to the memory inefficient non-cuda version


torch.Size([1, 2048, 512])

In [81]:
import math
class TransactionEncoder(nn.Module):
    def __init__(self, feature_embeddings: dict[str, tuple[int, int]], linear_proj: int=None):
        super().__init__()
        
        self.feature_embeddings = feature_embeddings
        self.embeddings = nn.ModuleDict({key: nn.Embedding(vocab, dim) for key, (vocab, dim) in feature_embeddings.items()})
        
        if linear_proj is not None:
            self.embedding_dim = linear_proj
            self.linear_proj = nn.Linear(sum([dim for key, (vocab, dim) in feature_embeddings.items()]), linear_proj)
        else:
            self.embedding_dim = sum([dim for key, (vocab, dim) in feature_embeddings.items()])
            self.linear_proj = nn.Identity()

    def forward(self, x: torch.Tensor, device: str="cpu") -> torch.Tensor:
        embeddings = [self.embeddings[key](x[key].to(device)) for key in self.feature_embeddings]
        proj = self.linear_proj(torch.cat(embeddings, dim=2))
        return proj


class Head(nn.Module):
    def __init__(self, embedding_dim: int, hidden_dim: int, vocab_size: int):
        super().__init__()

        self.head = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, vocab_size),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.head(x)


class PositionalEncoding(nn.Module):
    def __init__(self, embedding_dim: int, dropout: float=0.1, max_len: int=1000):
        super().__init__()
        self.dropout = nn.Dropout(dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2) * (-math.log(10000.0) / embedding_dim))
        pe = torch.zeros(1, max_len, embedding_dim)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Arguments:
            x: Tensor, shape ``[batch_size, seq_len, embedding_dim]``
        """
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)


class PerformerModel(nn.Module):
    def __init__(
            self, 
            feature_embeddings: dict[str, tuple[int, int]], 
            linear_proj: int=None,
            n_head: int=8, 
            dim_feedforward: int=128, 
            dropout: float=0.1, 
            num_layers: int=6, 
            head_hidden: int=128,
            max_len: int=1000,
        ):
        super().__init__()

        self.transaction_encoder = TransactionEncoder(feature_embeddings, linear_proj=linear_proj)
        self.embedding_dim = self.transaction_encoder.embedding_dim
        self.cat_cols = list(feature_embeddings.keys())
        self.num_classes_dict = {key: num_classes for key, (num_classes, _) in feature_embeddings.items()}
        
        self.pos_emb = PositionalEncoding(self.embedding_dim, dropout, max_len)

        self.transformer_encoder = Performer(
            dim = self.embedding_dim, 
            depth = num_layers,
            heads = n_head, 
            ff_dropout = dropout,
            causal = True,
            dim_head = 16,
            # use_rezero = True,
            # no_projection = True,
            # feature_redraw_interval = 10000000,
            # bucket_size = 25
        )
        
        self.heads = nn.ModuleDict({
            key: Head(
                self.embedding_dim, 
                head_hidden, 
                num_classes
            ) for key, num_classes in self.num_classes_dict.items()
        })

    def forward(self, x: torch.Tensor, device: str="cpu") -> torch.Tensor:
        N, S = x[self.cat_cols[0]].shape
        embeddings = self.transaction_encoder(x, device=device)
        embeddings = self.pos_emb(embeddings)
        
        attn_mask = self.generate_square_subsequent_mask(S).to(device)
        padding_mask = self.generate_padding_mask(x[self.cat_cols[0]]).to(device)
        embeddings = self.transformer_encoder(embeddings, input_mask=padding_mask)

        logits = {key: self.heads[key](embeddings) for key in self.cat_cols}
        return logits

    @staticmethod
    def generate_square_subsequent_mask(sz: int) -> torch.Tensor:
        return torch.triu(torch.full((sz, sz), True), diagonal=1).bool()
    
    @staticmethod
    def generate_padding_mask(x: torch.Tensor) -> torch.Tensor:
        return torch.where(x == 0, True, False).bool()

In [82]:
from tqdm.notebook import tqdm

In [83]:
import os

def train_epoch(model, optimizer, dataloader, warmup=10, device="cuda"):
    model.train()
    model.to(device)

    metrics = {
        key: {
            "f1_score": MulticlassF1Score(
                num_classes=num_classes, 
                average="weighted", 
                ignore_index=0
            ), 
            "accuracy": Accuracy(
                task="multiclass", 
                num_classes=num_classes, 
                ignore_index=0
            )
        } for key, num_classes in model.num_classes_dict.items()
    }

    loss_epoch = 0
    count = 0 
    for batch_dict in tqdm(dataloader):
        logits_dict = model(batch_dict, device=device)

        loss = 0
        for key, logits in logits_dict.items():
            y = batch_dict[key][:, warmup + 1:].to(device)
            logits_pred = logits[:, warmup: -1].permute(0, 2, 1)    # B x C x T

            loss += nn.functional.cross_entropy(logits_pred, y, ignore_index=0)
            
            y_pred = logits_pred.argmax(dim=1).to("cpu")
            metrics[key]["f1_score"].update(y_pred, y.to("cpu"))
            metrics[key]["accuracy"].update(y_pred, y.to("cpu"))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        cur_count =  torch.sum((y != 0).float()).item()
        loss_epoch += loss.item() * cur_count
        count += cur_count

    return loss_epoch / count, {feature: {m: v.compute().item() for m, v in results.items()} for feature, results in metrics.items()}


def eval_epoch(model, dataloader, warmup=10, device="cuda"):
    model.eval()
    model.to(device)

    metrics = {
        key: {
            "f1_score": MulticlassF1Score(
                num_classes=num_classes, 
                average="weighted", 
                ignore_index=0
            ), 
            "accuracy": Accuracy(
                task="multiclass", 
                num_classes=num_classes, 
                ignore_index=0
            )
        } for key, num_classes in model.num_classes_dict.items()
    }

    with torch.no_grad():
        loss_epoch = 0
        count = 0 
        for batch_dict in dataloader:
            logits_dict = model(batch_dict, device=device)

            loss = 0
            for key, logits in logits_dict.items():
                y = batch_dict[key][:, warmup + 1:].to(device)
                logits_pred = logits[:, warmup: -1].permute(0, 2, 1)
        
                loss += nn.functional.cross_entropy(logits_pred, y, ignore_index=0)

                y_pred = logits_pred.argmax(dim=1).to("cpu")
                metrics[key]["f1_score"].update(y_pred, y.to("cpu"))
                metrics[key]["accuracy"].update(y_pred, y.to("cpu"))

            cur_count = torch.sum((y != 0).float()).item()
            loss_epoch += loss.item() * cur_count
            count += cur_count    

    return loss_epoch / count, {feature: {m: v.compute().item() for m, v in results.items()} for feature, results in metrics.items()}


def train_model(model, optimizer, dataloaders, n_epochs, warmup=10, device="cuda", save_path="./"):
    if not os.path.exists(save_path):
        os.mkdir(save_path)

    best_loss = float("inf")
    for epoch in tqdm(range(n_epochs)):
        train_start = time.perf_counter()
        train_loss, train_metrics = train_epoch(model, optimizer, dataloaders["train"], warmup, device)
        train_end = time.perf_counter()
        val_loss, val_metrics = eval_epoch(model, dataloaders["val"], warmup, device)
        val_end = time.perf_counter()

        # wandb.log({
        #     "Epoch": epoch+1,
        #     "Train time": train_end - train_start,
        #     "Train loss": train_loss,
        #     "Train metrics": train_metrics,
        #     "Val time": val_end - train_end,
        #     "Val metrics": val_metrics,
        #     "Val loss": val_loss
        # })

        if val_loss < best_loss:
             best_loss = val_loss
             torch.save(model, os.path.join(save_path, "best_model.pt"))

    model = torch.load(os.path.join(save_path, "best_model.pt"))

    test_start = time.perf_counter()
    test_loss, test_metrics = eval_epoch(model, dataloaders["test"], warmup, device)
    test_end = time.perf_counter()

    # wandb.summary["Test time"] = test_end - test_start
    # wandb.summary["Test metrics"] = test_metrics
    # wandb.summary["Test loss"] = test_loss

In [84]:
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=transaction_collate_fn)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False, collate_fn=transaction_collate_fn)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, collate_fn=transaction_collate_fn)

In [85]:
transformer = PerformerModel(
    feature_embeddings={"small_group": (345, 64), "amount_rur_bin": (11, 64)}, 
    linear_proj=64,
    n_head=8, 
    dim_feedforward=128, 
    dropout=0.1, 
    num_layers=6, 
    head_hidden=128
)

optimizer = torch.optim.Adam(transformer.parameters(), lr=1e-3)

unable to import cuda code for auto-regressive Performer. will default to the memory inefficient non-cuda version
unable to import cuda code for auto-regressive Performer. will default to the memory inefficient non-cuda version
unable to import cuda code for auto-regressive Performer. will default to the memory inefficient non-cuda version
unable to import cuda code for auto-regressive Performer. will default to the memory inefficient non-cuda version
unable to import cuda code for auto-regressive Performer. will default to the memory inefficient non-cuda version
unable to import cuda code for auto-regressive Performer. will default to the memory inefficient non-cuda version


In [86]:
train_model(
    transformer, 
    optimizer,
    {"train": train_loader, "val": val_loader, "test": test_loader},
    20
)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/106 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [107]:
a = df.sort_values("TRDATETIME").groupby("client_id").apply(lambda x: x["amount_rur_bin"] == x["amount_rur_bin"].shift())

In [111]:
a.reset_index()["amount_rur_bin"].mean()

0.17164274952957415

In [29]:
config = {
    "experiment_name": "run2",
    "dataset": "rosbank",
    "min_length": 20,
    "max_length": 100,
    "batch_size": 32,
    "type": "transformer",
    "transformer_params": {
        "feature_embeddings": {"small_group": (345, 64), "amount_rur_bin": (11, 64)}, 
        "linear_proj": 64,
        "n_head": 8, 
        "dim_feedforward": 128, 
        "dropout": 0.1, 
        "num_layers": 6, 
        "head_hidden": 128,
    },
    "lr": 1e-3,
    "n_epochs": 50,
    "warmup": 10,
    "device": "cuda",
    "save_path": "./"
}


In [49]:
import json

with open("config.json", "w") as f:
    json.dump(config, f)

In [56]:
from yaml import load, dump
# from yaml import CLoader as Loader, CDumper as Dumper

with open("config.yaml", "w") as f:
    dump(config, f)

In [27]:
os.path.exists(os.path.join("./", "./"))
os.path.join("./", "./")

'././'

In [59]:

with open("config.yaml", "r") as f:
    d = load(f)

d

TypeError: load() missing 1 required positional argument: 'Loader'

In [30]:
def main(config):
    # wandb.login()

    # wandb.init(
    #     project="deep-learning-project",
    #     name=config["experiment_name"], 
    #     tags=config["tags"],
    #     config=config
    # )

    if config["dataset"] == "rosbank":
        df = pd.read_csv('data/rosbank/train.csv')
        df['TRDATETIME'] = pd.to_datetime(df['TRDATETIME'], format='%d%b%y:%H:%M:%S')
        df = df.rename(columns={'cl_id':'client_id', 'MCC':'small_group', 'amount':'amount_rur'})
        
        mcc_to_id = {mcc: i+1 for i, mcc in enumerate(df['small_group'].unique())}

        df['amount_rur_bin'] = 1 + KBinsDiscretizer(10, encode='ordinal', subsample=None).fit_transform(df[['amount_rur']]).astype('int')
        df['small_group'] = df['small_group'].map(mcc_to_id)

    else:
        pass

    clients_train, clients_val_test = train_test_split(df["client_id"].unique(), test_size=0.2, random_state=42)
    clients_val, clients_test = train_test_split(clients_val_test, test_size=0.5, random_state=42)

    train_ds = TransactionDataset(
        df[lambda x: x["client_id"].isin(clients_train)], 
        id_col="client_id", 
        dt_col="TRDATETIME", 
        cat_cols=["small_group", "amount_rur_bin"],
        min_length=config["min_length"],
        max_length=config["max_length"],
        random_slice=True
    )

    val_ds = TransactionDataset(
        df[lambda x: x["client_id"].isin(clients_val)], 
        id_col="client_id", 
        dt_col="TRDATETIME", 
        cat_cols=["small_group", "amount_rur_bin"],
        min_length=config["min_length"],
        max_length=config["max_length"],
        random_slice=False
    )

    test_ds = TransactionDataset(
        df[lambda x: x["client_id"].isin(clients_test)], 
        id_col="client_id", 
        dt_col="TRDATETIME", 
        cat_cols=["small_group", "amount_rur_bin"],
        min_length=config["min_length"],
        max_length=config["max_length"],
        random_slice=False
    )

    train_loader = DataLoader(train_ds, batch_size=config["batch_size"], shuffle=True, collate_fn=transaction_collate_fn)
    val_loader = DataLoader(val_ds, batch_size=config["batch_size"], shuffle=False, collate_fn=transaction_collate_fn)
    test_loader = DataLoader(test_ds, batch_size=config["batch_size"], shuffle=False, collate_fn=transaction_collate_fn)

    transformer = TransformerModel(**config["transformer_params"])
    optimizer = torch.optim.Adam(transformer.parameters(), lr=config["lr"])

    train_model(
        transformer, 
        optimizer, 
        {"train": train_loader, "val": val_loader, "test": test_loader}, 
        n_epochs=config["n_epochs"],
        warmup=config["warmup"],
        device=config["device"],
        save_path=config["save_path"]
    )

    # wandb.finish()

In [31]:
main(config)



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

628 sequences were filtered
77 sequences were filtered
70 sequences were filtered


  return torch._transformer_encoder_layer_fwd(


0,1
Epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
Train loss,█▅▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Train time,█▅▂▁▁▁▂▁▁▁▁▁▂▁▁▂▁▂▁▁▁▁▂▁▂▁▁▁▁▂▂▁▂▂▂▂▁▁▁▁
Val loss,█▅▄▄▃▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▁▂
Val time,▂▃▁▃▂▁▁▁▁▁▃▃▂▁▁▂▁▁▁▁▁▁▁█▁▁▂▁▂█▂▃▂▃▂▆▁▁▃▁

0,1
Epoch,50.0
Test loss,4.94315
Test time,0.19568
Train loss,4.75581
Train time,3.7141
Val loss,5.037
Val time,0.19415


In [12]:
wandb.login()

True