In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer

import torch
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from dataset import TransactionDataset, transaction_collate_fn

from torchmetrics.classification import MulticlassF1Score, Accuracy

from sklearn.model_selection import train_test_split

import time

import wandb

In [24]:
df = pd.read_csv('data/rosbank/train.csv')
df['TRDATETIME'] = pd.to_datetime(df['TRDATETIME'], format='%d%b%y:%H:%M:%S')
df = df.rename(columns={'cl_id':'client_id', 'MCC':'small_group', 'amount':'amount_rur'})
df.head()

Unnamed: 0,PERIOD,client_id,small_group,channel_type,currency,TRDATETIME,amount_rur,trx_category,target_flag,target_sum
0,01/10/2017,0,5200,,810,2017-10-21 00:00:00,5023.0,POS,0,0.0
1,01/10/2017,0,6011,,810,2017-10-12 12:24:07,20000.0,DEPOSIT,0,0.0
2,01/12/2017,0,5921,,810,2017-12-05 00:00:00,767.0,POS,0,0.0
3,01/10/2017,0,5411,,810,2017-10-21 00:00:00,2031.0,POS,0,0.0
4,01/10/2017,0,6012,,810,2017-10-24 13:14:24,36562.0,C2C_OUT,0,0.0


In [25]:
mcc_to_id = {mcc: i+1 for i, mcc in enumerate(df['small_group'].unique())}

df['amount_rur_bin'] = 1 + KBinsDiscretizer(10, encode='ordinal', subsample=None).fit_transform(df[['amount_rur']]).astype('int')
df['small_group'] = df['small_group'].map(mcc_to_id)

In [26]:
clients_train, clients_val = train_test_split(df["client_id"].unique(), test_size=0.1, random_state=42)

train_ds = TransactionDataset(
    df[lambda x: x["client_id"].isin(clients_train)], 
    id_col="client_id", 
    dt_col="TRDATETIME", 
    cat_cols=["small_group", "amount_rur_bin"],
    min_length=20,
    max_length=100
)

val_ds = TransactionDataset(
    df[lambda x: x["client_id"].isin(clients_val)], 
    id_col="client_id", 
    dt_col="TRDATETIME", 
    cat_cols=["small_group", "amount_rur_bin"],
    min_length=20,
    max_length=100
)

696 sequences were filtered
79 sequences were filtered


In [43]:
class TransactionEncoder(nn.Module):
    def __init__(self, feature_embeddings, linear_proj=None):
        super().__init__()
        
        self.feature_embeddings = feature_embeddings
        self.embeddings = nn.ModuleDict({key: nn.Embedding(vocab, dim) for key, (vocab, dim) in feature_embeddings.items()})
        self.linear_proj = nn.Identity()
        if linear_proj is not None:
            self.embedding_dim = linear_proj
            self.linear_proj = nn.Linear(sum([dim for key, (vocab, dim) in feature_embeddings.items()]), linear_proj)
        else:
            self.embedding_dim = sum([dim for key, (vocab, dim) in feature_embeddings.items()])

    def forward(self, x, device="cpu"):
        embeddings = [self.embeddings[key](x[key].to(device)) for key in self.feature_embeddings]
        proj = self.linear_proj(torch.cat(embeddings, dim=2))
        return proj


class Head(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        super().__init__()

        self.head = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, vocab_size),
        )

    def forward(self, x):
        return self.head(x)


class TransformerModel(nn.Module):
    def __init__(
            self, 
            feature_embeddings, 
            linear_proj=None,
            n_head=8, 
            dim_feedforward=128, 
            dropout=0.1, 
            num_layers=6, 
            head_hidden=128,
        ):
        super().__init__()

        self.transaction_encoder = TransactionEncoder(feature_embeddings, linear_proj=linear_proj)
        self.cat_cols = list(feature_embeddings.keys())
        self.num_classes_dict = {key: num_classes for key, (num_classes, _) in feature_embeddings.items()}

        self.embedding_dim = self.transaction_encoder.embedding_dim
        self.encoder_layer = nn.TransformerEncoderLayer(
            self.embedding_dim, 
            n_head, 
            dim_feedforward=dim_feedforward, 
            dropout=dropout, 
            activation="gelu",
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        
        self.heads = nn.ModuleDict({
            key: Head(
                self.embedding_dim, 
                head_hidden, 
                num_classes
            ) for key, num_classes in self.num_classes_dict.items()
        })

    def forward(self, x, device="cpu"):
        N, S = x[self.cat_cols[0]].shape
        embeddings = self.transaction_encoder(x, device=device)#.to(device)
        
        attn_mask = self.generate_square_subsequent_mask(S).to(device)
        padding_mask = self.generate_padding_mask(x[self.cat_cols[0]]).to(device)

        encoded = self.transformer_encoder(embeddings, mask=attn_mask, is_causal=True, src_key_padding_mask=padding_mask)
        logits = {key: self.heads[key](encoded) for key in self.cat_cols}
        return logits

    @staticmethod
    def generate_square_subsequent_mask(sz):
        return torch.triu(torch.full((sz, sz), True), diagonal=1).bool()
    
    @staticmethod
    def generate_padding_mask(x):
        return torch.where(x == 0, True, 0).bool()

In [39]:
def train_epoch(model, optimizer, dataloader, warmup=10, device="cuda"):
    model.train()
    model.to(device)

    metrics = {
        key: {
            "f1_score": MulticlassF1Score(
                num_classes=num_classes, 
                average="weighted", 
                ignore_index=0
            ), 
            "accuracy": Accuracy(
                task="multiclass", 
                num_classes=num_classes, 
                ignore_index=0
            )
        } for key, num_classes in model.num_classes_dict.items()
    }

    loss_epoch = 0
    count = 0 
    for batch_dict in dataloader:
        logits_dict = model(batch_dict, device=device)

        loss = 0
        for key, logits in logits_dict.items():
            y = batch_dict[key][:, warmup + 1:].to(device)
            logits_pred = logits[:, warmup: -1].permute(0, 2, 1)    # B x C x T

            loss += nn.functional.cross_entropy(logits_pred, y, ignore_index=0)
            
            y_pred = logits_pred.argmax(dim=1).to("cpu")
            metrics[key]["f1_score"].update(y_pred, y.to("cpu"))
            metrics[key]["accuracy"].update(y_pred, y.to("cpu"))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        cur_count =  torch.sum((y != 0).float()).item()
        loss_epoch += loss.item() * cur_count
        count += cur_count

    return loss_epoch / count, {feature: {m: v.compute().item() for m, v in results.items()} for feature, results in metrics.items()}

def val_epoch(model, dataloader, warmup=10, device="cuda"):
    model.eval()
    model.to(device)

    metrics = {
        key: {
            "f1_score": MulticlassF1Score(
                num_classes=num_classes, 
                average="weighted", 
                ignore_index=0
            ), 
            "accuracy": Accuracy(
                task="multiclass", 
                num_classes=num_classes, 
                ignore_index=0
            )
        } for key, num_classes in model.num_classes_dict.items()
    }

    with torch.no_grad():
        loss_epoch = 0
        count = 0 
        for batch_dict in dataloader:
            logits_dict = model(batch_dict, device=device)

            loss = 0
            for key, logits in logits_dict.items():
                y = batch_dict[key][:, warmup + 1:].to(device)
                logits_pred = logits[:, warmup: -1].permute(0, 2, 1)
        
                loss += nn.functional.cross_entropy(logits_pred, y, ignore_index=0)

                y_pred = logits_pred.argmax(dim=1).to("cpu")
                metrics[key]["f1_score"].update(y_pred, y.to("cpu"))
                metrics[key]["accuracy"].update(y_pred, y.to("cpu"))

            cur_count = torch.sum((y != 0).float()).item()
            loss_epoch += loss.item() * cur_count
            count += cur_count    

    return loss_epoch / count, {feature: {m: v.compute().item() for m, v in results.items()} for feature, results in metrics.items()}


def train_model(model, optimizer, dataloaders, n_epochs, warmup=10, device="cuda"):
    for epoch in range(n_epochs):
        train_start = time.perf_counter()
        train_loss, train_metrics = train_epoch(model, optimizer, dataloaders["train"], warmup, device)
        train_end = time.perf_counter()
        val_loss, val_metrics = val_epoch(model, dataloaders["val"], warmup, device)
        val_end = time.time()

        wandb.log({
            "Epoch": epoch+1,
            "Train time": train_end - train_start,
            "Train loss": train_loss,
            "Train metrics": train_metrics,
            "Val time": val_end - train_end,
            "Val metrics": val_metrics,
            "Val loss": val_loss
        })

        # print("Epoch", epoch+1)
        # print("Train time:", )
        # print("Val time:", val_end - train_end)
        # print("Train loss", train_loss)
        # print("Val loss", val_loss)
        # print("Train metrics", train_metrics)
        # print("Val metrics", val_metrics, end="\n\n")

In [34]:
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=transaction_collate_fn)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False, collate_fn=transaction_collate_fn)

In [35]:
transformer = TransformerModel(
    feature_embeddings={"small_group": (345, 64), "amount_rur_bin": (11, 64)}, 
    linear_proj=64,
    n_head=8, 
    dim_feedforward=128, 
    dropout=0.1, 
    num_layers=6, 
    head_hidden=128
)

optimizer = torch.optim.Adam(transformer.parameters(), lr=1e-3)

In [36]:
train_model(
    transformer, 
    optimizer,
    {"train": train_loader, "val": val_loader},
    20
)

tensor([[False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True],
        ...,
        [False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True]], device='cuda:0')
tensor([[False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True],
        ...,
        [False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ...,  True,  True,  True]], device='cuda:0')
tensor([[False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False, False,  ...,  True,  True,  True

KeyboardInterrupt: 

In [107]:
a = df.sort_values("TRDATETIME").groupby("client_id").apply(lambda x: x["amount_rur_bin"] == x["amount_rur_bin"].shift())

In [111]:
a.reset_index()["amount_rur_bin"].mean()

0.17164274952957415

In [54]:
config = {
    "experiment_name": "run2",
    "dataset": "rosbank",
    "min_length": 20,
    "max_length": 100,
    "batch_size": 32,
    "transformer_params": {
        "feature_embeddings": {"small_group": (345, 64), "amount_rur_bin": (11, 64)}, 
        "linear_proj": 64,
        "n_head": 8, 
        "dim_feedforward": 128, 
        "dropout": 0.1, 
        "num_layers": 6, 
        "head_hidden": 128,
    },
    "lr": 1e-3,
    "n_epochs": 30,
    "warmup": 10,
    "device": "cuda"
}


In [49]:
import json

with open("config.json", "w") as f:
    json.dump(config, f)

In [56]:
from yaml import load, dump
# from yaml import CLoader as Loader, CDumper as Dumper

with open("config.yaml", "w") as f:
    dump(config, f)

In [59]:

with open("config.yaml", "r") as f:
    d = load(f)

d

TypeError: load() missing 1 required positional argument: 'Loader'

In [45]:
def main(config):
    wandb.login()

    wandb.init(
        project="deep-learning-project",
        name=config["experiment_name"], 
        config=config
    )

    if config["dataset"] == "rosbank":
        df = pd.read_csv('data/rosbank/train.csv')
        df['TRDATETIME'] = pd.to_datetime(df['TRDATETIME'], format='%d%b%y:%H:%M:%S')
        df = df.rename(columns={'cl_id':'client_id', 'MCC':'small_group', 'amount':'amount_rur'})
        
        mcc_to_id = {mcc: i+1 for i, mcc in enumerate(df['small_group'].unique())}

        df['amount_rur_bin'] = 1 + KBinsDiscretizer(10, encode='ordinal', subsample=None).fit_transform(df[['amount_rur']]).astype('int')
        df['small_group'] = df['small_group'].map(mcc_to_id)

    else:
        pass

    clients_train, clients_val = train_test_split(df["client_id"].unique(), test_size=0.1, random_state=42)

    train_ds = TransactionDataset(
        df[lambda x: x["client_id"].isin(clients_train)], 
        id_col="client_id", 
        dt_col="TRDATETIME", 
        cat_cols=["small_group", "amount_rur_bin"],
        min_length=config["min_length"],
        max_length=config["max_length"],
        random_slice=True
    )

    val_ds = TransactionDataset(
        df[lambda x: x["client_id"].isin(clients_val)], 
        id_col="client_id", 
        dt_col="TRDATETIME", 
        cat_cols=["small_group", "amount_rur_bin"],
        min_length=config["min_length"],
        max_length=config["max_length"],
        random_slice=False
    )

    train_loader = DataLoader(train_ds, batch_size=config["batch_size"], shuffle=True, collate_fn=transaction_collate_fn)
    val_loader = DataLoader(val_ds, batch_size=config["batch_size"], shuffle=False, collate_fn=transaction_collate_fn)

    transformer = TransformerModel(**config["transformer_params"])
    optimizer = torch.optim.Adam(transformer.parameters(), lr=config["lr"])

    train_model(
        transformer, 
        optimizer, 
        {"train": train_loader, "val": val_loader}, 
        n_epochs=config["n_epochs"],
        warmup=config["warmup"],
        device=config["device"]
    )

    wandb.finish()

In [46]:
main(config)



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01693333333338766, max=1.0)…

696 sequences were filtered
79 sequences were filtered


  return torch._transformer_encoder_layer_fwd(


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
Train loss,█▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Train time,▇█▄▂▁▁▁▂▂▂▃▂▆▇▅▄▅▅▅▅▅▆▆▂▂▅▆▄▄▄
Val loss,█▆▄▄▃▃▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Val time,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Epoch,30.0
Train loss,4.84069
Train time,4.01987
Val loss,4.96302
Val time,1684610611.3276


In [12]:
wandb.login()

True