In [1]:
%env CUDA_VISIBLE_DEVICES=3

env: CUDA_VISIBLE_DEVICES=3


In [2]:
import os

#os.chdir("app/")

In [3]:
from functools import partial

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import torch.nn as nn
import torch

from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.trainer import Trainer

from ptls.preprocessing import PandasDataPreprocessor
from ptls.nn import TrxEncoder
from ptls.frames import PtlsDataModule
    
from nn.seq_encoder import ConvSeqEncoder
from modules.ts2vec_module_dynamic_pool import TS2VecDynamicPool
from datasets import TS2VecDataset
from utils.encode import encode_data
from utils.evaluation import bootstrap_eval

## Load data

In [4]:
df = pd.read_parquet("data/preprocessed_new/default_date.parquet")
df.head()

Unnamed: 0,user_id,mcc_code,amount,timestamp,holiday_target,weekend_target,global_target,default_target,time_delta
0,69,5,-342.89792,2021-03-05 02:52:36,0,0,0,0,0
1,69,21,-1251.8812,2021-03-05 09:43:28,0,0,0,0,24652
2,69,12,-87.30924,2021-03-05 11:17:23,0,0,0,0,5635
3,69,6,-1822.177,2021-03-05 13:41:03,0,0,0,0,8620
4,69,18,-427.12363,2021-03-05 19:14:23,0,0,0,0,20000


## Prepare dataset and dataloaders

In [5]:
preprocessor = PandasDataPreprocessor(
    col_id="user_id",
    col_event_time="timestamp",
    event_time_transformation="dt_to_timestamp",
    cols_category=["mcc_code"],
    cols_first_item=["global_target"]
)

In [6]:
data = preprocessor.fit_transform(df)

In [7]:
data[0]

{'user_id': 69,
 'amount': tensor([-3.4290e+02, -1.2519e+03, -8.7309e+01, -1.8222e+03, -4.2712e+02,
         -1.5881e+02, -9.9954e+02, -3.2054e+02, -6.6186e+02, -1.3906e+02,
         -3.0815e+02, -2.3189e+03, -6.7941e+02, -8.6535e+02, -4.6291e+02,
         -5.7949e+03, -7.1238e+02, -1.2665e+02, -3.4464e+03, -1.2212e+03,
         -2.5147e+02, -1.4552e+03, -9.0377e+02, -3.8657e+01, -1.1423e+03,
         -8.2171e+02, -6.7822e+02, -8.9096e+01, -2.0213e+03, -7.9014e+02,
         -3.5479e+01, -7.4267e+01, -7.0800e+02, -2.2680e+02, -9.1732e+01,
         -4.4947e+02, -6.8309e+02, -3.5448e+02, -1.5044e+02, -3.9984e+02,
         -4.7700e+02, -4.9064e+02, -2.7975e+03, -1.4055e+03, -7.2766e+03,
         -2.2740e+03, -2.1838e+01, -9.8174e+02, -8.5465e+02, -5.5266e+02,
         -1.1802e+03, -7.8140e+02, -8.7765e+02, -4.7904e+02, -5.4868e+02,
         -2.4586e+03, -1.5617e+02, -4.6330e+01, -4.0802e+02, -6.9189e+01,
         -6.9850e+02, -7.5377e+01, -5.7819e+02, -2.9812e+01, -1.2153e+03,
         -3.

In [8]:
val_size = 0.1
test_size = 0.1

train, val_test = train_test_split(data, test_size=test_size+val_size, random_state=42)
val, test = train_test_split(val_test, test_size=test_size/(test_size+val_size), random_state=42)

In [9]:
train_ds = TS2VecDataset(train, min_seq_len=15)
val_ds = TS2VecDataset(val, min_seq_len=15)
test_ds = TS2VecDataset(test, min_seq_len=15)

In [10]:
datamodule = PtlsDataModule(
    train_data=train_ds,
    valid_data=val_ds,
    train_batch_size=128,
    valid_batch_size=128,
    train_num_workers=8,
    valid_num_workers=8,
)

In [11]:
for batch in datamodule.train_dataloader():
    batch
    for field_name, field in batch[0].payload.items():
        print(field_name, field.shape)
    break

user_id torch.Size([128])
amount torch.Size([128, 300])
holiday_target torch.Size([128, 300])
weekend_target torch.Size([128, 300])
global_target torch.Size([128])
default_target torch.Size([128, 300])
time_delta torch.Size([128, 300])
event_time torch.Size([128, 300])
mcc_code torch.Size([128, 300])


In [12]:
batch[0].seq_lens

tensor([300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300,
        300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300,
        300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300,
        300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300,
        300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300,
        300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300,
        300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300,
        300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300,
        300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300, 300,
        300, 300])

In [13]:
batch[0].payload["time_delta"]

tensor([[      0,     201,  100616,  ...,   46206,    9562,   15544],
        [      0,    4883,  110328,  ...,   19312,    3134,  309238],
        [      0,     262,   10370,  ...,    3132,   17424,  839443],
        ...,
        [      0,  145450,  128970,  ...,   35769,    1135,  177391],
        [      0,   82700,   11949,  ...,   58276,  160635,   51853],
        [      0,    2040,   87840,  ...,    4554, 2940358,  162698]])

# TS2Vec with time features

## Model training

In [14]:
trx_encoder = TrxEncoder(
    embeddings={
        "mcc_code": {"in": 345, "out": 24}
    },
    numeric_values={
        "amount": "identity",
        "event_time": "identity",
        "time_delta": "identity",
    },
    use_batch_norm_with_lens=True,
    norm_embeddings=False,
    embeddings_noise=0.0003
)

seq_encoder = ConvSeqEncoder(
    trx_encoder,
    hidden_size=128,
    num_layers=5,
    dropout=0.1,
)

In [15]:
lr_scheduler_partial = partial(torch.optim.lr_scheduler.ReduceLROnPlateau, factor=.9025, patience=5, mode="min")
optimizer_partial = partial(torch.optim.AdamW, lr=4e-3)

model = TS2VecDynamicPool(
    seq_encoder,
    optimizer_partial=optimizer_partial,
    lr_scheduler_partial=lr_scheduler_partial
)

In [16]:
checkpoint = ModelCheckpoint(
    monitor="valid_loss", 
    mode="min"
)

trainer = Trainer(
    max_epochs=50,
    devices=[0],
    accelerator="gpu",
    callbacks=[checkpoint],
    log_every_n_steps=20,
)

trainer.fit(model, datamodule)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]

  | Name         | Type            | Params
-------------------------------------------------
0 | _loss        | DynamicPoolLoss | 0     
1 | _seq_encoder | ConvSeqEncoder  | 93.8 K
2 | _head        | Head            | 0     
3 | valid_loss   | MeanMetric      | 0     
-------------------------------------------------
93.8 K    Trainable params
0         Non-trainable params
93.8 K    Total params
0.375     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [17]:
model.load_state_dict(torch.load(checkpoint.best_model_path)["state_dict"])

<All keys matched successfully>

In [18]:
torch.save(model.seq_encoder.state_dict(), "ts2vec_dynamic_pool_default_date.pth")

## Model evaluation

In [19]:
train_val_ds = TS2VecDataset(train + val, min_seq_len=15)

In [20]:
X_train, y_train = encode_data(model.seq_encoder, train_val_ds)
X_test, y_test = encode_data(model.seq_encoder, test_ds)

print("Train size:", len(y_train))
print("Test size:", len(y_test))

Train size: 6372
Test size: 708


In [21]:
results = bootstrap_eval(X_train, X_test, y_train, y_test, n_runs=10)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:59<00:00,  5.94s/it]


In [22]:
results

Unnamed: 0,ROC-AUC,PR-AUC,Accuracy
0,0.588514,0.06355,0.961864
1,0.603198,0.059045,0.961864
2,0.582259,0.058822,0.961864
3,0.630119,0.103049,0.961864
4,0.596563,0.064465,0.961864
5,0.605482,0.10179,0.961864
6,0.598738,0.067069,0.961864
7,0.535705,0.056891,0.961864
8,0.614891,0.084487,0.961864
9,0.593735,0.072825,0.961864


In [23]:
results.agg(["mean", "std"])

Unnamed: 0,ROC-AUC,PR-AUC,Accuracy
mean,0.59492,0.073199,0.9618644
std,0.02482,0.017372,1.170278e-16
