In [None]:
import os 

os.chdir("app/")

In [1]:
from functools import partial

import pandas as pd

from sklearn.model_selection import train_test_split

import torch

from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.trainer import Trainer

from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.data_load.datasets import MemoryMapDataset
from ptls.preprocessing import PandasDataPreprocessor

from ptls.nn import TrxEncoder

from ptls.frames import PtlsDataModule

from nn.seq_encoder import ConvSeqEncoder
from modules.ts2vec_module import TS2Vec
from datasets.datasets import TS2VecDataset
from utils.encode import encode_data
from utils.evaluation import bootstrap_eval

comet_ml is installed but `COMET_API_KEY` is not set.


## Load data

In [2]:
df = pd.read_parquet("data/preprocessed_new/age.parquet")
df.head()

Unnamed: 0,user_id,timestamp,mcc_code,amount,global_target
0,33172,6,4,71.463,0
1,33172,6,35,45.017,0
2,33172,8,11,13.887,0
3,33172,9,11,15.983,0
4,33172,10,11,21.341,0


## Prepare dataset and dataloaders

In [3]:
preprocessor = PandasDataPreprocessor(
    col_id="user_id",
    col_event_time="timestamp",
    event_time_transformation="none",
    cols_category=["mcc_code"],
    cols_first_item=["global_target"]
)

In [4]:
data = preprocessor.fit_transform(df)

In [6]:
val_size = 0.1
test_size = 0.1

train, val_test = train_test_split(data, test_size=test_size+val_size, random_state=42)
val, test = train_test_split(val_test, test_size=test_size/(test_size+val_size), random_state=42)

In [7]:
train_ds = TS2VecDataset(train, min_seq_len=25)
val_ds = TS2VecDataset(val, min_seq_len=25)
test_ds = TS2VecDataset(test, min_seq_len=25)

In [8]:
datamodule = PtlsDataModule(
    train_data=train_ds,
    valid_data=val_ds,
    train_batch_size=512,
    valid_batch_size=512,
    train_num_workers=16,
    valid_num_workers=16
)

# TS2Vec

## Model training

In [9]:
trx_encoder = TrxEncoder(
    use_batch_norm_with_lens=True,
    norm_embeddings=False,
    embeddings_noise=0.003,
    embeddings={
        "mcc_code": {"in": 250, "out": 16}
    }
    numeric_values={
        "amount": "identity"
    }
)

seq_encoder = ConvSeqEncoder(
    trx_encoder,
    is_reduce_sequence=False,
    hidden_size=800,
    num_layers=10,
    dropout=0.1,
)

In [10]:
lr_scheduler_partial = partial(torch.optim.lr_scheduler.ReduceLROnPlateau, factor=.99, patience=50)
optimizer_partial = partial(torch.optim.Adam, lr=1e-3)

model = TS2Vec(
    seq_encoder,
    optimizer_partial=optimizer_partial,
    lr_scheduler_partial=lr_scheduler_partial
)

In [11]:
checkpoint = ModelCheckpoint(
    monitor="valid_loss", 
    mode="min"
)

trainer = Trainer(
    max_epochs=15,
    accelerator="gpu",
    devices=[0],
    callbacks=[checkpoint],
)

trainer.fit(model, datamodule)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
model.load_state_dict(torch.load(checkpoint.best_model_path)["state_dict"])
torch.save(model.seq_encoder.state_dict(), "coles_age.pth")

## Model evaluation

In [12]:
train_val_ds = MemoryMapDataset(train + val, [SeqLenFilter(min_seq_len=25)])

In [13]:
X_train, y_train = encode_data(model.seq_encoder, train_val_ds)
X_test, y_test = encode_data(model.seq_encoder, test_ds)

print("Train size:", len(y_train))
print("Test size:", len(y_test))

Train size: 27000
Test size: 3000


In [None]:
results = bootstrap_eval(X_train, X_test, y_train, y_test, n_runs=10)

In [19]:
results

Unnamed: 0,ROC-AUC,PR-AUC,Accuracy
0,0.817385,0.608651,0.569
1,0.816652,0.610217,0.57
2,0.818529,0.607414,0.574333
3,0.816339,0.607431,0.564667
4,0.817575,0.608622,0.569333
5,0.81579,0.606155,0.569333
6,0.815205,0.604773,0.563
7,0.816465,0.606905,0.57
8,0.813719,0.602924,0.559667
9,0.816431,0.608816,0.568


In [20]:
results.agg(["mean", "std"])

Unnamed: 0,ROC-AUC,PR-AUC,Accuracy
mean,0.816409,0.607191,0.567733
std,0.001331,0.002142,0.004183
