In [1]:
import os 

os.chdir("app/")

In [2]:
from functools import partial

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import torch.nn as nn
import torch

from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.trainer import Trainer

from ptls.preprocessing import PandasDataPreprocessor
from ptls.nn import TrxEncoder
from ptls.frames import PtlsDataModule
    
from nn.seq_encoder import ConvSeqEncoder
from modules.ts2vec_module import TS2Vec
from datasets import TS2VecDataset
from utils.encode import encode_data
from utils.evaluation import bootstrap_eval

## Load data

In [3]:
df = pd.read_parquet("data/preprocessed_new/churn.parquet")
df.head()

Unnamed: 0,user_id,mcc_code,timestamp,amount,global_target,holiday_target,weekend_target,churn_target,minute,hour,day,month,day_of_week,time_delta
0,0,2,2017-10-12 12:24:07,20000.0,0,0,0,0,24,12,12,10,3,0
1,0,19,2017-10-21 00:00:00,5023.0,0,0,1,0,0,0,21,10,5,732953
2,0,1,2017-10-21 00:00:00,2031.0,0,0,1,0,0,0,21,10,5,0
3,0,9,2017-10-24 13:14:24,36562.0,0,0,0,0,14,13,24,10,1,306864
4,0,10,2017-12-05 00:00:00,767.0,0,0,0,0,0,0,5,12,1,3581136


## Prepare dataset and dataloaders

In [4]:
preprocessor = PandasDataPreprocessor(
    col_id="user_id",
    col_event_time="timestamp",
    event_time_transformation="dt_to_timestamp",
    cols_category=["mcc_code"],
    cols_first_item=["global_target"]
)

In [5]:
data = preprocessor.fit_transform(df)

In [6]:
val_size = 0.1
test_size = 0.1

train, val_test = train_test_split(data, test_size=test_size+val_size, random_state=42)
val, test = train_test_split(val_test, test_size=test_size/(test_size+val_size), random_state=42)

In [7]:
train_ds = TS2VecDataset(train, min_seq_len=15)
val_ds = TS2VecDataset(val, min_seq_len=15)
test_ds = TS2VecDataset(test, min_seq_len=15)

In [8]:
datamodule = PtlsDataModule(
    train_data=train_ds,
    valid_data=val_ds,
    train_batch_size=16,
    valid_batch_size=16,
    train_num_workers=8,
    valid_num_workers=8
)

# TS2Vec

## Model training

In [9]:
trx_encoder = TrxEncoder(
    embeddings={
        "mcc_code": {"in": 345, "out": 24}
    },
    numeric_values={
        "amount": "identity"
    },
    use_batch_norm_with_lens=True,
    norm_embeddings=False,
    embeddings_noise=0.0003
)

seq_encoder = ConvSeqEncoder(
    trx_encoder,
    hidden_size=1024,
    num_layers=10,
    dropout=0.1,
)

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F


def hierarchical_contrastive_loss_joint(z1, z2, alpha=0.5, temporal_unit=0):
    loss = torch.tensor(0., device=z1.device)
    d = 0
    while z1.size(1) > 1:
        #if alpha != 0:
        #    loss += alpha * instance_contrastive_loss(z1, z2)
        #if d >= temporal_unit:
        #    if 1 - alpha != 0:
        #        loss += (1 - alpha) * temporal_contrastive_loss(z1, z2)
        loss += level_contrastive_loss(z1, z2)
        d += 1
        z1 = F.max_pool1d(z1.transpose(1, 2), kernel_size=2).transpose(1, 2)
        z2 = F.max_pool1d(z2.transpose(1, 2), kernel_size=2).transpose(1, 2)
    if z1.size(1) == 1:
        if alpha != 0:
            loss += level_contrastive_loss(z1, z2)
        d += 1
    return loss / d

def level_contrastive_loss(z1, z2):
    B, _, C= z1.shape

    if B == 1:
        return z1.new_tensor(0., requires_grad=True)
    
    z1 = z1.reshape(-1, C) # (BT, C)
    z2 = z2.reshape(-1, C) # (BT, C)

    z = torch.cat([z1, z2], dim=0) # (2BT, C)
    sim = torch.matmul(z, z.T) # (2BT, 2BT)

    loss = sim.exp().sum(dim=1).log().mean() - 2 * (z1 * z2).sum(dim=1).mean() 

    return loss


class HierarchicalContrastiveLossJoint(nn.Module):
    def __init__(self, alpha=0.5, temporal_unit=0):
        super().__init__()

        self.alpha = alpha
        self.temporal_unit = temporal_unit

    def forward(self, embeddings, _):
        out1, out2 = embeddings
        return hierarchical_contrastive_loss_joint(out1, out2, self.alpha, self.temporal_unit)

In [11]:
lr_scheduler_partial = partial(torch.optim.lr_scheduler.ReduceLROnPlateau, factor=.9025, patience=5, mode="min")
optimizer_partial = partial(torch.optim.Adam, lr=4e-3)

model = TS2Vec(
    seq_encoder,
    loss = HierarchicalContrastiveLossJoint(),
    optimizer_partial=optimizer_partial,
    lr_scheduler_partial=lr_scheduler_partial
)

In [12]:
checkpoint = ModelCheckpoint(
    monitor="valid_loss", 
    mode="min"
)

trainer = Trainer(
    max_epochs=50,
    devices=[0],
    accelerator="gpu",
    callbacks=[checkpoint]
)

trainer.fit(model, datamodule)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name         | Type                             | Params
------------------------------------------------------------------
0 | _loss        | HierarchicalContrastiveLossJoint | 0     
1 | _seq_encoder | ConvSeqEncoder                   | 3.3 M 
2 | _head        | Head                             | 0     
3 | valid_loss   | MeanMetric                       | 0     
------------------------------------------------------------------
3.3 M     Trainable params
0         Non-trainable params
3.3 M     Total params
13.190    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]



Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [13]:
model.load_state_dict(torch.load(checkpoint.best_model_path)["state_dict"])

<All keys matched successfully>

In [60]:
torch.save(model.seq_encoder.state_dict(), "ts2vec_churn.pth")

## Model evaluation

In [14]:
train_val_ds = TS2VecDataset(train + val, min_seq_len=15)

In [15]:
X_train, y_train = encode_data(model.seq_encoder, train_val_ds)
X_test, y_test = encode_data(model.seq_encoder, test_ds)

print("Train size:", len(y_train))
print("Test size:", len(y_test))

Train size: 3961
Test size: 443


In [16]:
results = bootstrap_eval(X_train, X_test, y_train, y_test, n_runs=10)

100%|██████████| 10/10 [00:34<00:00,  3.41s/it]


In [17]:
results

Unnamed: 0,ROC-AUC,PR-AUC,Accuracy
0,0.613827,0.680214,0.598194
1,0.605616,0.682446,0.602709
2,0.590719,0.687127,0.623025
3,0.605616,0.684079,0.61851
4,0.613933,0.700334,0.591422
5,0.605331,0.680462,0.582393
6,0.584137,0.659196,0.591422
7,0.60586,0.688606,0.591422
8,0.595163,0.682059,0.593679
9,0.615171,0.68674,0.595937


In [59]:
results.agg(["mean", "std"])

Unnamed: 0,ROC-AUC,PR-AUC
mean,0.675032,0.756812
std,0.012706,0.007555


# TS2Vec with time features

## Model training

In [23]:
trx_encoder = TrxEncoder(
    embeddings={
        "mcc_code": {"in": 345, "out": 24}
    },
    numeric_values={
        "amount": "identity",
        "event_time": "identity",
        "time_delta": "identity",
    },
    use_batch_norm_with_lens=True,
    norm_embeddings=False,
    embeddings_noise=0.0003
)

seq_encoder = ConvSeqEncoder(
    trx_encoder,
    hidden_size=1024,
    num_layers=10,
    dropout=0.1,
)

In [24]:
lr_scheduler_partial = partial(torch.optim.lr_scheduler.ReduceLROnPlateau, factor=.9025, patience=5, mode="min")
optimizer_partial = partial(torch.optim.Adam, lr=4e-3)

model = TS2Vec(
    seq_encoder,
    optimizer_partial=optimizer_partial,
    lr_scheduler_partial=lr_scheduler_partial
)

In [None]:
checkpoint = ModelCheckpoint(
    monitor="valid_loss", 
    mode="min"
)

trainer = Trainer(
    max_epochs=50,
    devices=[3],
    accelerator="gpu",
    callbacks=[checkpoint]
)

trainer.fit(model, datamodule)

In [34]:
model.load_state_dict(torch.load(checkpoint.best_model_path)["state_dict"])

<All keys matched successfully>

In [40]:
torch.save(model.seq_encoder.state_dict(), "ts2vec_churn_date.pth")

## Model evaluation

In [35]:
train_val_ds = TS2VecDataset(train + val, min_seq_len=15)

In [36]:
X_train, y_train = encode_data(model.seq_encoder, train_val_ds)
X_test, y_test = encode_data(model.seq_encoder, test_ds)

print("Train size:", len(y_train))
print("Test size:", len(y_test))

Train size: 3961
Test size: 443


In [37]:
results = bootstrap_eval(X_train, X_test, y_train, y_test, n_runs=10)

In [38]:
results

Unnamed: 0,ROC-AUC,PR-AUC
0,0.734573,0.807961
1,0.715507,0.813229
2,0.726342,0.804885
3,0.721855,0.796088
4,0.72886,0.818177
5,0.709264,0.79614
6,0.732415,0.812827
7,0.718575,0.813551
8,0.716311,0.802546
9,0.708757,0.794835


In [39]:
results.agg(["mean", "std"])

Unnamed: 0,ROC-AUC,PR-AUC
mean,0.721246,0.806024
std,0.009133,0.008441
