In [1]:
from functools import partial

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import torch

from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.trainer import Trainer
from pytorch_lightning.loggers import CometLogger

from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.data_load.datasets import MemoryMapDataset
from ptls.preprocessing import PandasDataPreprocessor
from ptls.frames import PtlsDataModule

from nn.trx_encoder import TimeTrxEncoder
from nn.seq_encoder import ContConvSeqEncoder

from datasets import TS2VecDataset

from utils.encode import encode_data
from utils.evaluation import bootstrap_eval
from utils.preprocessing import CustomDatetimeNormalization

comet_ml is installed but `COMET_API_KEY` is not set.


In [2]:
from ptls.frames.abs_module import ABSModule
from ptls.data_load.padded_batch import PaddedBatch
from ptls.nn.head import Head

from torchmetrics import MeanMetric

from losses.hierarchical_contrastive_loss import HierarchicalContrastiveLoss
from modules import take_per_row, mask_input

# fixed TS2Vec module for TimeTrxEncoder
# TODO: join 2 versions

class TS2VecTime(ABSModule):
    '''The TS2Vec model'''
    def __init__(
        self,
        seq_encoder,
        mask_mode="binomial",
        head=None,
        loss=None,
        validation_metric=None,
        optimizer_partial=None,
        lr_scheduler_partial=None
    ):
        ''' Initialize a TS2Vec model.
        
        Args:
        '''
        if head is None:
            head = Head(use_norm_encoder=True)
        
        if loss is None:
            loss = HierarchicalContrastiveLoss(alpha=0.5, temporal_unit=0)

        self.temporal_unit = loss.temporal_unit
        self.mask_mode = mask_mode
        
        super().__init__(validation_metric,
                         seq_encoder,
                         loss,
                         optimizer_partial,
                         lr_scheduler_partial)

        self._head = head
        self.valid_loss = MeanMetric()

    def shared_step(self, x, y):
        trx_encoder = self._seq_encoder.trx_encoder
        seq_encoder = self._seq_encoder.seq_encoder 

        seq_lens = x.seq_lens
        encoder_out = trx_encoder(x).payload

        x = encoder_out["embeddings"]
        t = encoder_out["event_time"]

        ts_l = x.size(1)
        crop_l = np.random.randint(low=2 ** (self.temporal_unit + 1), high=ts_l+1)
        crop_left = np.random.randint(ts_l - crop_l + 1)
        crop_right = crop_left + crop_l
        crop_eleft = np.random.randint(crop_left + 1)
        crop_eright = np.random.randint(low=crop_right, high=ts_l + 1)
        crop_offset = np.random.randint(low=-crop_eleft, high=ts_l - crop_eright + 1, size=x.size(0))

        input1 = take_per_row(x, crop_offset + crop_eleft, crop_right - crop_eleft)
        input2 = take_per_row(x, crop_offset + crop_left, crop_eright - crop_left)
        
        t1 = take_per_row(t, crop_offset + crop_eleft, crop_right - crop_eleft)
        t2 = take_per_row(t, crop_offset + crop_left, crop_eright - crop_left)
        
        input1_masked = mask_input(input1, self.mask_mode)
        input2_masked = mask_input(input2, self.mask_mode)
        
        out1 = seq_encoder(PaddedBatch({"embeddings": input1_masked, "event_time": t1}, seq_lens)).payload
        out1 = out1[:, -crop_l:]

        out2 = seq_encoder(PaddedBatch({"embeddings": input2_masked, "event_time": t2}, seq_lens)).payload
        out2 = out2[:, :crop_l]
        
        if self._head is not None:
            out1 = self._head(out1)
            out2 = self._head(out2)

        return (out1, out2), y

    def validation_step(self, batch, _):
        y_h, y = self.shared_step(*batch)
        loss = self._loss(y_h, y)
        self.valid_loss(loss)

    def validation_epoch_end(self, outputs):
        self.log(f'valid_loss', self.valid_loss, prog_bar=True)

    @property
    def is_requires_reduced_sequence(self):
        return False
    
    @property
    def metric_name(self):
        return "valid_loss"

# Read and preprocess data

In [3]:
df = pd.read_parquet("data/preprocessed_new/churn.parquet")
df.head()

Unnamed: 0,user_id,mcc_code,timestamp,amount,global_target,holiday_target,weekend_target,churn_target
0,0,147,2017-10-21 00:00:00,5023.0,0,0,1,0
1,0,244,2017-10-12 12:24:07,20000.0,0,0,0,0
2,0,204,2017-12-05 00:00:00,767.0,0,0,0,0
3,0,158,2017-10-21 00:00:00,2031.0,0,0,1,0
4,0,245,2017-10-24 13:14:24,36562.0,0,0,0,0


In [4]:
# normilize times for convolutions
min_timestamp = int(df["timestamp"].min().timestamp())

time_transformer = CustomDatetimeNormalization(
    col_name_original="timestamp",
    min_timestamp=min_timestamp,
    col_name_target="event_time",
)

preprocessor = PandasDataPreprocessor(
    col_id="user_id",
    col_event_time=time_transformer,
    cols_category=["mcc_code"],
    cols_first_item=["global_target"]
)

data = preprocessor.fit_transform(df)

In [5]:
val_size = 0.1
test_size = 0.1

train, val_test = train_test_split(data, test_size=test_size+val_size, random_state=42)
val, test = train_test_split(val_test, test_size=test_size/(test_size+val_size), random_state=42)

train_ds = TS2VecDataset(train, min_seq_len=15)
val_ds = TS2VecDataset(val, min_seq_len=15)
test_ds = TS2VecDataset(test, min_seq_len=15)

datamodule = PtlsDataModule(
    train_data=train_ds,
    valid_data=val_ds,
    train_batch_size=16,
    valid_batch_size=16,
    train_num_workers=8,
    valid_num_workers=8
)

In [6]:
trx_encoder = TimeTrxEncoder(
    use_batch_norm_with_lens=True,
    norm_embeddings=False,
    embeddings_noise=0.003,
    embeddings={
        "mcc_code": {"in": 345, "out": 24}
    },
    numeric_values={
        "amount": "identity"
    }
)

seq_encoder = ContConvSeqEncoder(
        trx_encoder,
        is_reduce_sequence=False,
        kernel_hiddens=[8, 16, 8],
        hidden_size=32,
        num_layers=10,
        kernel_size=5,
        dropout=0.1,
    )

num_params = sum([p.numel() for p in seq_encoder.parameters()])
print("Num parameters:", num_params)

Num parameters: 122474


In [7]:
lr_scheduler_partial = partial(torch.optim.lr_scheduler.ReduceLROnPlateau, factor=.9025, patience=5, mode="min")
optimizer_partial = partial(torch.optim.Adam, lr=3e-4)

model = TS2VecTime(
    seq_encoder,
    optimizer_partial=optimizer_partial,
    lr_scheduler_partial=lr_scheduler_partial
)

In [13]:
checkpoint = ModelCheckpoint(
    monitor="valid_loss", 
    mode="min",
    dirpath="logs/"
)

comet_logger = CometLogger(
    api_key="agnHNC2vEt7tOxnnxT4LzYf7Y",
    project_name="ts2vec-irregular",
    workspace="stalex2902",
    experiment_name="CCNN_TS2Vec_churn_check_logs",
    display_summary_level=0,
)

trainer = Trainer(
    max_epochs=5,
    accelerator="gpu",
    devices=[1],
    callbacks=[checkpoint],
    logger=comet_logger,
    accumulate_grad_batches=4
)

trainer.fit(model, datamodule)

#model.load_state_dict(torch.load(checkpoint.best_model_path)["state_dict"])
#torch.save(model.seq_encoder.state_dict(), "ts2vec_ccnn_churn.pth")

CometLogger will be initialized in online mode
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name         | Type                        | Params
-------------------------------------------------------------
0 | _loss        | HierarchicalContrastiveLoss | 0     
1 | _seq_encoder | ContConvSeqEncoder          | 120 K 
2 | _head        | Head                        | 0     
3 | valid_loss   | MeanMetric                  | 0     
-------------------------------------------------------------
120 K     Trainable params
0         Non-trainable params
120 K     Total params
0.481     Total estimated model params size (MB)
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/stalex2902/ts2vec-irregular/499cfa4d0f1c46169b152b50fea4a283



Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[1;38;5;39mCOMET INFO:[0m Please wait for metadata to finish uploading (timeout is 3600 seconds)


In [8]:
model.seq_encoder.load_state_dict(torch.load("ts2vec_ccnn_churn_v2.pth"))

<All keys matched successfully>

# Evaluation

In [9]:
train_val_ds = MemoryMapDataset(train + val, [SeqLenFilter(min_seq_len=15)])

X_train, y_train = encode_data(model.seq_encoder, train_val_ds)
X_test, y_test = encode_data(model.seq_encoder, test_ds)

print("Train size:", len(y_train))
print("Test size:", len(y_test))

Train size: 3961
Test size: 443


In [10]:
results = bootstrap_eval(X_train, X_test, y_train, y_test, n_runs=10)

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:17<00:00,  1.70s/it]


In [11]:
results.agg(["mean", "std"])

Unnamed: 0,ROC-AUC,PR-AUC,Accuracy
mean,0.734072,0.796331,0.694357
std,0.008304,0.007069,0.011954


In [17]:
results

Unnamed: 0,ROC-AUC,PR-AUC,Accuracy
0,0.677374,0.737225,0.6614
1,0.673396,0.727659,0.668172
2,0.659239,0.718063,0.638826
3,0.669629,0.726024,0.645598
4,0.676507,0.739796,0.647856
5,0.672973,0.734677,0.650113
6,0.676993,0.742876,0.641084
7,0.674941,0.741765,0.645598
8,0.658647,0.7178,0.643341
9,0.668211,0.739289,0.638826


In [15]:
results.agg(["mean", "std"])

Unnamed: 0,ROC-AUC,PR-AUC,Accuracy
mean,0.670791,0.732517,0.648081
std,0.00693,0.009485,0.009662
