In [1]:
import os 

#os.chdir("app/")

In [2]:
%env CUDA_VISIBLDE_DEVICES=5

env: CUDA_VISIBLDE_DEVICES=5


In [3]:
from functools import partial

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import torch.nn as nn
import torch

from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.trainer import Trainer

from ptls.preprocessing import PandasDataPreprocessor
from ptls.nn import TrxEncoder
from ptls.frames import PtlsDataModule
    
from nn.seq_encoder import ConvSeqEncoder
from modules.ts2vec_module import TS2Vec
from datasets import TS2VecDataset
from utils.encode import encode_data
from utils.evaluation import bootstrap_eval

## Load data

In [4]:
df = pd.read_parquet("data/preprocessed_new/churn.parquet")
df.head()

Unnamed: 0,user_id,mcc_code,timestamp,amount,global_target,holiday_target,weekend_target,churn_target,minute,hour,day,month,day_of_week,time_delta
0,0,2,2017-10-12 12:24:07,20000.0,0,0,0,0,24,12,12,10,3,0
1,0,19,2017-10-21 00:00:00,5023.0,0,0,1,0,0,0,21,10,5,732953
2,0,1,2017-10-21 00:00:00,2031.0,0,0,1,0,0,0,21,10,5,0
3,0,9,2017-10-24 13:14:24,36562.0,0,0,0,0,14,13,24,10,1,306864
4,0,10,2017-12-05 00:00:00,767.0,0,0,0,0,0,0,5,12,1,3581136


## Prepare dataset and dataloaders

In [5]:
preprocessor = PandasDataPreprocessor(
    col_id="user_id",
    col_event_time="timestamp",
    event_time_transformation="dt_to_timestamp",
    cols_category=["mcc_code"],
    cols_first_item=["global_target"]
)

In [6]:
data = preprocessor.fit_transform(df)

In [7]:
data[0]

{'user_id': 0,
 'amount': tensor([20000.,  5023.,  2031., 36562.,   767.], dtype=torch.float64),
 'global_target': 0,
 'holiday_target': tensor([0, 0, 0, 0, 0]),
 'weekend_target': tensor([0, 1, 1, 0, 0]),
 'churn_target': tensor([0, 0, 0, 0, 0]),
 'minute': tensor([24,  0,  0, 14,  0], dtype=torch.int32),
 'hour': tensor([12,  0,  0, 13,  0], dtype=torch.int32),
 'day': tensor([12, 21, 21, 24,  5], dtype=torch.int32),
 'month': tensor([10, 10, 10, 10, 12], dtype=torch.int32),
 'day_of_week': tensor([3, 5, 5, 1, 1], dtype=torch.int32),
 'time_delta': tensor([      0,  732953,       0,  306864, 3581136]),
 'event_time': tensor([1507811047, 1508544000, 1508544000, 1508850864, 1512432000]),
 'mcc_code': tensor([ 2, 19,  1,  9, 10])}

In [8]:
val_size = 0.1
test_size = 0.1

train, val_test = train_test_split(data, test_size=test_size+val_size, random_state=42)
val, test = train_test_split(val_test, test_size=test_size/(test_size+val_size), random_state=42)

In [9]:
train_ds = TS2VecDataset(train, min_seq_len=15)
val_ds = TS2VecDataset(val, min_seq_len=15)
test_ds = TS2VecDataset(test, min_seq_len=15)

In [10]:
datamodule = PtlsDataModule(
    train_data=train_ds,
    valid_data=val_ds,
    train_batch_size=128,
    valid_batch_size=128,
    train_num_workers=8,
    valid_num_workers=8,
)

In [11]:
for batch in datamodule.train_dataloader():
    batch
    for field_name, field in batch[0].payload.items():
        print(field_name, field.shape)
    break

user_id torch.Size([128])
amount torch.Size([128, 439])
global_target torch.Size([128])
holiday_target torch.Size([128, 439])
weekend_target torch.Size([128, 439])
churn_target torch.Size([128, 439])
minute torch.Size([128, 439])
hour torch.Size([128, 439])
day torch.Size([128, 439])
month torch.Size([128, 439])
day_of_week torch.Size([128, 439])
time_delta torch.Size([128, 439])
event_time torch.Size([128, 439])
mcc_code torch.Size([128, 439])


In [184]:
batch[0].seq_lens

tensor([112, 120,  65, 171,  49, 147,  77,  30,  22,  36,  56, 119, 160,  17,
        108,  17, 135,  42,  37,  34,  19, 206,  22, 104, 104,  93,  94, 118,
         22,  47,  37,  88, 127, 140, 190,  19, 144, 162,  37, 127,  23, 187,
        115, 164,  24,  60,  94,  55,  44,  44,  71,  77, 140, 144, 259,  55,
        174, 133,  21, 100, 153,  32, 141,  94,  41, 111,  23, 323,  89, 112,
        186, 290, 172,  23, 278,  79, 281, 125, 108,  44,  93,  66,  85, 131,
         40,  16,  63,  65, 159, 105, 173,  65,  33, 104,  40,  70, 147, 164,
        111,  43,  20, 125, 101,  37, 123,  41, 138, 145,  91,  41, 155,  31,
        233,  62, 161, 439,  35, 222, 148, 208, 126,  47,  16, 135, 131,  18,
        206,  25])

In [18]:
event_time = batch[0].payload["event_time"]
ones = torch.ones_like(batch[0].payload["amount"])


In [22]:
event_time[0, :10].diff()

tensor([ 86400,      0, 172800, 259200,      0,      0,  86400,      0,      0])

In [25]:
24 * 60 * 60

86400

In [269]:
def pool_fixed_span(stepwise_embeds, timestamps, non_padded_lengths, span: int, stride: int):
    batch_size = len(stepwise_embeds)
    seq_len, dim = stepwise_embeds[0].shape
    
    pooled_embeds = []
    pooled_time = []
    next_none_padded_lengths = []

    for sample_i in range(batch_size):
        print(sample_i)
        left = 0
        right = 1

        pooled_embeds_i = []
        pooled_time_i = []

        while right <= non_padded_lengths[sample_i]:
            if right < non_padded_lengths[sample_i] and timestamps[sample_i][right] - timestamps[sample_i][left] < span:
                right += 1
            else:
                #print(stepwise_embeds[sample_i, :, left:right+1])
                      
                pooled_embeds_i.append(stepwise_embeds[sample_i][left:right].sum(axis=-2))
                pooled_time_i.append(timestamps[sample_i][left:right].sum(axis=-1).item() // (right - left) )

                print(timestamps[sample_i][left:right+1] // (24 * 60 * 60), "->", pooled_time_i[-1] // (24 * 60 * 60))
                
                left += stride
                right = max(right, left + 1)

        #print([s.shape for s in pooled_embeds_i])
        
        pooled_embeds.append(torch.stack(pooled_embeds_i))
        pooled_time.append(torch.tensor(pooled_time_i))
        next_none_padded_lengths.append(len(pooled_time_i))

    return pooled_embeds, pooled_time, next_none_padded_lengths

In [270]:
embeds = torch.ones(2, 439, 4)
none_padded_lengths = [10, 8]

In [271]:
event_time[0, :15] // (24 * 60 * 60)

tensor([17141, 17142, 17142, 17144, 17147, 17147, 17147, 17148, 17148, 17148,
        17149, 17149, 17149, 17150, 17150])

In [272]:
event_time[1, :15] // (24 * 60 * 60)

tensor([17114, 17114, 17117, 17117, 17118, 17118, 17118, 17118, 17118, 17119,
        17119, 17121, 17126, 17128, 17129])

In [273]:
pooled_embeds, pooled_time, next_none_padded_lengths = pool_fixed_span(embeds, event_time, none_padded_lengths, 2 * 24 * 60 * 60, 2)

0
tensor([17141, 17142, 17142, 17144]) -> 17141
tensor([17142, 17144]) -> 17142
tensor([17147, 17147, 17147, 17148, 17148, 17148, 17149]) -> 17147
tensor([17147, 17148, 17148, 17148, 17149]) -> 17147
tensor([17148, 17148, 17149]) -> 17148
1
tensor([17114, 17114, 17117]) -> 17114
tensor([17117, 17117, 17118, 17118, 17118, 17118, 17118]) -> 17117
tensor([17118, 17118, 17118, 17118, 17118]) -> 17118
tensor([17118, 17118, 17118]) -> 17118


In [274]:
pooled_embeds

[tensor([[3., 3., 3., 3.],
         [1., 1., 1., 1.],
         [6., 6., 6., 6.],
         [4., 4., 4., 4.],
         [2., 2., 2., 2.]]),
 tensor([[2., 2., 2., 2.],
         [6., 6., 6., 6.],
         [4., 4., 4., 4.],
         [2., 2., 2., 2.]])]

In [275]:
pooled_embeds, pooled_time, next_none_padded_lengths = pool_fixed_span(pooled_embeds, pooled_time, next_none_padded_lengths, 4 * 24 * 60 * 60, 2)

0
tensor([17141, 17142, 17147]) -> 17141
tensor([17147, 17147, 17148]) -> 17147
tensor([17148]) -> 17148
1
tensor([17114, 17117, 17118]) -> 17115
tensor([17118, 17118]) -> 17118


In [276]:
pooled_embeds

[tensor([[ 4.,  4.,  4.,  4.],
         [12., 12., 12., 12.],
         [ 2.,  2.,  2.,  2.]]),
 tensor([[8., 8., 8., 8.],
         [6., 6., 6., 6.]])]

In [277]:
pooled_embeds, pooled_time, next_none_padded_lengths = pool_fixed_span(pooled_embeds, pooled_time, next_none_padded_lengths, 8 * 24 * 60 * 60, 2)

0
tensor([17141, 17147, 17148]) -> 17145
tensor([17148]) -> 17148
1
tensor([17115, 17118]) -> 17116


In [278]:
pooled_embeds

[tensor([[18., 18., 18., 18.],
         [ 2.,  2.,  2.,  2.]]),
 tensor([[14., 14., 14., 14.]])]

In [279]:
pooled_time

[tensor([1481402400, 1481587200]), tensor([1478901600])]

In [282]:
pooled_embeds, pooled_time, next_none_padded_lengths = pool_fixed_span(pooled_embeds, pooled_time, next_none_padded_lengths, 16 * 24 * 60 * 60, 2)

0
tensor([17145, 17148]) -> 17146
1
tensor([17116]) -> 17116


In [283]:
pooled_embeds

[tensor([[20., 20., 20., 20.]]), tensor([[14., 14., 14., 14.]])]

In [284]:
pooled_time

[tensor([1481494800]), tensor([1478901600])]

In [285]:
pooled_time[0]

tensor([1481494800])

In [286]:
next_none_padded_lengths

[1, 1]

# TS2Vec with time features

## Model training

In [23]:
trx_encoder = TrxEncoder(
    embeddings={
        "mcc_code": {"in": 345, "out": 24}
    },
    numeric_values={
        "amount": "identity",
        "event_time": "identity",
        "time_delta": "identity",
    },
    use_batch_norm_with_lens=True,
    norm_embeddings=False,
    embeddings_noise=0.0003
)

seq_encoder = ConvSeqEncoder(
    trx_encoder,
    hidden_size=1024,
    num_layers=10,
    dropout=0.1,
)

In [24]:
lr_scheduler_partial = partial(torch.optim.lr_scheduler.ReduceLROnPlateau, factor=.9025, patience=5, mode="min")
optimizer_partial = partial(torch.optim.Adam, lr=4e-3)

model = TS2Vec(
    seq_encoder,
    optimizer_partial=optimizer_partial,
    lr_scheduler_partial=lr_scheduler_partial
)

In [None]:
checkpoint = ModelCheckpoint(
    monitor="valid_loss", 
    mode="min"
)

trainer = Trainer(
    max_epochs=50,
    devices=[3],
    accelerator="gpu",
    callbacks=[checkpoint]
)

trainer.fit(model, datamodule)

In [34]:
model.load_state_dict(torch.load(checkpoint.best_model_path)["state_dict"])

<All keys matched successfully>

In [40]:
torch.save(model.seq_encoder.state_dict(), "ts2vec_churn_date.pth")

## Model evaluation

In [35]:
train_val_ds = TS2VecDataset(train + val, min_seq_len=15)

In [36]:
X_train, y_train = encode_data(model.seq_encoder, train_val_ds)
X_test, y_test = encode_data(model.seq_encoder, test_ds)

print("Train size:", len(y_train))
print("Test size:", len(y_test))

Train size: 3961
Test size: 443


In [37]:
results = bootstrap_eval(X_train, X_test, y_train, y_test, n_runs=10)

In [38]:
results

Unnamed: 0,ROC-AUC,PR-AUC
0,0.734573,0.807961
1,0.715507,0.813229
2,0.726342,0.804885
3,0.721855,0.796088
4,0.72886,0.818177
5,0.709264,0.79614
6,0.732415,0.812827
7,0.718575,0.813551
8,0.716311,0.802546
9,0.708757,0.794835


In [39]:
results.agg(["mean", "std"])

Unnamed: 0,ROC-AUC,PR-AUC
mean,0.721246,0.806024
std,0.009133,0.008441
