In [1]:
import os

os.chdir('app/')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from ptls.preprocessing import PandasDataPreprocessor
import torch
from torch.utils.data import Dataset

import pytorch_lightning as pl

from pytorch_lightning.callbacks import ModelCheckpoint

from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder, AggFeatureSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.data_load.datasets import MemoryMapDataset
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices, NoSplit
from ptls.frames import PtlsDataModule
from ptls.data_load.datasets import inference_data_loader
from ptls.data_load.utils import collate_feature_dict

from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression, LinearRegression
from lightgbm import LGBMClassifier, LGBMRegressor

In [3]:
path_to_data = 'data'

In [4]:
df_train = pd.read_parquet(path_to_data + '/train_dataset_hackaton2023_train.gzip')
df_train.head()

Unnamed: 0,customer_id,date_diff_post,buy_post,group_name,revenue,startdatetime,dish_name,ownareaall_sqm,format_name
0,29891,9.0,1,train,69.99,2022-12-05 12:03:58,Кинг Фри станд,300.0,Отдельно стоящий без внешней зоны
1,29891,9.0,1,train,190.0,2022-12-05 12:03:58,Чикен Тар-Тар,300.0,Отдельно стоящий без внешней зоны
2,29891,9.0,1,train,9.99,2022-12-05 12:03:58,Соус Сырный,300.0,Отдельно стоящий без внешней зоны
3,29891,9.0,1,train,119.99,2022-12-05 12:03:58,Энергет.нап. Адреналин Раш,300.0,Отдельно стоящий без внешней зоны
4,29891,9.0,1,train,119.99,2022-12-05 14:28:35,Латте (СТАНД.),300.0,Отдельно стоящий без внешней зоны


In [5]:
# df_train = df_train.drop_duplicates()

In [6]:
data_check_agg = df_train.groupby(['customer_id', 'startdatetime']).agg({
    'revenue': 'sum', 
    'buy_post': 'first',
    'date_diff_post': 'first'
}).reset_index()
data_check_agg.head()

Unnamed: 0,customer_id,startdatetime,revenue,buy_post,date_diff_post
0,29891,2022-12-05 12:03:58,389.97,1,9.0
1,29891,2022-12-05 14:28:35,119.99,1,9.0
2,29891,2022-12-15 00:37:19,269.99,1,9.0
3,29891,2022-12-20 09:20:38,144.97,1,9.0
4,29891,2022-12-21 09:46:23,184.96,1,9.0


In [7]:
data_check_agg['delta'] = data_check_agg.groupby('customer_id')['startdatetime'].diff() 

In [8]:
data_check_agg.head()

Unnamed: 0,customer_id,startdatetime,revenue,buy_post,date_diff_post,delta
0,29891,2022-12-05 12:03:58,389.97,1,9.0,NaT
1,29891,2022-12-05 14:28:35,119.99,1,9.0,0 days 02:24:37
2,29891,2022-12-15 00:37:19,269.99,1,9.0,9 days 10:08:44
3,29891,2022-12-20 09:20:38,144.97,1,9.0,5 days 08:43:19
4,29891,2022-12-21 09:46:23,184.96,1,9.0,1 days 00:25:45


In [9]:
data_check_agg['delta_hours'] = (data_check_agg['delta'].dt.total_seconds() // (60*60)).fillna(0)
data_check_agg['delta_days'] = (data_check_agg['delta'].dt.total_seconds() // (24*60*60)).fillna(0)

In [10]:
data_check_agg = data_check_agg.drop('delta', axis=1)

In [11]:
data_check_agg.head()

Unnamed: 0,customer_id,startdatetime,revenue,buy_post,date_diff_post,delta_hours,delta_days
0,29891,2022-12-05 12:03:58,389.97,1,9.0,0.0,0.0
1,29891,2022-12-05 14:28:35,119.99,1,9.0,2.0,0.0
2,29891,2022-12-15 00:37:19,269.99,1,9.0,226.0,9.0
3,29891,2022-12-20 09:20:38,144.97,1,9.0,128.0,5.0
4,29891,2022-12-21 09:46:23,184.96,1,9.0,24.0,1.0


In [12]:
preprocessor = PandasDataPreprocessor(
    col_id = 'customer_id',
    col_event_time = 'startdatetime',
    cols_numerical = ['revenue', 'delta_hours', 'delta_days'],
    cols_first_item = 'buy_post'
)

In [13]:
class BKDataset(Dataset):
    def __init__(self, data, ):
        self.data = data

    def __getitem__(self, index):
        return self.data[index]
    
    def __len__(self):
        return len(self.data)

    @staticmethod
    def collate_fn(batch):
        padded_batch = collate_feature_dict(batch)
        target = padded_batch.payload['buy_post']
        return padded_batch, target

In [14]:
data = preprocessor.fit_transform(data_check_agg)

In [15]:
train_data, valid_data = train_test_split(data, test_size=.2, random_state=42)

In [16]:
datamodule = PtlsDataModule(
    train_data=BKDataset(train_data),
    valid_data=BKDataset(valid_data),
    train_batch_size=1024,
    valid_batch_size=1024,
)

In [18]:
from typing import Any, List
import torch
import torch.nn as nn
import pytorch_lightning as pl

from torchmetrics import AUROC


class RNNModel(pl.LightningModule):
    def __init__(self, seq_encoder, optimizer_partial, lr_scheduler_partial, head_hidden=512, dropout=0.1):
        super().__init__()

        self.seq_encoder = seq_encoder
        self.head = nn.Sequential(
            nn.Dropout(dropout),
            nn.BatchNorm1d(seq_encoder.embedding_size),
            nn.Linear(seq_encoder.embedding_size, head_hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.BatchNorm1d(head_hidden),
            nn.Linear(head_hidden, 1)
        ) 

        self._optimizer_partial = optimizer_partial
        self._lr_scheduler_partial = lr_scheduler_partial

        self.metric = {"train": AUROC(task="binary"), "valid": AUROC(task="binary")} 
        self.loss = nn.BCEWithLogitsLoss()

    def forward(self, X):
        embeddings = self.seq_encoder(X)

        if not self.seq_encoder.is_reduce_sequence:
            # mean pool
            embeddings = embeddings.payload.sum(dim=1)
            embeddings /= X.seq_lens.unsqueeze(1).expand_as(embeddings)

        logits = self.head(embeddings).squeeze()
        return logits

    def shared_step(self, stage, batch, _):
        X, y = batch

        logits = self(X)

        loss = None
        if stage == 'train':
            loss = self.loss(logits, y.float())

        self.metric[stage].update(logits, y.long())
        self.log(f'{stage}_auc', self.metric[stage].compute(), prog_bar=True)

        return loss

    def training_step(self, *args, **kwargs):
        return self.shared_step('train', *args, **kwargs)
    
    def validation_step(self, *args, **kwargs):
        return self.shared_step('valid', *args, **kwargs)
    
    def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> Any:
        logits = self(batch)
        return nn.functional.sigmoid(logits)

    @property
    def metric_name(self):
        return 'valid_auc'

    def on_train_epoch_end(self):
        self.metric["train"].reset()

    def on_validation_batch_end(self, outputs, batch, batch_idx, dataloader_idx) -> None:
        self.log('valid_auc', self.metric["valid"].compute(), prog_bar=True)
        self.metric["valid"].reset()

    def configure_optimizers(self):
        optimizer = self._optimizer_partial(self.parameters())
        scheduler = self._lr_scheduler_partial(optimizer)
        
        if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
            scheduler = {
                'scheduler': scheduler,
                'monitor': self.metric_name,
            }
        return [optimizer], [scheduler]


In [29]:
class BOTModel(pl.LightningModule):
    def __init__(self, trx_encoder, optimizer_partial, lr_scheduler_partial, head_hidden=512, dropout=0.1):
        super().__init__()

        self.trx_encoder = trx_encoder

        self.query = nn.Parameter(torch.randn(trx_encoder.output_size), requires_grad=True)
        self.attn = nn.MultiheadAttention(trx_encoder.output_size, 4, batch_first=True)

        self.head = nn.Sequential(
            nn.Dropout(dropout),
            nn.BatchNorm1d(trx_encoder.output_size),
            nn.Linear(trx_encoder.output_size, head_hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.BatchNorm1d(head_hidden),
            nn.Linear(head_hidden, 1)
        ) 

        self._optimizer_partial = optimizer_partial
        self._lr_scheduler_partial = lr_scheduler_partial

        self.metric = {"train": AUROC(task="binary"), "valid": AUROC(task="binary")} 
        self.loss = nn.BCEWithLogitsLoss()

    def forward(self, X):
        embeddings = self.trx_encoder(X).payload

        embeddings = embeddings.sum(dim=1) 
        embeddings /= X.seq_lens.unsqueeze(1)

        # attn_output, _ = self.attn(
        #     self.query.unsqueeze(0).expand(embeddings.shape[0], 1, -1), 
        #     embeddings,
        #     embeddings,
        #     key_padding_mask=(1-X.seq_len_mask).bool()
        # )

        logits = self.head(embeddings).squeeze()

        return logits

    def shared_step(self, stage, batch, _):
        X, y = batch

        logits = self(X)

        loss = None
        if stage == 'train':
            loss = self.loss(logits, y.float())

        self.metric[stage].update(logits, y.long())
        self.log(f'{stage}_auc', self.metric[stage].compute(), prog_bar=True)

        return loss

    def training_step(self, *args, **kwargs):
        return self.shared_step('train', *args, **kwargs)
    
    def validation_step(self, *args, **kwargs):
        return self.shared_step('valid', *args, **kwargs)
    
    def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> Any:
        logits = self(batch)
        return nn.functional.sigmoid(logits)

    @property
    def metric_name(self):
        return 'valid_auc'

    def on_train_epoch_end(self):
        self.metric["train"].reset()

    def on_validation_batch_end(self, outputs, batch, batch_idx, dataloader_idx) -> None:
        self.log('valid_auc', self.metric["valid"].compute(), prog_bar=True)
        self.metric["valid"].reset()

    def configure_optimizers(self):
        optimizer = self._optimizer_partial(self.parameters())
        scheduler = self._lr_scheduler_partial(optimizer)
        
        if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
            scheduler = {
                'scheduler': scheduler,
                'monitor': self.metric_name,
            }
        return [optimizer], [scheduler]

In [30]:
# trx_encoder = TrxEncoder(
#     numeric_values={
#         'revenue': 'identity',
#         'delta_hours': 'identity',
#         'delta_days': 'identity',
#     },
#     linear_projection_size=32,
#     use_batch_norm_with_lens=True,
#     embeddings_noise=1e-3,
# )

# seq_encoder = RnnSeqEncoder(
#     trx_encoder,
#     hidden_size=64,
#     type='lstm',
#     is_reduce_sequence=True
# )


# model = RNNModel(
#     seq_encoder,
#     optimizer_partial=partial(torch.optim.Adam, lr=0.001),
#     lr_scheduler_partial=partial(torch.optim.lr_scheduler.ReduceLROnPlateau, factor=.99),    
# )

In [31]:
trx_encoder = TrxEncoder(
    numeric_values={
        'revenue': 'identity',
        'delta_hours': 'identity',
        'delta_days': 'identity',
    },
    linear_projection_size=32,
    use_batch_norm_with_lens=True,
    embeddings_noise=1e-3,
)

model = BOTModel(
    trx_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.ReduceLROnPlateau, factor=.99),    
)

In [32]:
checkpoint = ModelCheckpoint(
    monitor='valid_auc',
    mode='max'
)

trainer = pl.Trainer(
    max_epochs=50,
    accelerator='gpu',
    devices=[0],
    callbacks=[checkpoint]
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [33]:
trainer.fit(model, datamodule)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name        | Type               | Params
---------------------------------------------------
0 | trx_encoder | TrxEncoder         | 134   
1 | attn        | MultiheadAttention | 4.2 K 
2 | head        | Sequential         | 18.5 K
3 | loss        | BCEWithLogitsLoss  | 0     
---------------------------------------------------
22.9 K    Trainable params
0         Non-trainable params
22.9 K    Total params
0.092     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [89]:
data_cust_agg = data_check_agg.groupby('customer_id').agg({
    'revenue': ['mean', 'median', 'std', 'max', 'min', 'count'],
    'delta_hours': ['mean', 'median', 'std', 'max', 'min',], 
    'delta_days': ['mean', 'median', 'std', 'max', 'min',], 
    'startdatetime': ['min', 'max'],
    'buy_post': 'first',
    'date_diff_post': 'first',
}).reset_index()

data_cust_agg.columns = [f'{header}_{stat}' for header, stat in data_cust_agg.columns]

In [90]:
data_cust_agg.head()

Unnamed: 0,customer_id_,revenue_mean,revenue_median,revenue_std,revenue_max,revenue_min,revenue_count,delta_hours_mean,delta_hours_median,delta_hours_std,...,delta_hours_min,delta_days_mean,delta_days_median,delta_days_std,delta_days_max,delta_days_min,startdatetime_min,startdatetime_max,buy_post_first,date_diff_post_first
0,29891,203.494,199.96,123.170275,439.98,1.0,25,55.24,24.0,72.50246,...,0.0,1.92,1.0,2.998889,11.0,0.0,2022-12-05 12:03:58,2023-02-01 09:55:59,1,9.0
1,30477,227.024,229.99,124.933425,499.95,44.99,25,49.4,23.0,60.578462,...,0.0,1.52,0.0,2.518597,9.0,0.0,2022-10-04 09:25:05,2022-11-25 08:53:01,1,10.0
2,31426,349.2775,274.99,327.626906,1079.97,1.0,24,49.708333,34.5,46.635944,...,0.0,1.666667,1.0,1.809796,6.0,0.0,2023-05-12 16:05:44,2023-07-01 20:54:48,1,4.0
3,44491,128.725,59.98,144.471912,344.97,49.97,4,166.75,11.5,318.351352,...,0.0,6.5,0.0,13.0,26.0,0.0,2023-06-10 21:59:25,2023-07-08 18:10:44,1,42.0
4,44939,554.943333,554.94,49.985,604.93,504.96,3,123.333333,172.0,107.598017,...,0.0,5.0,7.0,4.358899,8.0,0.0,2022-12-10 11:54:04,2022-12-25 22:38:48,1,9.0


In [101]:
data_cust_agg['recency'] = (data_cust_agg['startdatetime_max'] - data_cust_agg['startdatetime_min']).dt.days
data_cust_agg['T'] = (pd.to_datetime('2023-08-02') - data_cust_agg['startdatetime_min']).dt.days
data_cust_agg['days_from_last_purchase'] = (pd.to_datetime('2023-08-02') - data_cust_agg['startdatetime_max']).dt.days
data_cust_agg['lambda'] = data_cust_agg['revenue_count'] / (data_cust_agg['recency'] + 1)
# data_cust_agg = data_cust_agg.drop(['startdatetime_min', 'startdatetime_max'], axis=1)

In [102]:
data_cust_agg.head()

Unnamed: 0,customer_id_,revenue_mean,revenue_median,revenue_std,revenue_max,revenue_min,revenue_count,delta_hours_mean,delta_hours_median,delta_hours_std,...,delta_days_max,delta_days_min,startdatetime_min,startdatetime_max,buy_post_first,date_diff_post_first,recency,T,lambda,days_from_last_purchase
0,29891,203.494,199.96,123.170275,439.98,1.0,25,55.24,24.0,72.50246,...,11.0,0.0,2022-12-05 12:03:58,2023-02-01 09:55:59,1,9.0,57,239,0.431034,181
1,30477,227.024,229.99,124.933425,499.95,44.99,25,49.4,23.0,60.578462,...,9.0,0.0,2022-10-04 09:25:05,2022-11-25 08:53:01,1,10.0,51,301,0.480769,249
2,31426,349.2775,274.99,327.626906,1079.97,1.0,24,49.708333,34.5,46.635944,...,6.0,0.0,2023-05-12 16:05:44,2023-07-01 20:54:48,1,4.0,50,81,0.470588,31
3,44491,128.725,59.98,144.471912,344.97,49.97,4,166.75,11.5,318.351352,...,26.0,0.0,2023-06-10 21:59:25,2023-07-08 18:10:44,1,42.0,27,52,0.142857,24
4,44939,554.943333,554.94,49.985,604.93,504.96,3,123.333333,172.0,107.598017,...,8.0,0.0,2022-12-10 11:54:04,2022-12-25 22:38:48,1,9.0,15,234,0.1875,219


In [103]:
data_train, data_valid = train_test_split(data_cust_agg, test_size=.2, random_state=42)

In [104]:
X_train = data_train.drop(['date_diff_post_first', 'buy_post_first', 'customer_id_', 'startdatetime_min', 'startdatetime_max'], axis=1)
y_train_class = data_train['buy_post_first']
y_train_reg = data_train['date_diff_post_first']

X_valid = data_valid.drop(['date_diff_post_first', 'buy_post_first', 'customer_id_', 'startdatetime_min', 'startdatetime_max'], axis=1)
y_valid_class = data_valid['buy_post_first']
y_valid_reg = data_valid['date_diff_post_first']

In [111]:
model = LGBMClassifier(verbose=-1)
# model = LogisticRegression()

res = cross_val_score(model, X_train, y_train_class, scoring='roc_auc')
print(f'ROC-AUC: {res.mean():.3f} ± {res.std():.3f}')

ROC-AUC: 0.749 ± 0.001


In [113]:
res = cross_val_score(model, X_train, y_train_class, scoring='f1')
print(f'f1-score: {res.mean():.3f} ± {res.std():.3f}')

f1-score: 0.841 ± 0.001


In [106]:
model = LGBMRegressor(verbose=-1)
# model = LinearRegression()

res = cross_val_score(model, X_train[y_train_reg.notna()], y_train_reg[y_train_reg.notna()], scoring='neg_root_mean_squared_error')
print(f'RMSE: {-res.mean():.3f} ± {res.std():.3f}')

RMSE: 12.845 ± 0.028
