In [1]:
import os 

os.chdir("app")

In [2]:
import pandas as pd
import numpy as np
import re
from tqdm.auto import tqdm

from ptls.preprocessing import PandasDataPreprocessor
import torch
import pytorch_lightning as pl

from pytorch_lightning.callbacks import ModelCheckpoint


from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder,RnnEncoder,AggFeatureSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from ptls.data_load.datasets import inference_data_loader
from ptls.data_load.utils import collate_feature_dict



import torch.nn as nn
from torch.utils.data import Dataset

# from pyhocon import ConfigFactory
from catboost import CatBoostClassifier, metrics

import logging

In [3]:
path_to_data = "data"

In [4]:
train = pd.read_csv(path_to_data + '/train.csv', usecols=['client_id', 'gender'])
mcc_codes = pd.read_csv(path_to_data + '/mcc_codes.csv', sep=';')
test_sample_submission = pd.read_csv(path_to_data + '/test_sample_submission.csv', usecols=['client_id', 'probability'])
test = pd.read_csv(path_to_data + '/test.csv', usecols=['client_id'])
trans_types = pd.read_csv(path_to_data + '/trans_types.csv', sep=';')
transactions = pd.read_csv(path_to_data + '/transactions.csv')

In [13]:
trans_train = pd.merge(transactions, train, on='client_id')
trans_test = pd.merge(transactions, test, on='client_id')

In [14]:
trans_train.head()

Unnamed: 0,client_id,trans_time,mcc_code,trans_type,amount,term_id,trans_city,gender
0,d1bbbc9a0e0410d3cf12a3d2f44f3450,35 08:24:41,4829,2370,-1808.56,,Tver,0
1,d1bbbc9a0e0410d3cf12a3d2f44f3450,105 12:57:32,4829,2370,-3390.41,,Tver,0
2,d1bbbc9a0e0410d3cf12a3d2f44f3450,455 19:32:01,4814,1030,-144.5,889003.0,Tver,0
3,d1bbbc9a0e0410d3cf12a3d2f44f3450,83 09:22:26,6011,2010,-3542.3,,Tver,0
4,d1bbbc9a0e0410d3cf12a3d2f44f3450,74 13:31:57,6011,2010,-3542.7,,Tver,0


In [15]:
def preproc_amount(x):
    if x >= 0 :
        return np.log(1 + x)
    
    return -np.log(1 - x)

base_date = pd.to_datetime('2022-01-01')

for df in [trans_train, trans_test]:
    df['trans_time'] = df['trans_time'].astype(str)
    df[['days', 'time']] = df['trans_time'].str.split(' ', expand=True)
    df['days'] = pd.to_timedelta(df['days'].astype(int), unit='D')
    df['time'] = pd.to_timedelta(df['time'])
    df['event_time'] = base_date + df['days'] + df['time']

    df['amount_pos'] = df['amount'] * (df['amount']>=0)
    df['amount_neg'] = -df['amount'] * (df['amount']<0)

    df.drop(['trans_time', 'days', 'time', 'term_id', 'amount'], axis=1, inplace=True)


In [16]:
# trans_train.drop('gender', axis=1, inplace=True)

In [18]:
preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time = 'event_time',
    cols_category=['mcc_code','trans_type', 'trans_city'],
    cols_numerical=['amount_pos', 'amount_neg'],
    cols_first_item=['gender'],
    return_records=True,
)

In [19]:
%%time

dataset_train = preprocessor.fit_transform(trans_train)
dataset_test = preprocessor.transform(trans_test)

CPU times: user 14 s, sys: 1.85 s, total: 15.9 s
Wall time: 15.9 s


In [20]:
# import pickle

# with open('preprocessor/preprocessor.p', 'wb') as f:
#     pickle.dump(preprocessor, f)

In [21]:
# with open('preprocessor/preprocessor.p', 'rb') as f:
#     a = pickle.load(f)

In [22]:
dataset_train = sorted(dataset_train, key=lambda x: x['client_id'])
dataset_test = sorted(dataset_test, key=lambda x: x['client_id'])

In [23]:
from sklearn.model_selection import train_test_split

train_cv, test_cv = train_test_split(dataset_train, test_size=0.2, random_state=42)

len(train_cv), len(test_cv)

(6048, 1512)

# CoLES pretraining

In [57]:
train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=train_cv,
            i_filters=[
                SeqLenFilter(min_seq_len=25),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    valid_data=ColesDataset(
        MemoryMapDataset(
            data=test_cv,
            i_filters=[
                SeqLenFilter(min_seq_len=25),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    train_num_workers=2,
    train_batch_size=256,
    valid_batch_size=256,
)

In [59]:
trx_encoder = TrxEncoder(
    embeddings={
        # 'trans_type': {'in': 100, 'out': 16},
        'mcc_code': {'in': 200, 'out': 24},
        # 'trans_city': {'in': 15, 'out': 4}
    },
    numeric_values={
        'amount_pos': 'log',
        'amount_neg': 'log',
    },
    use_batch_norm_with_lens=True,
    embeddings_noise=1e-3,
)

seq_encoder = RnnSeqEncoder(
    trx_encoder,
    hidden_size=128,
    type='lstm',
)

coles = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.ReduceLROnPlateau, factor=.99),
)

In [60]:
checkpoint = ModelCheckpoint(
    monitor=coles.metric_name,
    mode='max'
)

trainer = pl.Trainer(
    max_epochs=30,
    accelerator='gpu',
    devices=[1],
    callbacks=[checkpoint]
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(coles, train_dl)

# RNNModel

In [73]:
class SberDataset(MemoryMapDataset):
    @staticmethod
    def collate_fn(batch):
        X = collate_feature_dict(batch)
        y = X.payload["gender"]
        return X, y

In [74]:
from torchmetrics import AUROC


class RNNModel(pl.LightningModule):
    def __init__(self, seq_encoder, optimizer_partial, lr_scheduler_partial, head_hidden=512, dropout=0.1):
        super().__init__()


        self.seq_encoder = seq_encoder
        self.head = nn.Sequential(
            nn.Dropout(dropout),
            nn.BatchNorm1d(seq_encoder.embedding_size),
            nn.Linear(seq_encoder.embedding_size, head_hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.BatchNorm1d(head_hidden),
            nn.Linear(head_hidden, 1)
        ) 

        self._optimizer_partial = optimizer_partial
        self._lr_scheduler_partial = lr_scheduler_partial

        self.metric = {"train": AUROC(task="binary"), "valid": AUROC(task="binary")} 
        self.loss = nn.BCEWithLogitsLoss()

    def training_step(self, batch, _):
        X, y = batch

        embeddings = self.seq_encoder(X)
        preds = self.head(embeddings).squeeze()

        loss = self.loss(preds, y.float())

        self.metric["train"].update(preds, y.long())
        self.log('train_auc', self.metric["train"].compute(), prog_bar=True)

        return loss
    
    def validation_step(self, batch, _):
        X, y = batch

        embeddings = self.seq_encoder(X)
        preds = nn.functional.sigmoid(self.head(embeddings)).squeeze()
        
        self.metric["valid"].update(preds, y.long())

    @property
    def metric_name(self):
        return 'valid_auc'

    def on_train_epoch_end(self):
        self.metric["train"].reset()

    def on_validation_batch_end(self, outputs, batch, batch_idx, dataloader_idx) -> None:
        self.log('valid_auc', self.metric["valid"].compute(), prog_bar=True)
        self.metric["valid"].reset()

    def configure_optimizers(self):
        optimizer = self._optimizer_partial(self.parameters())
        scheduler = self._lr_scheduler_partial(optimizer)
        
        if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
            scheduler = {
                'scheduler': scheduler,
                'monitor': self.metric_name,
            }
        return [optimizer], [scheduler]

In [75]:
datamodule = PtlsDataModule(
    train_data=SberDataset(
        data=train_cv,
        i_filters=[
            SeqLenFilter(min_seq_len=25),
        ],
    ),
    valid_data=SberDataset(
        data=test_cv,
        i_filters=[
            SeqLenFilter(min_seq_len=25),
        ],
    ),
    train_num_workers=2,
    train_batch_size=16,
    valid_batch_size=16,
)

In [77]:
trx_encoder = TrxEncoder(
    embeddings={
        # 'trans_type': {'in': 100, 'out': 16},
        'mcc_code': {'in': 200, 'out': 24},
        # 'trans_city': {'in': 15, 'out': 4}
    },
    numeric_values={
        'amount_pos': 'log',
        'amount_neg': 'log',
    },
    use_batch_norm_with_lens=True,
    embeddings_noise=1e-3,
)

seq_encoder = RnnSeqEncoder(
    trx_encoder,
    hidden_size=128,
    type='lstm',
)

seq_encoder.load_state_dict(coles.seq_encoder.state_dict())

model = RNNModel(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.ReduceLROnPlateau, factor=.99),
    dropout=0.3
)

In [78]:
checkpoint = ModelCheckpoint(
    monitor='valid_auc',
    mode='max'
)

trainer = pl.Trainer(
    max_epochs=30,
    accelerator='gpu',
    devices=[1],
    callbacks=[checkpoint]б
    gradient_clip_val=1,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [79]:
trainer.fit(model, datamodule)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name        | Type              | Params
--------------------------------------------------
0 | seq_encoder | RnnSeqEncoder     | 84.8 K
1 | head        | Sequential        | 67.8 K
2 | loss        | BCEWithLogitsLoss | 0     
--------------------------------------------------
152 K     Trainable params
0         Non-trainable params
152 K     Total params
0.611     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [56]:
checkpoint.best_model_score

tensor(0.8468, device='cuda:1')

In [35]:
model.load_state_dict(torch.load(checkpoint.best_model_path)['state_dict'])

<All keys matched successfully>

In [62]:
model_ = CoLESModule(coles.seq_encoder)

In [63]:
train_dl = inference_data_loader(train_cv, num_workers=0, batch_size=16)
train_embeds = torch.vstack(trainer.predict(model_, train_dl))

test_dl = inference_data_loader(test_cv, num_workers=0, batch_size=16)
test_embeds = torch.vstack(trainer.predict(model_, test_dl))

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
  rank_zero_warn(


Predicting: 24it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Predicting: 24it [00:00, ?it/s]

In [64]:
train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['client_id'] = [x['client_id'] for x in train_cv]

test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['client_id'] = [x['client_id'] for x in test_cv]

print(train_df.shape, test_df.shape)

(6048, 129) (1512, 129)


In [65]:
trans_train = pd.merge(transactions, train, on='client_id')

In [66]:
client_gender_dict = pd.Series(trans_train.drop_duplicates('client_id')['gender'].values, index=trans_train.drop_duplicates('client_id')['client_id']).to_dict()

In [67]:
train_df['gender'] = train_df['client_id'].map(client_gender_dict)
test_df['gender'] = test_df['client_id'].map(client_gender_dict)

In [68]:
train_df.dropna(inplace=True)
# test_df.dropna(inplace=True)

In [69]:
embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['gender']
x_test, y_test = test_df[embed_columns], test_df['gender']

In [70]:
CatBoostModel = CatBoostClassifier(
    iterations= 500,
    learning_rate = 0.05,
    use_best_model = True,
    eval_metric ='AUC', 
    loss_function='Logloss',
    random_seed = 42,
    logging_level = 'Silent',
    depth = 5
)

In [71]:
CatBoostModel.fit(
    x_train, y_train,
    eval_set=(x_test, y_test),
    plot=False
)

<catboost.core.CatBoostClassifier at 0x7efe37affca0>

In [72]:
CatBoostModel.get_best_score()

{'learn': {'Logloss': 0.3392787711330192},
 'validation': {'Logloss': 0.5574050991152946, 'AUC': 0.7841683207622072}}

In [42]:
predictions = CatBoostModel.predict_proba(x_test)[:, 1]
test_sample_submission['probability'] = predictions
test_sample_submission.to_csv('/kaggle/working/result.csv', index=False)