In [1]:
import os 

os.chdir("app")

In [2]:
import pandas as pd
import numpy as np
import re
from tqdm.auto import tqdm

from ptls.preprocessing import PandasDataPreprocessor
import torch
import pytorch_lightning as pl

from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder,RnnEncoder,AggFeatureSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from ptls.data_load.datasets import inference_data_loader
from ptls.data_load.utils import collate_feature_dict

import torch.nn as nn
from torch.utils.data import Dataset

# from pyhocon import ConfigFactory
from catboost import CatBoostClassifier, metrics

import logging

In [3]:
path_to_data = "data"

In [4]:
train = pd.read_csv(path_to_data + '/train.csv', usecols=['client_id', 'gender'])
mcc_codes = pd.read_csv(path_to_data + '/mcc_codes.csv', sep=';')
test_sample_submission = pd.read_csv(path_to_data + '/test_sample_submission.csv', usecols=['client_id', 'probability'])
test = pd.read_csv(path_to_data + '/test.csv', usecols=['client_id'])
trans_types = pd.read_csv(path_to_data + '/trans_types.csv', sep=';')
transactions = pd.read_csv(path_to_data + '/transactions.csv')

In [5]:
trans_train = pd.merge(transactions, train, on='client_id')
trans_test = pd.merge(transactions, test, on='client_id')

In [6]:
trans_train.head()

Unnamed: 0,client_id,trans_time,mcc_code,trans_type,amount,term_id,trans_city,gender
0,d1bbbc9a0e0410d3cf12a3d2f44f3450,35 08:24:41,4829,2370,-1808.56,,Tver,0
1,d1bbbc9a0e0410d3cf12a3d2f44f3450,105 12:57:32,4829,2370,-3390.41,,Tver,0
2,d1bbbc9a0e0410d3cf12a3d2f44f3450,455 19:32:01,4814,1030,-144.5,889003.0,Tver,0
3,d1bbbc9a0e0410d3cf12a3d2f44f3450,83 09:22:26,6011,2010,-3542.3,,Tver,0
4,d1bbbc9a0e0410d3cf12a3d2f44f3450,74 13:31:57,6011,2010,-3542.7,,Tver,0


In [7]:
base_date = pd.to_datetime('2022-01-01')

for df in [trans_train, trans_test]:
    df['trans_time'] = df['trans_time'].astype(str)
    df[['days', 'time']] = df['trans_time'].str.split(' ', expand=True)
    df['days'] = pd.to_timedelta(df['days'].astype(int), unit='D')
    df['time'] = pd.to_timedelta(df['time'])
    df['event_time'] = base_date + df['days'] + df['time']
    df.drop(['trans_time', 'days', 'time', 'term_id'], axis=1, inplace=True)

In [8]:
# trans_train.drop('gender', axis=1, inplace=True)

In [9]:
preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time = 'event_time',
    cols_category=['mcc_code','trans_type', 'trans_city'],
    cols_numerical=['amount'],
    cols_first_item=['gender'],
    return_records=True,
)

In [10]:
%%time

dataset_train = preprocessor.fit_transform(trans_train)
dataset_test = preprocessor.transform(trans_test)

CPU times: user 13.7 s, sys: 1.26 s, total: 15 s
Wall time: 15 s


In [11]:
# import pickle

# with open('preprocessor/preprocessor.p', 'wb') as f:
#     pickle.dump(preprocessor, f)

In [12]:
# with open('preprocessor/preprocessor.p', 'rb') as f:
#     a = pickle.load(f)

In [13]:
dataset_train = sorted(dataset_train, key=lambda x: x['client_id'])
dataset_test = sorted(dataset_test, key=lambda x: x['client_id'])

In [14]:
from sklearn.model_selection import train_test_split

train_cv, test_cv = train_test_split(dataset_train, test_size=0.2, random_state=42)

len(train_cv), len(test_cv)

(6048, 1512)

In [15]:
class SberDataset(MemoryMapDataset):
    @staticmethod
    def collate_fn(batch):
        X = collate_feature_dict(batch)
        y = X.payload["gender"]
        return X, y

In [29]:
from torchmetrics import AUROC


class RNNModel(pl.LightningModule):
    def __init__(self, seq_encoder, optimizer_partial, lr_scheduler_partial):
        super().__init__()


        self.seq_encoder = seq_encoder
        self.head = nn.Linear(seq_encoder.embedding_size, 1)

        self._optimizer_partial = optimizer_partial
        self._lr_scheduler_partial = lr_scheduler_partial

        self.metric = AUROC(task="binary")
        self.loss = nn.BCEWithLogitsLoss()

    def training_step(self, batch, _):
        X, y = batch

        embeddings = self.seq_encoder(X)
        preds = self.head(embeddings).squeeze()

        loss = self.loss(preds, y.float())
        return loss
    
    def validation_step(self, batch, _):
        X, y = batch

        embeddings = self.seq_encoder(X)
        preds = nn.functional.sigmoid(self.head(embeddings)).squeeze()
        
        self.metric.update(preds, y.long())

    @property
    def metric_name(self):
        return 'valid_auc'

    def on_validation_batch_end(self, outputs, batch, batch_idx, dataloader_idx) -> None:
        self.log('valid_auc', self.metric.compute(), prog_bar=True)
        self.metric.reset()

    def configure_optimizers(self):
        optimizer = self._optimizer_partial(self.parameters())
        scheduler = self._lr_scheduler_partial(optimizer)
        
        if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
            scheduler = {
                'scheduler': scheduler,
                'monitor': self.metric_name,
            }
        return [optimizer], [scheduler]

In [30]:
datamodule = PtlsDataModule(
    train_data=SberDataset(
        data=train_cv,
        i_filters=[
            SeqLenFilter(min_seq_len=25),
        ],
    ),
    valid_data=SberDataset(
        data=test_cv,
        i_filters=[
            SeqLenFilter(min_seq_len=25),
        ],
    ),
    train_num_workers=2,
    train_batch_size=32,
    valid_batch_size=32,
)

In [31]:
trx_encoder = TrxEncoder(
    embeddings={
        'trans_type': {'in': 100, 'out': 16},
        'mcc_code': {'in': 200, 'out': 16},
        'trans_city': {'in': 15, 'out': 16}
    },
    numeric_values={'amount': 'identity'}
)

seq_encoder = RnnSeqEncoder(
    trx_encoder,
    hidden_size=128,
)

model = RNNModel(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.ReduceLROnPlateau, factor=.99),
)

In [32]:
trainer = pl.Trainer(
    max_epochs=30,
    accelerator='gpu',
    devices=[1],
    # gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=True,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [33]:
trainer.fit(model, datamodule)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name        | Type              | Params
--------------------------------------------------
0 | seq_encoder | RnnSeqEncoder     | 73.9 K
1 | head        | Linear            | 129   
2 | metric      | BinaryAUROC       | 0     
3 | loss        | BCEWithLogitsLoss | 0     
--------------------------------------------------
74.0 K    Trainable params
0         Non-trainable params
74.0 K    Total params
0.296     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [None]:
train_dl = inference_data_loader(train_cv, num_workers=0, batch_size=16)
train_embeds = torch.vstack(trainer.predict(model, train_dl))

test_dl = inference_data_loader(test_cv, num_workers=0, batch_size=16)
test_embeds = torch.vstack(trainer.predict(model, test_dl))

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
  rank_zero_warn(


Predicting: 24it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Predicting: 24it [00:00, ?it/s]

In [108]:
train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['client_id'] = [x['client_id'] for x in train_cv]

test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['client_id'] = [x['client_id'] for x in test_cv]

print(train_df.shape, test_df.shape)

NameError: name 'train_embeds' is not defined

In [57]:
trans_train = pd.merge(transactions, train, on='client_id')

In [58]:
client_gender_dict = pd.Series(trans_train.drop_duplicates('client_id')['gender'].values, index=trans_train.drop_duplicates('client_id')['client_id']).to_dict()

In [59]:
train_df['gender'] = train_df['client_id'].map(client_gender_dict)
test_df['gender'] = test_df['client_id'].map(client_gender_dict)

In [60]:
train_df.dropna(inplace=True)
# test_df.dropna(inplace=True)

In [61]:
embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['gender']
x_test, y_test = test_df[embed_columns], test_df['gender']

In [65]:
CatBoostModel = CatBoostClassifier(
    iterations= 500,
    learning_rate = 0.05,
    use_best_model = True,
    eval_metric ='AUC', 
    loss_function='Logloss',
    random_seed = 42,
    logging_level = 'Silent',
    depth = 3
)

In [66]:
CatBoostModel.fit(
    x_train, y_train,
    eval_set=(x_test, y_test),
    plot=False
)

<catboost.core.CatBoostClassifier at 0x7fe60f15cbb0>

In [67]:
CatBoostModel.get_best_score()

{'learn': {'Logloss': 0.43719619670242227},
 'validation': {'Logloss': 0.5400039787411766, 'AUC': 0.7997282872392043}}

In [42]:
predictions = CatBoostModel.predict_proba(x_test)[:, 1]
test_sample_submission['probability'] = predictions
test_sample_submission.to_csv('/kaggle/working/result.csv', index=False)