In [1]:
import os 

os.chdir("app")

In [2]:
import pandas as pd
import numpy as np
import re
from tqdm.auto import tqdm

from ptls.preprocessing import PandasDataPreprocessor
import torch
import pytorch_lightning as pl

from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder,RnnEncoder,AggFeatureSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from ptls.data_load.datasets import inference_data_loader

# from pyhocon import ConfigFactory
from catboost import CatBoostClassifier, metrics

import logging

In [3]:
path_to_data = "data"

In [4]:
train = pd.read_csv(path_to_data + '/train.csv', usecols=['client_id', 'gender'])
mcc_codes = pd.read_csv(path_to_data + '/mcc_codes.csv', sep=';')
test_sample_submission = pd.read_csv(path_to_data + '/test_sample_submission.csv', usecols=['client_id', 'probability'])
test = pd.read_csv(path_to_data + '/test.csv', usecols=['client_id'])
trans_types = pd.read_csv(path_to_data + '/trans_types.csv', sep=';')
transactions = pd.read_csv(path_to_data + '/transactions.csv')

In [5]:
trans_train = pd.merge(transactions, train, on='client_id')
trans_test = pd.merge(transactions, test, on='client_id')

In [6]:
trans_train.head()

Unnamed: 0,client_id,trans_time,mcc_code,trans_type,amount,term_id,trans_city,gender
0,d1bbbc9a0e0410d3cf12a3d2f44f3450,35 08:24:41,4829,2370,-1808.56,,Tver,0
1,d1bbbc9a0e0410d3cf12a3d2f44f3450,105 12:57:32,4829,2370,-3390.41,,Tver,0
2,d1bbbc9a0e0410d3cf12a3d2f44f3450,455 19:32:01,4814,1030,-144.5,889003.0,Tver,0
3,d1bbbc9a0e0410d3cf12a3d2f44f3450,83 09:22:26,6011,2010,-3542.3,,Tver,0
4,d1bbbc9a0e0410d3cf12a3d2f44f3450,74 13:31:57,6011,2010,-3542.7,,Tver,0


In [7]:
base_date = pd.to_datetime('2022-01-01')

for df in [trans_train, trans_test]:
    df['trans_time'] = df['trans_time'].astype(str)
    df[['days', 'time']] = df['trans_time'].str.split(' ', expand=True)
    df['days'] = pd.to_timedelta(df['days'].astype(int), unit='D')
    df['time'] = pd.to_timedelta(df['time'])
    df['event_time'] = base_date + df['days'] + df['time']
    df.drop(['trans_time', 'days', 'time', 'term_id'], axis=1, inplace=True)

In [8]:
trans_train.drop('gender', axis=1, inplace=True)

In [9]:
preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time = 'event_time',
    cols_category=['mcc_code','trans_type', 'trans_city'],
    cols_numerical=['amount'],
    return_records=True,
)

In [10]:
%%time

dataset_train = preprocessor.fit_transform(trans_train)
dataset_test = preprocessor.transform(trans_test)

CPU times: user 12.9 s, sys: 1.99 s, total: 14.9 s
Wall time: 14.9 s


In [11]:
import pickle

with open('preprocessor/preprocessor.p', 'wb') as f:
    pickle.dump(preprocessor, f)

In [12]:
# with open('preprocessor/preprocessor.p', 'rb') as f:
#     a = pickle.load(f)

In [13]:
dataset_train = sorted(dataset_train, key=lambda x: x['client_id'])
dataset_test = sorted(dataset_test, key=lambda x: x['client_id'])

In [14]:
from sklearn.model_selection import train_test_split

train_cv, test_cv = train_test_split(dataset_train, test_size=0.2, random_state=42)

len(train_cv), len(test_cv)

(6048, 1512)

In [21]:
trx_encoder = TrxEncoder(
    embeddings={
        'trans_type': {'in': 100, 'out': 8},
        'mcc_code': {'in': 200, 'out': 16},
        'trans_city': {'in': 15, 'out': 4}
    },
    numeric_values={'amount': 'identity'}
)

seq_encoder = RnnSeqEncoder(
    trx_encoder,
    hidden_size=128
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.ReduceLROnPlateau, factor=.99),
)

In [25]:
train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=train_cv,
            i_filters=[
                SeqLenFilter(min_seq_len=25),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    valid_data=ColesDataset(
        MemoryMapDataset(
            data=test_cv,
            i_filters=[
                SeqLenFilter(min_seq_len=25),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    train_num_workers=2,
    train_batch_size=256,
    valid_batch_size=256,
)

In [29]:
trainer = pl.Trainer(
    max_epochs=30,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=True,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [30]:
trainer.fit(model, train_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 65.2 K
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
65.2 K    Trainable params
0         Non-trainable params
65.2 K    Total params
0.261     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f94c2b77940>
Traceback (most recent call last):
  File "/home/macro-micro-coles/miniconda/envs/env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "/home/macro-micro-coles/miniconda/envs/env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1461, in _shutdown_workers
    if w.is_alive():
  File "/home/macro-micro-coles/miniconda/envs/env/lib/python3.9/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f94c2b77940>
Traceback (most recent call last):
  File "/home/macro-micro-coles/miniconda/envs/env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "/home/macro-m

Validation: 0it [00:00, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f94c2b77940>
Traceback (most recent call last):
  File "/home/macro-micro-coles/miniconda/envs/env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "/home/macro-micro-coles/miniconda/envs/env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1461, in _shutdown_workers
    if w.is_alive():
  File "/home/macro-micro-coles/miniconda/envs/env/lib/python3.9/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f94c2b77940>
Traceback (most recent call last):
  File "/home/macro-micro-coles/miniconda/envs/env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "/home/macro-m

Validation: 0it [00:00, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f94c2b77940>
Traceback (most recent call last):
  File "/home/macro-micro-coles/miniconda/envs/env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "/home/macro-micro-coles/miniconda/envs/env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1461, in _shutdown_workers
    if w.is_alive():
  File "/home/macro-micro-coles/miniconda/envs/env/lib/python3.9/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f94c2b77940>
Traceback (most recent call last):
  File "/home/macro-micro-coles/miniconda/envs/env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "/home/macro-m

Validation: 0it [00:00, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f94c2b77940>
Traceback (most recent call last):
  File "/home/macro-micro-coles/miniconda/envs/env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f94c2b77940>
    Traceback (most recent call last):
  File "/home/macro-micro-coles/miniconda/envs/env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
self._shutdown_workers()    
self._shutdown_workers()
  File "/home/macro-micro-coles/miniconda/envs/env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1461, in _shutdown_workers
  File "/home/macro-micro-coles/miniconda/envs/env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1461, in _shutdown_workers
    if w.is_alive():    
if w.is_alive():  File "/home/macro-micro-coles/miniconda/envs/env/lib/python3.9/multiprocessing/process.py", l

Validation: 0it [00:00, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f94c2b77940>
Traceback (most recent call last):
  File "/home/macro-micro-coles/miniconda/envs/env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
Exception ignored in:     self._shutdown_workers()<function _MultiProcessingDataLoaderIter.__del__ at 0x7f94c2b77940>

  File "/home/macro-micro-coles/miniconda/envs/env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1461, in _shutdown_workers
Traceback (most recent call last):
  File "/home/macro-micro-coles/miniconda/envs/env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    if w.is_alive():    
self._shutdown_workers()
  File "/home/macro-micro-coles/miniconda/envs/env/lib/python3.9/multiprocessing/process.py", line 160, in is_alive
  File "/home/macro-micro-coles/miniconda/envs/env/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1461, in _shutdown_workers

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [18]:
train_dl = inference_data_loader(train_cv, num_workers=0, batch_size=256)
train_embeds = torch.vstack(trainer.predict(model, train_dl))

test_dl = inference_data_loader(test_cv, num_workers=0, batch_size=256)
test_embeds = torch.vstack(trainer.predict(model, test_dl))

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
  rank_zero_warn(


OutOfMemoryError: CUDA out of memory. Tried to allocate 12.18 GiB (GPU 0; 47.45 GiB total capacity; 1.17 GiB already allocated; 5.90 GiB free; 1.18 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [28]:
train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['client_id'] = [x['client_id'] for x in train_cv]

test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['client_id'] = [x['client_id'] for x in test_cv]

print(train_df.shape, test_df.shape)

(6048, 953) (1512, 953)


In [29]:
trans_train = pd.merge(transactions, train, on='client_id')

In [30]:
client_gender_dict = pd.Series(trans_train.drop_duplicates('client_id')['gender'].values, index=trans_train.drop_duplicates('client_id')['client_id']).to_dict()

In [31]:
train_df['gender'] = train_df['client_id'].map(client_gender_dict)
test_df['gender'] = test_df['client_id'].map(client_gender_dict)

In [32]:
train_df.dropna(inplace=True)
# test_df.dropna(inplace=True)

In [33]:
embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['gender']
x_test, y_test = test_df[embed_columns], test_df['gender']

In [34]:
CatBoostModel = CatBoostClassifier(
    iterations= 500,
    learning_rate = 0.05,
    use_best_model = True,
    eval_metric ='AUC', 
    loss_function='Logloss',
    random_seed = 42,
    logging_level = 'Silent',
    depth = 5
)

In [35]:
CatBoostModel.fit(
    x_train, y_train,
    eval_set=(x_test, y_test),
    plot=False
)

<catboost.core.CatBoostClassifier at 0x7f58add7bb20>

In [36]:
CatBoostModel.get_best_score()

{'learn': {'Logloss': 0.3061159325655175},
 'validation': {'Logloss': 0.44215489881523645, 'AUC': 0.8770535044770853}}

In [42]:
predictions = CatBoostModel.predict_proba(x_test)[:, 1]
test_sample_submission['probability'] = predictions
test_sample_submission.to_csv('/kaggle/working/result.csv', index=False)