In [1]:
!pip install pytorch-lifestream -q
!pip install pyhocon -q

[33mDEPRECATION: pytorch-lightning 1.6.5 has a non-standard dependency specifier torch>=1.8.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cuml 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.7 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 11.0.0 which is incompatible.
cudf 23.8.0 r

In [3]:
import pandas as pd
import numpy as np
import re
from tqdm.auto import tqdm

from ptls.preprocessing import PandasDataPreprocessor
import torch
import pytorch_lightning as pl

from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder,RnnEncoder,AggFeatureSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule
from ptls.data_load.datasets import inference_data_loader

from pyhocon import ConfigFactory

import logging

In [4]:
train = pd.read_csv('data/train.csv', usecols=['client_id', 'gender'])
mcc_codes = pd.read_csv('data/mcc_codes.csv', sep=';')
test_sample_submission = pd.read_csv('data/test_sample_submission.csv', usecols=['client_id', 'probability'])
test = pd.read_csv('data/test.csv', usecols=['client_id'])
trans_types = pd.read_csv('data/trans_types.csv', sep=';')
transactions = pd.read_csv('data/transactions.csv')

In [5]:
trans_train = pd.merge(transactions, train, on='client_id')
trans_test = pd.merge(transactions, test, on='client_id')

In [6]:
base_date = pd.to_datetime('2022-01-01')

for df in [trans_train, trans_test]:
    df['trans_time'] = df['trans_time'].astype(str)
    df[['days', 'time']] = df['trans_time'].str.split(' ', expand=True)
    df['days'] = pd.to_timedelta(df['days'].astype(int), unit='D')
    df['time'] = pd.to_timedelta(df['time'])
    df['event_time'] = base_date + df['days'] + df['time']
    df.drop(['trans_time', 'days', 'time', 'trans_city', 'term_id'], axis=1, inplace=True)

In [7]:
trans_train.drop('gender', axis=1, inplace=True)

In [8]:
preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time = 'event_time',
    cols_category=['mcc_code','trans_type'],
    cols_numerical=['amount'],
    return_records=True,
)

In [9]:
%%time

dataset_train = preprocessor.fit_transform(trans_train)
dataset_test = preprocessor.transform(trans_test)

CPU times: total: 10.1 s
Wall time: 10.3 s


import pickle

with open('/kaggle/working/preprocessor.p', 'wb') as f:
    pickle.dump(preprocessor, f)

In [10]:
dataset_train = sorted(dataset_train, key=lambda x: x['client_id'])
dataset_test = sorted(dataset_test, key=lambda x: x['client_id'])

from sklearn.model_selection import train_test_split

train_cv, test_cv = train_test_split(dataset, test_size=0.2, random_state=42)

len(train_cv), len(test_cv)

In [11]:
def get_conf():
    params = {
        'numeric_values': {
            'amount': {'identity'},
        },
        'embeddings': {
            'trans_type': {'in': 100},
            'mcc_code': {'in': 200}
        },
    }
    return ConfigFactory.from_dict(params)

seq_encoder = AggFeatureSeqEncoder(**get_conf())

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

In [12]:
train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=dataset_train,
            i_filters=[
                SeqLenFilter(min_seq_len=25),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    train_num_workers=2,
    train_batch_size=256,
)

In [13]:
trainer = pl.Trainer(
    max_epochs=30,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=False,
)

  rank_zero_deprecation(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [14]:
train_dl = inference_data_loader(dataset_train, num_workers=0, batch_size=256)
train_embeds = torch.vstack(trainer.predict(model, train_dl))

test_dl = inference_data_loader(dataset_test, num_workers=0, batch_size=256)
test_embeds = torch.vstack(trainer.predict(model, test_dl))

Missing logger folder: d:\Skoltech\SberHse hackaton\lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.91 GiB (GPU 0; 4.00 GiB total capacity; 2.02 GiB already allocated; 806.65 MiB free; 2.03 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [27]:
train_df = pd.DataFrame(data=train_embeds, columns=[f'embed_{i}' for i in range(train_embeds.shape[1])])
train_df['client_id'] = [x['client_id'] for x in dataset_train]

test_df = pd.DataFrame(data=test_embeds, columns=[f'embed_{i}' for i in range(test_embeds.shape[1])])
test_df['client_id'] = [x['client_id'] for x in dataset_test]

print(train_df.shape, test_df.shape)

(7560, 907) (840, 907)


In [30]:
trans_train = pd.merge(transactions, train, on='client_id')

In [31]:
client_gender_dict = pd.Series(trans_train.drop_duplicates('client_id')['gender'].values, index=trans_train.drop_duplicates('client_id')['client_id']).to_dict()

In [32]:
train_df['gender'] = train_df['client_id'].map(client_gender_dict)
# test_df['gender'] = test_df['client_id'].map(client_gender_dict)

In [33]:
train_df.dropna(inplace=True)
# test_df.dropna(inplace=True)

In [34]:
embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['gender']
x_test = test_df[embed_columns]

In [37]:
from catboost import CatBoostClassifier, metrics
CatBoostModel = CatBoostClassifier(
    iterations= 500,
    learning_rate = 0.05,
#     use_best_model = True,
    eval_metric ='AUC', 
    loss_function='Logloss',
    random_seed = 42,
    logging_level = 'Silent',
    depth = 5
)

In [38]:
CatBoostModel.fit(
    x_train, y_train,
#     eval_set=(x_test, y_test),
    plot=False
)

<catboost.core.CatBoostClassifier at 0x790191fcfeb0>

In [42]:
predictions = CatBoostModel.predict_proba(x_test)[:, 1]
test_sample_submission['probability'] = predictions
test_sample_submission.to_csv('/kaggle/working/result.csv', index=False)