In [2]:
import os 

os.chdir("app")

In [3]:
import pandas as pd
import numpy as np

from ptls.preprocessing import PandasDataPreprocessor
import torch
import pytorch_lightning as pl

from pytorch_lightning.callbacks import ModelCheckpoint

from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder, AggFeatureSeqEncoder
from ptls.frames.coles import CoLESModule
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.data_load.datasets import MemoryMapDataset
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices, NoSplit
from ptls.frames import PtlsDataModule
from ptls.data_load.datasets import inference_data_loader

from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

from dataset import SberDataset
from models import RNNModel, BOTModel

In [4]:
path_to_data = 'data'
path_to_checkpoints = 'checkpoints'

In [5]:
train = pd.read_csv(path_to_data + '/train.csv', usecols=['client_id', 'gender'])
mcc_codes = pd.read_csv(path_to_data + '/mcc_codes.csv', sep=';')
test_sample_submission = pd.read_csv(path_to_data + '/test_sample_submission.csv', usecols=['client_id', 'probability'])
test = pd.read_csv(path_to_data + '/test.csv', usecols=['client_id'])
trans_types = pd.read_csv(path_to_data + '/trans_types.csv', sep=';')
transactions = pd.read_csv(path_to_data + '/transactions.csv')

In [6]:
trans_train = pd.merge(transactions, train, on='client_id')
trans_test = pd.merge(transactions, test, on='client_id')

In [7]:
trans_train.head()

Unnamed: 0,client_id,trans_time,mcc_code,trans_type,amount,term_id,trans_city,gender
0,d1bbbc9a0e0410d3cf12a3d2f44f3450,35 08:24:41,4829,2370,-1808.56,,Tver,0
1,d1bbbc9a0e0410d3cf12a3d2f44f3450,105 12:57:32,4829,2370,-3390.41,,Tver,0
2,d1bbbc9a0e0410d3cf12a3d2f44f3450,455 19:32:01,4814,1030,-144.5,889003.0,Tver,0
3,d1bbbc9a0e0410d3cf12a3d2f44f3450,83 09:22:26,6011,2010,-3542.3,,Tver,0
4,d1bbbc9a0e0410d3cf12a3d2f44f3450,74 13:31:57,6011,2010,-3542.7,,Tver,0


In [8]:
base_date = pd.to_datetime('2022-01-01')

for df in [trans_train, trans_test]:
    df['trans_time'] = df['trans_time'].astype(str)
    df[['days', 'time']] = df['trans_time'].str.split(' ', expand=True)
    df['days'] = pd.to_timedelta(df['days'].astype(int), unit='D')
    df['time'] = pd.to_timedelta(df['time'])
    df['event_time'] = base_date + df['days'] + df['time']

    df['hour'] = df['event_time'].dt.hour
    df['minute'] = df['event_time'].dt.minute
    df['day_of_week'] = df['event_time'].dt.day_of_week

    df['amount_pos'] = df['amount'] * (df['amount']>=0)
    df['amount_neg'] = -df['amount'] * (df['amount']<0) 

    df['is_na_term_id'] = df['term_id'].isna()

    df.drop(['trans_time', 'days', 'time', 'term_id'], axis=1, inplace=True)


In [9]:
preprocessor = PandasDataPreprocessor(
    col_id='client_id',
    col_event_time = 'event_time',
    cols_category=['mcc_code','trans_type', 'trans_city', 'day_of_week', 'hour'],
    cols_numerical=['amount', 'minute'],
    cols_first_item=['gender'],
    return_records=True,
)

In [10]:
%%time

dataset_train = preprocessor.fit_transform(trans_train)
dataset_test = preprocessor.transform(trans_test)

CPU times: user 21.4 s, sys: 3.59 s, total: 25 s
Wall time: 25 s


In [11]:
from sklearn.model_selection import train_test_split

train, valid_test = train_test_split(dataset_train, test_size=0.2, random_state=42)
valid, test = train_test_split(valid_test, test_size=0.5, random_state=42)

len(train), len(valid), len(test)

(6048, 756, 756)

# AggFeatureSeqEncoder + LightGBM

In [12]:
params = {
    'numeric_values': {
        'amount': {'identity'},
    },
    'embeddings': {
        'trans_type': {'in': 100},
        'mcc_code': {'in': 200},
        'trans_city': {'in': 15},
        'day_of_week': {'in': 10},
        'hour': {'in': 24},
    },
}

seq_encoder = AggFeatureSeqEncoder(**params)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

In [13]:
trainer = pl.Trainer(
    max_epochs=30,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=False,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
train_dl = inference_data_loader(train, num_workers=0, batch_size=16)
train_embeds = torch.vstack(trainer.predict(model, train_dl))

test_dl = inference_data_loader(test, num_workers=0, batch_size=16)
test_embeds = torch.vstack(trainer.predict(model, test_dl))

In [15]:
X_train = train_embeds.numpy() 
X_test = test_embeds.numpy()

y_train = np.array([cl['gender'] for cl in train]) 
y_test = np.array([cl['gender'] for cl in test])

In [16]:
lgbm = LGBMClassifier(n_estimators=100, verbose=-1)
lgbm.fit(X_train, y_train)

In [17]:
print(f'Train score: {roc_auc_score(y_train, lgbm.predict_proba(X_train)[:, 1]):.4f}', )
print(f'Test score: {roc_auc_score(y_test, lgbm.predict_proba(X_test)[:, 1]):.4f}')

Train score: 0.9985
Test score: 0.8751


# CoLES embeddings + LightGBM

In [19]:
train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=train,
            i_filters=[
                SeqLenFilter(min_seq_len=25),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=400,
        ),
    ),
    valid_data=ColesDataset(
        MemoryMapDataset(
            data=valid,
            i_filters=[
                SeqLenFilter(min_seq_len=25),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=400,
        ),
    ),
    train_num_workers=2,
    train_batch_size=256,
    valid_batch_size=256,
)

In [20]:
trx_encoder = TrxEncoder(
    embeddings={
        'trans_type': {'in': 100, 'out': 16},
        'mcc_code': {'in': 200, 'out': 24},
        'trans_city': {'in': 15, 'out': 4},
        'day_of_week': {'in': 10, 'out': 4},
        'hour': {'in': 24, 'out': 4},
    },
    numeric_values={
        'amount': 'identity',
    },
    linear_projection_size=32,
    use_batch_norm_with_lens=True,
    embeddings_noise=1e-3,
)

seq_encoder = RnnSeqEncoder(
    trx_encoder,
    hidden_size=64,
    type='lstm',
    is_reduce_sequence=True
)

coles = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.ReduceLROnPlateau, factor=.99),
)

In [21]:
checkpoint = ModelCheckpoint(
    monitor=coles.metric_name,
    mode='max'
)

trainer = pl.Trainer(
    max_epochs=50,
    accelerator='gpu',
    devices=[0],
    callbacks=[checkpoint]
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(coles, train_dl)

In [23]:
checkpoint.best_model_score

tensor(0.9506, device='cuda:0')

In [24]:
coles.load_state_dict(torch.load(checkpoint.best_model_path)['state_dict'])

<All keys matched successfully>

In [25]:
torch.save(coles.seq_encoder.state_dict(), f'{path_to_checkpoints}/coles.pth')

In [None]:
train_dl = inference_data_loader(train, num_workers=0, batch_size=16)
train_embeds = torch.vstack(trainer.predict(coles, train_dl))

test_dl = inference_data_loader(test, num_workers=0, batch_size=16)
test_embeds = torch.vstack(trainer.predict(coles, test_dl))

In [27]:
X_train = train_embeds.numpy() 
X_test = test_embeds.numpy()

y_train = np.array([cl['gender'] for cl in train]) 
y_test = np.array([cl['gender'] for cl in test])

In [28]:
lgbm = LGBMClassifier(n_estimators=100, verbose=-1)
lgbm.fit(X_train, y_train)

In [29]:
print(f'Train score: {roc_auc_score(y_train, lgbm.predict_proba(X_train)[:, 1]):.4f}', )
print(f'Test score: {roc_auc_score(y_test, lgbm.predict_proba(X_test)[:, 1]):.4f}')

Train score: 0.9858
Test score: 0.7297


# RNN

In [30]:
datamodule = PtlsDataModule(
    train_data=SberDataset(
        data=train,
        i_filters=[
            SeqLenFilter(min_seq_len=25),
        ],
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    valid_data=SberDataset(
        data=valid,
        i_filters=[
            SeqLenFilter(min_seq_len=25),
        ],
        splitter=NoSplit()
    ),
    train_num_workers=1,
    train_batch_size=128,
    valid_batch_size=16,
)

In [31]:
trx_encoder = TrxEncoder(
    embeddings={
        'trans_type': {'in': 100, 'out': 16},
        'mcc_code': {'in': 200, 'out': 24},
        'trans_city': {'in': 15, 'out': 4},
        'day_of_week': {'in': 10, 'out': 4},
        'hour': {'in': 24, 'out': 4},
    },
    numeric_values={
        'amount': 'identity',
    },
    linear_projection_size=32,
    use_batch_norm_with_lens=True,
    embeddings_noise=1e-3,
)

seq_encoder = RnnSeqEncoder(
    trx_encoder,
    hidden_size=64,
    type='lstm',
    is_reduce_sequence=True
)

# можно инициализировать предобученными весами
# seq_encoder.load_state_dict(torch.load('coles.pth'))

rnn_model = RNNModel(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.ReduceLROnPlateau, factor=.99),
    head_hidden=128,
    dropout=0.1
)

In [32]:
checkpoint = ModelCheckpoint(
    monitor='valid_auc',
    mode='max'
)

trainer = pl.Trainer(
    max_epochs=50,
    accelerator='gpu',
    devices=[0],
    callbacks=[checkpoint]
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(rnn_model, datamodule)

In [34]:
checkpoint.best_model_score

tensor(0.8481, device='cuda:0')

In [35]:
rnn_model.load_state_dict(torch.load(checkpoint.best_model_path)['state_dict'])

<All keys matched successfully>

In [36]:
torch.save(rnn_model.state_dict(), f'{path_to_checkpoints}/rnn.pth')

In [None]:
train_dl = inference_data_loader(train, batch_size=16)
test_dl = inference_data_loader(test, batch_size=16)

y_train = [cl['gender'] for cl in train]
y_test = [cl['gender'] for cl in test]

y_train_pred = torch.cat(trainer.predict(rnn_model, train_dl)).numpy()
y_test_pred = torch.cat(trainer.predict(rnn_model, test_dl)).numpy()

In [38]:
print(f'Train score: {roc_auc_score(y_train, y_train_pred):.4f}')
print(f'Test score: {roc_auc_score(y_test, y_test_pred):.4f}')

Train score: 0.8919
Test score: 0.8446


# BoT (bag of transactions) 

In [39]:
datamodule = PtlsDataModule(
    train_data=SberDataset(
        data=train,
        i_filters=[
            SeqLenFilter(min_seq_len=25),
        ],
        splitter=SampleSlices(
            split_count=5,
            cnt_min=25,
            cnt_max=200,
        ),
    ),
    valid_data=SberDataset(
        data=valid,
        i_filters=[
            SeqLenFilter(min_seq_len=25),
        ],
        splitter=NoSplit()
    ),
    train_num_workers=1,
    train_batch_size=128,
    valid_batch_size=16,
)

In [40]:
trx_encoder = TrxEncoder(
    embeddings={
        'trans_type': {'in': 100, 'out': 16},
        'mcc_code': {'in': 200, 'out': 24},
        'trans_city': {'in': 15, 'out': 4},
        'day_of_week': {'in': 10, 'out': 4},
        'hour': {'in': 24, 'out': 4},
    },
    numeric_values={
        'amount': 'identity',
    },
    linear_projection_size=32,
    use_batch_norm_with_lens=True,
    embeddings_noise=1e-3,
)

bot_model = BOTModel(
    trx_encoder=trx_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.ReduceLROnPlateau, factor=.99),
    head_hidden=128,
    dropout=0.1
)

In [41]:
checkpoint = ModelCheckpoint(
    monitor='valid_auc',
    mode='max'
)

trainer = pl.Trainer(
    max_epochs=50,
    accelerator='gpu',
    devices=[0],
    callbacks=[checkpoint]
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(bot_model, datamodule)

In [43]:
checkpoint.best_model_score

tensor(0.8642, device='cuda:0')

In [44]:
bot_model.load_state_dict(torch.load(checkpoint.best_model_path)['state_dict'])

<All keys matched successfully>

In [45]:
torch.save(bot_model.state_dict(), f'{path_to_checkpoints}/bot.pth')

In [None]:
train_dl = inference_data_loader(train, batch_size=16)
test_dl = inference_data_loader(test, batch_size=16)

y_train = [cl['gender'] for cl in train]
y_test = [cl['gender'] for cl in test]

y_train_pred = torch.cat(trainer.predict(bot_model, train_dl)).numpy()
y_test_pred = torch.cat(trainer.predict(bot_model, test_dl)).numpy()

In [47]:
print(f'Train score: {roc_auc_score(y_train, y_train_pred):.4f}')
print(f'Test score: {roc_auc_score(y_test, y_test_pred):.4f}')

Train score: 0.8950
Test score: 0.8568
