In [21]:
from google.colab import drive
import os
import sys
if 'google.colab' in str(get_ipython()):
  if 'drive' not in os.listdir():
      drive.mount('/content/drive')
sys.path.append('/content/drive/MyDrive/pytorch-lifestream/seq_encoders')

# Скрипт по обучению coles модели

## Импорт модулей и подгрузка конфига

In [22]:
# !pip install -q pytorch-lifestream==0.5.2
# !pip install -q pytorch-lightning==1.6.*
# !pip install -q catboost
# !pip install -q einops

In [23]:
from functools import partial
import os
import yaml
import joblib
import gc

from catboost import CatBoostClassifier, metrics

import torchmetrics
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

import torch
import warnings
from torch import nn
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from ptls.nn import TrxEncoder, RnnSeqEncoder,TransformerSeqEncoder,TransformerEncoder
from ptls.frames.coles import CoLESModule
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import ISeqLenLimit,FeatureFilter,SeqLenFilter
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices,SampleUniform
from ptls.frames import PtlsDataModule
from ptls.frames.inference_module import InferenceModule
from ptls.data_load.utils import collate_feature_dict
from ptls.data_load.datasets import inference_data_loader
import ptls

# import own models
from simple_seq_encoder import SimpleSeqEncoder
from rmt import ReccurentMemoryTransformerEncoder

import logging

import warnings
warnings.filterwarnings('ignore')

In [24]:
# sberhack_gender_prediction_data, rosbank_churn_prediction_data, datafusion_churn_prediction_data
# ['transformer','rmt']

data_type = 'datafusion_churn_prediction_data'
model_type = 'transformer'
list(filter(lambda x: model_type in x and data_type.replace('_data','') in x and 'coles' in x,
            os.listdir('drive/My Drive/pytorch-lifestream/configs')))

['datafusion_churn_predictioncoles_transformer_64.yaml',
 'datafusion_churn_predictioncoles_transformer_128.yaml']

In [25]:
config_name = 'datafusion_churn_predictioncoles_transformer_128.yaml'
path_to_working_directory = 'drive/My Drive/pytorch-lifestream'

with open(os.path.join(path_to_working_directory,'configs',config_name),'r') as f:
  model_config = yaml.safe_load(f)

with open(os.path.join(path_to_working_directory,'configs',model_config['data_config']),'r') as f:
  data_config = yaml.safe_load(f)

In [26]:
# опредедяем i filters и splitter
i_filters = [SeqLenFilter(min_seq_len=model_config['min_seq_len']),FeatureFilter(drop_feature_names=[data_config['target_col']])]
splitter = SampleSlices(**model_config['splitter_params'])

##Загрузка предобработанных данных

In [27]:
df_data_train = joblib.load(os.path.join(data_config['path_folder'],'train_'+model_config['data_config'].replace('yaml','pickle')))
df_data_valid = joblib.load(os.path.join(data_config['path_folder'],'valid_'+model_config['data_config'].replace('yaml','pickle')))
# df_data_test = joblib.load(os.path.join(data_config['path_folder'],'test_'+model_config['data_config'].replace('yaml','pickle')))
preprocessor = joblib.load(os.path.join(data_config['path_folder'],'preprocessor_'+model_config['data_config'].replace('yaml','pickle')))

## Построение моделей

In [28]:
# если надо поменять, то сами ручками в файле все меняем
if data_config.get('trx_embed_dim',False) == False:
    cat_feature_params = {k: {'in' : v, 'out' : v // model_config['rnn_config']['category_emb_dim_reduction']} for k,v in preprocessor.get_category_dictionary_sizes().items()}
else:
    cat_feature_params = data_config['trx_embed_dim']

if (sum([v['out'] for k,v in cat_feature_params.items()]) + len(data_config['numeric_cols'])) % 8 != 0:
    cat_feature_params[list(cat_feature_params.keys())[0]]['out'] += (8 - (sum([v['out'] for k,v in cat_feature_params.items()]) + len(data_config['numeric_cols'])) % 8)

num_feature_params = {f:'identity' for f in data_config['numeric_cols']}
trx_encoder_params = dict(
    embeddings_noise=0.001,
    numeric_values=num_feature_params,
    embeddings=cat_feature_params)

trx_encoder = TrxEncoder(**trx_encoder_params)

if model_config['rnn_config']['rnn_type'] == 'rmt':
    encoder = ReccurentMemoryTransformerEncoder(
            input_size=trx_encoder.output_size,
            num_memory_tokens=model_config['rnn_config']['hidden_state'],
            n_layers=4,
            max_seq_len=5000,
            dim_hidden = model_config['rnn_config']['hidden_state'],
            n_heads=4,
            ff_mult=4,
            attn_dropout=0.,
            ff_dropout=0.,
            use_flash_attn=False,
            abs_pos_emb=True,
            rotary_pos_emb=False,
            token_shift=True,
            is_reduce_sequence=True,
    )
    seq_encoder = SimpleSeqEncoder(
    trx_encoder=trx_encoder,
    seq_encoder=encoder)

elif model_config['rnn_config']['rnn_type'] == 'transformer':
    seq_encoder = TransformerSeqEncoder(
        trx_encoder=trx_encoder,
        input_size=trx_encoder.output_size,
        is_reduce_sequence=True,
        **dict(n_heads=8, dim_hidden=model_config['rnn_config']['hidden_state'], dropout=0, n_layers=6)
    )

# coles model
coles_model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.NAdam),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.ReduceLROnPlateau, mode='min', factor=0.2, patience=2)
)
# partial(torch.optim.lr_scheduler.StepLR, step_size=20, gamma=0.9) -
# ошибка MisconfigurationException: The provided lr scheduler `StepLR` doesn't follow PyTorch's LRScheduler API. You should override the `LightningModule.lr_scheduler_step` hook with your own logic if you are using a custom LR scheduler.

train_dl = PtlsDataModule(
    train_data = ColesDataset(MemoryMapDataset(data=df_data_train,i_filters=i_filters,),splitter=splitter),
    valid_data = ColesDataset(MemoryMapDataset(data=df_data_valid,i_filters=i_filters,),splitter=splitter),
    train_num_workers=model_config['num_workers'],
    train_batch_size=16#model_config['batch_size'],
)

## Обучение модели

In [29]:
early_stop_callback = EarlyStopping(monitor='recall_top_k', min_delta=0.005, patience=5, verbose=False, mode='max')
trainer = pl.Trainer(
    max_epochs=100,
    gpus= 1 if torch.cuda.is_available() else 0,
    enable_progress_bar=True,
    callbacks = [early_stop_callback]
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True, used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [30]:
%%time
gc.collect()
torch.cuda.empty_cache()
trainer.fit(coles_model, train_dataloaders=train_dl)

INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name               | Type                  | Params
-------------------------------------------------------------
0 | _loss              | ContrastiveLoss       | 0     
1 | _seq_encoder       | TransformerSeqEncoder | 81.8 K
2 | _validation_metric | BatchRecallTopK       | 0     
3 | _head              | Head                  | 0     
-------------------------------------------------------------
81.8 K    Trainable params
0         Non-trainable params
81.8 K    Total params
0.327     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

CPU times: user 13min 32s, sys: 16.7 s, total: 13min 49s
Wall time: 13min 2s


In [31]:
# torch.save(coles_model.state_dict(), os.path.join(path_to_working_directory,'models',config_name.replace('yaml','pt')))

## Сохранение эмбеддингов

In [32]:
def embedding_dataset(coles_model,prep_records_data,trainer,id_col,target_col=None):

  dl = inference_data_loader(prep_records_data, num_workers=1, batch_size=16)
  embeds = torch.vstack(trainer.predict(coles_model, dl))

  df = pd.DataFrame(data=embeds, columns=[f'embed_{i}' for i in range(embeds.shape[1])])
  df[id_col] = [x[id_col] for x in prep_records_data]
  if target_col:
    df[target_col] = [x[target_col] for x in prep_records_data]

  return df.drop_duplicates(subset=id_col)

In [33]:
gc.collect()
torch.cuda.empty_cache()
train_embded_df = embedding_dataset(coles_model,
                                    list(filter(lambda x: x['event_time'].shape[0] < 5000,df_data_train)),trainer,data_config['id_col'],data_config['target_col'])
valid_embded_df = embedding_dataset(coles_model,
                                    list(filter(lambda x: x['event_time'].shape[0] < 5000,df_data_valid)),trainer,data_config['id_col'],data_config['target_col'])
# test_embded_df = embedding_dataset(coles_model,df_data_test[:4],trainer,data_config['id_col'])

INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 941it [00:00, ?it/s]

INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 941it [00:00, ?it/s]

In [34]:
# train_embded_df.to_csv(os.path.join(data_config['path_folder'],'train_emb_'+config_name.replace('yaml','csv')),encoding='utf-8',index=False)
# valid_embded_df.to_csv(os.path.join(data_config['path_folder'],'valid_emb_'+config_name.replace('yaml','csv')),encoding='utf-8',index=False)
# test_embded_df.to_csv(os.path.join(data_config['path_folder'],'test_emb_'+config_name.replace('yaml','csv')),encoding='utf-8',index=False)

Оценим при помощи catboost наши эмбеддинги - код с примеров быстро вставил

In [35]:
from catboost import CatBoostClassifier, metrics

In [36]:
y_train = train_embded_df[data_config['target_col']].values
X_train = train_embded_df.drop([data_config['target_col'],data_config['id_col']], axis=1)
# if model_config['rnn_config']['rnn_type'] == 'gru':
X_train_emb = X_train
# else:
#     X_train_emb = pd.DataFrame(np.arange(len(X_train)))
#     X_train_emb['embeddings'] = X_train.values.tolist()
#     X_train_emb = X_train_emb.drop([0], axis=1)

y_val = valid_embded_df[data_config['target_col']].values
X_val = valid_embded_df.drop([data_config['target_col'],data_config['id_col']], axis=1)
# if model_config['rnn_config']['rnn_type'] == 'gru':
X_val_emb = X_val
# else:
#     X_val_emb = pd.DataFrame(np.arange(len(X_val)))
#     X_val_emb['embeddings'] = X_val.values.tolist()
#     X_val_emb = X_val_emb.drop([0], axis=1)

# if model_config['rnn_config']['rnn_type'] == 'gru':
CatBoostModel_emb = CatBoostClassifier(
  devices='gpu',
  iterations=1000,
  learning_rate=0.05,
  use_best_model=True,
  custom_metric=[metrics.Accuracy(),metrics.AUC()],
  random_seed=42,
  logging_level='Silent',
  # embedding_features=['embeddings'],
  depth=5
)
# else:
#     CatBoostModel_emb = CatBoostClassifier(
#         devices='gpu',
#         iterations=1000,
#         learning_rate=0.05,
#         use_best_model=True,
#         custom_metric=[metrics.Accuracy(),metrics.AUC()],
#         random_seed=42,
#         logging_level='Silent',
#         embedding_features=['embeddings'],
#         depth=5
#     )

CatBoostModel_emb.fit(
    X_train_emb, y_train,
    eval_set=(X_val_emb, y_val)
)

<catboost.core.CatBoostClassifier at 0x7f252008ac50>

In [37]:
from sklearn.metrics import roc_auc_score, accuracy_score,f1_score

val_preds = CatBoostModel_emb.predict(X_val_emb)
val_preds_proba = CatBoostModel_emb.predict_proba(X_val_emb)[:,1]
train_preds = CatBoostModel_emb.predict(X_train_emb)
train_preds_proba = CatBoostModel_emb.predict_proba(X_train_emb)[:,1]

results = [
            {'dataset':'train',
            'roc-auc':round(roc_auc_score(y_train,train_preds_proba,),3),
            'accuracy':round(accuracy_score(y_train,train_preds),3),
            'f1':round(f1_score(y_train,train_preds,average='micro'),3),
            'model_type' : model_config['rnn_config']['rnn_type'],
            'hidden_size' :  model_config['rnn_config']['hidden_state'],
            'num_layers' : 1,
            'bidir' : False,
            },
            {'dataset':'valid',
            'roc-auc':round(roc_auc_score(y_val,val_preds_proba),3),
            'accuracy':round(accuracy_score(y_val,val_preds),3),
            'f1':round(f1_score(y_val,val_preds,average='micro'),3),
            'model_type' : model_config['rnn_config']['rnn_type'],
            'hidden_size' :  model_config['rnn_config']['hidden_state'],
            'num_layers' : 1,
            'bidir' : False,
            }
           ]

pd.DataFrame(results)

Unnamed: 0,dataset,roc-auc,accuracy,f1,model_type,hidden_size,num_layers,bidir
0,train,0.746,0.92,0.92,transformer,128,1,False
1,valid,0.69,0.918,0.918,transformer,128,1,False


In [38]:
pd.DataFrame(results).to_csv(os.path.join(path_to_working_directory,'results','metrics_'+config_name.replace('yaml','csv')),
                             encoding='utf-8',index=False)