# Скрипт по обучению supervised модели

## Импорт модулей и подгрузка конфига

In [None]:
from google.colab import drive
import os
import sys
if 'google.colab' in str(get_ipython()):
  if 'drive' not in os.listdir():
      drive.mount('/content/drive')
sys.path.insert(0,'/content/drive/MyDrive/pytorch-lifestream/seq_encoders')

Mounted at /content/drive


In [None]:
# !pip install -q pytorch-lifestream==0.5.2
# !pip install -q pytorch-lightning==1.6.*
# !pip install -q s5-pytorch

In [None]:
from functools import partial
import os
import yaml
import joblib
import gc

import torchmetrics
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

import torch
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from ptls.nn import TrxEncoder, RnnSeqEncoder, Head,TransformerSeqEncoder
from ptls.frames.supervised import SeqToTargetDataset, SequenceToTarget
from ptls.frames import PtlsDataModule
from ptls.data_load.iterable_processing import ISeqLenLimit,FeatureFilter,SeqLenFilter
from ptls.data_load.utils import collate_feature_dict
from ptls.frames import PtlsDataModule
from ptls.data_load.utils import collate_feature_dict
from ptls.frames.inference_module import InferenceModule
import ptls
from ptls.preprocessing import PandasDataPreprocessor
from ptls.data_load.datasets import MemoryMapDataset


# import own models
from simple_seq_encoder import SimpleSeqEncoder
from new_rnn_encoder import NewRnnEncoder

import logging
import warnings
warnings.filterwarnings('ignore')

In [None]:
# sberhack_gender_prediction_data, rosbank_churn_prediction_data, sber_age_prediction_data, datafusion_churn_prediction_data
# ['gru','lstm','cornn','urnn','indrnn','qrnn','lmu','lem','lru','s5']

# очень долго: 'cornn','urnn','lmu', 'lru'
data_type = 'rosbank_churn_prediction_data'
model_type = '128'
list(filter(lambda x: model_type in x and data_type.replace('data','') in x and 's2t' in x,
            os.listdir('drive/My Drive/pytorch-lifestream/configs')))

['rosbank_churn_prediction_s2t_gru_128_1.yaml',
 'rosbank_churn_prediction_s2t_lstm_128_1.yaml',
 'rosbank_churn_prediction_s2t_cornn_128_1.yaml',
 'rosbank_churn_prediction_s2t_urnn_128_1.yaml',
 'rosbank_churn_prediction_s2t_indrnn_128_1.yaml',
 'rosbank_churn_prediction_s2t_qrnn_128_1_1.yaml',
 'rosbank_churn_prediction_s2t_lmu_128_1.yaml',
 'rosbank_churn_prediction_s2t_lem_128_1.yaml',
 'rosbank_churn_prediction_s2t_lru_128_1.yaml',
 'rosbank_churn_prediction_s2t_s5_128_1.yaml']

In [None]:
config_name = 'rosbank_churn_prediction_s2t_gru_128_1.yaml'
path_to_working_directory = 'drive/My Drive/pytorch-lifestream'

with open(os.path.join(path_to_working_directory,'configs',config_name),'r') as f:
  model_config = yaml.safe_load(f)

with open(os.path.join(path_to_working_directory,'configs',model_config['data_config']),'r') as f:
  data_config = yaml.safe_load(f)

In [None]:
# под задачу определяем метрики и лосс
loss=torch.torch.nn.CrossEntropyLoss()
metric=torchmetrics.Accuracy(task='binary')

##Загрузка предобработанных данных

In [None]:
df_data_train = joblib.load(os.path.join(data_config['path_folder'],'train_'+model_config['data_config'].replace('yaml','pickle')))
df_data_valid = joblib.load(os.path.join(data_config['path_folder'],'valid_'+model_config['data_config'].replace('yaml','pickle')))
# df_data_test = joblib.load(os.path.join(data_config['path_folder'],'test_'+model_config['data_config'].replace('yaml','pickle')))
preprocessor = joblib.load(os.path.join(data_config['path_folder'],'preprocessor_'+model_config['data_config'].replace('yaml','pickle')))

In [None]:
dataset_train = MemoryMapDataset(df_data_train,)
dataset_valid = MemoryMapDataset(df_data_valid)
dataset_test = MemoryMapDataset(df_data_test)

## Построение моделей

In [None]:
# только с таким костылем смог запустить
logger = logging.getLogger(__name__)
class SequenceToTarget2(SequenceToTarget):
    def configure_optimizers(self):
        if self.hparams.pretrained_lr is not None:
            if self.hparams.pretrained_lr == 'freeze':
                for p in self.seq_encoder.parameters():
                    p.requires_grad = False
                logger.info('Created optimizer with frozen encoder')
                parameters = self.parameters()
            else:
                parameters = [
                    {'params': self.seq_encoder.parameters(), 'lr': self.hparams.pretrained_lr},
                    {'params': self.head.parameters()},  # use predefined lr from `self.optimizer_partial`
                ]
                logger.info('Created optimizer with two lr groups')
        else:
            parameters = self.parameters()

        optimizer = self.optimizer_partial(parameters)
        scheduler = self.lr_scheduler_partial(optimizer)
        return {"optimizer": optimizer,
                "lr_scheduler": scheduler,
                "monitor": [f"val_{metric}" for metric in self.valid_metrics][0]}

In [None]:
# если надо поменять, то сами ручками в файле все меняем
if data_config.get('trx_embed_dim',False) == False:
    cat_feature_params = {k: {'in' : v, 'out' : v // model_config['rnn_config']['category_emb_dim_reduction']} for k,v in preprocessor.get_category_dictionary_sizes().items()}
else:
    cat_feature_params = data_config['trx_embed_dim']

if sum([v['out'] for k,v in cat_feature_params.items()]) % 2 == 1:
    cat_feature_params[list(cat_feature_params.keys())[0]]['out'] += 1

num_feature_params = {f:'identity' for f in data_config['numeric_cols']}
trx_encoder_params = dict(
    embeddings_noise=0.001,
    numeric_values=num_feature_params,
    embeddings=cat_feature_params)


trx_encoder = TrxEncoder(**trx_encoder_params)
if model_config['rnn_config']['additional_params']:
    rnn_encoder = NewRnnEncoder(
        input_size=trx_encoder.output_size,
        hidden_size=model_config['rnn_config']['hidden_state'],
        type=model_config['rnn_config']['rnn_type'],
        bidir=model_config['rnn_config']['bidir'],
        num_layers=model_config['rnn_config']['num_layers'],
        **model_config['rnn_config']['additional_params'])
else:
    rnn_encoder = NewRnnEncoder(
        input_size=trx_encoder.output_size,
        hidden_size=model_config['rnn_config']['hidden_state'],
        type=model_config['rnn_config']['rnn_type'],
        bidir=model_config['rnn_config']['bidir'],
        num_layers=model_config['rnn_config']['num_layers'])

seq_encoder = SimpleSeqEncoder(
    trx_encoder=trx_encoder,
    seq_encoder=rnn_encoder)


head = Head(input_size=seq_encoder.embedding_size, **model_config['task'],use_batch_norm=True)

if model_config['rnn_config']['own_head']:

  own_head = torch.nn.Sequential(
                          torch.nn.Linear(model_config['rnn_config']['hidden_state'], model_config['rnn_config']['hidden_state'] // 2),
                          torch.nn.ReLU(),
                          torch.nn.Linear(model_config['rnn_config']['hidden_state'] // 2, 1),torch.nn.Sigmoid(),torch.nn.Flatten(start_dim=0)
                          )
  assert own_head is not None,'Не определили голову'
  head = own_head


model = SequenceToTarget2(
    seq_encoder=seq_encoder,
    head=head,
    loss=loss,
    metric_list=metric,
    optimizer_partial=partial(torch.optim.NAdam),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.ReduceLROnPlateau, mode='min', factor=0.2, patience=2)
)

# partial(torch.optim.lr_scheduler.StepLR, step_size=20, gamma=0.9) -
# ошибка MisconfigurationException: The provided lr scheduler `StepLR` doesn't follow PyTorch's LRScheduler API. You should override the `LightningModule.lr_scheduler_step` hook with your own logic if you are using a custom LR scheduler.
# не забывать про target_dtype
sup_data = PtlsDataModule(
    train_data=SeqToTargetDataset(dataset_train, target_col_name=data_config['target_col'], target_dtype=torch.long),
    valid_data=SeqToTargetDataset(dataset_valid, target_col_name=data_config['target_col'], target_dtype=torch.long),
    train_batch_size=model_config['batch_size'],
    valid_batch_size=model_config['batch_size'],
    train_num_workers=int(model_config['num_workers'] / 4),
)

## Обучение и оценка модели

In [None]:
early_stop_callback = EarlyStopping(monitor=f"val_{metric._get_name()}", min_delta=0.001, patience=3, verbose=False, mode='max')
trainer = pl.Trainer(
    max_epochs=100,
    gpus=1 if torch.cuda.is_available() else 0,
    enable_progress_bar=True,
    callbacks = [early_stop_callback]
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True, used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
%%time
gc.collect()
torch.cuda.empty_cache()

trainer.fit(model, sup_data)

INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name          | Type             | Params
---------------------------------------------------
0 | seq_encoder   | SimpleSeqEncoder | 105 K 
1 | head          | Head             | 385   
2 | loss          | CrossEntropyLoss | 0     
3 | train_metrics | ModuleDict       | 0     
4 | valid_metrics | ModuleDict       | 0     
5 | test_metrics  | ModuleDict       | 0     
---------------------------------------------------
106 K     Trainable params
0         Non-trainable params
106 K     Total params
0.425     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

RuntimeError: Expected floating point type for target with class probabilities, got Long

In [None]:
gc.collect()
torch.cuda.empty_cache()
trainer.test(ckpt_path='best', dataloaders=sup_data.val_dataloader())

MisconfigurationException: `.test(ckpt_path="best")` is set but `ModelCheckpoint` is not configured to save the best model.

Сохранение модели

In [None]:
# torch.save(model.state_dict(), os.path.join(path_to_working_directory,'models',config_name.replace('yaml','pt')))

Своя оценка модели на val и train

In [None]:
def model_result(emb_model,prep_data,threshold=0.5):
  '''Получить предсказания для датасета для классификации'''
  inference_dl = torch.utils.data.DataLoader(
    dataset=prep_data,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=8,
    num_workers=1,
    )

  model = InferenceModule(model=emb_model, pandas_output=True,model_out_name='pred_proba')
  predict = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0).predict(model, inference_dl)

  full_result = pd.concat(predict)

  try:
    full_result['pred'] = (full_result['pred_proba']>=threshold).astype(int)
  except:
    cols = list(filter(lambda x: 'pred_proba' in x,full_result.columns.tolist()))
    full_result['pred'] = np.argmax(full_result[cols].values,axis=1)

  return full_result

def model_emb_retrieval(emb_model,prep_data,id_lin_layer = 1):
  '''При желании получить какой-то линейный слой с головы'''

  inference_dl = torch.utils.data.DataLoader(
    dataset=prep_data,
    collate_fn=collate_feature_dict,
    shuffle=False,
    batch_size=16,
    num_workers=2,
    )

  # в нашем случае вот так вытягиваем эмбеддинги
  emb_retrieval = torch.nn.Sequential(*list(emb_model.children())[:2])
  emb_retrieval[-1] = emb_retrieval[1][id_lin_layer]

  model = InferenceModule(model=emb_retrieval, pandas_output=True, model_out_name='emb')
  predict = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0).predict(model, inference_dl)

  full_result = pd.concat(predict)
  return full_result

In [None]:
gc.collect()
torch.cuda.empty_cache()

val_result = model_result(model,df_data_valid,0.5)
train_result = model_result(model,df_data_train,0.5)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True, used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True, used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score,f1_score
results = [
            {'dataset':'train',
            'roc-auc':round(roc_auc_score(train_result[data_config['target_col']],train_result.pred_proba),3),
            'accuracy':round(accuracy_score(train_result[data_config['target_col']],train_result.pred),3),
            'f1':round(f1_score(train_result[data_config['target_col']],train_result.pred,average = 'micro'),3),
            'model_type' : model_config['rnn_config']['rnn_type'],
            'hidden_size' :  model_config['rnn_config']['hidden_state'],
            'num_layers' : model_config['rnn_config']['num_layers'],
            'bidir' : model_config['rnn_config']['bidir'],
            },
            {'dataset':'valid',
            'roc-auc':round(roc_auc_score(val_result[data_config['target_col']],val_result.pred_proba),3),
            'accuracy':round(accuracy_score(val_result[data_config['target_col']],val_result.pred),3),
            'f1':round(f1_score(val_result[data_config['target_col']],val_result.pred,average = 'micro'),3),
            'model_type' : model_config['rnn_config']['rnn_type'],
            'hidden_size' :  model_config['rnn_config']['hidden_state'],
            'num_layers' : model_config['rnn_config']['num_layers'],
            'bidir' : model_config['rnn_config']['bidir'],
            }
           ]

pd.DataFrame(results)

Unnamed: 0,dataset,roc-auc,accuracy,f1,model_type,hidden_size,num_layers,bidir
0,train,0.393,0.554,0.554,gru,128,1,False
1,valid,0.413,0.553,0.553,gru,128,1,False


In [None]:
pd.DataFrame(results).to_csv(os.path.join(path_to_working_directory,'results','metrics_'+config_name.replace('yaml','csv')),encoding='utf-8',index=False)