In [1]:
import os
import re
import glob
import json
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import lightning.pytorch as pl
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from lightning.pytorch import Trainer, seed_everything, LightningDataModule
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks.early_stopping import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

C:\Users\User\anaconda3\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll
C:\Users\User\anaconda3\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll


Данные предварительно обработаны: ко всем признакам, содержащим значение 0, прибавлена единица, так как 0 используется в качестве паддинга последовательностей.  
Обучающая выборка разбита на 10 файлов (id клиентов разбиты на 10 равных частей), из этих файлов удалены значения id, чтобы можно было считать в формате int16 и хранить все данные в оперативной памяти.  
Также созданы таблицы с индексами (id клиента - номер строки в файле с данными, которая относится к нему) для извлечения признаков по id.

In [2]:
def get_emb_size(categories):
    size = len(categories) + 1
    return (size, int(np.sqrt(size + 1)))


class FeaturesMaster(object):
    
    def __init__(self, base_path, features=None, cat_features=None, num_features=None):

        self.base_path = base_path

        with open(os.path.join(base_path, 'cat_count.json'), 'r') as f:
            self.features_dict = json.load(f)

        with open(os.path.join(base_path, 'cat_count_test.json'), 'r') as f:
            self.features_dict_test = json.load(f)
        
        self.features = features if features is not None else [
            'rn', 'pre_since_opened', 'pre_since_confirmed', 'pre_pterm',
            'pre_fterm', 'pre_till_pclose', 'pre_till_fclose',
            'pre_loans_credit_limit', 'pre_loans_next_pay_summ',
            'pre_loans_outstanding', 'pre_loans_total_overdue',
            'pre_loans_max_overdue_sum', 'pre_loans_credit_cost_rate', 'pre_loans5',
            'pre_loans530', 'pre_loans3060', 'pre_loans6090', 'pre_loans90',
            'is_zero_loans5', 'is_zero_loans530', 'is_zero_loans3060',
            'is_zero_loans6090', 'is_zero_loans90', 'pre_util', 'pre_over2limit',
            'pre_maxover2limit', 'is_zero_util', 'is_zero_over2limit',
            'is_zero_maxover2limit', 'enc_paym_0', 'enc_paym_1', 'enc_paym_2',
            'enc_paym_3', 'enc_paym_4', 'enc_paym_5', 'enc_paym_6', 'enc_paym_7',
            'enc_paym_8', 'enc_paym_9', 'enc_paym_10', 'enc_paym_11', 'enc_paym_12',
            'enc_paym_13', 'enc_paym_14', 'enc_paym_15', 'enc_paym_16',
            'enc_paym_17', 'enc_paym_18', 'enc_paym_19', 'enc_paym_20',
            'enc_paym_21', 'enc_paym_22', 'enc_paym_23', 'enc_paym_24',
            'enc_loans_account_holder_type', 'enc_loans_credit_status',
            'enc_loans_credit_type', 'enc_loans_account_cur', 'pclose_flag',
            'fclose_flag'
        ]
        self.cat_features = cat_features if cat_features is not None else [
            'rn', 'pre_since_opened', 'pre_since_confirmed', 'pre_pterm',
            'pre_fterm', 'pre_till_pclose', 'pre_till_fclose',
            'pre_loans_credit_limit', 'pre_loans_next_pay_summ',
            'pre_loans_outstanding', 'pre_loans_total_overdue',
            'pre_loans_max_overdue_sum', 'pre_loans_credit_cost_rate',
            'is_zero_loans5', 'is_zero_loans530', 'is_zero_loans3060',
            'is_zero_loans6090', 'is_zero_loans90', 'pre_util', 'pre_over2limit',
            'pre_maxover2limit', 'is_zero_util', 'is_zero_over2limit',
            'is_zero_maxover2limit', 'enc_paym_0', 'enc_paym_1', 'enc_paym_2',
            'enc_paym_3', 'enc_paym_4', 'enc_paym_5', 'enc_paym_6', 'enc_paym_7',
            'enc_paym_8', 'enc_paym_9', 'enc_paym_10', 'enc_paym_11', 'enc_paym_12',
            'enc_paym_13', 'enc_paym_14', 'enc_paym_15', 'enc_paym_16',
            'enc_paym_17', 'enc_paym_18', 'enc_paym_19', 'enc_paym_20',
            'enc_paym_21', 'enc_paym_22', 'enc_paym_23', 'enc_paym_24',
            'enc_loans_account_holder_type', 'enc_loans_credit_status',
            'enc_loans_credit_type', 'enc_loans_account_cur', 'pclose_flag',
            'fclose_flag'
        ]

        self.num_features = num_features if num_features is not None else [
            'pre_loans5', 'pre_loans530', 'pre_loans3060', 'pre_loans6090', 'pre_loans90'
        ]
        
        self.size_features = len(self.features)
        self.size_cat_features = len(self.cat_features)
        self.size_num_features = len(self.num_features)
        
        assert set(self.features) == set(self.cat_features) | set(self.num_features)
        assert self.size_cat_features + self.size_num_features == self.size_features
        assert self.size_cat_features + self.size_num_features == len(set(self.cat_features) | set(self.num_features))

    def get_prelayer_params(self):
        num = len(self.features)
        cat_idx = [i for i in range(num) if self.features[i] in self.cat_features]
        num_idx = [i for i in range(num) if self.features[i] in self.num_features]
        embeddings_param = [get_emb_size(self.features_dict[self.features[i]]['categories']) for i in cat_idx]
        return { 'cat_idx': cat_idx, 'num_idx': num_idx, 'embeddings_param': embeddings_param }

    def get_features(self):
        return self.features
    
    def print_size(self):
        print(f'features number is {self.size_features}')
        print(f'categorical features number is {self.size_cat_features}')
        print(f'numerical features number is {self.size_num_features}')

In [3]:
class PklScoringDataset(Dataset):

    def __init__(self, files, index, idx_tables, targets, split_file_idx):
        self.num_files = len(files)
        self.files = files
        self.idx_tables = idx_tables
        self.index = index
        self.targets = targets
        size = sum(f.shape[0] for f in files)
        part_size = size // self.num_files
        self.split_file_idx = split_file_idx

    def __getitem__(self, index):
        item_id = self.index[index]
        file_num = (self.split_file_idx < item_id).sum()
        item_idx = self.idx_tables[file_num].loc[item_id][['idx']].to_numpy().ravel()
        item = self.files[file_num][item_idx]
        item = torch.LongTensor(item)
        return item, torch.LongTensor([self.targets[item_id]])

    def __len__(self):
        return len(self.index)


class PklScoringDatasetPredict(Dataset):

    def __init__(self, file, index, idx_table):
        self.file = file
        self.idx_table = idx_table
        self.index = index

    def __getitem__(self, index):
        item_id = self.index[index]
        item_idx = self.idx_table.loc[item_id][['idx']].to_numpy().ravel()
        item = self.file[item_idx]
        item = torch.LongTensor(item)
        return item

    def __len__(self):
        return len(self.index)


def collate_function(batch):

    items, targets, mask = [], [], []
    for item, target in batch:
        items.append(item)
        targets.append(target)
        mask.append(torch.full(size=(len(item),), fill_value=True))

    items = torch.nn.utils.rnn.pad_sequence(items, batch_first=True, padding_value=0)
    mask = torch.nn.utils.rnn.pad_sequence(mask, batch_first=True, padding_value=False)
    targets = torch.Tensor(targets)

    return dict(tensor=items, mask=mask, targets=targets)


def collate_function_predict(batch):
    
    mask = list(map(lambda it: torch.full(size=(len(it),), fill_value=True), batch))
    items = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=0)
    mask = torch.nn.utils.rnn.pad_sequence(mask, batch_first=True, padding_value=False)
    return dict(tensor=items, mask=mask, targets=None)


class HistoryDataModule(LightningDataModule):
    
    def __init__(
        self, data_dir, features_names, batch_size=1024, num_workers=0,
        train_size=None, val_size=None, test_size=None, num_train_files=10,
        test_perc=0.15, val_perc=0.015
    ):

        super().__init__()

        self.data_dir = data_dir
        self.features_names = features_names
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.num_train_files = num_train_files
        self.test_perc = test_perc
        self.val_perc = test_perc
        
        self.targets = None
        self.train_idx = None
        self.valid_idx = None
        self.test_idx = None
        self.predict_idx = None
        
        self.train_dataset = None
        self.valid_dataset = None
        self.test_dataset = None
        self.predict_dataset = None
        
        self.train_file = None
        self.test_file = None
        
        self.set_targets(train_size, val_size, test_size)
        train_size = self.targets.size
        part_size = train_size // self.num_train_files
        self.split_file_idx = np.arange(part_size - 1, train_size - 1, part_size)
    
    def set_targets(self, train_size, val_size, test_size):
        
        train_targets = pd.read_csv(os.path.join(self.data_dir, 'train_target.csv'), index_col=0)
        test_targets = pd.read_csv(os.path.join(self.data_dir, 'test_target.csv'), index_col=0)
        
        self.targets = train_targets['flag'].to_numpy()
        
        self.predict_idx = test_targets.index.to_numpy()
        idx = train_targets.index.to_numpy()

        test_size_ = int(idx.size * self.test_perc)
        val_size_ = int(idx.size * self.val_perc)
        train_idx, self.test_idx = train_test_split(idx, test_size=test_size_, shuffle=False)
        self.train_idx, self.valid_idx = train_test_split(train_idx, test_size=val_size_, shuffle=True)
        
        if train_size is not None:
            self.train_idx = self.train_idx[:train_size]
            
        if val_size is not None:
            self.valid_idx = self.valid_idx[:val_size]
            
        if test_size is not None:
            self.test_idx = self.test_idx[:test_size]

    def setup(self, stage: str):
        if stage == 'fit' or (stage == 'test' and len(self.train_files) == 0):
            self.train_files = []
            self.train_idx_tables = []
            for i in range(self.num_train_files):
                file_path = os.path.join(self.data_dir, f'train_part_{i}.csv')
                table_path = os.path.join(self.data_dir, f'train_table_part_{i}.csv')
                file = pd.read_csv(file_path, dtype=np.int16)[self.features_names].to_numpy()
                file_table = pd.read_csv(table_path, index_col=0)
                self.train_files.append(file)
                self.train_idx_tables.append(file_table)
        elif stage == 'predict':
            test_path = os.path.join(self.data_dir, 'test_data.csv')
            table_path = os.path.join(self.data_dir, 'test_idx_table.csv')
            self.test_file = pd.read_csv(test_path, dtype=np.int16)[self.features_names].to_numpy()
            self.test_idx_table = pd.read_csv(table_path, index_col=0)

    def train_dataloader(self):
        
        train_targets = self.targets[self.train_idx]
        zeros_p = train_targets.sum() / train_targets.size
        weights = np.where(train_targets == 1, 1 - zeros_p, zeros_p)
        
        self.train_dataset = PklScoringDataset(
            files=self.train_files, index=self.train_idx, targets=self.targets,
            idx_tables=self.train_idx_tables, split_file_idx=self.split_file_idx
        )
        sampler = WeightedRandomSampler(weights, 2 * int(train_targets.sum()), replacement=False)
        
        return DataLoader(
            self.train_dataset, batch_size=self.batch_size, sampler=sampler,
            num_workers=self.num_workers, collate_fn=collate_function
        )

    def val_dataloader(self):
        self.valid_dataset = PklScoringDataset(
            files=self.train_files, index=self.valid_idx, targets=self.targets,
            idx_tables=self.train_idx_tables, split_file_idx=self.split_file_idx
        )
        return DataLoader(
            self.valid_dataset, batch_size=self.batch_size, shuffle=False,
            num_workers=self.num_workers, collate_fn=collate_function
        )

    def test_dataloader(self):
        self.test_dataset = PklScoringDataset(
            files=self.train_files, index=self.test_idx, targets=self.targets,
            idx_tables=self.train_idx_tables, split_file_idx=self.split_file_idx
        )
        return DataLoader(
            self.test_dataset, batch_size=self.batch_size, shuffle=False,
            num_workers=self.num_workers, collate_fn=collate_function
        )

    def predict_dataloader(self):
        self.predict_dataset = PklScoringDatasetPredict(
            file=self.test_file, index=self.predict_idx, idx_table=self.test_idx_table
        )
        return DataLoader(
            self.predict_dataset, batch_size=self.batch_size, shuffle=False,
            num_workers=self.num_workers, collate_fn=collate_function_predict
        )

    def teardown(self, stage: str):
        if stage == 'fit':
            self.train_dataset = None
            self.valid_dataset = None
        if stage == 'test':
            self.train_files = None
            self.train_idx_tables = None
            self.test_dataset = None
        elif stage == 'predict':
            self.test_file = None
            self.predict_dataset = None


In [4]:
class PreDataBlock(pl.LightningModule):

    def __init__(self, cat_idx, num_idx, embeddings_param, output_size):
        super().__init__()
        assert len(embeddings_param) == len(cat_idx)
        self.cat_idx = cat_idx
        self.num_idx =  num_idx
        self.embeddings_param = embeddings_param
        self.output_size = output_size
        
        layer_output_size = sum([_[1] for _ in embeddings_param]) + len(num_idx)
        self.emb_layer = nn.ModuleList(
            [nn.Embedding(inp, outp, padding_idx=0) for inp, outp in embeddings_param]
        )
        self.linear = nn.Linear(layer_output_size, output_size)
        
    def forward(self, batch):
        tensor = batch['tensor']
        embs_list = [e(tensor[:, :, idx]) for e, idx in zip(self.emb_layer, self.cat_idx)]
        embs = torch.cat(embs_list, dim=-1)
        all_features = torch.cat([embs, tensor[:, :, self.num_idx]], dim=-1)
        next_state = self.linear(all_features)
        return dict(tensor=next_state, mask=batch['mask'], targets=batch['targets'])


class SelfAttentionLayer(pl.LightningModule):

    def __init__(self, input_size, num_heads=8, dropout=0.0):
        
        super().__init__()
        self.cls_token = nn.Parameter(torch.rand(input_size))
        self.mha = nn.MultiheadAttention(input_size, num_heads, dropout=dropout, batch_first=True)
        
    def forward(self, batch):
        batch_size = batch['tensor'].shape[0]
        with_token = torch.cat([self.cls_token.repeat((batch_size, 1, 1)), batch['tensor']], dim=1)
        new_mask = torch.cat([torch.full((batch_size, 1), fill_value=True), batch['mask']], dim=1)
        next_state, _ = self.mha(with_token, with_token, with_token, key_padding_mask=~new_mask)
        return dict(tensor=next_state, mask=new_mask, targets=batch['targets'])


class LSTM(pl.LightningModule):

    def __init__(self, input_size, hidden_size, num_layers=1, bidirectional=True, dropout=0):
        
        super().__init__()
        self.lstm = nn.LSTM(
            input_size, hidden_size, batch_first=True,
            num_layers=num_layers, bidirectional=bidirectional, dropout=dropout
        )
        
    def forward(self, batch):
        lengths = batch['mask'].sum(-1).detach().cpu()
        packed = nn.utils.rnn.pack_padded_sequence(
            batch['tensor'], lengths, batch_first=True, enforce_sorted=False
        )
        output, (h, c) = self.lstm(packed)
        output, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
        return dict(tensor=output, mask=batch['mask'], targets=batch['targets'])

class GRU(pl.LightningModule):

    def __init__(self, input_size, hidden_size, num_layers=1, bidirectional=True, dropout=0):
        
        super().__init__()
        self.gru = nn.GRU(
            input_size, hidden_size, batch_first=True,
            num_layers=num_layers, bidirectional=bidirectional, dropout=dropout
        )
        
    def forward(self, batch):
        lengths = batch['mask'].sum(-1).detach().cpu()
        packed = nn.utils.rnn.pack_padded_sequence(
            batch['tensor'], lengths, batch_first=True, enforce_sorted=False
        )
        output, h = self.gru(packed)
        output, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
        return dict(tensor=output, mask=batch['mask'], targets=batch['targets'])

class ResidualLSTM(pl.LightningModule):

    def __init__(self, input_size, hidden_size, num_layers=1, bidirectional=True, dropout=0):
        
        super().__init__()
        self.lstm = LSTM(
            input_size, hidden_size, num_layers=num_layers,
            bidirectional=bidirectional, dropout=dropout
        )
        D = 2 if bidirectional else 1
        self.linear =  nn.Sequential(
            nn.Linear(hidden_size * D, input_size),
            nn.ReLU(),
            nn.Dropout(p=dropout)
        )
        
    def forward(self, batch):
        
        x = batch['tensor']
        lstm_out = self.lstm(batch)['tensor']
        after_linear = self.linear(lstm_out)
        next_state = x + after_linear
        return dict(tensor=next_state, mask=batch['mask'], targets=batch['targets'])
    
    
class Squeezer(pl.LightningModule):

    def __init__(self, squeezer_type='positional', pos=0, pooling_types=['mean']):
        
        super().__init__()

        assert squeezer_type in ['positional', 'pooling']
        assert isinstance(pos, int)
        assert isinstance(pooling_types, list)

        self.squeezer_type = squeezer_type
        self.pos = pos
        self.pooling_types = pooling_types
        
    def forward(self, batch):
        if self.squeezer_type == 'positional':
            next_state = batch['tensor'][:, self.pos, :]
        elif self.squeezer_type == 'pooling':
            poolings = []
            for pooling_type in self.pooling_types:
                if pooling_type == 'mean':
                    poolings.append(batch['tensor'].mean(1))
                elif pooling_type == 'max':
                    poolings.append(batch['tensor'].max(1)[0])
                elif pooling_type == 'min':
                    poolings.append(batch['tensor'].min(1)[0])
            next_state = torch.cat(poolings, dim=-1)
        return dict(tensor=next_state, targets=batch['targets'])


class ClassificationLayer(pl.LightningModule):

    def __init__(self, input_size, num_layers, hidden_size, dropout=0.0):
        
        super().__init__()
        layers = [nn.Linear(input_size, hidden_size), nn.ReLU(), nn.Dropout(p=dropout)]
        cur_size = hidden_size
        for i in range(num_layers - 2):
            layers.append(nn.Linear(cur_size, cur_size // 2))
            cur_size = cur_size // 2
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(p=dropout))
        layers.append(nn.Linear(cur_size, 1))

        self.layers = nn.Sequential(*layers)
        
    def forward(self, batch):
        next_state = self.layers(batch['tensor']).flatten()
        return dict(tensor=next_state, targets=batch['targets'])


class Net(pl.LightningModule):

    def __init__(self, params):

        super().__init__()
        self.params = params
        self.layers = self.configure_net(params)

    def forward(self, batch):
        return self.layers(batch)
    
    def configure_net(self, params):
        
        embed_dim = params['dim']
        layers = [PreDataBlock(output_size=embed_dim, **params['prelayer'])]

        if params['type'] == 'lstm':
            lstm_param = params['lstm']
            layers.append(LSTM(input_size=embed_dim, **lstm_param))
            embed_dim = lstm_param['hidden_size'] * (2 if lstm_param['bidirectional'] else 1)
        elif params['type'] == 'attention':
            layers.append(SelfAttentionLayer(input_size=embed_dim, **params['attention']))
        elif params['type'] == 'residual_lstm':
            layers.append(ResidualLSTM(input_size=embed_dim, **params['lstm']))
        elif params['type'] == 'gru':
            lstm_param = params['lstm']
            layers.append(GRU(input_size=embed_dim, **lstm_param))
            embed_dim = lstm_param['hidden_size'] * (2 if lstm_param['bidirectional'] else 1)
        
        if params['squeezer']['squeezer_type'] == 'positional':
            after_squeezer_dim = embed_dim
        elif params['squeezer']['squeezer_type'] == 'pooling':
            after_squeezer_dim = embed_dim * len(params['squeezer']['pooling_types'])
            
        layers.extend([
            Squeezer(**params['squeezer']),
            ClassificationLayer(input_size=after_squeezer_dim, **params['classifier'])
        ])
        return nn.Sequential(*layers)

In [5]:
class ScoringModule(pl.LightningModule):
  
    def __init__(self, params):
        
        super().__init__()
        self.params = params
        self.net = Net(params['net'])
        self.loss_func = nn.BCEWithLogitsLoss()
        
        self.training_step_outputs = []
        self.validation_step_outputs = []
        self.test_step_outputs = []

    def forward(self, batch):
        return self.net(batch)

    def training_step(self, batch, batch_idx):
        logits = self(batch)['tensor']
        loss = self.loss_func(logits, batch['targets'])
        self.log('train_loss', loss, prog_bar=True)
        self.training_step_outputs.append(torch.stack([logits, batch['targets']], dim=-1))
        return loss

    def validation_step(self, batch, batch_idx):
        logits = self(batch)['tensor']
        loss = self.loss_func(logits, batch['targets'])
        self.log('val_loss', loss, prog_bar=True)
        self.validation_step_outputs.append(torch.stack([logits, batch['targets']], dim=-1))
        return loss

    def test_step(self, batch, batch_idx):
        logits = self(batch)['tensor']
        loss = self.loss_func(logits, batch['targets'])
        self.log('test_loss', loss, prog_bar=True)
        self.test_step_outputs.append(torch.stack([logits, batch['targets']], dim=-1))
        return loss
    
    def predict_step(self, batch, batch_idx):
        logits = self(batch)['tensor']
        return logits

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.net.parameters(), lr=self.params['optimizer']['lr'])
        return optimizer
        
    def on_train_epoch_end(self):
        all_preds = torch.cat(self.training_step_outputs, dim=0).detach()
        roc_auc = roc_auc_score(all_preds[:, 1].int(), all_preds[:, 0])
        self.log('train_roc_auc', roc_auc, prog_bar=True)
        self.training_step_outputs.clear()

    def on_validation_epoch_end(self):
        all_preds = torch.cat(self.validation_step_outputs, dim=0)
        roc_auc = roc_auc_score(all_preds[:, 1].int(), all_preds[:, 0])
        self.log('val_roc_auc', roc_auc, prog_bar=True)
        self.validation_step_outputs.clear()
        
    def on_test_epoch_end(self):
        all_preds = torch.cat(self.test_step_outputs, dim=0)
        roc_auc = roc_auc_score(all_preds[:, 1].int(), all_preds[:, 0])
        self.log('test_roc_auc', roc_auc, prog_bar=True)
        self.test_step_outputs.clear()

**Этот ноутбук - пример!**  
В реальности использовалась вся обучающая выборка без ограничений train_size=1000, val_size=100, test_size=100, а число воркеров устанавливалось в зависимости от количества доступных cpu.

In [6]:
base_path = 'C:/Users/User/Desktop/MIPT_Alpha'
data_path = os.path.join(base_path, 'data')
fmaster = FeaturesMaster(base_path=data_path)
features_names = fmaster.get_features()
datamodule = HistoryDataModule(
    data_path, batch_size=256, num_workers=0, features_names=features_names, train_size=1000, val_size=100, test_size=100
)

  mask |= (ar1 == a)


In [7]:
experiment_name = 'lstm_1'

params = {
    'optimizer': {'lr' : 0.001},
    'net': {
        'type': 'lstm',
        'dim': 64,
        'prelayer': fmaster.get_prelayer_params(),
        'lstm': {
            'hidden_size': 64,
            'num_layers': 1,
            'bidirectional': True,
            'dropout': 0.0
        },
        'squeezer': { 'squeezer_type': 'pooling', 'pooling_types': ['mean', 'max'] },
        'classifier': { 'num_layers': 2, 'hidden_size': 16, 'dropout': 0.05 }
    }
}

In [8]:
module = ScoringModule(params)
checkpoint_roc_auc_callback = ModelCheckpoint(
    monitor='val_roc_auc', mode='max', filename='roc_auc-{epoch}-{step}-{val_roc_auc:.4f}', save_top_k=-1,
    dirpath=os.path.join(base_path, 'logs', experiment_name, params['net']['type'])
)
checkpoint_loss_callback = ModelCheckpoint(
    monitor='val_loss', mode='min', filename='loss-{epoch}-{step}-{val_loss:.4f}', save_top_k=-1,
    dirpath=os.path.join(base_path, 'logs', experiment_name, params['net']['type'])
)
early_stopping_callback = EarlyStopping(monitor='val_roc_auc', mode='max', min_delta=0.0, patience=3, verbose=False)

In [9]:
trainer = Trainer(
    min_epochs=2,
    max_epochs=30,
    num_sanity_val_steps=0,
    check_val_every_n_epoch=1,
    
    accelerator='cpu',
    deterministic=True,
    callbacks=[
        checkpoint_roc_auc_callback,
        checkpoint_loss_callback,
        early_stopping_callback
    ]
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [10]:
trainer.fit(model=module, datamodule=datamodule)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")

  | Name      | Type              | Params
------------------------------------------------
0 | net       | Net               | 81.8 K
1 | loss_func | BCEWithLogitsLoss | 0     
------------------------------------------------
81.8 K    Trainable params
0         Non-trainable params
81.8 K    Total params
0.327     Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [11]:
roc_auc_dict = dict()
for path in glob.glob(os.path.join(base_path, 'logs', experiment_name, params['net']['type'] + '/*')):
    if 'roc_auc' in path:
        name = re.search(r'roc_auc-epoch=.+', path)[0]
        score = re.search(r'0\.\d{4}', path)[0]
        roc_auc_dict[name] = float(score)

In [12]:
best_name, best_score = max(roc_auc_dict.items(), key=lambda x: x[1])
print(best_name, best_score)

roc_auc-epoch=15-step=16-val_roc_auc=0.7388.ckpt 0.7388


In [13]:
datamodule.setup('test')
test_preds = trainer.predict(
    module,
    dataloaders=datamodule.test_dataloader(), return_predictions=True,
    ckpt_path=os.path.join(base_path, 'logs', experiment_name, params['net']['type'], best_name)
)
datamodule.teardown('test')

  mask |= (ar1 == a)
Restoring states from the checkpoint path at C:/Users/User/Desktop/MIPT_Alpha\logs\lstm_1\lstm\roc_auc-epoch=15-step=16-val_roc_auc=0.7388.ckpt
Loaded model weights from the checkpoint at C:/Users/User/Desktop/MIPT_Alpha\logs\lstm_1\lstm\roc_auc-epoch=15-step=16-val_roc_auc=0.7388.ckpt
  rank_zero_warn(


Predicting: 0it [00:00, ?it/s]

In [14]:
test_preds = torch.cat(test_preds).flatten()
roc_auc_score(datamodule.targets[datamodule.test_idx], test_preds)

0.36979166666666663

In [15]:
with open(os.path.join(base_path, 'results/test_1.pkl'), 'wb') as f:
    pickle.dump(test_preds, f)

In [16]:
predictions = trainer.predict(
    module,
    datamodule=datamodule, return_predictions=True,
    ckpt_path=os.path.join(base_path, 'logs', experiment_name, params['net']['type'], best_name)
)

predictions = torch.cat(predictions).flatten()

  mask |= (ar1 == a)
Restoring states from the checkpoint path at C:/Users/User/Desktop/MIPT_Alpha\logs\lstm_1\lstm\roc_auc-epoch=15-step=16-val_roc_auc=0.7388.ckpt
Loaded model weights from the checkpoint at C:/Users/User/Desktop/MIPT_Alpha\logs\lstm_1\lstm\roc_auc-epoch=15-step=16-val_roc_auc=0.7388.ckpt
  rank_zero_warn(


Predicting: 0it [00:00, ?it/s]

In [17]:
with open(os.path.join(base_path, 'results/predictions_1.pkl'), 'wb') as f:
    pickle.dump(predictions, f)