# Callibration

- input   0 0 1 0 0 2 2 2
- output  0 0 1 2 2 2 2 2

In [1]:
import sys

DATASET_PATH = ('../../../feedback-prize-2021')

sys.path.insert(0, '../codes/new_transformers_branch/transformers/src')
sys.path.append('../codes')
sys.path.append('..')

In [2]:
import os
import os.path as osp

import re
import pickle
import random
import easydict
import copy

from glob import glob
from tqdm.auto import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from new_transformers import DebertaV2TokenizerFast

import torch
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.utils.data import DataLoader

## Data

In [3]:
import h5py

data = h5py.File(f'../data_file/deberta_spm_data_v2.h5py')
labels = data[f'cbio_labels']
labels = np.argmax(labels, -1)
labels = labels[:-1]

In [4]:
token_n_list = []
for num_tokens in data['num_tokens']:
    token_n_list.append(num_tokens[0])
    
token_n_list = np.array(token_n_list[:-1])

In [5]:
seed = 0

with open('../data_file/data_splits.pickle', 'rb') as f:
    data_splits = pickle.load(f)

with open('../data_file/id_to_ix_map.pickle', 'rb') as f:
    id_to_ix_map = {x.split('/')[-1].split('.')[0]: y for x, y in pickle.load(f).items()}

train_idx = []
valid_idx = []
for val_fold in range(5):
    train_text_ids = [text_id for fold in range(5) if fold != val_fold for text_id in data_splits[seed][250]['normed'][fold]]
    val_text_ids = data_splits[seed][250]['normed'][val_fold]
    
    train_fold_idx = [id_to_ix_map[text_id] for text_id in train_text_ids]
    val_fold_idx = [id_to_ix_map[text_id] for text_id in val_text_ids]
    
    train_idx.append(train_fold_idx)
    valid_idx.append(val_fold_idx)


## Augmentation

### Replacing category randomly by continuous 0 (length is 1 - 5)

In [6]:
test = copy.deepcopy(labels[0])
token_n = token_n_list[0]

# padding
test[token_n:] = -1

# place where zero class will be assigned
zero_n = random.randint(1, 5)
zero_start_i = random.randint(1, token_n - 1)
zero_end_i = min(zero_start_i + zero_n, token_n - 1)

# place that will be printed
print_start_i = max(zero_start_i - 10, 0)
print_end_i = min(zero_end_i + 10, token_n - 1)

print(f'0 category will be added at {zero_start_i} index with length {zero_n}')
print(f'before {test[print_start_i:print_end_i]}')

# augmentation
test[zero_start_i:zero_end_i] = 0

print(f'after  {test[print_start_i:print_end_i]}')

0 category will be added at 195 index with length 2
before [ 6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  0  0  9 10]
after  [ 6  6  6  6  6  6  6  6  6  6  0  0  6  6  6  6  6  6  0  0  9 10]


### Replacing category randomly by any category (length is 30 - 100)

In [7]:
test = copy.deepcopy(labels[0])
token_n = token_n_list[0]

# padding
test[token_n:] = -1

# place where zero class will be assigned
replace_n = random.randint(30, 100)
replace_idx = np.random.choice(range(1, token_n - 1), size=replace_n, replace=False)
replace_cats = np.random.choice(range(0, 15), size=replace_n, replace=True)

# place that will be printed
print_start_i = max(replace_idx[0] - 10, 0)
print_end_i = min(replace_idx[0] + 10, token_n - 1)

print(f'category replacement will be happend at {replace_n} times')
print(f'showing at index {print_start_i}')
print(f'before {test[print_start_i:print_end_i]}')

# augmentation
test[replace_idx] = replace_cats

print(f'after  {test[print_start_i:print_end_i]}')

category replacement will be happend at 37 times
showing at index 112
before [6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6]
after  [ 6  6  6  6  6  6  6  6  6 14  0  4  6  6  6  6  6  6  6  6]


### Replace starting category as even number (ex: 122 -> 222)

In [8]:
test = copy.deepcopy(labels[0])
token_n = token_n_list[0]

# padding
test[token_n:] = -1

# place where odd category exist
odd_category = [1, 3, 5, 7, 9, 11, 13]

odd_cat_idx = [i for i, category in enumerate(test) if category in odd_category]
replace_n = random.randint(1, len(odd_cat_idx))
odd_cat_idx = np.random.choice(odd_cat_idx, size=replace_n, replace=False)

# place that will be printed
print_start_i = max(odd_cat_idx[0] - 5, 0)
print_end_i = min(odd_cat_idx[0] + 5, token_n - 1)

print(f'total {replace_n} will be replaced')
print(f'example: category {test[odd_cat_idx[0]]} will be replaced to {test[odd_cat_idx[0]] + 1}')
print(f'before {test[print_start_i:print_end_i]}')

# augmentation
test[odd_cat_idx] += 1

print(f'after  {test[print_start_i:print_end_i]}')

total 5 will be replaced
example: category 3 will be replaced to 4
before [2 2 2 2 0 3 4 4 4 4]
after  [2 2 2 2 0 4 4 4 4 4]


### Replace starting category to different category (ex: 122 -> 322)

In [9]:
test = copy.deepcopy(labels[0])
token_n = token_n_list[0]

# padding
test[token_n:] = -1

# place where odd category exist
odd_category = [1, 3, 5, 7, 9, 11, 13]
odd_cat_idx = [i for i, category in enumerate(test) if category in odd_category]

replace_n = random.randint(1, len(odd_cat_idx))
odd_cat_idx = np.random.choice(odd_cat_idx, size=replace_n, replace=False)

replace_cats = []
for odd_cat_i in odd_cat_idx:
    current_cat = test[odd_cat_i]
    odd_sample = copy.deepcopy(odd_category)
    odd_sample.remove(current_cat)
    replace_cats.append(random.choice(odd_sample))
    
replace_cats = np.array(replace_cats)

# place that will be printed
print_start_i = max(odd_cat_idx[0] - 5, 0)
print_end_i = min(odd_cat_idx[0] + 5, token_n - 1)

print(f'total {replace_n} will be replaced')
print(f'category {test[odd_cat_idx[0]]} will be replaced to {replace_cats[0]}')
print(f'before {test[print_start_i:print_end_i]}')

# augmentation
test[odd_cat_idx] = replace_cats

print(f'after  {test[print_start_i:print_end_i]}')

total 4 will be replaced
category 3 will be replaced to 11
before [2 2 2 2 0 3 4 4 4 4]
after  [ 2  2  2  2  0 11  4  4  4  4]


## Dataset

In [10]:
class CategoryDataset(torch.utils.data.Dataset):
    def __init__(self, labels, token_n_list):
        self.labels = labels
        self.token_n_list = token_n_list
        
        self.odd_category = [1, 3, 5, 7, 9, 11, 13]

    def replace_with_zero(self, label, token_n):
        """"""
        zero_n = random.randint(1, 5)
        zero_start_i = random.randint(1, token_n - 1)
        zero_end_i = min(zero_start_i + zero_n, token_n - 1)
        
        label[zero_start_i:zero_end_i] = 0
        
        return label
    
    def b_to_i_label(self, label, token_n):
        """"""
        odd_cat_idx = [i for i, category in enumerate(label) if category in self.odd_category]
        if len(odd_cat_idx) == 0:
            return label

        replace_n = random.randint(1, len(odd_cat_idx))
        odd_cat_idx = np.random.choice(odd_cat_idx, size=replace_n, replace=False)

        label[odd_cat_idx] += 1
        
        return label
    
    def replace_b_label(self, label, token_n):
        """"""
        odd_cat_idx = [i for i, category in enumerate(label) if category in self.odd_category]
        if len(odd_cat_idx) == 0:
            return label

        replace_n = random.randint(1, len(odd_cat_idx))
        odd_cat_idx = np.random.choice(odd_cat_idx, size=replace_n, replace=False)

        replace_cats = []
        for odd_cat_i in odd_cat_idx:
            current_cat = label[odd_cat_i]
            odd_sample = copy.deepcopy(self.odd_category)
            odd_sample.remove(current_cat)
            replace_cats.append(random.choice(odd_sample))

        replace_cats = np.array(replace_cats)

        # augmentation
        label[odd_cat_idx] = replace_cats
        
        return label
    
    def replace_with_random_label(self, label, token_n):
        """"""
        replace_n = random.randint(30, 100)
        replace_idx = np.random.choice(range(1, token_n - 1), size=replace_n, replace=False)
        replace_cats = np.random.choice(range(0, 15), size=replace_n, replace=True)

        label[replace_idx] = replace_cats

        return label
        
    def __getitem__(self, idx):
        targets = copy.deepcopy(self.labels[idx])
        inputs = copy.deepcopy(targets)
        token_n = self.token_n_list[idx]
        
        # padding
        inputs[token_n:] = 15
        targets[token_n:] = 15

        # augmentation
        inputs = self.replace_with_zero(inputs, token_n)
        inputs = self.b_to_i_label(inputs, token_n)
        inputs = self.replace_b_label(inputs, token_n)
        inputs = self.replace_with_random_label(inputs, token_n)
        
        return inputs, targets

    def __len__(self):
        return len(self.labels)

## Model

In [11]:
class Callibration(nn.Module):
    def __init__(self,
                 hidden_size=128,
                 n_layers=2,
                 device='cuda:0'):
        super().__init__()
        
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.device = device

        self.embedding = nn.Embedding(16, hidden_size, padding_idx=15)
        self.lstm = nn.LSTM(hidden_size,
                            hidden_size,
                            n_layers,
                            bidirectional=True,
                            batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, 16)
        
    def init_hidden(self, batch_size):
        h = torch.zeros(self.n_layers * 2, batch_size, self.hidden_size)
        h = h.to(self.device)

        c = torch.zeros(self.n_layers * 2, batch_size, self.hidden_size)
        c = c.to(self.device)

        return (h, c)

    def forward(self, x):
        batch_size, seq_len = x.size()
        out = self.embedding(x)
        
        hidden = self.init_hidden(batch_size)
        out, hidden = self.lstm(out, hidden)
        
        out = out.contiguous().view(batch_size, -1, self.hidden_size * 2)
        out = self.fc(out)
        
        return out

## Train

In [12]:
batch_size = 128
device = 'cuda:0'

In [13]:
save_path = '../result/callibration'

if not osp.exists(save_path):
    os.makedirs(save_path)

In [None]:
for fold in range(5):
    batch_size = 128
    
    model = Callibration().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    train_labels = copy.deepcopy(labels[train_idx[fold]])
    train_token_n_list = copy.deepcopy(token_n_list[train_idx[fold]])

    valid_labels = copy.deepcopy(labels[valid_idx[fold]])
    valid_token_n_list = copy.deepcopy(token_n_list[valid_idx[fold]])

    train_dataset = CategoryDataset(train_labels, train_token_n_list)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    valid_dataset = CategoryDataset(valid_labels, valid_token_n_list)
    valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
    
    for epoch in range(5):
        train_losses = []
        for step, (inputs, targets) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
            optimizer.zero_grad()
            
            inputs = inputs.to(device)
            targets = targets.to(device)

            batch_size, seq_len = inputs.size()

            preds = model(inputs)
            preds = preds.view(batch_size * seq_len, 16)
            targets = targets.view(-1)

            loss = criterion(preds, targets)
            train_losses.append(loss.item())

            if (step + 1) % 20 == 0:
                print(f"[ TRAIN ] fold {fold + 1} epoch {epoch + 1} loss: {np.array(train_losses).mean():.4f}")
            
            loss.backward()
            optimizer.step()
            
        valid_losses = []
        for step, (inputs, targets) in tqdm(enumerate(valid_dataloader), total=len(valid_dataloader)):
            with torch.no_grad():
                inputs = inputs.to(device)
                targets = targets.to(device)

                batch_size, seq_len = inputs.size()

                preds = model(inputs)
                preds = preds.view(batch_size * seq_len, 16)
                targets = targets.view(-1)

                loss = criterion(preds, targets)
                valid_losses.append(loss.item())

                if (step + 1) % 10 == 0:
                    print(f"[ VALID ] fold {fold + 1} epoch {epoch + 1} loss: {np.array(valid_losses).mean():.4f}")
            
        torch.save(model.state_dict(), osp.join(save_path, f'fold_{fold}_callibration.pt'))

  0%|          | 0/98 [00:00<?, ?it/s]

[ TRAIN ] fold 1 epoch 1 loss: 1.2897
[ TRAIN ] fold 1 epoch 1 loss: 0.6929
[ TRAIN ] fold 1 epoch 1 loss: 0.4773
[ TRAIN ] fold 1 epoch 1 loss: 0.3655


  0%|          | 0/25 [00:00<?, ?it/s]

[ VALID ] fold 1 epoch 1 loss: 0.0242
[ VALID ] fold 1 epoch 1 loss: 0.0232


  0%|          | 0/98 [00:00<?, ?it/s]

[ TRAIN ] fold 1 epoch 2 loss: 0.0214
[ TRAIN ] fold 1 epoch 2 loss: 0.0203
[ TRAIN ] fold 1 epoch 2 loss: 0.0194
[ TRAIN ] fold 1 epoch 2 loss: 0.0186


  0%|          | 0/25 [00:00<?, ?it/s]

[ VALID ] fold 1 epoch 2 loss: 0.0158
[ VALID ] fold 1 epoch 2 loss: 0.0152


  0%|          | 0/98 [00:00<?, ?it/s]

[ TRAIN ] fold 1 epoch 3 loss: 0.0143
[ TRAIN ] fold 1 epoch 3 loss: 0.0141
[ TRAIN ] fold 1 epoch 3 loss: 0.0137
[ TRAIN ] fold 1 epoch 3 loss: 0.0134


  0%|          | 0/25 [00:00<?, ?it/s]

[ VALID ] fold 1 epoch 3 loss: 0.0122
[ VALID ] fold 1 epoch 3 loss: 0.0120


  0%|          | 0/98 [00:00<?, ?it/s]

In [None]:
token_n = 15
inputs = torch.zeros(2048)

inputs[token_n:] = 15

inputs[:token_n] = torch.Tensor([0, 3, 4, 4, 0, 4, 0, 4, 3, 5, 0, 0, 1, 4, 0])
inputs = inputs.long().to(device).unsqueeze(0)

output = model(inputs)
cal_output = output.argmax(-1)

In [None]:
inputs[:, :15]

In [None]:
cal_output[:, :15]

In [None]:
inputs[:, :15]

In [None]:
cal_output[:, :15]