# Callibration

- input   0 0 1 0 0 2 2 2
- output  0 0 1 2 2 2 2 2

In [1]:
import sys

DATASET_PATH = ('../../../feedback-prize-2021')

sys.path.insert(0, '../codes/new_transformers_branch/transformers/src')
sys.path.append('../codes')
sys.path.append('..')

In [2]:
import os
import os.path as osp

import re
import pickle
import random
import easydict
import copy

from glob import glob
from tqdm.auto import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from new_transformers import DebertaV2TokenizerFast

import torch
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.utils.data import DataLoader

## Data

In [3]:
import h5py

data = h5py.File(f'../data_file/deberta_spm_data_v2.h5py')
labels = data[f'cbio_labels']
labels = np.argmax(labels, -1)
labels = labels[:-1]

In [4]:
token_n_list = []
for num_tokens in data['num_tokens']:
    token_n_list.append(num_tokens[0])
    
token_n_list = np.array(token_n_list[:-1])

In [5]:
seed = 0

with open('../data_file/data_splits.pickle', 'rb') as f:
    data_splits = pickle.load(f)

with open('../data_file/id_to_ix_map.pickle', 'rb') as f:
    id_to_ix_map = {x.split('/')[-1].split('.')[0]: y for x, y in pickle.load(f).items()}

train_idx = []
valid_idx = []
for val_fold in range(5):
    train_text_ids = [text_id for fold in range(5) if fold != val_fold for text_id in data_splits[seed][250]['normed'][fold]]
    val_text_ids = data_splits[seed][250]['normed'][val_fold]
    
    train_fold_idx = [id_to_ix_map[text_id] for text_id in train_text_ids]
    val_fold_idx = [id_to_ix_map[text_id] for text_id in val_text_ids]
    
    train_idx.append(train_fold_idx)
    valid_idx.append(val_fold_idx)


## Augmentation

## adding +1 to continuous sequence to learn position bias

In [6]:
test = copy.deepcopy(labels[0])
token_n = token_n_list[0]

# padding
test[token_n:] = -1

# place where zero class will be assigned
zero_n = random.randint(1, 20)
zero_start_i = random.randint(1, token_n - 1)
zero_end_i = min(zero_start_i + zero_n, token_n - 1)

# place that will be printed
print_start_i = max(zero_start_i - 10, 0)
print_end_i = min(zero_end_i + 10, token_n - 1)

print(f'+1 will be added at {zero_start_i} index with length {zero_n}')
print(f'before {test[print_start_i:print_end_i]}')

# augmentation
test[zero_start_i:zero_end_i] += 1
test = np.clip(test, 0, 14)

print(f'after  {test[print_start_i:print_end_i]}')

+1 will be added at 92 index with length 6
before [6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6]
after  [6 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 6 6 6 6 6 6 6 6 6 6]


### Replacing category randomly by continuous 0 (length is 1 - 5)

In [7]:
test = copy.deepcopy(labels[0])
token_n = token_n_list[0]

# padding
test[token_n:] = -1

# place where zero class will be assigned
zero_n = random.randint(1, 5)
zero_start_i = random.randint(1, token_n - 1)
zero_end_i = min(zero_start_i + zero_n, token_n - 1)

# place that will be printed
print_start_i = max(zero_start_i - 10, 0)
print_end_i = min(zero_end_i + 10, token_n - 1)

print(f'0 category will be added at {zero_start_i} index with length {zero_n}')
print(f'before {test[print_start_i:print_end_i]}')

# augmentation
test[zero_start_i:zero_end_i] = 0

print(f'after  {test[print_start_i:print_end_i]}')

0 category will be added at 45 index with length 5
before [2 2 2 2 2 0 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 7 8 8 8]
after  [2 2 2 2 2 0 3 4 4 4 0 0 0 0 0 4 4 4 4 4 4 7 8 8 8]


### Replacing category randomly by any category (length is 30 - 100)

In [8]:
test = copy.deepcopy(labels[0])
token_n = token_n_list[0]

# padding
test[token_n:] = -1

# place where zero class will be assigned
replace_n = random.randint(30, 100)
replace_idx = np.random.choice(range(1, token_n - 1), size=replace_n, replace=False)
replace_cats = np.random.choice(range(0, 15), size=replace_n, replace=True)

# place that will be printed
print_start_i = max(replace_idx[0] - 10, 0)
print_end_i = min(replace_idx[0] + 10, token_n - 1)

print(f'category replacement will be happend at {replace_n} times')
print(f'showing at index {print_start_i}')
print(f'before {test[print_start_i:print_end_i]}')

# augmentation
test[replace_idx] = replace_cats

print(f'after  {test[print_start_i:print_end_i]}')

category replacement will be happend at 61 times
showing at index 37
before [2 2 2 0 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 7]
after  [ 2  2  2  0 14  4  2  4  4  4  5  9  4  6 12  4  0  4  4  7]


### Replace starting category as even number (ex: 122 -> 222)

In [9]:
test = copy.deepcopy(labels[0])
token_n = token_n_list[0]

# padding
test[token_n:] = -1

# place where odd category exist
odd_category = [1, 3, 5, 7, 9, 11, 13]

odd_cat_idx = [i for i, category in enumerate(test) if category in odd_category]
replace_n = random.randint(1, len(odd_cat_idx))
odd_cat_idx = np.random.choice(odd_cat_idx, size=replace_n, replace=False)

# place that will be printed
print_start_i = max(odd_cat_idx[0] - 5, 0)
print_end_i = min(odd_cat_idx[0] + 5, token_n - 1)

print(f'total {replace_n} will be replaced')
print(f'example: category {test[odd_cat_idx[0]]} will be replaced to {test[odd_cat_idx[0]] + 1}')
print(f'before {test[print_start_i:print_end_i]}')

# augmentation
test[odd_cat_idx] += 1

print(f'after  {test[print_start_i:print_end_i]}')

total 4 will be replaced
example: category 5 will be replaced to 6
before [8 8 8 8 8 5 6 6 6 6]
after  [8 8 8 8 8 6 6 6 6 6]


### Replace starting category to different category (ex: 122 -> 322)

In [10]:
test = copy.deepcopy(labels[0])
token_n = token_n_list[0]

# padding
test[token_n:] = -1

# place where odd category exist
odd_category = [1, 3, 5, 7, 9, 11, 13]
odd_cat_idx = [i for i, category in enumerate(test) if category in odd_category]

replace_n = random.randint(1, len(odd_cat_idx))
odd_cat_idx = np.random.choice(odd_cat_idx, size=replace_n, replace=False)

replace_cats = []
for odd_cat_i in odd_cat_idx:
    current_cat = test[odd_cat_i]
    odd_sample = copy.deepcopy(odd_category)
    odd_sample.remove(current_cat)
    replace_cats.append(random.choice(odd_sample))
    
replace_cats = np.array(replace_cats)

# place that will be printed
print_start_i = max(odd_cat_idx[0] - 5, 0)
print_end_i = min(odd_cat_idx[0] + 5, token_n - 1)

print(f'total {replace_n} will be replaced')
print(f'category {test[odd_cat_idx[0]]} will be replaced to {replace_cats[0]}')
print(f'before {test[print_start_i:print_end_i]}')

# augmentation
test[odd_cat_idx] = replace_cats

print(f'after  {test[print_start_i:print_end_i]}')

total 3 will be replaced
category 3 will be replaced to 13
before [2 2 2 2 0 3 4 4 4 4]
after  [ 2  2  2  2  0 13  4  4  4  4]


### Starting Point (`B` in BIO label) imputation

In [67]:
test = copy.deepcopy(labels[0])
token_n = token_n_list[0]

# padding
test[token_n:] = -1

# place where odd category exist
even_category = [2, 4, 6, 8, 10, 12, 14]
even_cat_idx = [i for i, category in enumerate(test) if category in even_category]

max_n = max(1, len(even_cat_idx) // 20)
replace_n = random.randint(1, max_n)
even_cat_idx = np.random.choice(even_cat_idx, size=replace_n, replace=False)

replace_idx = []
for even_cat_i in even_cat_idx:
    if test[even_cat_i] - 1 == test[even_cat_i - 1]:
        continue
    
    replace_idx.append(even_cat_i)
    
replace_idx = np.array(replace_idx)

# place that will be printed
print_start_i = max(replace_idx[0] - 5, 0)
print_end_i = min(replace_idx[0] + 5, token_n - 1)

print(f'total {replace_n} will be replaced')
print(f'index {[replace_idx[0]]} will be replaced to {test[replace_idx[0]] - 1}')
print(f'before {test[print_start_i:print_end_i]}')

# augmentation
test[replace_idx] -= 1

print(f'after  {test[print_start_i:print_end_i]}')

total 13 will be replaced
index [253] will be replaced to 9
before [10 10 10 10 10 10 10 10 10 10]
after  [10 10 10 10 10  9 10 10 10 10]


## Dataset

In [132]:
class CategoryDataset(torch.utils.data.Dataset):
    def __init__(self, labels, token_n_list):
        self.labels = labels
        self.token_n_list = token_n_list
        
        self.odd_category = [1, 3, 5, 7, 9, 11, 13]
        self.even_category = [2, 4, 6, 8, 10, 12, 14]
        
    def position_bias(self, label, token_n):
        replace_n = random.randint(10, 20)
        replace_start_i = random.randint(1, token_n - 1)
        replace_end_i = min(replace_start_i + replace_n, token_n - 1)
        
        label[replace_start_i:replace_end_i] += 1
        label = np.clip(label, 0, 14)
        
        return label

    def replace_with_zero(self, label, token_n):
        """"""
        zero_n = random.randint(1, 5)
        zero_start_i = random.randint(1, token_n - 1)
        zero_end_i = min(zero_start_i + zero_n, token_n - 1)
        
        label[zero_start_i:zero_end_i] = 0
        
        return label
    
    def add_starting_point(self, label, target, token_n):
        
        even_cat_idx = [i for i, category in enumerate(label) if category in self.even_category]
        if len(even_cat_idx) == 0:
            return label, target
        
        max_n = min(5, len(even_cat_idx) // 20)
        replace_n = random.randint(1, max_n)
        even_cat_idx = np.random.choice(even_cat_idx, size=replace_n, replace=False)

        replace_idx = []
        for even_cat_i in even_cat_idx:
            if label[even_cat_i] - 1 == label[even_cat_i - 1]:
                continue

            replace_idx.append(even_cat_i)

        replace_idx = np.array(replace_idx)
        if len(replace_idx) == 0:
            return label, target
        
        # augmentation
        label[replace_idx] -= 1
        target[replace_idx] -= 1
        
        return label, target
    
    def b_to_i_label(self, label, token_n):
        """"""
        odd_cat_idx = [i for i, category in enumerate(label) if category in self.odd_category]
        if len(odd_cat_idx) == 0:
            return label

        replace_n = random.randint(1, len(odd_cat_idx))
        replace_n = min(2, replace_n)
        odd_cat_idx = np.random.choice(odd_cat_idx, size=replace_n, replace=False)

        label[odd_cat_idx] += 1
        
        return label
    
    def replace_b_label(self, label, token_n):
        """"""
        odd_cat_idx = [i for i, category in enumerate(label) if category in self.odd_category]
        if len(odd_cat_idx) == 0:
            return label

        replace_n = random.randint(1, len(odd_cat_idx))
        replace_n = min(2, replace_n)
        odd_cat_idx = np.random.choice(odd_cat_idx, size=replace_n, replace=False)

        replace_cats = []
        for odd_cat_i in odd_cat_idx:
            current_cat = label[odd_cat_i]
            odd_sample = copy.deepcopy(self.odd_category)
            odd_sample.remove(current_cat)
            replace_cats.append(random.choice(odd_sample))

        replace_cats = np.array(replace_cats)

        # augmentation
        label[odd_cat_idx] = replace_cats
        
        return label
    
    def replace_with_random_label(self, label, token_n):
        """"""
        replace_n = random.randint(10, 50)
        replace_idx = np.random.choice(range(1, token_n - 1), size=replace_n, replace=False)
        replace_cats = np.random.choice(range(0, 15), size=replace_n, replace=True)

        label[replace_idx] = replace_cats

        return label
        
    def __getitem__(self, idx):
        targets = copy.deepcopy(self.labels[idx])
        inputs = copy.deepcopy(targets)
        token_n = self.token_n_list[idx]

        # augmentation
        if random.random() < 0.99:
            inputs, targets = self.add_starting_point(inputs, targets, token_n)
            inputs = self.position_bias(inputs, token_n)
            inputs = self.replace_with_zero(inputs, token_n)
            inputs = self.b_to_i_label(inputs, token_n)
            inputs = self.replace_b_label(inputs, token_n)
            inputs = self.replace_with_random_label(inputs, token_n)

        # padding
        inputs[token_n:] = 15
        targets[token_n:] = 15
        
        return inputs, targets

    def __len__(self):
        return len(self.labels)

## Model

In [133]:
class Callibration(nn.Module):
    def __init__(self,
                 hidden_size=128,
                 n_layers=2,
                 device='cuda:0'):
        super().__init__()
        
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.device = device

        self.embedding = nn.Embedding(16, hidden_size, padding_idx=15)
        self.lstm = nn.LSTM(hidden_size,
                            hidden_size,
                            n_layers,
                            bidirectional=True,
                            batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, 16)
        
    def init_hidden(self, batch_size):
        h = torch.zeros(self.n_layers * 2, batch_size, self.hidden_size)
        h = h.to(self.device)

        c = torch.zeros(self.n_layers * 2, batch_size, self.hidden_size)
        c = c.to(self.device)

        return (h, c)

    def forward(self, x):
        batch_size, seq_len = x.size()
        out = self.embedding(x)
        
        hidden = self.init_hidden(batch_size)
        out, hidden = self.lstm(out, hidden)
        
        out = out.contiguous().view(batch_size, -1, self.hidden_size * 2)
        out = self.fc(out)
        
        return out

## Train

In [134]:
batch_size = 32
device = 'cuda:0'

In [135]:
save_path = '../result/callibration_all'

if not osp.exists(save_path):
    os.makedirs(save_path)

In [137]:
for fold in range(5):
    batch_size = 32
    
    model = Callibration().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scheduler = optim.lr_scheduler.LambdaLR(optimizer=optimizer,
                                            lr_lambda=lambda epoch: 0.9 ** epoch,
                                            last_epoch=-1,
                                            verbose=True)
    
    train_labels = copy.deepcopy(labels[train_idx[fold]])
    train_token_n_list = copy.deepcopy(token_n_list[train_idx[fold]])

    valid_labels = copy.deepcopy(labels[valid_idx[fold]])
    valid_token_n_list = copy.deepcopy(token_n_list[valid_idx[fold]])

    train_dataset = CategoryDataset(train_labels, train_token_n_list)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    valid_dataset = CategoryDataset(valid_labels, valid_token_n_list)
    valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
    
    for epoch in range(7):
        train_losses = []
        for step, (inputs, targets) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
            optimizer.zero_grad()
            
            inputs = inputs.to(device)
            targets = targets.to(device)

            batch_size, seq_len = inputs.size()

            preds = model(inputs)
            preds = preds.view(batch_size * seq_len, 16)
            targets = targets.view(-1)

            loss = criterion(preds, targets)
            train_losses.append(loss.item())

            if (step + 1) % 20 == 0:
                print(f"[ TRAIN ] fold {fold + 1} epoch {epoch + 1} loss: {np.array(train_losses).mean():.4f}")
            
            loss.backward()
            optimizer.step()
            
            
        valid_losses = []
        for step, (inputs, targets) in tqdm(enumerate(valid_dataloader), total=len(valid_dataloader)):
            with torch.no_grad():
                inputs = inputs.to(device)
                targets = targets.to(device)

                batch_size, seq_len = inputs.size()

                preds = model(inputs)
                preds = preds.view(batch_size * seq_len, 16)
                targets = targets.view(-1)

                loss = criterion(preds, targets)
                valid_losses.append(loss.item())

                if (step + 1) % 10 == 0:
                    print(f"[ VALID ] fold {fold + 1} epoch {epoch + 1} loss: {np.array(valid_losses).mean():.4f}")
                    
        scheduler.step()
        torch.save(model.state_dict(), osp.join(save_path, f'fold_{fold}_callibration.pt'))

Adjusting learning rate of group 0 to 1.0000e-03.


  0%|          | 0/390 [00:00<?, ?it/s]

[ TRAIN ] fold 1 epoch 1 loss: 1.1601
[ TRAIN ] fold 1 epoch 1 loss: 0.6316
[ TRAIN ] fold 1 epoch 1 loss: 0.4395
[ TRAIN ] fold 1 epoch 1 loss: 0.3382
[ TRAIN ] fold 1 epoch 1 loss: 0.2759
[ TRAIN ] fold 1 epoch 1 loss: 0.2336
[ TRAIN ] fold 1 epoch 1 loss: 0.2030
[ TRAIN ] fold 1 epoch 1 loss: 0.1798
[ TRAIN ] fold 1 epoch 1 loss: 0.1616
[ TRAIN ] fold 1 epoch 1 loss: 0.1469
[ TRAIN ] fold 1 epoch 1 loss: 0.1348
[ TRAIN ] fold 1 epoch 1 loss: 0.1247
[ TRAIN ] fold 1 epoch 1 loss: 0.1160
[ TRAIN ] fold 1 epoch 1 loss: 0.1086
[ TRAIN ] fold 1 epoch 1 loss: 0.1021
[ TRAIN ] fold 1 epoch 1 loss: 0.0964
[ TRAIN ] fold 1 epoch 1 loss: 0.0914
[ TRAIN ] fold 1 epoch 1 loss: 0.0869
[ TRAIN ] fold 1 epoch 1 loss: 0.0828


  0%|          | 0/99 [00:00<?, ?it/s]

[ VALID ] fold 1 epoch 1 loss: 0.0099
[ VALID ] fold 1 epoch 1 loss: 0.0097
[ VALID ] fold 1 epoch 1 loss: 0.0099
[ VALID ] fold 1 epoch 1 loss: 0.0099
[ VALID ] fold 1 epoch 1 loss: 0.0098
[ VALID ] fold 1 epoch 1 loss: 0.0098
[ VALID ] fold 1 epoch 1 loss: 0.0098
[ VALID ] fold 1 epoch 1 loss: 0.0098
[ VALID ] fold 1 epoch 1 loss: 0.0097
Adjusting learning rate of group 0 to 9.0000e-04.


  0%|          | 0/390 [00:00<?, ?it/s]

[ TRAIN ] fold 1 epoch 2 loss: 0.0094
[ TRAIN ] fold 1 epoch 2 loss: 0.0093
[ TRAIN ] fold 1 epoch 2 loss: 0.0093
[ TRAIN ] fold 1 epoch 2 loss: 0.0092
[ TRAIN ] fold 1 epoch 2 loss: 0.0091
[ TRAIN ] fold 1 epoch 2 loss: 0.0090
[ TRAIN ] fold 1 epoch 2 loss: 0.0090
[ TRAIN ] fold 1 epoch 2 loss: 0.0089
[ TRAIN ] fold 1 epoch 2 loss: 0.0088
[ TRAIN ] fold 1 epoch 2 loss: 0.0087
[ TRAIN ] fold 1 epoch 2 loss: 0.0086
[ TRAIN ] fold 1 epoch 2 loss: 0.0086
[ TRAIN ] fold 1 epoch 2 loss: 0.0085
[ TRAIN ] fold 1 epoch 2 loss: 0.0084
[ TRAIN ] fold 1 epoch 2 loss: 0.0084
[ TRAIN ] fold 1 epoch 2 loss: 0.0084
[ TRAIN ] fold 1 epoch 2 loss: 0.0083
[ TRAIN ] fold 1 epoch 2 loss: 0.0083
[ TRAIN ] fold 1 epoch 2 loss: 0.0082


  0%|          | 0/99 [00:00<?, ?it/s]

[ VALID ] fold 1 epoch 2 loss: 0.0076
[ VALID ] fold 1 epoch 2 loss: 0.0073
[ VALID ] fold 1 epoch 2 loss: 0.0073
[ VALID ] fold 1 epoch 2 loss: 0.0074
[ VALID ] fold 1 epoch 2 loss: 0.0074
[ VALID ] fold 1 epoch 2 loss: 0.0074
[ VALID ] fold 1 epoch 2 loss: 0.0074
[ VALID ] fold 1 epoch 2 loss: 0.0074
[ VALID ] fold 1 epoch 2 loss: 0.0074
Adjusting learning rate of group 0 to 8.1000e-04.


  0%|          | 0/390 [00:00<?, ?it/s]

[ TRAIN ] fold 1 epoch 3 loss: 0.0073
[ TRAIN ] fold 1 epoch 3 loss: 0.0073
[ TRAIN ] fold 1 epoch 3 loss: 0.0073
[ TRAIN ] fold 1 epoch 3 loss: 0.0072
[ TRAIN ] fold 1 epoch 3 loss: 0.0072
[ TRAIN ] fold 1 epoch 3 loss: 0.0072
[ TRAIN ] fold 1 epoch 3 loss: 0.0072
[ TRAIN ] fold 1 epoch 3 loss: 0.0072
[ TRAIN ] fold 1 epoch 3 loss: 0.0072
[ TRAIN ] fold 1 epoch 3 loss: 0.0072
[ TRAIN ] fold 1 epoch 3 loss: 0.0071
[ TRAIN ] fold 1 epoch 3 loss: 0.0071
[ TRAIN ] fold 1 epoch 3 loss: 0.0071
[ TRAIN ] fold 1 epoch 3 loss: 0.0071
[ TRAIN ] fold 1 epoch 3 loss: 0.0070
[ TRAIN ] fold 1 epoch 3 loss: 0.0070
[ TRAIN ] fold 1 epoch 3 loss: 0.0070
[ TRAIN ] fold 1 epoch 3 loss: 0.0070
[ TRAIN ] fold 1 epoch 3 loss: 0.0070


  0%|          | 0/99 [00:00<?, ?it/s]

[ VALID ] fold 1 epoch 3 loss: 0.0069
[ VALID ] fold 1 epoch 3 loss: 0.0065
[ VALID ] fold 1 epoch 3 loss: 0.0065
[ VALID ] fold 1 epoch 3 loss: 0.0066
[ VALID ] fold 1 epoch 3 loss: 0.0066
[ VALID ] fold 1 epoch 3 loss: 0.0066
[ VALID ] fold 1 epoch 3 loss: 0.0067
[ VALID ] fold 1 epoch 3 loss: 0.0067
[ VALID ] fold 1 epoch 3 loss: 0.0067
Adjusting learning rate of group 0 to 7.2900e-04.


  0%|          | 0/390 [00:00<?, ?it/s]

[ TRAIN ] fold 1 epoch 4 loss: 0.0068
[ TRAIN ] fold 1 epoch 4 loss: 0.0068
[ TRAIN ] fold 1 epoch 4 loss: 0.0067
[ TRAIN ] fold 1 epoch 4 loss: 0.0067
[ TRAIN ] fold 1 epoch 4 loss: 0.0067
[ TRAIN ] fold 1 epoch 4 loss: 0.0067
[ TRAIN ] fold 1 epoch 4 loss: 0.0067
[ TRAIN ] fold 1 epoch 4 loss: 0.0066
[ TRAIN ] fold 1 epoch 4 loss: 0.0066
[ TRAIN ] fold 1 epoch 4 loss: 0.0066
[ TRAIN ] fold 1 epoch 4 loss: 0.0066
[ TRAIN ] fold 1 epoch 4 loss: 0.0066
[ TRAIN ] fold 1 epoch 4 loss: 0.0065
[ TRAIN ] fold 1 epoch 4 loss: 0.0065
[ TRAIN ] fold 1 epoch 4 loss: 0.0065
[ TRAIN ] fold 1 epoch 4 loss: 0.0065
[ TRAIN ] fold 1 epoch 4 loss: 0.0065
[ TRAIN ] fold 1 epoch 4 loss: 0.0065
[ TRAIN ] fold 1 epoch 4 loss: 0.0065


  0%|          | 0/99 [00:00<?, ?it/s]

[ VALID ] fold 1 epoch 4 loss: 0.0062
[ VALID ] fold 1 epoch 4 loss: 0.0061
[ VALID ] fold 1 epoch 4 loss: 0.0060
[ VALID ] fold 1 epoch 4 loss: 0.0062
[ VALID ] fold 1 epoch 4 loss: 0.0063
[ VALID ] fold 1 epoch 4 loss: 0.0064
[ VALID ] fold 1 epoch 4 loss: 0.0064
[ VALID ] fold 1 epoch 4 loss: 0.0064
[ VALID ] fold 1 epoch 4 loss: 0.0064
Adjusting learning rate of group 0 to 6.5610e-04.


  0%|          | 0/390 [00:00<?, ?it/s]

[ TRAIN ] fold 1 epoch 5 loss: 0.0063
[ TRAIN ] fold 1 epoch 5 loss: 0.0063
[ TRAIN ] fold 1 epoch 5 loss: 0.0063
[ TRAIN ] fold 1 epoch 5 loss: 0.0063
[ TRAIN ] fold 1 epoch 5 loss: 0.0063
[ TRAIN ] fold 1 epoch 5 loss: 0.0063
[ TRAIN ] fold 1 epoch 5 loss: 0.0064
[ TRAIN ] fold 1 epoch 5 loss: 0.0063
[ TRAIN ] fold 1 epoch 5 loss: 0.0063
[ TRAIN ] fold 1 epoch 5 loss: 0.0063
[ TRAIN ] fold 1 epoch 5 loss: 0.0063
[ TRAIN ] fold 1 epoch 5 loss: 0.0063
[ TRAIN ] fold 1 epoch 5 loss: 0.0063
[ TRAIN ] fold 1 epoch 5 loss: 0.0063
[ TRAIN ] fold 1 epoch 5 loss: 0.0063
[ TRAIN ] fold 1 epoch 5 loss: 0.0063
[ TRAIN ] fold 1 epoch 5 loss: 0.0063
[ TRAIN ] fold 1 epoch 5 loss: 0.0063
[ TRAIN ] fold 1 epoch 5 loss: 0.0062


  0%|          | 0/99 [00:00<?, ?it/s]

[ VALID ] fold 1 epoch 5 loss: 0.0063
[ VALID ] fold 1 epoch 5 loss: 0.0063
[ VALID ] fold 1 epoch 5 loss: 0.0063
[ VALID ] fold 1 epoch 5 loss: 0.0063
[ VALID ] fold 1 epoch 5 loss: 0.0064
[ VALID ] fold 1 epoch 5 loss: 0.0064
[ VALID ] fold 1 epoch 5 loss: 0.0064
[ VALID ] fold 1 epoch 5 loss: 0.0064
[ VALID ] fold 1 epoch 5 loss: 0.0063
Adjusting learning rate of group 0 to 5.9049e-04.


  0%|          | 0/390 [00:00<?, ?it/s]

[ TRAIN ] fold 1 epoch 6 loss: 0.0061
[ TRAIN ] fold 1 epoch 6 loss: 0.0063
[ TRAIN ] fold 1 epoch 6 loss: 0.0063
[ TRAIN ] fold 1 epoch 6 loss: 0.0062
[ TRAIN ] fold 1 epoch 6 loss: 0.0062
[ TRAIN ] fold 1 epoch 6 loss: 0.0062
[ TRAIN ] fold 1 epoch 6 loss: 0.0062
[ TRAIN ] fold 1 epoch 6 loss: 0.0061
[ TRAIN ] fold 1 epoch 6 loss: 0.0062
[ TRAIN ] fold 1 epoch 6 loss: 0.0061
[ TRAIN ] fold 1 epoch 6 loss: 0.0061
[ TRAIN ] fold 1 epoch 6 loss: 0.0061
[ TRAIN ] fold 1 epoch 6 loss: 0.0062
[ TRAIN ] fold 1 epoch 6 loss: 0.0062
[ TRAIN ] fold 1 epoch 6 loss: 0.0062
[ TRAIN ] fold 1 epoch 6 loss: 0.0062
[ TRAIN ] fold 1 epoch 6 loss: 0.0062
[ TRAIN ] fold 1 epoch 6 loss: 0.0062
[ TRAIN ] fold 1 epoch 6 loss: 0.0062


  0%|          | 0/99 [00:00<?, ?it/s]

[ VALID ] fold 1 epoch 6 loss: 0.0063
[ VALID ] fold 1 epoch 6 loss: 0.0062
[ VALID ] fold 1 epoch 6 loss: 0.0060
[ VALID ] fold 1 epoch 6 loss: 0.0061
[ VALID ] fold 1 epoch 6 loss: 0.0062
[ VALID ] fold 1 epoch 6 loss: 0.0062
[ VALID ] fold 1 epoch 6 loss: 0.0062
[ VALID ] fold 1 epoch 6 loss: 0.0063
[ VALID ] fold 1 epoch 6 loss: 0.0062
Adjusting learning rate of group 0 to 5.3144e-04.


  0%|          | 0/390 [00:00<?, ?it/s]

[ TRAIN ] fold 1 epoch 7 loss: 0.0061
[ TRAIN ] fold 1 epoch 7 loss: 0.0061
[ TRAIN ] fold 1 epoch 7 loss: 0.0059
[ TRAIN ] fold 1 epoch 7 loss: 0.0060
[ TRAIN ] fold 1 epoch 7 loss: 0.0060
[ TRAIN ] fold 1 epoch 7 loss: 0.0060
[ TRAIN ] fold 1 epoch 7 loss: 0.0060
[ TRAIN ] fold 1 epoch 7 loss: 0.0060
[ TRAIN ] fold 1 epoch 7 loss: 0.0060
[ TRAIN ] fold 1 epoch 7 loss: 0.0060
[ TRAIN ] fold 1 epoch 7 loss: 0.0060
[ TRAIN ] fold 1 epoch 7 loss: 0.0060
[ TRAIN ] fold 1 epoch 7 loss: 0.0060
[ TRAIN ] fold 1 epoch 7 loss: 0.0060
[ TRAIN ] fold 1 epoch 7 loss: 0.0060
[ TRAIN ] fold 1 epoch 7 loss: 0.0060
[ TRAIN ] fold 1 epoch 7 loss: 0.0060
[ TRAIN ] fold 1 epoch 7 loss: 0.0060
[ TRAIN ] fold 1 epoch 7 loss: 0.0060


  0%|          | 0/99 [00:00<?, ?it/s]

[ VALID ] fold 1 epoch 7 loss: 0.0058
[ VALID ] fold 1 epoch 7 loss: 0.0058
[ VALID ] fold 1 epoch 7 loss: 0.0057
[ VALID ] fold 1 epoch 7 loss: 0.0059
[ VALID ] fold 1 epoch 7 loss: 0.0060
[ VALID ] fold 1 epoch 7 loss: 0.0061
[ VALID ] fold 1 epoch 7 loss: 0.0061
[ VALID ] fold 1 epoch 7 loss: 0.0061
[ VALID ] fold 1 epoch 7 loss: 0.0061
Adjusting learning rate of group 0 to 4.7830e-04.
Adjusting learning rate of group 0 to 1.0000e-03.


  0%|          | 0/390 [00:00<?, ?it/s]

[ TRAIN ] fold 2 epoch 1 loss: 1.3036
[ TRAIN ] fold 2 epoch 1 loss: 0.6989
[ TRAIN ] fold 2 epoch 1 loss: 0.4829
[ TRAIN ] fold 2 epoch 1 loss: 0.3708
[ TRAIN ] fold 2 epoch 1 loss: 0.3016
[ TRAIN ] fold 2 epoch 1 loss: 0.2549
[ TRAIN ] fold 2 epoch 1 loss: 0.2212
[ TRAIN ] fold 2 epoch 1 loss: 0.1957
[ TRAIN ] fold 2 epoch 1 loss: 0.1756
[ TRAIN ] fold 2 epoch 1 loss: 0.1595
[ TRAIN ] fold 2 epoch 1 loss: 0.1462
[ TRAIN ] fold 2 epoch 1 loss: 0.1350
[ TRAIN ] fold 2 epoch 1 loss: 0.1256
[ TRAIN ] fold 2 epoch 1 loss: 0.1175
[ TRAIN ] fold 2 epoch 1 loss: 0.1104
[ TRAIN ] fold 2 epoch 1 loss: 0.1041
[ TRAIN ] fold 2 epoch 1 loss: 0.0986
[ TRAIN ] fold 2 epoch 1 loss: 0.0936
[ TRAIN ] fold 2 epoch 1 loss: 0.0892


  0%|          | 0/98 [00:00<?, ?it/s]

[ VALID ] fold 2 epoch 1 loss: 0.0094
[ VALID ] fold 2 epoch 1 loss: 0.0092
[ VALID ] fold 2 epoch 1 loss: 0.0093
[ VALID ] fold 2 epoch 1 loss: 0.0093
[ VALID ] fold 2 epoch 1 loss: 0.0093
[ VALID ] fold 2 epoch 1 loss: 0.0092
[ VALID ] fold 2 epoch 1 loss: 0.0091
[ VALID ] fold 2 epoch 1 loss: 0.0092
[ VALID ] fold 2 epoch 1 loss: 0.0092
Adjusting learning rate of group 0 to 9.0000e-04.


  0%|          | 0/390 [00:00<?, ?it/s]

[ TRAIN ] fold 2 epoch 2 loss: 0.0091
[ TRAIN ] fold 2 epoch 2 loss: 0.0091
[ TRAIN ] fold 2 epoch 2 loss: 0.0089
[ TRAIN ] fold 2 epoch 2 loss: 0.0089
[ TRAIN ] fold 2 epoch 2 loss: 0.0088
[ TRAIN ] fold 2 epoch 2 loss: 0.0087
[ TRAIN ] fold 2 epoch 2 loss: 0.0087
[ TRAIN ] fold 2 epoch 2 loss: 0.0086
[ TRAIN ] fold 2 epoch 2 loss: 0.0086
[ TRAIN ] fold 2 epoch 2 loss: 0.0085
[ TRAIN ] fold 2 epoch 2 loss: 0.0084
[ TRAIN ] fold 2 epoch 2 loss: 0.0084
[ TRAIN ] fold 2 epoch 2 loss: 0.0083
[ TRAIN ] fold 2 epoch 2 loss: 0.0083
[ TRAIN ] fold 2 epoch 2 loss: 0.0082
[ TRAIN ] fold 2 epoch 2 loss: 0.0082
[ TRAIN ] fold 2 epoch 2 loss: 0.0082
[ TRAIN ] fold 2 epoch 2 loss: 0.0081
[ TRAIN ] fold 2 epoch 2 loss: 0.0081


  0%|          | 0/98 [00:00<?, ?it/s]

[ VALID ] fold 2 epoch 2 loss: 0.0074
[ VALID ] fold 2 epoch 2 loss: 0.0072
[ VALID ] fold 2 epoch 2 loss: 0.0072
[ VALID ] fold 2 epoch 2 loss: 0.0072
[ VALID ] fold 2 epoch 2 loss: 0.0072
[ VALID ] fold 2 epoch 2 loss: 0.0072
[ VALID ] fold 2 epoch 2 loss: 0.0072
[ VALID ] fold 2 epoch 2 loss: 0.0073
[ VALID ] fold 2 epoch 2 loss: 0.0072
Adjusting learning rate of group 0 to 8.1000e-04.


  0%|          | 0/390 [00:00<?, ?it/s]

[ TRAIN ] fold 2 epoch 3 loss: 0.0072
[ TRAIN ] fold 2 epoch 3 loss: 0.0072
[ TRAIN ] fold 2 epoch 3 loss: 0.0072
[ TRAIN ] fold 2 epoch 3 loss: 0.0072
[ TRAIN ] fold 2 epoch 3 loss: 0.0071
[ TRAIN ] fold 2 epoch 3 loss: 0.0071
[ TRAIN ] fold 2 epoch 3 loss: 0.0071
[ TRAIN ] fold 2 epoch 3 loss: 0.0070
[ TRAIN ] fold 2 epoch 3 loss: 0.0070
[ TRAIN ] fold 2 epoch 3 loss: 0.0070
[ TRAIN ] fold 2 epoch 3 loss: 0.0070
[ TRAIN ] fold 2 epoch 3 loss: 0.0070
[ TRAIN ] fold 2 epoch 3 loss: 0.0070
[ TRAIN ] fold 2 epoch 3 loss: 0.0069
[ TRAIN ] fold 2 epoch 3 loss: 0.0069
[ TRAIN ] fold 2 epoch 3 loss: 0.0069
[ TRAIN ] fold 2 epoch 3 loss: 0.0069
[ TRAIN ] fold 2 epoch 3 loss: 0.0069
[ TRAIN ] fold 2 epoch 3 loss: 0.0069


  0%|          | 0/98 [00:00<?, ?it/s]

[ VALID ] fold 2 epoch 3 loss: 0.0060
[ VALID ] fold 2 epoch 3 loss: 0.0060
[ VALID ] fold 2 epoch 3 loss: 0.0061
[ VALID ] fold 2 epoch 3 loss: 0.0063
[ VALID ] fold 2 epoch 3 loss: 0.0064
[ VALID ] fold 2 epoch 3 loss: 0.0064
[ VALID ] fold 2 epoch 3 loss: 0.0064
[ VALID ] fold 2 epoch 3 loss: 0.0065
[ VALID ] fold 2 epoch 3 loss: 0.0065
Adjusting learning rate of group 0 to 7.2900e-04.


  0%|          | 0/390 [00:00<?, ?it/s]

[ TRAIN ] fold 2 epoch 4 loss: 0.0068
[ TRAIN ] fold 2 epoch 4 loss: 0.0067
[ TRAIN ] fold 2 epoch 4 loss: 0.0067
[ TRAIN ] fold 2 epoch 4 loss: 0.0066
[ TRAIN ] fold 2 epoch 4 loss: 0.0066
[ TRAIN ] fold 2 epoch 4 loss: 0.0066
[ TRAIN ] fold 2 epoch 4 loss: 0.0066
[ TRAIN ] fold 2 epoch 4 loss: 0.0066
[ TRAIN ] fold 2 epoch 4 loss: 0.0066
[ TRAIN ] fold 2 epoch 4 loss: 0.0066
[ TRAIN ] fold 2 epoch 4 loss: 0.0066
[ TRAIN ] fold 2 epoch 4 loss: 0.0066
[ TRAIN ] fold 2 epoch 4 loss: 0.0065
[ TRAIN ] fold 2 epoch 4 loss: 0.0065
[ TRAIN ] fold 2 epoch 4 loss: 0.0065
[ TRAIN ] fold 2 epoch 4 loss: 0.0065
[ TRAIN ] fold 2 epoch 4 loss: 0.0065
[ TRAIN ] fold 2 epoch 4 loss: 0.0065
[ TRAIN ] fold 2 epoch 4 loss: 0.0065


  0%|          | 0/98 [00:00<?, ?it/s]

[ VALID ] fold 2 epoch 4 loss: 0.0061
[ VALID ] fold 2 epoch 4 loss: 0.0061
[ VALID ] fold 2 epoch 4 loss: 0.0060
[ VALID ] fold 2 epoch 4 loss: 0.0063
[ VALID ] fold 2 epoch 4 loss: 0.0064
[ VALID ] fold 2 epoch 4 loss: 0.0064
[ VALID ] fold 2 epoch 4 loss: 0.0064
[ VALID ] fold 2 epoch 4 loss: 0.0064
[ VALID ] fold 2 epoch 4 loss: 0.0064
Adjusting learning rate of group 0 to 6.5610e-04.


  0%|          | 0/390 [00:00<?, ?it/s]

[ TRAIN ] fold 2 epoch 5 loss: 0.0064
[ TRAIN ] fold 2 epoch 5 loss: 0.0063
[ TRAIN ] fold 2 epoch 5 loss: 0.0063
[ TRAIN ] fold 2 epoch 5 loss: 0.0063
[ TRAIN ] fold 2 epoch 5 loss: 0.0063
[ TRAIN ] fold 2 epoch 5 loss: 0.0063
[ TRAIN ] fold 2 epoch 5 loss: 0.0063
[ TRAIN ] fold 2 epoch 5 loss: 0.0063
[ TRAIN ] fold 2 epoch 5 loss: 0.0062
[ TRAIN ] fold 2 epoch 5 loss: 0.0062
[ TRAIN ] fold 2 epoch 5 loss: 0.0063
[ TRAIN ] fold 2 epoch 5 loss: 0.0063
[ TRAIN ] fold 2 epoch 5 loss: 0.0062
[ TRAIN ] fold 2 epoch 5 loss: 0.0062
[ TRAIN ] fold 2 epoch 5 loss: 0.0062
[ TRAIN ] fold 2 epoch 5 loss: 0.0062
[ TRAIN ] fold 2 epoch 5 loss: 0.0062
[ TRAIN ] fold 2 epoch 5 loss: 0.0062
[ TRAIN ] fold 2 epoch 5 loss: 0.0062


  0%|          | 0/98 [00:00<?, ?it/s]

[ VALID ] fold 2 epoch 5 loss: 0.0064
[ VALID ] fold 2 epoch 5 loss: 0.0061
[ VALID ] fold 2 epoch 5 loss: 0.0061
[ VALID ] fold 2 epoch 5 loss: 0.0062
[ VALID ] fold 2 epoch 5 loss: 0.0062
[ VALID ] fold 2 epoch 5 loss: 0.0062
[ VALID ] fold 2 epoch 5 loss: 0.0062
[ VALID ] fold 2 epoch 5 loss: 0.0063
[ VALID ] fold 2 epoch 5 loss: 0.0063
Adjusting learning rate of group 0 to 5.9049e-04.


  0%|          | 0/390 [00:00<?, ?it/s]

[ TRAIN ] fold 2 epoch 6 loss: 0.0061
[ TRAIN ] fold 2 epoch 6 loss: 0.0060
[ TRAIN ] fold 2 epoch 6 loss: 0.0061
[ TRAIN ] fold 2 epoch 6 loss: 0.0061
[ TRAIN ] fold 2 epoch 6 loss: 0.0061
[ TRAIN ] fold 2 epoch 6 loss: 0.0061
[ TRAIN ] fold 2 epoch 6 loss: 0.0061
[ TRAIN ] fold 2 epoch 6 loss: 0.0061
[ TRAIN ] fold 2 epoch 6 loss: 0.0061
[ TRAIN ] fold 2 epoch 6 loss: 0.0061
[ TRAIN ] fold 2 epoch 6 loss: 0.0061
[ TRAIN ] fold 2 epoch 6 loss: 0.0061
[ TRAIN ] fold 2 epoch 6 loss: 0.0061
[ TRAIN ] fold 2 epoch 6 loss: 0.0061
[ TRAIN ] fold 2 epoch 6 loss: 0.0061
[ TRAIN ] fold 2 epoch 6 loss: 0.0061
[ TRAIN ] fold 2 epoch 6 loss: 0.0061
[ TRAIN ] fold 2 epoch 6 loss: 0.0061
[ TRAIN ] fold 2 epoch 6 loss: 0.0061


  0%|          | 0/98 [00:00<?, ?it/s]

[ VALID ] fold 2 epoch 6 loss: 0.0065
[ VALID ] fold 2 epoch 6 loss: 0.0061
[ VALID ] fold 2 epoch 6 loss: 0.0061
[ VALID ] fold 2 epoch 6 loss: 0.0061
[ VALID ] fold 2 epoch 6 loss: 0.0061
[ VALID ] fold 2 epoch 6 loss: 0.0061
[ VALID ] fold 2 epoch 6 loss: 0.0062
[ VALID ] fold 2 epoch 6 loss: 0.0062
[ VALID ] fold 2 epoch 6 loss: 0.0062
Adjusting learning rate of group 0 to 5.3144e-04.


  0%|          | 0/390 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [104]:
STOP!!!

SyntaxError: invalid syntax (3246949926.py, line 1)

## Testing

In [128]:
callibration_checkpoints = os.listdir('../result/callibration_random_with_starting_1percent_clean/')

callibration = Callibration()
callibration.eval().cuda()
callibration.load_state_dict(torch.load('../result/callibration_random_with_starting_1percent_clean/' + callibration_checkpoints[3]))

<All keys matched successfully>

In [129]:
token_n = 15
inputs = torch.zeros(2048)

inputs[token_n:] = 15

inputs[:token_n] = torch.Tensor([0, 1, 4, 5, 3, 4, 4, 4, 3, 4, 4, 3, 4, 4, 0])
inputs = inputs.long().to(device).unsqueeze(0)

output = callibration(inputs)
cal_output = output.argmax(-1)

In [130]:
inputs[:, :15]

tensor([[0, 1, 4, 5, 3, 4, 4, 4, 3, 4, 4, 3, 4, 4, 0]], device='cuda:0')

In [131]:
cal_output[:, :15]

tensor([[0, 3, 4, 4, 3, 4, 4, 4, 3, 4, 4, 3, 4, 4, 0]], device='cuda:0')

In [48]:
inputs[:, :15]

tensor([[0, 2, 4, 5, 3, 4, 4, 4, 3, 4, 4, 3, 4, 4, 0]], device='cuda:0')

In [49]:
cal_output[:, :15]

tensor([[0, 0, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0]], device='cuda:0')