# Training

- Environment Setting
- Argument Setting

## Environment Setting

In [10]:
import os
import sys

sys.path.insert(0, '/home/feedback/working/feedback/models_training/longformer/sumbission/codes')
sys.path.append('longformer/tvm/python/')
sys.path.append('longformer/')

In [9]:
import re
import random
import easydict

from random import shuffle
from tqdm import tqdm
from glob import glob

import numpy as np
import pandas as pd

import h5py
import ftfy
import dill as pickle
import wandb

import torch
from transformers import DebertaV2Model

# torch.use_deterministic_algorithms(True)
# from longformer.longformer import Longformer, LongformerConfig, RobertaModel
# from longformer.sliding_chunks import pad_to_window_size

In [23]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8"
    
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

**why using `os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8"`?**
- [torch.use_deterministic_algorithms](https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html)
> **A handful of CUDA operations are nondeterministic if the CUDA version is 10.2 or greater**, unless the environment variable `CUBLAS_WORKSPACE_CONFIG=:4096:8` or `CUBLAS_WORKSPACE_CONFIG=:16:8` is set. See the CUDA documentation for more details: https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility If one of these environment variable configurations is not set, a RuntimeError will be raised from these operations when called with CUDA tensors:



## Text Load

In [38]:
all_texts = {}
for text_path in glob('../../../feedback-prize-2021/train/*.txt'):
    with open(text_path, encoding='utf-8') as f:
        text_id = text_path.split('/')[-1].split('.')[0]
        all_texts[text_id] = f.read()

## Argument Setting

In [33]:
args = easydict.EasyDict({})

# version
args.dataset_version = 2
args.data_prefix = ''

# setting
args.seed = 0
args.label_names = ['None', 'Lead', 'Position', 'Evidence', 'Claim',
                    'Concluding Statement', 'Counterclaim', 'Rebuttal']

# hyperparameter
args.epochs = 9
args.batch_size = 8
args.lr = 32e-6
args.min_lr = 32e-6
args.label_smoothing = 0.1

args.weight_decay = 1e-2
args.weights_pow = 0.1
args.use_groupped_weights = False

args.use_groupped_weights = False
args.global_attn = 0
args.extra_dense = False

args.max_grad_norm = 35 * batch_size
args.start_eval_at = 3000
args.warmup_steps = 500
args.rce_weight = 0.1
args.ce_weight = 1 - args.rce_weight

args.decay_bias = False


# inference
args.grad_acc_steps = batch_size
args.grad_checkpt = True
args.min_len = 0
args.eval_interval = 200

In [34]:
args.gpu_n = 4

if args.gpu_n == 1:
    args.max_grad_norm = 1.
    args.val_fold = 0
elif args.gpu_n == 2:
    args.val_fold = 1
elif args.gpu_n == 3:
    args.val_fold = 0
elif args.gpu_n == 4:
    args.val_fold = 1
    args.max_grad_norm = 1.
    
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_n - 1)

## Token Counts

In [18]:
with open('../../token_counts.pickle', 'rb') as f:
    groupped_token_counts, ungroupped_token_counts = pickle.load(f)
    
if args.use_groupped_weights:
    counts = groupped_token_counts
else:
    counts = ungroupped_token_counts

token_weights = (counts.mean() / counts) ** args.weights_pow

In [21]:
groupped_token_counts, ungroupped_token_counts

(array([ 549185.,  561242.,  561242.,  317823.,  317823., 4069479.,
        4069479.,  982228.,  982228.,  945061.,  945061.,  157527.,
         157527.,  138255.,  138255.]),
 array([ 549185.,    9305.,  551937.,   15419.,  302404.,   45702.,
        4023777.,   50206.,  932022.,   13505.,  931556.,    5817.,
         151710.,    4337.,  133918.]))

## Wandb

In [37]:
run = wandb.init(entity='ducky', project='feedback_debertav3_large')
run.name = f'v3_fold{args.val_fold}_minlr{args.min_lr}_maxlr{args.lr}_wd{args.weight_decay}_warmup{args.warmup_steps}_gradnorm{args.max_grad_norm}_biasdecay{args.decay_bias}_ls{args.label_smoothing}_wp{args.weights_pow}_data{args.dataset_version}_rce{args.rce_weight}'

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [40]:
seed_everything(args.seed)

In [None]:
class TrainDataset(torch.utils.data.Dataset):
    def __init__(self, args, ids):
        self.args = args
        self.ids = ids
        self.data = h5py.File(f'../../deberta_spm_data_v{dataset_version}.h5py')

    def __len__(self):
        return len(self.ids)
    
    def __getitem__(self, idx):
        x = self.ids[ix]
        
        tokens = self.data['tokens'][x]
        attention_mask = self.data['attention_masks'][x]
        num_tokens = self.data['num_tokens'][x, 0]
        
        cbio_labels = self.data[f'{data_prefix}cbio_labels'][x]
        cbio_labels *= (1 - self.label_smoothing)
        cbio_labels += label_smoothing / 15
        
        label_mask = np.zeros_like(attention_mask)
        argmax_labels = cbio_labels.argmax(-1)
        
        for ix in range(1, 15):
            label_mask[argmax_labels==ix] = token_weights[ix]
        
        zero_label_mask = argmax_labels==0
        zero_label_mask[num_tokens - 1:] = False
        
        label_mask[zero_label_mask] = token_weights[0]
        label_mask[0] = 0

        return tokens, attention_mask, cbio_labels, label_mask, num_tokens
    

In [None]:
class ValDataset(torch.utils.data.Dataset):
    def __init__(self, args, ids):
        self.args = args
        self.ids = ids
        self.data = h5py.File(f'../../deberta_spm_data_v{dataset_version}.h5py')
        self.csv = pd.read_csv('../../train.csv')
        self.space_regex = re.compile('[\s\n]')
        
    def __len__(self):
        return len(self.ids)
    
    def __getitem__(self, ix):
        x = self.ids[ix]
        text = all_texts[val_files[ix]]
        gt_dict = {}
        sample_df = self.csv.loc[self.csv.id==val_files[ix]]
        for cat_ix in range(1, 8):
            cat_name = label_names[cat_ix]
            cat_entities = sample_df.loc[sample_df.discourse_type==cat_name]
            if len(cat_entities):
                gt_dict[cat_ix] = [(x[0], x[1]) for x in cat_entities.predictionstring.map(split_predstring)]
        
        tokens = self.data['tokens'][x]
        attention_mask = self.data['attention_masks'][x]
        num_tokens = self.data['num_tokens'][x, 0]
        token_bounds = self.data['token_offsets'][x]

        cbio_labels = self.data['cbio_labels'][x]
        
        label_mask = np.zeros_like(attention_mask)
        argmax_labels = cbio_labels.argmax(-1)
        for cat_ix in range(1, 15):
            label_mask[argmax_labels==cat_ix] = token_weights[cat_ix]
        zero_label_mask = argmax_labels==0
        zero_label_mask[num_tokens - 1:] = False
        label_mask[zero_label_mask] = token_weights[0]
        label_mask[0] = 0
        
        index_map = []
        current_word = 0
        blank = False
        for char_ix in range(text.index(text.strip()[0]), len(text)):
            if self.space_regex.match(text[char_ix]) is not None:
                blank = True
            elif blank:
                current_word += 1
                blank = False
            index_map.append(current_word)
        
        return tokens, attention_mask, cbio_labels, label_mask, token_bounds, gt_dict, index_map, num_tokens