# Training

- Environment Setting
- Argument Setting

## Environment Setting

In [1]:
import os
import os.path as osp
import sys

DATASET_PATH = ('../../../feedback-prize-2021')

sys.path.insert(0, '/home/feedback/working/feedback/models_training/longformer/sumbission/codes')
sys.path.append('longformer/tvm/python/')
sys.path.append('longformer/')

In [2]:
import re
import random
import easydict

from random import shuffle
from tqdm import tqdm
from glob import glob

import numpy as np
import pandas as pd

import h5py
import ftfy
import dill as pickle
import wandb

import torch
from transformers import DebertaV2Model

# torch.use_deterministic_algorithms(True)
# from longformer.longformer import Longformer, LongformerConfig, RobertaModel
# from longformer.sliding_chunks import pad_to_window_size

In [3]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8"
    
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

**why using `os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8"`?**
- [torch.use_deterministic_algorithms](https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html)
> **A handful of CUDA operations are nondeterministic if the CUDA version is 10.2 or greater**, unless the environment variable `CUBLAS_WORKSPACE_CONFIG=:4096:8` or `CUBLAS_WORKSPACE_CONFIG=:16:8` is set. See the CUDA documentation for more details: https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility If one of these environment variable configurations is not set, a RuntimeError will be raised from these operations when called with CUDA tensors:



## Text Load

In [38]:
all_texts = {}
for text_path in glob('/train/*.txt'):
    with open(text_path, encoding='utf-8') as f:
        text_id = text_path.split('/')[-1].split('.')[0]
        all_texts[text_id] = f.read()

## Argument Setting

In [13]:
def init_args():
    args = easydict.EasyDict({})

    # version
    args.dataset_version = 2
    args.data_prefix = ''
    args.h5py_path = f'../../deberta_spm_data_v{args.dataset_version}.h5py'
    args.csv_path = f'../../train.csv'

    # setting
    args.seed = 0
    args.label_names = ['None', 'Lead', 'Position', 'Evidence', 'Claim',
                        'Concluding Statement', 'Counterclaim', 'Rebuttal']

    # hyperparameter
    args.epochs = 9
    args.batch_size = 8
    args.lr = 32e-6
    args.min_lr = 32e-6
    args.label_smoothing = 0.1

    args.weight_decay = 1e-2
    args.weights_pow = 0.1
    args.use_groupped_weights = False

    args.use_groupped_weights = False
    args.global_attn = 0
    args.extra_dense = False

    args.max_grad_norm = 35 * args.batch_size
    args.start_eval_at = 3000
    args.warmup_steps = 500
    args.rce_weight = 0.1
    args.ce_weight = 1 - args.rce_weight

    args.decay_bias = False

    # inference
    args.grad_acc_steps = args.batch_size
    args.grad_checkpt = True
    args.min_len = 0
    args.eval_interval = 200

    return args

In [14]:
args = init_args()

In [15]:
args.gpu_n = 4

if args.gpu_n == 1:
    args.max_grad_norm = 1.
    args.val_fold = 0
elif args.gpu_n == 2:
    args.val_fold = 1
elif args.gpu_n == 3:
    args.val_fold = 0
elif args.gpu_n == 4:
    args.val_fold = 1
    args.max_grad_norm = 1.
    
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_n - 1)

## Token Counts

In [19]:
with open('../../token_counts.pickle', 'rb') as f:
    groupped_token_counts, ungroupped_token_counts = pickle.load(f)
    
if args.use_groupped_weights:
    counts = groupped_token_counts
else:
    counts = ungroupped_token_counts

args.token_weights = (counts.mean() / counts) ** args.weights_pow

In [16]:
groupped_token_counts, ungroupped_token_counts, token_weights

(array([ 549185.,  561242.,  561242.,  317823.,  317823., 4069479.,
        4069479.,  982228.,  982228.,  945061.,  945061.,  157527.,
         157527.,  138255.,  138255.]),
 array([ 549185.,    9305.,  551937.,   15419.,  302404.,   45702.,
        4023777.,   50206.,  932022.,   13505.,  931556.,    5817.,
         151710.,    4337.,  133918.]),
 array([0.99353973, 1.49377596, 0.99304323, 1.42020646, 1.0546257 ,
        1.27398286, 0.81412992, 1.26206447, 0.94235489, 1.43915525,
        0.94240202, 1.56562302, 1.12994078, 1.61227144, 1.1441243 ]))

## Wandb

In [17]:
run = wandb.init(entity='ducky', project='feedback_debertav3_large')
run.name = f'v3_fold{args.val_fold}_minlr{args.min_lr}_maxlr{args.lr}_wd{args.weight_decay}_warmup{args.warmup_steps}_gradnorm{args.max_grad_norm}_biasdecay{args.decay_bias}_ls{args.label_smoothing}_wp{args.weights_pow}_data{args.dataset_version}_rce{args.rce_weight}'

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mducky[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [18]:
seed_everything(args.seed)

In [11]:
class TrainDataset(torch.utils.data.Dataset):
    def __init__(self, args, ids):
        self.args = args
        self.ids = ids
        self.data = h5py.File(args.h5py_path)

    def __len__(self):
        return len(self.ids)
    
    def __getitem__(self, idx):
        text_id = self.ids[idx]
        
        # TODO: Add Tokenizer and directly tokenize for typo injections
        tokens = self.data['tokens'][text_id]
        attention_mask = self.data['attention_masks'][text_id]
        num_tokens = self.data['num_tokens'][text_id, 0]
        
        # label
        cbio_labels = self.data[f'{self.args.data_prefix}cbio_labels'][text_id]
        cbio_labels *= (1 - self.args.label_smoothing)
        cbio_labels += self.args.label_smoothing / 15
        
        # label mask
        label_mask = np.zeros_like(attention_mask)
        argmax_labels = cbio_labels.argmax(-1)
        
        for i in range(1, 15):
            label_mask[argmax_labels == i] = self.args.token_weights[i]
        
        zero_label_mask = argmax_labels == 0
        zero_label_mask[num_tokens - 1:] = False

        label_mask[zero_label_mask] = self.args.token_weights[0]
        label_mask[0] = 0

        return tokens, attention_mask, cbio_labels, label_mask, num_tokens
    

In [None]:
class ValDataset(torch.utils.data.Dataset):
    def __init__(self, args, ids, val_files):
        self.args = args
        self.ids = ids
        self.val_files = val_files

        self.data = h5py.File(args.h5py_path)
        self.csv = pd.read_csv(args.csv_path)
        self.space_regex = re.compile('[\s\n]')
        
    def __len__(self):
        return len(self.ids)

    def split_predstring(self, x):
        vals = x.split()
        return int(vals[0]), int(vals[-1])
    
    def __getitem__(self, idx):
        text_id = self.ids[idx]
        text = all_texts[self.val_files[idx]]
        gt_dict = {}
        sample_df = self.csv.loc[self.csv.id == self.val_files[idx]]
        sample_df = self.csv.query("id == @self.val_files[idx]")
        for class_i in range(1, 8):
            class_name = self.args.label_names[class_i]
            class_entities = sample_df.loc[sample_df.discourse_type == class_name]
            if len(class_entities):
                gt_dict[class_i] = [(x[0], x[1]) for x in class_entities.predictionstring.map(self.split_predstring)]
        
        tokens = self.data['tokens'][text_id]
        attention_mask = self.data['attention_masks'][text_id]
        num_tokens = self.data['num_tokens'][text_id, 0]
        token_bounds = self.data['token_offsets'][text_id]
        cbio_labels = self.data['cbio_labels'][text_id]
        
        label_mask = np.zeros_like(attention_mask)
        argmax_labels = cbio_labels.argmax(-1)
        for class_i in range(1, 15):
            label_mask[argmax_labels == class_i] = self.args.token_weights[class_i]

        zero_label_mask = argmax_labels == 0
        zero_label_mask[num_tokens - 1:] = False
        label_mask[zero_label_mask] = self.args.token_weights[0]
        label_mask[0] = 0 # what is this for?
        
        index_map = []
        current_word = 0
        blank = False
        for char_ix in range(text.index(text.strip()[0]), len(text)):
            if self.space_regex.match(text[char_ix]) is not None:
                blank = True
            elif blank:
                current_word += 1
                blank = False
            index_map.append(current_word)
        
        return tokens, attention_mask, cbio_labels, label_mask, token_bounds, gt_dict, index_map, num_tokens