# Training

✅ **needed to be checked**
- loss function (cross entropy loss and rce loss)
- collate (does it properly implemented?)
    - assumed torch padding function but was not used
- entity_extraction function
- does deverta accepted long sequence length even the config setted by 512 seq len?

✅ **needed to be added**
- training depend on device
- model output to prediction string converter
- f1macro calculation by prediction string
- training mode without gradient accumulation

---

- Environment Setting
- Argument Setting

## Environment Setting

In [1]:
import os
import os.path as osp
import sys

DATASET_PATH = ('../../feedback-prize-2021')

sys.path.insert(0, './codes')
sys.path.append('longformer/tvm/python/')
sys.path.append('longformer/')

In [None]:
USE_WANDB = False

In [1]:
import re
import random
import easydict
import argparse

from random import shuffle
from tqdm import tqdm
from glob import glob

import numpy as np
import pandas as pd

import h5py
import ftfy
import dill as pickle
import wandb

import torch
from transformers import DebertaV2Model

# torch.use_deterministic_algorithms(True)
# from longformer.longformer import Longformer, LongformerConfig, RobertaModel
# from longformer.sliding_chunks import pad_to_window_size

In [3]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8"
    
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

**why using `os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8"`?**
- [torch.use_deterministic_algorithms](https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html)
> **A handful of CUDA operations are nondeterministic if the CUDA version is 10.2 or greater**, unless the environment variable `CUBLAS_WORKSPACE_CONFIG=:4096:8` or `CUBLAS_WORKSPACE_CONFIG=:16:8` is set. See the CUDA documentation for more details: https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility If one of these environment variable configurations is not set, a RuntimeError will be raised from these operations when called with CUDA tensors:



## Argument Setting

In [7]:
def get_config():
    parser = argparse.ArgumentParser(description="use huggingface models")
    parser.add_argument("--wandb_user", default='ducky', type=str)
    parser.add_argument("--wandb_project", default='feedback_deberta_large', type=str)
    parser.add_argument("--dataset_path", default='../../feedback-prize-2021', type=str)
    parser.add_argument("--seed", default=0, type=int)
    parser.add_argument("--min_len", default=0, type=int)
    parser.add_argument("--wd", default=1e-2, type=float)
    parser.add_argument("--weights_pow", default=0.1, type=float)
    parser.add_argument("--use_groupped_weights", default=False, type=bool)
    parser.add_argument("--global_attn", default=False, type=int)
    parser.add_argument("--label_smoothing", default= 0.1, type=float)
    parser.add_argument("--extra_dense", default= False, type=bool)
    parser.add_argument("--epochs", default=9, type=int)
    parser.add_argument("--batch_size", default=4, type=int)
    parser.add_argument("--grad_acc_steps", default=2, type=int)
    parser.add_argument("--grad_checkpt", default=True, type=bool)
    parser.add_argument("--data_prefix", default='', type=str)
    parser.add_argument("--max_grad_norm", default=35 * 8, type=int)
    parser.add_argument("--start_eval_at", default=0, type=int)
    parser.add_argument("--lr", default=32e-6, type=float)
    parser.add_argument("--min_lr", default=32e-6, type=float)
    parser.add_argument("--dataset_version", default=2, type=int)
    parser.add_argument("--warmup_steps", default=500, type=int)
    parser.add_argument("--rce_weight", default=0.1, type=float)
    parser.add_argument("--ce_weight", default=0.9, type=float)
    parser.add_argument("--dropout_ratio", default=0.0, type=float)
    parser.add_argument("--decay_bias", default=False, type=bool)
    parser.add_argument("--val_fold", default=0, type=int)
    parser.add_argument("--num_worker", default=8, type=int)
    parser.add_argument("--model_name", default="microsoft/deberta-v3-large", type=str)
    parser.add_argument("--local_rank", type=int, default=-1, help="do not modify!")

    args = parser.parse_args(args=[])

    if args.local_rank !=-1:
        print('[ DDP ] local rank', args.local_rank)
        torch.cuda.set_device(args.local_rank)
        dist.init_process_group(backend='nccl')
        args.device = torch.device("cuda", args.local_rank)
        args.rank = torch.distributed.get_rank()
        args.world_size = torch.distributed.get_world_size()  

        # checking settings for distributed training
        assert args.batch_size % args.world_size == 0, f'--batch_size {args.batch_size} must be multiple of world size'
        assert torch.cuda.device_count() > args.local_rank, 'insufficient CUDA devices for DDP command'

        args.ddp = True
    else:
        args.device = torch.device("cuda")
        args.rank = 0
        args.ddp = False

    return args

In [8]:
args = get_config()

## Load data

In [12]:
from module.utils import get_token_weights
from module.utils import get_prepare_data
from module.utils import get_all_texts
from module.utils import get_id_to_ix_map
from module.utils import get_fold_data

In [13]:
def get_data_files(args):
    token_weights = get_token_weights(args.use_groupped_weights, args.weights_pow)
    data = get_prepare_data()
    csv = pd.read_csv(osp.join(args.dataset_path, 'train.csv'))
    all_texts = get_all_texts(args)
    id_to_ix_map = get_id_to_ix_map()
    data_splits = get_fold_data()

    # text_id example `16585724607E`
    train_text_ids = [text_id for fold in range(5) if fold != args.val_fold for text_id in data_splits[args.seed][250]['normed'][fold]]
    val_text_ids = data_splits[args.seed][250]['normed'][args.val_fold]

    train_ids = [id_to_ix_map[text_id] for text_id in train_text_ids]
    val_ids = [id_to_ix_map[text_id] for text_id in val_text_ids]

    return all_texts, token_weights, data, csv, train_ids, val_ids, train_text_ids, val_text_ids

## Wandb

In [None]:
if USE_WANDB:
    run = wandb.init(entity='ducky', project='feedback_debertav3_large')
    run.name = f'v3_fold{args.val_fold}_minlr{args.min_lr}_maxlr{args.lr}_wd{args.weight_decay}_warmup{args.warmup_steps}_gradnorm{args.max_grad_norm}_biasdecay{args.decay_bias}_ls{args.label_smoothing}_wp{args.weights_pow}_data{args.dataset_version}_rce{args.rce_weight}'

In [18]:
seed_everything(args.seed)

## Dataset & Dataloader

In [11]:
class TrainDataset(torch.utils.data.Dataset):
    def __init__(self, args, ids):
        self.args = args
        self.ids = ids
        self.data = h5py.File(args.h5py_path)

    def __len__(self):
        return len(self.ids)
    
    def __getitem__(self, idx):
        text_id = self.ids[idx]
        
        # TODO: Add Tokenizer and directly tokenize for typo injections
        tokens = self.data['tokens'][text_id]
        attention_mask = self.data['attention_masks'][text_id]
        num_tokens = self.data['num_tokens'][text_id, 0]
        
        # label
        cbio_labels = self.data[f'{self.args.data_prefix}cbio_labels'][text_id]
        cbio_labels *= (1 - self.args.label_smoothing)
        cbio_labels += self.args.label_smoothing / 15
        
        # label mask
        label_mask = np.zeros_like(attention_mask)
        argmax_labels = cbio_labels.argmax(-1)
        
        for i in range(1, 15):
            label_mask[argmax_labels == i] = self.args.token_weights[i]
        
        zero_label_mask = argmax_labels == 0
        zero_label_mask[num_tokens - 1:] = False

        label_mask[zero_label_mask] = self.args.token_weights[0]
        label_mask[0] = 0

        return tokens, attention_mask, cbio_labels, label_mask, num_tokens
    

In [None]:
class ValDataset(torch.utils.data.Dataset):
    def __init__(self, args, ids, val_files):
        self.args = args
        self.ids = ids
        self.val_files = val_files

        self.data = h5py.File(args.h5py_path)
        self.csv = pd.read_csv(args.csv_path)
        self.space_regex = re.compile('[\s\n]')
        
    def __len__(self):
        return len(self.ids)

    def split_predstring(self, x):
        vals = x.split()
        return int(vals[0]), int(vals[-1])
    
    def __getitem__(self, idx):
        text_id = self.ids[idx]
        text = all_texts[self.val_files[idx]]
        gt_dict = {}
        sample_df = self.csv.loc[self.csv.id == self.val_files[idx]]
        sample_df = self.csv.query("id == @self.val_files[idx]")
        for class_i in range(1, 8):
            class_name = self.args.label_names[class_i]
            class_entities = sample_df.loc[sample_df.discourse_type == class_name]
            if len(class_entities):
                gt_dict[class_i] = [(x[0], x[1]) for x in class_entities.predictionstring.map(self.split_predstring)]
        
        tokens = self.data['tokens'][text_id]
        attention_mask = self.data['attention_masks'][text_id]
        num_tokens = self.data['num_tokens'][text_id, 0]
        token_bounds = self.data['token_offsets'][text_id]
        cbio_labels = self.data['cbio_labels'][text_id]
        
        label_mask = np.zeros_like(attention_mask)
        argmax_labels = cbio_labels.argmax(-1)
        for class_i in range(1, 15):
            label_mask[argmax_labels == class_i] = self.args.token_weights[class_i]

        zero_label_mask = argmax_labels == 0
        zero_label_mask[num_tokens - 1:] = False
        label_mask[zero_label_mask] = self.args.token_weights[0]
        label_mask[0] = 0 # what is this for?
        
        index_map = []
        current_word = 0
        blank = False
        for char_ix in range(text.index(text.strip()[0]), len(text)):
            if self.space_regex.match(text[char_ix]) is not None:
                blank = True
            elif blank:
                current_word += 1
                blank = False
            index_map.append(current_word)
        
        return tokens, attention_mask, cbio_labels, label_mask, token_bounds, gt_dict, index_map, num_tokens