# Online Tokenizer

> Literally, the title is all

In [1]:
import sys
import argparse
import torch

from module.utils import get_data_files
from transformers import AutoConfig, AutoModel, AutoTokenizer, AutoModelForTokenClassification

sys.path.insert(0, './codes/new_transformers_branch/transformers/src')

In [2]:
def get_config():
    parser = argparse.ArgumentParser(description="use huggingface models")
    parser.add_argument("--dataset_path", default='../../feedback-prize-2021', type=str)
    parser.add_argument("--save_path", default='result', type=str)
    parser.add_argument("--seed", default=0, type=int)
    parser.add_argument("--min_len", default=0, type=int)
    parser.add_argument("--use_groupped_weights", default=False, type=bool)
    parser.add_argument("--global_attn", default=False, type=int)
    parser.add_argument("--epochs", default=9, type=int)
    parser.add_argument("--batch_size", default=4, type=int)
    parser.add_argument("--grad_acc_steps", default=2, type=int)
    parser.add_argument("--grad_checkpt", default=True, type=bool)
    parser.add_argument("--data_prefix", default='', type=str)
    parser.add_argument("--max_grad_norm", default=10.0, type=float)
    parser.add_argument("--start_eval_at", default=0, type=int)
    parser.add_argument("--weight_decay", default=1e-2, type=float)
    parser.add_argument("--weights_pow", default=0.1, type=float)
    parser.add_argument("--dataset_version", default=2, type=int)
    parser.add_argument("--decay_bias", default=False, type=bool)
    parser.add_argument("--val_fold", default=0, type=int)
    parser.add_argument("--num_worker", default=8, type=int)
    parser.add_argument("--local_rank", type=int, default=-1, help="do not modify!")
    parser.add_argument("--device", type=int, default=0, help="select the gpu device to train")

    # logging
    parser.add_argument("--wandb_user", default='ducky', type=str)
    parser.add_argument("--wandb_project", default='feedback_deberta_large', type=str)
    parser.add_argument("--wandb_comment", default="", type=str, help="comment will be added at the back of wandb project name")
    parser.add_argument("--print_acc", default=500, type=int, help="print accuracy of each class every `print_acc` steps")

    # optimizer
    parser.add_argument("--label_smoothing", default=0.1, type=float)
    parser.add_argument("--rce_weight", default=0.1, type=float)
    parser.add_argument("--ce_weight", default=0.9, type=float)
    parser.add_argument("--nesterov", default=True, type=bool, help="use nesterov for SGD")
    parser.add_argument("--momentum", default=0.9, type=float, help="momentum for SGD")

    # scheduler
    parser.add_argument("--lr", default=3e-5, type=float)
    parser.add_argument("--min_lr", default=1e-6, type=float)
    parser.add_argument("--warmup_steps", default=500, type=int)
    parser.add_argument("--gamma", default=0.8, type=float, help="gamma for cosine annealing warmup restart scheduler")
    parser.add_argument("--cycle_mult", default=1.0, type=float, help="cycle length adjustment for cosine annealing warmup restart scheduler")

    # model related arguments
    parser.add_argument("--model", default="microsoft/deberta-v3-large", type=str)
    parser.add_argument("--cnn1d", default=False, type=bool)
    parser.add_argument("--extra_dense", default= False, type=bool)
    parser.add_argument("--dropout_ratio", default=0.0, type=float)

    # swa
    parser.add_argument("--swa", action="store_true", help="use stochastic weight averaging")
    parser.add_argument("--swa_update_per_epoch", default=3, type=int)

    args = parser.parse_args([])

    if args.local_rank !=-1:
        print('[ DDP ] local rank', args.local_rank)
        torch.cuda.set_device(args.local_rank)
        dist.init_process_group(backend='nccl')
        args.device = torch.device("cuda", args.local_rank)
        args.rank = torch.distributed.get_rank()
        args.world_size = torch.distributed.get_world_size()  

        # checking settings for distributed training
        assert args.batch_size % args.world_size == 0, f'--batch_size {args.batch_size} must be multiple of world size'
        assert torch.cuda.device_count() > args.local_rank, 'insufficient CUDA devices for DDP command'

        args.ddp = True
    else:
        args.device = torch.device("cuda", args.device)
        args.rank = -1
        args.ddp = False

    return args

In [3]:
args = get_config()
all_texts, token_weights, data, csv, train_ids, val_ids, train_text_ids, val_text_ids = get_data_files(args)

In [4]:
text_id = train_text_ids[0]
text_id

'B72D0B4875B4'

In [5]:
text = all_texts[text_id]

## DebertaV3 Tokenizer

In [42]:
from new_transformers import DebertaV2TokenizerFast
from transformers import AutoTokenizer

In [13]:
tokenizer = DebertaV2TokenizerFast.from_pretrained('microsoft/deberta-v3-large')
tokenizer.model_max_length = 2048

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [44]:
auto_tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')
auto_tokenizer.model_max_length = 2048

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [22]:
tokenizer(['a', 'b'])

{'input_ids': [[1, 266, 2], [1, 2165, 2]], 'token_type_ids': [[0, 0, 0], [0, 0, 0]], 'attention_mask': [[1, 1, 1], [1, 1, 1]]}

In [24]:
tokenizer('a\n')

{'input_ids': [1, 266, 507, 2], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [45]:
auto_tokenizer('a\n')

{'input_ids': [1, 266, 2], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

In [39]:
tokenizer('a\n')

{'input_ids': [1, 266, 507, 2], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [25]:
tokenizer('\n')

{'input_ids': [1, 507, 2], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

In [46]:
decoded_text = tokenizer.decode([12, 507, 16])
decoded_text

'<0x08> <0x0C>'

In [49]:
decoded_text

'<0x08> <0x0C>'

### newline (\n) is removed by DebertaV3 Tokenizer

In [None]:
fix_text = lambda x: x.replace('\n', '‽')

text = fix_text(f.read().strip())

In [None]:
tokenizer_outs = tokenizer(text, return_offsets_mapping=True)

# token replacement ‽ -> [MASK]
tokenizer_outs['input_ids'] = [x if x != 126861 else 128000 for x in tokenizer_outs['input_ids']]

In [None]:
char_start = discourse_start
char_end = discourse_end
word_start = len(full_text[:char_start].split())
word_end = word_start + len(full_text[char_start:char_end].split())
word_end = min( word_end, len(full_text.split()) )
predictionstring = " ".join( [str(x) for x in range(word_start,word_end)] )

In [25]:
'  a'.split(' ')

['', '', 'a']

In [None]:
regexp = re.compile('[0-9a-zA-z]')

In [5]:
class FeedbackDataset(torch.utils.data.Dataset):
    def __init__(
        self, text_ids, csv, all_texts, token_weights
    ):
        self.csv = csv
        self.all_texts = all_texts
        self.text_ids = text_ids
        self.class_names = class_names
        self.token_weights = token_weights
        
        self.space_regex = re.compile("[\s\n]")
        
    def noise_injection(self, text):
        
    def preprocess_text(self, text):
        text = text.strip()
        
        # newline is removed from debertav3 tokenizer
        text = text.replace('\n', '‽')
        
        return text
    
    def forward(self, idx):
        text_id = self.text_ids[idx]
        
        

    def __len__(self):
        return len(self.text_ids)
    

In [50]:
csv

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1.622628e+12,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1.622628e+12,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1.622628e+12,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1.622628e+12,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1.622628e+12,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...
...,...,...,...,...,...,...,...,...
144288,4C471936CD75,1.618153e+12,2234.0,3203.0,if I'm not sure what college I want to attend...,Evidence,Evidence 2,386 387 388 389 390 391 392 393 394 395 396 39...
144289,4C471936CD75,1.618153e+12,3221.0,4509.0,seeking multiple opinions before making a har...,Evidence,Evidence 3,576 577 578 579 580 581 582 583 584 585 586 58...
144290,4C471936CD75,1.618025e+12,4510.0,4570.0,it is better to seek multiple opinions instead...,Position,Position 1,828 829 830 831 832 833 834 835 836 837 838
144291,4C471936CD75,1.618025e+12,4570.0,4922.0,The impact of asking people to help you make a...,Evidence,Evidence 4,839 840 841 842 843 844 845 846 847 848 849 85...


In [None]:
    def __getitem__(self, idx):
        i = self.ids[idx]

        # load text data & text dataframe
        text_id = self.val_text_ids[idx]
        text = self.all_texts[text_id]
        sample_df = self.csv.query("id == @text_id")

        # load ground truth prediction string for f1macro metric
        gt_dict = {}
        for class_i in range(1, 8):
            class_name = self.class_names[class_i]
            class_df = sample_df.query("discourse_type == @class_name")
            if len(class_df):
                gt_dict[class_i] = [
                    (x[0], x[1])
                    for x in class_df.predictionstring.map(split_predstring)
                ]

        # load valid data
        tokens = self.data["tokens"][i]
        attention_mask = self.data["attention_masks"][i]
        num_tokens = self.data["num_tokens"][i, 0]
        token_bounds = self.data["token_offsets"][i]
        cbio_labels = self.data["cbio_labels"][i]

        # class weight per token
        class_weight = np.zeros_like(attention_mask)
        argmax_labels = cbio_labels.argmax(-1)

        for class_i in range(1, 15):
            class_weight[argmax_labels == class_i] = self.token_weights[class_i]

        class_none_index = argmax_labels == 0
        class_none_index[num_tokens - 1 :] = False
        class_weight[class_none_index] = self.token_weights[0]
        class_weight[0] = 0

        # ???
        index_map = []
        current_word = 0
        blank = False
        for char_ix in range(text.index(text.strip()[0]), len(text)):
            if self.space_regex.match(text[char_ix]) is not None:
                blank = True
            elif blank:
                current_word += 1
                blank = False
            index_map.append(current_word)

        return (
            tokens,
            attention_mask,
            cbio_labels,
            class_weight,
            token_bounds,
            gt_dict,
            index_map,
            num_tokens,
        )


first_batch = True


def train_collate_fn(ins):
    global first_batch
    if first_batch:
        max_len = 2048
        first_batch = False
    else:
        max_len = (max(x[-1] for x in ins) + 7) // 8 * 8

    return tuple(
        torch.from_numpy(
            np.concatenate([ins[z][x][None, :max_len] for z in range(len(ins))])
        )
        for x in range(len(ins[0]) - 1)
    )


def val_collate_fn(ins):
    max_len = (max(x[-1] for x in ins) + 7) // 8 * 8
    return tuple(
        torch.from_numpy(
            np.concatenate([ins[z][x][None, :max_len] for z in range(len(ins))])
        )
        for x in range(len(ins[0]) - 3)
    ) + (
        [x[-3] for x in ins],
        [x[-2] for x in ins],
        np.array([x[-1] for x in ins]),
    )


def get_dataloader(
    args,
    train_ids,
    val_ids,
    data,
    csv,
    all_texts,
    val_text_ids,
    class_names,
    token_weights,
):
    train_dataset = TrainDataset(
        train_ids, data, args.label_smoothing, token_weights, args.data_prefix
    )
    val_dataset = ValDataset(
        val_ids, data, csv, all_texts, val_text_ids, class_names, token_weights
    )

    train_dataloader = DataLoader(
        train_dataset,
        collate_fn=train_collate_fn,
        batch_size=args.batch_size,
        num_workers=args.num_worker,
        shuffle=True,
    )
    val_dataloader = DataLoader(
        val_dataset,
        collate_fn=val_collate_fn,
        batch_size=args.batch_size,
        num_workers=8,
        persistent_workers=True,
    )

    return train_dataloader, val_dataloader