References
- [Chris Mccormick finetuning BERT for SQUAD](https://colab.research.google.com/drive/16VjEulbATgok4mELTSaq7GTQdh3JGhGy#scrollTo=Xm1wTn09RAR7)
- [Discussion Regarding finetuning T5](https://github.com/huggingface/transformers/issues/4426) | [Exploring T5 by patil suraj](https://github.com/patil-suraj/exploring-T5)
    - [SQUAD QA finetuning for T5](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=KdmKlMkfcLa0)
    - [T5 finetuning for non extractive tasks](https://colab.research.google.com/drive/176NSaYjc2eeI-78oLH_F9-YV3po3qQQO?usp=sharing)
- [Google's T5 fine tuning example for QA](https://colab.research.google.com/github/google-research/text-to-text-transfer-transformer/blob/master/notebooks/t5-trivia.ipynb#scrollTo=6rU32DjyeLuL)

## Import Dataset

In [38]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [1]:
# https://stackoverflow.com/questions/2352181/how-to-use-a-dot-to-access-members-of-dictionary
class dotdict(dict):
    """dot.notation access to dictionary attributes, as dict.key_name, not as dict["key_name"] """
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__


In [2]:
import textwrap

# Wrap text to 80 characters.
wrapper = textwrap.TextWrapper(width=80) 


In [3]:
import yaml

# Read config.yaml file
with open("config.yaml") as infile:
    SAVED_CFG = yaml.load(infile, Loader=yaml.FullLoader)
    SAVED_CFG = dotdict(SAVED_CFG)

# arguments setting
data_args = dotdict(SAVED_CFG.data)
model_args = dotdict(SAVED_CFG.custom_model)
model_args

{'model_name_or_path': 'klue/roberta-base',
 'save_steps': 100,
 'num_train_epochs': 3,
 'learning_rate': 5e-05,
 'batch_size': 32,
 'warmup_steps': 300,
 'weight_decay': 0.01,
 'validation': False,
 'max_length': 512,
 'DEBUG': True}

In [4]:
class Metrics(object):
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

## Import from Fine_Tune_BERT_on_SQuAD_v1_1.ipynb

In [5]:
import torch
from datasets import load_from_disk
from torch.utils.data import (DataLoader, RandomSampler, TensorDataset)

datasets = load_from_disk(data_args.dataset_name)
train_dataset_from_huggingface = datasets['train']
valid_dataset_from_huggingface = datasets['validation']

In [6]:
train_dataset_from_huggingface[0]

{'title': '미국 상원',
 'context': '미국 상의원 또는 미국 상원(United States Senate)은 양원제인 미국 의회의 상원이다.\\n\\n미국 부통령이 상원의장이 된다. 각 주당 2명의 상원의원이 선출되어 100명의 상원의원으로 구성되어 있다. 임기는 6년이며, 2년마다 50개주 중 1/3씩 상원의원을 새로 선출하여 연방에 보낸다.\\n\\n미국 상원은 미국 하원과는 다르게 미국 대통령을 수반으로 하는 미국 연방 행정부에 각종 동의를 하는 기관이다. 하원이 세금과 경제에 대한 권한, 대통령을 포함한 대다수의 공무원을 파면할 권한을 갖고 있는 국민을 대표하는 기관인 반면 상원은 미국의 주를 대표한다. 즉 캘리포니아주, 일리노이주 같이 주 정부와 주 의회를 대표하는 기관이다. 그로 인하여 군대의 파병, 관료의 임명에 대한 동의, 외국 조약에 대한 승인 등 신속을 요하는 권한은 모두 상원에게만 있다. 그리고 하원에 대한 견제 역할(하원의 법안을 거부할 권한 등)을 담당한다. 2년의 임기로 인하여 급진적일 수밖에 없는 하원은 지나치게 급진적인 법안을 만들기 쉽다. 대표적인 예로 건강보험 개혁 당시 하원이 미국 연방 행정부에게 퍼블릭 옵션(공공건강보험기관)의 조항이 있는 반면 상원의 경우 하원안이 지나치게 세금이 많이 든다는 이유로 퍼블릭 옵션 조항을 제외하고 비영리건강보험기관이나 보험회사가 담당하도록 한 것이다. 이 경우처럼 상원은 하원이나 내각책임제가 빠지기 쉬운 국가들의 국회처럼 걸핏하면 발생하는 의회의 비정상적인 사태를 방지하는 기관이다. 상원은 급박한 처리사항의 경우가 아니면 법안을 먼저 내는 경우가 드물고 하원이 만든 법안을 수정하여 다시 하원에 되돌려보낸다. 이러한 방식으로 단원제가 빠지기 쉬운 함정을 미리 방지하는 것이다.날짜=2017-02-05',
 'question': '대통령을 포함한 미국의 행정부 견제권을 갖는 국가 기관은?',
 'id': 'mrc-1-000067',
 'answers': {'answer_start'

In [7]:
import pandas as pd
import numpy as np

def pull_out_dictionary(df_input: pd.DataFrame):
    """pull out str `{}` values from the pandas dataframe and shape it as a new column"""

    df = df_input.copy()

    # assign subject_entity and object_entity column values type as dictionary
    # df["answers"] = df["answers"].apply(lambda x: eval(x))
    
    df = df.assign(
        # subject_entity
        answer_start=lambda x: x["answers"].apply(lambda x: x["answer_start"]),
        text=lambda x: x["answers"].apply(lambda x: x["text"]),
    )

    # drop subject_entity and object_entity column
    df = df.drop(["answers"], axis=1)

    return df

def pull_out_list(df_input: pd.DataFrame):
    """ pull out single item out of the list """
    
    df = df_input.copy()

    df["answer_start"] = df["answer_start"].apply(lambda x: int(x[0]))
    df["text"] = df["text"].apply(lambda x: x[0])
    return df

In [8]:
""" Converting train and validation dataset to Pandas dataframe for convenience """

train_df = pull_out_dictionary(pd.DataFrame.from_records(datasets['train']))
val_df = pull_out_dictionary(pd.DataFrame.from_records(datasets['validation']))

train_df = pull_out_list(train_df)
val_df = pull_out_list(val_df)

display(train_df.head(2))
display(val_df.head(2))

Unnamed: 0,title,context,question,id,document_id,__index_level_0__,answer_start,text
0,미국 상원,미국 상의원 또는 미국 상원(United States Senate)은 양원제인 미국...,대통령을 포함한 미국의 행정부 견제권을 갖는 국가 기관은?,mrc-1-000067,18293,42,235,하원
1,인사조직관리,'근대적 경영학' 또는 '고전적 경영학'에서 현대적 경영학으로 전환되는 시기는 19...,현대적 인사조직관리의 시발점이 된 책은?,mrc-0-004397,51638,2873,212,《경영의 실제》


Unnamed: 0,title,context,question,id,document_id,__index_level_0__,answer_start,text
0,전효숙,"순천여자고등학교 졸업, 1973년 이화여자대학교를 졸업하고 1975년 제17회 사법...",처음으로 부실 경영인에 대한 보상 선고를 받은 회사는?,mrc-0-003264,9027,2146,284,한보철강
1,스토우빌선,요크 카운티 동쪽에 처음으로 여객 열차 운행이 시작한 시점은 1868년 토론토 & ...,스카버러 남쪽과 코보콘그 마을의 철도 노선이 처음 연장된 연도는?,mrc-0-004762,51765,3106,146,1871년


In [9]:
print(train_df.columns)

Index(['title', 'context', 'question', 'id', 'document_id',
       '__index_level_0__', 'answer_start', 'text'],
      dtype='object')


In [10]:
import pandas as pd
import numpy as np
import json

# load test dataset as dataframe
with open("/opt/ml/data/wikipedia_documents.json", "r", encoding="utf-8") as reader:
    input_data = json.load(reader)
test_df = pd.DataFrame(input_data).T
test_df.head(2)

Unnamed: 0,text,corpus_source,url,domain,title,author,html,document_id
0,"이 문서는 나라 목록이며, 전 세계 206개 나라의 각 현황과 주권 승인 정보를 개...",위키피디아,TODO,,나라 목록,,,0
1,이 목록에 실린 국가 기준은 1933년 몬테비데오 협약 1장을 참고로 하였다. 협정...,위키피디아,TODO,,나라 목록,,,1


## Load Tokenizer
Fixed: roberta not receiving sequence ids

In [11]:
from transformers import AutoModel, AutoTokenizer, AutoConfig

# load tokenizer and configuration according to the model (ex: klue/roberta-large)
if "roberta" in model_args.model_name_or_path:
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path, 
        model_input_names = ["input_ids", "attention_mask"],
        use_fast=True # use rust based tokenizer    
    )
    print("sequence id not used:", model_args.model_name_or_path)
else:
    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
    print(model_args.model_name_or_path)

config = AutoConfig.from_pretrained(model_args.model_name_or_path)

sequence id not used: klue/roberta-base


In [14]:
# sample tokenization
tokens = tokenizer.tokenize(train_dataset_from_huggingface[1]['question'])
" ".join(tokens)

'현대 ##적 인사 ##조 ##직 ##관리 ##의 시발점 ##이 된 책 ##은 ?'

In [15]:
# test batch tokenization
# https://github.com/huggingface/transformers/issues/10297#issuecomment-783464293
sample_answer_token = ['크리스토', '##포', '알', '##하우스']
print(sample_answer_token)
print("Wrong Example:", tokenizer.encode(sample_answer_token, add_special_tokens=False, return_tensors='pt', is_split_into_words=True))
# apply int for torch Tensor
print("Correctly Encoded:" ,torch.IntTensor([tokenizer.convert_tokens_to_ids(sample_answer_token)]))

['크리스토', '##포', '알', '##하우스']
Wrong Example: tensor([[21533,     7,     7,  1862,  1381,     7,     7,  6634]])
Correctly Encoded: tensor([[21533,  2208,  1381, 17975]], dtype=torch.int32)


## Custom Dataset Class

In [27]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, df_dataset, tokenizer, is_training:bool=True, is_debbuging:bool=False):
        # initialize within the class from the input variables
        self.df_dataset = df_dataset
        self.tokenizer = tokenizer
        self.is_training = is_training
        self.is_debbuging = is_debbuging

        # intiailize from df_dataset's columns
        # ['title', 'context', 'question', 'id', 'document_id', '__index_level_0__', 'answer_start', 'text']
        self.title = df_dataset['title'] # title text
        self.context = df_dataset['context'] # context text
        self.question = df_dataset['question'] # question text
        self.id = df_dataset['id']
        self.document_id = df_dataset['document_id']
        self.answer_start = df_dataset['answer_start'] # answer index within context
        self.text = df_dataset['text'] # answer text
        
    def __len__(self):
        return len(self.df_dataset)
    
    def __getitem__(self, idx):
        item_context = self.context[idx] # context text
        item_question = self.question[idx] # question text
        item_id = self.id[idx]
        item_document_id = self.document_id[idx]
        item_answer_start = self.answer_start[idx] # answer index within context
        item_text = self.text[idx] # answer text
        
        # tokenize
        answer_tokens, masked_context = self.mask_context(item_question, item_answer_start, item_text, item_context)
        encoded_dict = self.tokenizer.encode_plus(
            item_question,
            masked_context,
            add_special_tokens = True,
            max_length=model_args.max_length,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        # get input id and attetnion mask from tokenized result
        input_ids = encoded_dict['input_ids']
        attention_mask = encoded_dict['attention_mask']
        
        # deepcopy input_ids from encoded_dict in order to notate answer in this tensor
        # input_ids_with_answer = input_ids.clone()
        
        # Restore the answer within the reference text. (Replace the `[MASK]` tokens
        # with the answer tokens)
        is_mask_token = (input_ids[0] == self.tokenizer.mask_token_id)
        mask_token_indices = is_mask_token.nonzero(as_tuple=False)[:, 0]

        # TODO: fix trunctation longer than 512
        # TODO: Get 512 tokens wrapping around the answer 
        if not len(mask_token_indices) == len(answer_tokens):
            print("mask token length not match")
            return None
        start_index = mask_token_indices[0]
        end_index = mask_token_indices[-1]
        answer_token_ids = self.tokenizer.convert_tokens_to_ids(
            answer_tokens,
        )
        
        # replace the `[MASK]` token ids with the answer token ids
        input_ids[0, start_index:end_index+1] = torch.IntTensor([answer_token_ids])

        # construct single item to return
        # encoded_dict["input_ids_with_answer"] = input_ids_with_answer
        encoded_dict["start_index"] = torch.IntTensor(int(start_index))
        encoded_dict["end_index"] = torch.IntTensor(int(end_index))

        if self.is_debbuging:
            print("- question:", item_question)
            print("- answer tokens:", answer_tokens)
            print("- tokenized answer token ids:", answer_token_ids)
        
        return encoded_dict

    def mask_context(self, item_question, item_answer_start, item_text, item_context):
        # mask the answer text with the mask token
        # print(item_text)
        answer_tokens = self.tokenizer.tokenize(item_text)
        # print(answer_tokens)
        str_mask = " ".join([self.tokenizer.mask_token]*len(answer_tokens))
        start_character_index = item_answer_start
        end_character_index = start_character_index + len(item_text)
        masked_context = \
            item_context[:start_character_index] + \
            str_mask + \
            item_context[end_character_index:]
        if self.is_debbuging:
            print("- masked context:", masked_context)
        return answer_tokens, masked_context

In [28]:
DEBUG_MODE = True
train_data = CustomDataset(train_df, tokenizer, is_debbuging=DEBUG_MODE)
dev_data = CustomDataset(val_df, tokenizer, is_debbuging=DEBUG_MODE)

sample_index = 1
sample_item = train_data[sample_index]

# print keys with index number
for index, key in enumerate(sample_item.keys()):
    print(index, "|", key, "|", sample_item[key].shape)

# check if torch tensor sample_item["input_ids"] matches with sample_item["input_ids_with_answer"]
# print(sample_item["input_ids"] == sample_item["input_ids_with_answer"])

- masked context: '근대적 경영학' 또는 '고전적 경영학'에서 현대적 경영학으로 전환되는 시기는 1950년대이다. 2차 세계대전을 마치고, 6.25전쟁의 시기로 유럽은 전후 재건에 집중하고, 유럽 제국주의의 식민지가 독립하여 아프리카, 아시아, 아메리카 대륙에서 신생국가가 형성되는 시기였고, 미국은 전쟁 이후 경제적 변화에 기업이 적응을 해야 하던 시기였다. 특히 1954년 피터 드러커의 저서 [MASK] [MASK] [MASK] [MASK] [MASK]는 현대적 경영의 기준을 제시하여서, 기존 근대적 인사조직관리를 넘어선 현대적 인사조직관리의 전환점이 된다. 드러커는 경영자의 역할을 강조하며 경영이 현시대 최고의 예술이자 과학이라고 주장하였고 , 이 주장은 21세기 인사조직관리의 역할을 자리매김했다.\n\n현대적 인사조직관리와 근대 인사조직관리의 가장 큰 차이는 통합이다. 19세기의 영향을 받던 근대적 경영학(고전적 경영)의 흐름은 기능을 강조하였지만, 1950년대 이후의 현대 경영학은 통합을 강조하였다. 기능이 분화된 '기계적인 기업조직' 이해에서 다양한 기능을 인사조직관리의 목적, 경영의 목적을 위해서 다양한 분야를 통합하여 '유기적 기업 조직' 이해로 전환되었다. 이 통합적 접근방식은 과정, 시스템, 상황을 중심으로 하는 인사조직관리 방식을 형성했다.
- question: 현대적 인사조직관리의 시발점이 된 책은?
- answer tokens: ['《', '경영', '##의', '실제', '》']
- tokenized answer token ids: [170, 3939, 2079, 3966, 171]
0 | input_ids | torch.Size([1, 336])
1 | attention_mask | torch.Size([1, 336])
2 | start_index | torch.Size([117])
3 | end_index | torch.Size([121])


In [29]:
DEBUG_MODE = False
train_data = CustomDataset(train_df, tokenizer, is_debbuging=DEBUG_MODE)
dev_data = CustomDataset(val_df, tokenizer, is_debbuging=DEBUG_MODE)

In [30]:
model_args.batch_size = 16
model_args

{'model_name_or_path': 'klue/roberta-base',
 'save_steps': 100,
 'num_train_epochs': 3,
 'learning_rate': 5e-05,
 'batch_size': 16,
 'warmup_steps': 300,
 'weight_decay': 0.01,
 'validation': False,
 'max_length': 512,
 'DEBUG': False}

In [31]:
# load the dataset
train_dataloader = torch.utils.data.DataLoader(
    train_data,
    batch_size=model_args.batch_size,
    shuffle=True,
    num_workers=4,
)

validation_dataloader = torch.utils.data.DataLoader(
    dev_data,
    batch_size=model_args.batch_size,
    shuffle=False,
    num_workers=4,
)

print('{:,} training batches & {:,} validation batches'.format(len(train_dataloader), len(validation_dataloader)))

247 training batches & 15 validation batches


## Load Model

In [32]:
from transformers import AutoModelForQuestionAnswering, AutoConfig

model_config = AutoConfig.from_pretrained(model_args.model_name_or_path)
model = AutoModelForQuestionAnswering.from_pretrained(
    model_args.model_name_or_path,
    config=model_config,
    )
model

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForQuestionAnswering: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a d

RobertaForQuestionAnswering(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm):

In [33]:
model_args.DEBUG = False
device = torch.device('cuda:0' if torch.cuda.is_available() and model_args.DEBUG == False else 'cpu')
print(device)
!nvidia-smi

cuda:0
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Fri Oct 22 02:57:59 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.80.02    Driver Version: 450.80.02    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  Off  | 00000000:00:05.0 Off |                  Off |
| N/A   34C    P0    36W / 250W |   1647MiB / 32510MiB |      0%      Default |
|                               |     

In [34]:
model.to(device)
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Fri Oct 22 02:57:59 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.80.02    Driver Version: 450.80.02    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  Off  | 00000000:00:05.0 Off |                  Off |
| N/A   34C    P0    36W / 250W |   2101MiB / 32510MiB |      9%      Default |
|                               |            

## Define Optimizer and Scheduler

In [35]:
from adamp import AdamP

optimizer = AdamP(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [36]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
epochs = model_args.num_train_epochs

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

## Training Loop

In [40]:
import time
import wandb
import random
import numpy as np

import wandb 

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []

# https://wandb.ai/happyface-boostcamp/KLUE-QA
wandb.init(project='KLUE-QA', entity='happyface-boostcamp')

for epoch_i in range(0, epochs):
    

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    
    print('Training {:,} batches...'.format(len(train_dataloader)))

    t0 = time.time()

    total_train_loss = 0

    model.train()

    num_batches = len(train_dataloader)

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        b_input_ids = batch["input_ids"].to(device)
        b_input_mask = batch["attention_mask"].to(device)
        # b_seg_ids = batch["segment_ids"].to(device)
        b_start_pos = batch["start_index"].to(device)
        b_end_pos = batch["end_index"].to(device)

        # print(batch)

        model.zero_grad()        

        outputs = model(
            b_input_ids, 
            attention_mask=b_input_mask, 
            start_positions=b_start_pos,
            end_positions=b_end_pos,
            return_dict=True
        )

        loss = outputs.loss
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        total_train_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)            
    

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
        
    print("")
    print("Running Validation...")

    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0

    t0_val = time.time()

    pred_start, pred_end, true_start, true_end = [], [], [], []

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        b_input_ids = batch["input_ids"].to(device)
        b_input_mask = batch["attention_mask"].to(device)
        # b_seg_ids = batch["segment_ids"].to(device)
        b_start_pos = batch["start_index"].to(device)
        b_end_pos = batch["end_index"].to(device)

        print(batch)
        
        with torch.no_grad():        
            outputs = model(
                b_input_ids, 
                # token_type_ids=b_seg_ids, 
                attention_mask=b_input_mask,
                start_positions=b_start_pos,
                end_positions=b_end_pos,
                return_dict=True
            )

        loss = outputs.loss
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits        

        total_eval_loss += loss.item()

        start_logits = start_logits.detach().cpu().numpy()
        end_logits = end_logits.detach().cpu().numpy()
        
        b_start_pos = b_start_pos.to('cpu').numpy()
        b_end_pos = b_end_pos.to('cpu').numpy()

        answer_start = np.argmax(start_logits, axis=1)
        answer_end = np.argmax(end_logits, axis=1)

        pred_start.append(answer_start)
        pred_end.append(answer_end)
        true_start.append(b_start_pos)
        true_end.append(b_end_pos)

    pred_start = np.concatenate(pred_start, axis=0)
    pred_end = np.concatenate(pred_end, axis=0)
    true_start = np.concatenate(true_start, axis=0)
    true_end = np.concatenate(true_end, axis=0)
        
    num_start_correct = np.sum(pred_start == true_start)
    num_end_correct = np.sum(pred_end == true_end)

    total_correct = num_start_correct + num_end_correct
    total_indeces = len(true_start) + len(true_end)

    # Report the final accuracy for this validation run.
    avg_val_accuracy = float(total_correct) / float(total_indeces)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # validation_time = format_time(time.time() - t0_val)
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    # print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    wandb.log(
                    {
                        'train/loss':avg_train_loss, 
                        'train/learning_rate':optimizer.param_groups[0]['lr'], 
                        'eval/loss':avg_val_loss,
                        'eval/accuracy':avg_val_accuracy,
                        }
            )
    

print("")
print("Training complete!")

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

[34m[1mwandb[0m: wandb version 0.12.5 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade



Training 247 batches...
mask token length not matchmask token length not match

mask token length not match
mask token length not match
mask token length not match

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 198, in _worker_loop
    data = fetcher.fetch(index)
  File "/opt/conda/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
    return self.collate_fn(data)
  File "/opt/conda/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 73, in default_collate
    return {key: default_collate([d[key] for d in batch]) for key in elem}
  File "/opt/conda/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 73, in <dictcomp>
    return {key: default_collate([d[key] for d in batch]) for key in elem}
  File "/opt/conda/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 55, in default_collate
    return torch.stack(batch, 0, out=out)
RuntimeError: stack expects each tensor to be equal size, but got [1, 362] at entry 0 and [1, 385] at entry 1



mask token length not match
mask token length not match


In [None]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tue Oct 19 13:01:40 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.80.02    Driver Version: 450.80.02    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  Off  | 00000000:00:05.0 Off |                  Off |
| N/A   56C    P0    49W / 250W |  23265MiB / 32510MiB |      0%      Default |
|                               |            

Evaluating on 14 test batches...
    DONE.

Evaluation took 7 seconds.



Samples w/ all answers clipped: 19 of 240 (7.92%)

    Additional clipped answers: 0 of 240


In [54]:
import time
import torch

# Track the time. Tokenizing all training valid_dataset_from_huggingface takes around 3 minutes.
t0 = time.time()

# Lists to store the encoded samples.
all_input_ids = []
attention_masks = []


print('Tokenizing {:,} valid_dataset_from_huggingface...'.format(len(valid_dataset_from_huggingface)))

# For each of the training valid_dataset_from_huggingface...
for (ex_num, ex) in enumerate(valid_dataset_from_huggingface):

    # =====================
    #   Progress Update
    # =====================

    # =============================
    #      Tokenize & Encode
    # =============================
    # Combine the question and the context strings, and tokenize them all 
    # together.
    # `encode_plus` will:    
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Place an `[SEP]` token between the question and reference text, and 
    #       and at the end of the reference text.
    #   (4) Map tokens to their IDs ("encode" the text)
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    #   (7) Create the list of segment IDs, indicating which tokens belong
    #       to the question vs. the context.
    #   (8) Casts everything as PyTorch tensors.

    encoded_dict = tokenizer.encode_plus(
        ex['question'], 
        ex['context'],
        add_special_tokens = True,  # Add '[CLS]' and '[SEP]'
        max_length = max_len,       # Pad & truncate all sentences.
        pad_to_max_length = True,
        truncation = True,
        return_attention_mask = True, # Construct attention masks.
        return_tensors = 'pt',        # Return pytorch tensors.
    )

    # Retrieve the encoded sequence.
    input_ids = encoded_dict['input_ids']

    # =============================
    #     Store Encoded Sample
    # =============================

    # Add the encoded sentence to the list.    
    all_input_ids.append(input_ids)

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])    


    # ^^^ Continue looping through all of the test samples. ^^^

# =========================
#        Wrap-Up
# =========================

# Convert the lists of tensors into 2D tensors.
all_input_ids = torch.cat(all_input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# We don't need the indeces to be tensors, since we're not doing training here.
# Convert the "labels" (the start and end indeces) into tensors.
#start_positions = torch.tensor(start_positions)
#end_positions = torch.tensor(end_positions)


Tokenizing 240 valid_dataset_from_huggingface...




In [2]:
import time
import numpy as np

# Prediction on test set

# Put model in evaluation mode
model.eval()

t0 = time.time()

# Tracking variables 
pred_start = []
pred_end = []

# Get the total number of test samples (not answers).
num_test_samples = all_input_ids.shape[0]

# We'll batch the samples to speed up processing. 
batch_size = 16

num_batches = int(np.ceil(num_test_samples / batch_size))

print('Evaluating on {:,} test batches...'.format(num_batches))

batch_num = 0

# Predict 
for start_i in range(0, num_test_samples, batch_size):
    
    # Report progress.
    if ((batch_num % 50) == 0) and not (batch_num == 0):

        # Calculate the time remaining based on our progress.
        batches_per_sec = (time.time() - t0) / batch_num
        remaining_sec = batches_per_sec * (num_batches - batch_num)
        # Report progress.
        print('  Batch {:>7,}  of  {:>7,}.    Elapsed: {:}. Remaining: {:}'.format(batch_num, num_batches))

    # Calculate the ending index for this batch.
    # `end_i` is equal to the index of the last sample in the batch, +1.
    end_i = min(start_i + batch_size, num_test_samples)

    # Select our batch inputs (`b` stands for batch here).
    b_input_ids = all_input_ids[start_i:end_i, :]
    b_attn_masks = attention_masks[start_i:end_i, :]
    # b_seg_ids = segment_ids[start_i:end_i, :]   

    # Copy these to the GPU.
    b_input_ids = b_input_ids.to(device)
    b_attn_masks = b_attn_masks.to(device)
    # b_seg_ids = b_seg_ids.to(device)
    
    # Telling the model not to compute or store the compute graph, saving memory 
    # and speeding up prediction
    with torch.no_grad():
        
        # Forward pass, calculate logit predictions
        outputs = model(
            b_input_ids, 
            attention_mask=b_attn_masks,
            return_dict=True
            )
                        

    # Move logits to CPU.
    start_logits = outputs.start_logits.detach().cpu().numpy()
    end_logits = outputs.end_logits.detach().cpu().numpy()
    
    # Find the tokens with the highest `start` and `end` scores.
    answer_start = np.argmax(start_logits, axis=1)
    answer_end = np.argmax(end_logits, axis=1)

    # Store predictions and true labels
    pred_start.append(answer_start)
    pred_end.append(answer_end)

    batch_num += 1

    # ^^^ Continue looping through the batches. ^^^

# Combine the results across the batches.
pred_start = np.concatenate(pred_start, axis=0)
pred_end = np.concatenate(pred_end, axis=0)

print('    DONE.')

print('\nEvaluation took {:.0f} seconds.'.format(time.time() - t0))

NameError: name 'model' is not defined

In [57]:
pred_start

array([253, 311, 170, 116, 213,  60, 125,  70, 259, 238,  44, 493,  21,
       154,  68,  56, 116, 389,  27, 255,  27,  63, 324, 244,  52, 264,
       510,  18,  80, 151,  51, 137,  28, 181, 344,  55, 122, 108,  33,
        85, 510, 470, 126, 232,  31, 130, 295,  42,  29, 275, 380,  80,
       225, 473, 277, 183, 152,  42,  50, 294, 278, 458,  67, 154, 322,
       190,   1, 302, 426,  58, 179, 256,  74, 378,  32, 142, 115,  49,
       173, 496, 208, 128, 134, 363,  47, 140, 275,  53,  41, 406, 282,
       393,  21, 218, 102, 173, 112, 110, 236, 263, 223, 111, 104,  29,
        30, 187, 122, 102, 154, 415, 106, 200, 328,  76,  18, 122,  52,
        30, 215,  24, 109, 104, 121,  85, 223, 284,   6,  79, 263,  32,
        11, 307, 283,   8,  58, 377,  76, 257,  39,  58,  48,  18,   1,
        26, 288,  23,  49,  47, 506, 337,  19, 104, 444, 309, 413,  21,
       510, 472, 363, 136, 211,  73,  65, 305, 285, 441, 426,  65,  81,
       490, 290, 417, 288,  58,  48,  21, 249, 107, 339,  74, 12

In [56]:

total_correct = 0

# For each test sample...
for i in range(0, len(pred_start)):

    match_options = []

    # For each of the three possible answers...
    for j in range (0, len(start_positions[i])):
    
        matches = 0

        # Add a point if the start indeces match.
        if pred_start[i] == start_positions[i][j]:
            matches += 1

        # Add a point if the end indeces match.
        if pred_end[i] == end_positions[i][j]:
            matches += 1

        # Store the total.
        match_options.append(matches)

    # Between the three possible answers, pick the one with the highest "score".
    total_correct += (max(match_options))

    # ^^^ Continue looping through test samples ^^^

total_indeces = len(pred_start) + len(pred_end)

print('Correctly predicted indeces: {:,} of {:,} ({:.2%})'.format(
    total_correct,
    total_indeces,
    float(total_correct) / float(total_indeces)
))


Correctly predicted indeces: 10 of 480 (2.08%)
