This is mimicking ChrisMcCormick's Finetuning BERT on SQUAD v1.1 dataset. Please refer to original notebook here: 
https://colab.research.google.com/drive/16VjEulbATgok4mELTSaq7GTQdh3JGhGy#scrollTo=Xm1wTn09RAR7

## Import from Baseline Code

In [1]:
# https://stackoverflow.com/questions/2352181/how-to-use-a-dot-to-access-members-of-dictionary
class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__


In [2]:
import logging
import os
import sys

from typing import List, Callable, NoReturn, NewType, Any
import dataclasses
from datasets import load_metric, load_from_disk, Dataset, DatasetDict

from transformers import AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer

from transformers import (
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    TrainingArguments,
    set_seed,
)

from tokenizers import Tokenizer
from tokenizers.models import WordPiece

from baseline.utils_qa import postprocess_qa_predictions, check_no_error
from baseline.trainer_qa import QuestionAnsweringTrainer
from baseline.retrieval import SparseRetrieval

import yaml

# Read config.yaml file
with open("config.yaml") as infile:
    SAVED_CFG = yaml.load(infile, Loader=yaml.FullLoader)
    SAVED_CFG = dotdict(SAVED_CFG)

# arguments setting
data_args = dotdict(SAVED_CFG.data)
model_args = dotdict(SAVED_CFG.custom_model)
training_args = TrainingArguments(
    output_dir="./results",  # output directory
    save_total_limit=5,  # number of total save model.
    save_steps=model_args.save_steps,  # model saving step.
    num_train_epochs=model_args.num_train_epochs,  # total number of training epochs
    learning_rate=model_args.learning_rate,  # learning_rate
    per_device_train_batch_size=model_args.batch_size,  # batch size per device during training
    per_device_eval_batch_size=model_args.batch_size,  # batch size for evaluation
    warmup_steps=model_args.warmup_steps,  # number of warmup steps for learning rate scheduler
    weight_decay=model_args.weight_decay,  # strength of weight decay
    logging_dir="./logs",  # directory for storing logs
    logging_steps=100,  # log saving step.
    evaluation_strategy="steps",  # evaluation strategy to adopt during training
    # `no`: No evaluation during training.
    # `steps`: Evaluate every `eval_steps`.
    # `epoch`: Evaluate every end of epoch.
    eval_steps=500,  # evaluation step.
    load_best_model_at_end=True,
)


In [4]:
import torch
import torch.nn.functional as F
from transformers import BertModel, BertPreTrainedModel, AdamW,AutoTokenizer, TrainingArguments, get_linear_schedule_with_warmup
from datasets import load_metric, load_from_disk, Dataset, DatasetDict
from torch.utils.data import (DataLoader, RandomSampler, TensorDataset)
from tqdm import tqdm
import random

tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
datasets = load_from_disk(data_args.dataset_name)
train_dataset = datasets['train']
valid_dataset = datasets['validation']
#train dataset, train dataloader
q_seqs = tokenizer(
    train_dataset['question'], 
    padding="max_length", 
    truncation=True, 
    return_tensors='pt',
    return_token_type_ids=False,  # for RoBERTa
    )
p_seqs = tokenizer(
    train_dataset['context'], 
    padding="max_length", 
    truncation=True, 
    return_tensors='pt',
    return_token_type_ids=False,  # for RoBERTa
    )
# print(q_seqs[0])
train_dataset = TensorDataset(
    p_seqs['input_ids'], 
    p_seqs['attention_mask'], 
    # p_seqs['token_type_ids'],
    q_seqs['input_ids'], 
    q_seqs['attention_mask'], 
    # q_seqs['token_type_ids']
    )
train_loader = DataLoader(train_dataset,batch_size=model_args.batch_size)

#valid dataset, valid dataloader
q_seqs = tokenizer(
    valid_dataset['question'], 
    padding="max_length", 
    truncation=True, 
    return_tensors='pt',
    return_token_type_ids=False,  # for RoBERTa
    )
p_seqs = tokenizer(
    valid_dataset['context'], 
    padding="max_length", 
    truncation=True, 
    return_tensors='pt',
    return_token_type_ids=False,  # for RoBERTa
    )
# print(q_seqs[0])
valid_dataset = TensorDataset(
    p_seqs['input_ids'], 
    p_seqs['attention_mask'], 
    # p_seqs['token_type_ids'],
    q_seqs['input_ids'], 
    q_seqs['attention_mask'], 
    # q_seqs['token_type_ids']
    )
valid_loader = DataLoader(
    valid_dataset,
    batch_size=model_args.batch_size
    )


## Import from Fine_Tune_BERT_on_SQuAD_v1_1.ipynb