In [1]:
import pandas as pd
import os
import random
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
from datasets import Dataset
from torch.nn.utils import clip_grad_norm_
import math
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
args = {
    "seed": 42,
    "max_length": 384,
    "stride": 128,
    "epochs": 5,
    "batch_size": 4,
    "gradient_accumulation": 8,
    "lr": 2e-5,
    "weight_decay": 1e-2,
}

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [5]:
seed_everything(args["seed"])

In [6]:
tokenizer = AutoTokenizer.from_pretrained("../models/herbert-base-cased")
pad_on_right = tokenizer.padding_side == "right"

In [7]:
def preprocess(examples):
    """
    Tokenize contexts and questions, identify answer tokens.
    Source: https://github.com/huggingface/notebooks/blob/master/examples/question_answering.ipynb
    """

    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=args["max_length"],
        stride=args["stride"],
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = tokenized_examples.sequence_ids(i)

        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        if not answers["answer_text"][0]:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["answer_text"][0])

            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            if offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)

                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)
            else:
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)

    return tokenized_examples

In [8]:
class LinearWarmupInverseSqrtDecayScheduler():
    def __init__(self, optimizer, lr_initial=1e-6, lr_peak=1e-4, lr_final=2e-5, t_warmup=1000, t_decay=10000):
        assert lr_initial > 0
        assert lr_peak > 0
        assert lr_final > 0
        assert t_warmup > 0
        assert t_decay > 0

        self.optimizer = optimizer
        self.lr_initial = lr_initial
        self.lr_peak = lr_peak
        self.lr_final = lr_final
        self.t_warmup = t_warmup
        self.t_decay = t_decay

        self.t = 0
        self.lr = lr_initial

    def step(self):
        self.t += 1

        if self.t <= self.t_warmup:
            A = (self.lr_peak - self.lr_initial) / self.t_warmup
            B = self.lr_initial
            self.lr = A * self.t + B
        elif self.t > self.t_warmup and self.t <= self.t_decay:
            A = (self.lr_peak - self.lr_final) / (1 / math.sqrt(self.t_warmup) - 1 / math.sqrt(self.t_decay))
            B = self.lr_peak - A / math.sqrt(self.t_warmup)
            self.lr = A / math.sqrt(self.t) + B
        else:
            self.lr = self.lr_final

        for p in self.optimizer.param_groups:
            p["lr"] = self.lr

    def get_lr(self):
        return self.lr

In [9]:
df = pd.read_csv("../data/SQuAD-PL/train.csv").fillna("").sample(1000).reset_index(drop=True)
df["has_answer"] = df["answer_text"].apply(lambda x: int(len(x) > 0))
df["answers"] = df[["answer_start", "answer_text"]].apply(lambda x: {"answer_start": [x[0]], "answer_text": [x[1]]}, axis=1)

In [10]:
train_df, dev_df = train_test_split(df, test_size=0.1, random_state=args["seed"], shuffle=True, stratify=df["has_answer"])

In [11]:
train_set = Dataset.from_pandas(train_df)
train_set = train_set.map(preprocess, batched=True, remove_columns=train_set.column_names)
train_loader = DataLoader(train_set, batch_size=args["batch_size"], shuffle=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [12]:
dev_set = Dataset.from_pandas(dev_df)
dev_set = dev_set.map(preprocess, batched=True, remove_columns=dev_set.column_names)
dev_loader = DataLoader(dev_set, batch_size=args["batch_size"], shuffle=False)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [13]:
model = AutoModelForQuestionAnswering.from_pretrained("../models/herbert-base-cased").to(device=DEVICE)
optimizer = AdamW(model.parameters(), lr=args["lr"], weight_decay=args["weight_decay"])
scheduler = LinearWarmupInverseSqrtDecayScheduler(optimizer)

In [14]:
pbar = tqdm(range(args["epochs"]))
best_loss = np.inf

for epoch in pbar:
    model = model.train()
    batch_pbar = tqdm(train_loader, desc="train")

    for i, x in enumerate(batch_pbar):
        model(
            input_ids=torch.stack(x["input_ids"], dim=1).to(device=DEVICE),
            attention_mask=torch.stack(x["attention_mask"], dim=1).to(device=DEVICE),
            start_positions=x["start_positions"].to(device=DEVICE),
            end_positions=x["end_positions"].to(device=DEVICE)
        ).loss.backward()

        if (i + 1) % args["gradient_accumulation"] == 0 or i == len(train_loader) - 1:
            clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()

            batch_pbar.set_description("train lr: {}".format(scheduler.get_lr()))

    model = model.eval()
    train_loss = 0
    dev_loss = 0

    with torch.no_grad():
        for x in tqdm(train_loader, desc="eval train"):
            train_loss += model(
                input_ids=torch.stack(x["input_ids"], dim=1).to(device=DEVICE),
                attention_mask=torch.stack(x["attention_mask"], dim=1).to(device=DEVICE),
                start_positions=x["start_positions"].to(device=DEVICE),
                end_positions=x["end_positions"].to(device=DEVICE)
            ).loss.item() / len(train_loader)

        for x in tqdm(dev_loader, desc="eval dev"):
            dev_loss += model(
                input_ids=torch.stack(x["input_ids"], dim=1).to(device=DEVICE),
                attention_mask=torch.stack(x["attention_mask"], dim=1).to(device=DEVICE),
                start_positions=x["start_positions"].to(device=DEVICE),
                end_positions=x["end_positions"].to(device=DEVICE)
            ).loss.item() / len(dev_loader)

    if dev_loss < best_loss:
        best_loss = dev_loss
        model.save_pretrained("best")

    model.save_pretrained("epoch_{}".format(epoch))

    with open("log.txt", "a") as f:
        f.write("{},{},{}".format(epoch, train_loss, dev_loss))

    pbar.set_description("train: {:.4f} dev: {:.4f} best: {:.4f}".format(train_loss, dev_loss, best_loss))

  0%|          | 0/5 [00:00<?, ?it/s]

train:   0%|          | 0/231 [00:00<?, ?it/s]

eval train:   0%|          | 0/231 [00:00<?, ?it/s]

eval dev:   0%|          | 0/26 [00:00<?, ?it/s]

train:   0%|          | 0/231 [00:00<?, ?it/s]

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\ProgramData\Miniconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\ADRIAN~1\AppData\Local\Temp/ipykernel_1980/1111489870.py", line 10, in <module>
    input_ids=torch.stack(x["input_ids"], dim=1).to(device=DEVICE),
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\ProgramData\Miniconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2064, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\ProgramData\Miniconda3\lib\site-packages\IPython\core\ultratb.py", line 1101, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_of

TypeError: object of type 'NoneType' has no len()