# Intro

In [1]:
!pip install nlp transformers datasets wandb
!apt install git-lfs

Collecting nlp
  Downloading nlp-0.4.0-py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 5.0 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 57.2 MB/s 
[?25hCollecting datasets
  Downloading datasets-1.18.0-py3-none-any.whl (311 kB)
[K     |████████████████████████████████| 311 kB 76.6 MB/s 
[?25hCollecting wandb
  Downloading wandb-0.12.9-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 62.8 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 81.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 78.0 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-

In [2]:
## IMPORTS
import os
import sys
import logging
from dataclasses import dataclass, field
import json
from typing import Dict, List, Optional

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm.notebook import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import datasets

from transformers import (LongformerModel, LongformerTokenizer, LongformerPreTrainedModel,
                          LongformerConfig, Trainer, TrainingArguments, EarlyStoppingCallback)
from transformers.models.longformer.modeling_longformer import LongformerQuestionAnsweringModelOutput
from transformers import LongformerForQuestionAnswering, LongformerTokenizerFast, EvalPrediction
from transformers import (
    HfArgumentParser,
    DataCollator,
    Trainer,
    TrainingArguments,
    set_seed,
)

In [3]:
# MONITOR CPU and GPU

os.environ["WANDB_DISABLED"] = "true"

# os.environ["WANDB_DISABLED"] = "false"
# import wandb
# wandb.init()

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# GLOBAL VARIABLES

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

UNKNOWN = "unknown"

BASE_DIR = "/content/drive/MyDrive/Colab Notebooks/seminar/finetuning-longformer-1024"

DATASET_TRAIN_PATH = "/content/drive/MyDrive/Colab Notebooks/seminar/dataset/coqa_flat_train_df_tokenized_reduced_1024.pkl"
DATASET_TEST_PATH = "/content/drive/MyDrive/Colab Notebooks/seminar/dataset/coqa_flat_val_df_tokenized_reduced_1024.pkl"
ANSWERS_PATH = "/content/drive/MyDrive/Colab Notebooks/seminar/answers/{file_name}"

MODEL_NAME = "allenai/longformer-base-4096"
MY_QA_MODEL_NAME = f"{MODEL_NAME}-finetuned-coqa-falttened"

SEED = 7

MAX_LENGTH = 1024

In [6]:
print(device)

cuda


In [7]:
print(device)

LOGS_DIR = os.path.join(BASE_DIR, "logs/")
MODEL_DIR = os.path.join(BASE_DIR, "model/")
OUTPUT_DIR = os.path.join(BASE_DIR, "output/")
TOKENIZER_DIR = os.path.join(BASE_DIR, "tokenizer/")
ANSWERS_DIR = os.path.join(BASE_DIR, "answers/")

DIRECTORIES = [LOGS_DIR, MODEL_DIR, OUTPUT_DIR, TOKENIZER_DIR]

for direc in DIRECTORIES:
    if not os.path.exists(direc):
        os.makedirs(direc)
        print(direc)

cuda


In [8]:
for direc in DIRECTORIES:
    if not os.path.exists(direc):
        os.makedirs(direc)

In [9]:
## HELPER FUNCTIONS

def _get_question_end_index(input_ids, sep_token_id):
    """
    Computes the index of the first occurrence of `sep_token_id`.
    """

    sep_token_indices = (input_ids == sep_token_id).nonzero()
    batch_size = input_ids.shape[0]

    assert sep_token_indices.shape[1] == 2, "`input_ids` should have two dimensions"
    assert (
        sep_token_indices.shape[0] == 3 * batch_size
    ), f"There should be exactly three separator tokens: {sep_token_id} in every sample for questions answering. You might also consider to set `global_attention_mask` manually in the forward function to avoid this error."
    return sep_token_indices.view(batch_size, 3, 2)[:, 0, 1]


def _compute_global_attention_mask(input_ids, sep_token_id, before_sep_token=True):
    """
    Computes global attention mask by putting attention on all tokens before `sep_token_id` if `before_sep_token is
    True` else after `sep_token_id`.
    """
    question_end_index = _get_question_end_index(input_ids, sep_token_id)
    question_end_index = question_end_index.unsqueeze(dim=1)  # size: batch_size x 1
    # bool attention mask with True in locations of global attention
    attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)
    if before_sep_token is True:
        attention_mask = (attention_mask.expand_as(input_ids) < question_end_index).to(torch.uint8)
    else:
        # last token is separation token and should not be counted and in the middle are two separation tokens
        attention_mask = (attention_mask.expand_as(input_ids) > (question_end_index + 1)).to(torch.uint8) * (
            attention_mask.expand_as(input_ids) < input_ids.shape[-1]
        ).to(torch.uint8)

    return attention_mask

# Prepare Data

In [10]:
df_train = pd.read_pickle(DATASET_TRAIN_PATH)
df_val = pd.read_pickle(DATASET_TEST_PATH)

In [11]:
df_train.head(1)

Unnamed: 0,index,id,turn_id,start_positions,end_positions,input_ids,attention_mask
0,0,3zotghdk5ibi9cex97fepx7jetpso7,1,364,369,"[0, 2765, 141, 203, 116, 2, 2, 1640, 16256, 43...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [12]:
dataset_train = datasets.Dataset.from_pandas(df_train)
dataset_val = datasets.Dataset.from_pandas(df_val)

In [13]:
columns = ['index', 'id', 'turn_id', 'start_positions', 'end_positions', 'input_ids', 'attention_mask']
dataset_train.set_format(type='torch', columns=columns)
dataset_val.set_format(type='torch', columns=columns)

In [14]:
dataset_train, dataset_val

(Dataset({
     features: ['index', 'id', 'turn_id', 'start_positions', 'end_positions', 'input_ids', 'attention_mask'],
     num_rows: 107286
 }), Dataset({
     features: ['index', 'id', 'turn_id', 'start_positions', 'end_positions', 'input_ids', 'attention_mask'],
     num_rows: 7918
 }))

In [15]:
# cach the dataset, so we can load it directly for training

# torch.save(dataset_train, 'train_data.pt')
# torch.save(dataset_val, 'valid_data.pt')

# Model

In [16]:
class MyLongformerForQuestionAnswering(LongformerPreTrainedModel):

    _keys_to_ignore_on_load_unexpected = [r"pooler"]

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096', pad_to_max_length=True, max_length=MAX_LENGTH)

        self.longformer = LongformerModel.from_pretrained('allenai/longformer-base-4096', gradient_checkpointing=True)
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        global_attention_mask=None,
        head_mask=None,
        token_type_ids=None,
        position_ids=None,
        inputs_embeds=None,
        start_positions=None,
        end_positions=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if global_attention_mask is None:
            if input_ids is None:
                logger.warning(
                    "It is not possible to automatically generate the `global_attention_mask` because input_ids is None. Please make sure that it is correctly set."
                )
            else:
                # set global attention on question tokens automatically
                global_attention_mask = _compute_global_attention_mask(input_ids, self.config.sep_token_id)
 
        outputs = self.longformer(
            input_ids,
            attention_mask=attention_mask,
            global_attention_mask=global_attention_mask,
            head_mask=head_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        return LongformerQuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            global_attentions=outputs.global_attentions,
        )

    def construct_answer(self, outputs, encoding):
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits
        all_tokens = self.tokenizer.convert_ids_to_tokens(encoding["input_ids"][0].tolist())

        start_index = torch.argmax(start_logits)

        end_sorted = torch.argsort(end_logits, descending=True).squeeze().tolist()
        for i in end_sorted:
            if i+1 > start_index:
                end_index = i+1
                break

        if start_index < end_index:
            answer_tokens = all_tokens[start_index:end_index]
            answer = self.tokenizer.decode(self.tokenizer.convert_tokens_to_ids(answer_tokens))
        else: # TODO: a good condition for unknown
            answer = UNKNOWN
        return answer

In [None]:
def test_my_longformer_for_question_answering():

    question, text = "Who was Ali?", "Ali was a nice programmer."

    encoding = model.tokenizer(question, text, return_tensors="pt", pad_to_max_length=True, max_length=MAX_LENGTH).to(device)

    outputs = model(**encoding)
    # outputs = model(**encoding, start_positions=start_positions, end_positions=end_positions)

    answer = model.construct_answer(outputs, encoding)
    print(answer)

test_my_longformer_for_question_answering()

# Training script

In [17]:
logger = logging.getLogger(__name__)

@dataclass
class MyDataCollator:
    def __call__(self, batch):
        return self.collate_batch(batch)

    def collate_batch(self, batch):
        """
        Take a list of samples from a Dataset and collate them into a batch.
        Returns:
            A dictionary of tensors
        """
        input_ids = torch.stack([example['input_ids'] for example in batch]) # TODO:.to(device)
        attention_mask = torch.stack([example['attention_mask'] for example in batch]) # TODO:.to(device)
        start_positions = torch.stack([example['start_positions'] for example in batch]) # TODO:.to(device)
        end_positions = torch.stack([example['end_positions'] for example in batch]) # TODO:.to(device)

        return {
            'input_ids': input_ids, 
            'start_positions': start_positions, 
            'end_positions': end_positions,
            'attention_mask': attention_mask
        }


In [18]:
tokenizer = LongformerTokenizerFast.from_pretrained(
    MODEL_NAME,
    pad_to_max_length=True, 
    max_length=MAX_LENGTH
)

model = MyLongformerForQuestionAnswering.from_pretrained(
    MODEL_NAME,
)

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/694 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing MyLongformerForQuestionAnswering: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.w

In [19]:
model = model.to(device)

train_dataset = dataset_train
valid_dataset = dataset_val

In [20]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [25]:
set_seed(SEED)

training_args = TrainingArguments(
    MY_QA_MODEL_NAME,
    
    # evaluation_strategy="epoch",
    evaluation_strategy="steps",
    eval_steps=100,
    prediction_loss_only=True,

    logging_strategy="steps",
    logging_steps=100,

    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,

    save_steps=100,
    load_best_model_at_end=True,

    num_train_epochs=2,

    weight_decay=0.01,
    push_to_hub=True
) 

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=MyDataCollator(),
    tokenizer=tokenizer
) 

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
/content/allenai/longformer-base-4096-finetuned-coqa-falttened is already a clone of https://huggingface.co/alistvt/longformer-base-4096-finetuned-coqa-falttened. Make sure you pull the latest changes with `repo.git_pull()`.


In [26]:
trainer.train(resume_from_checkpoint=True)

# trainer.train()
trainer.push_to_hub()
# trainer.save_model()

# For convenience, we also re-save the tokenizer to the same directory,
# so that you can share your model easily on huggingface.co/models =)

tokenizer.save_pretrained('output/')


Loading model from allenai/longformer-base-4096-finetuned-coqa-falttened/checkpoint-1000).
The following columns in the training set  don't have a corresponding argument in `MyLongformerForQuestionAnswering.forward` and have been ignored: turn_id, id, index.
***** Running training *****
  Num examples = 107286
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 13412


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss
100,0.0,No log


The following columns in the evaluation set  don't have a corresponding argument in `MyLongformerForQuestionAnswering.forward` and have been ignored: turn_id, id, index.
***** Running Evaluation *****
  Num examples = 7918
  Batch size = 16
Saving model checkpoint to allenai/longformer-base-4096-finetuned-coqa-falttened/checkpoint-100
Configuration saved in allenai/longformer-base-4096-finetuned-coqa-falttened/checkpoint-100/config.json
Model weights saved in allenai/longformer-base-4096-finetuned-coqa-falttened/checkpoint-100/pytorch_model.bin
tokenizer config file saved in allenai/longformer-base-4096-finetuned-coqa-falttened/checkpoint-100/tokenizer_config.json
Special tokens file saved in allenai/longformer-base-4096-finetuned-coqa-falttened/checkpoint-100/special_tokens_map.json


KeyError: ignored

In [27]:

# Evaluation
results = {}
if training_args.do_eval and training_args.local_rank in [-1, 0]:
    
    logger.info("*** Evaluate ***")

    eval_output = trainer.evaluate()

    output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results *****")
        for key in sorted(eval_output.keys()):
            logger.info("  %s = %s", key, str(eval_output[key]))
            writer.write("%s = %s\n" % (key, str(eval_output[key])))

    results.update(eval_output)
    
    print(results)


The following columns in the evaluation set  don't have a corresponding argument in `MyLongformerForQuestionAnswering.forward` and have been ignored: turn_id, id, index.
***** Running Evaluation *****
  Num examples = 7918
  Batch size = 16


Step,Training Loss,Validation Loss
100,0.0,No log


Step,Training Loss,Validation Loss
100,0.0,No log
100,0.0,No log


{}


# Test model with data

In [None]:
predictions = []

for index, item in tqdm(df_test.iterrows()):

    question, text = item["question"], item["story"]

    encoding = model.tokenizer(question, text, return_tensors="pt").to(device)

    outputs = model(**encoding)
    # outputs = model(**encoding, start_positions=start_positions, end_positions=end_positions)

    answer = model.construct_answer(outputs, encoding)

    predictions.append(
        {
            "id": item["id"],
            "turn_id": item["turn_id"],
            "answer": answer
        }
    )