# Intro

In [1]:
!pip install nlp transformers datasets wandb
!apt install git-lfs

Collecting nlp
  Downloading nlp-0.4.0-py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 3.6 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 49.8 MB/s 
[?25hCollecting datasets
  Downloading datasets-1.18.1-py3-none-any.whl (311 kB)
[K     |████████████████████████████████| 311 kB 73.8 MB/s 
[?25hCollecting wandb
  Downloading wandb-0.12.9-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 53.7 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 74.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.6 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_

In [2]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [3]:
## IMPORTS
import os
import sys
import logging
from dataclasses import dataclass, field
import json
from typing import Dict, List, Optional

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm.notebook import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import datasets

from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from transformers import default_data_collator, TrainingArguments, Trainer, EvalPrediction, set_seed


In [4]:
# MONITOR CPU and GPU

os.environ["WANDB_DISABLED"] = "true"

# os.environ["WANDB_DISABLED"] = "false"
# import wandb
# wandb.init()

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# GLOBAL VARIABLES

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

UNKNOWN = "unknown"
BASE_DIR = "/content/drive/MyDrive/Colab Notebooks/seminar/finetune-squad"

MODEL_NAME = "twmkn9/bert-base-uncased-squad2"

MY_MLM_MODEL_NAME = f"{MODEL_NAME}-pretrained-mlm-coqa-stories"
MY_QA_MODEL_NAME = f"{MODEL_NAME}-pretrain-finetuned-coqa-falt"
MY_MLM_MODEL_PATH = f"alistvt/{MY_MLM_MODEL_NAME}"

SEED = 7

MAX_LENGTH = 512

In [7]:
print(device)

LOGS_DIR = os.path.join(BASE_DIR, "logs/")
MODEL_DIR = os.path.join(BASE_DIR, "model/")
OUTPUT_DIR = os.path.join(BASE_DIR, "output/")
TOKENIZER_DIR = os.path.join(BASE_DIR, "tokenizer/")
ANSWERS_DIR = os.path.join(BASE_DIR, "answers/")

DIRECTORIES = [LOGS_DIR, MODEL_DIR, OUTPUT_DIR, TOKENIZER_DIR]

for direc in DIRECTORIES:
    if not os.path.exists(direc):
        os.makedirs(direc)
        print(direc)

cuda


# Prepare Data

In [8]:
from datasets import load_dataset
coqa = load_dataset("alistvt/coqa-flat")

Downloading:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Using custom data configuration alistvt--coqa-flat-859610cee67df2fb


Downloading and preparing dataset None/None (download: 16.39 MiB, generated: 209.24 MiB, post-processed: Unknown size, total: 225.63 MiB) to /root/.cache/huggingface/datasets/parquet/alistvt--coqa-flat-859610cee67df2fb/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/16.1M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/alistvt--coqa-flat-859610cee67df2fb/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
coqa['train'].features

{'bad_turn': Value(dtype='bool', id=None),
 'filename': Value(dtype='string', id=None),
 'id': Value(dtype='string', id=None),
 'index': Value(dtype='int64', id=None),
 'input_text': Value(dtype='string', id=None),
 'name': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'source': Value(dtype='string', id=None),
 'span_end': Value(dtype='int64', id=None),
 'span_start': Value(dtype='int64', id=None),
 'span_text': Value(dtype='string', id=None),
 'story': Value(dtype='string', id=None),
 'turn_id': Value(dtype='int64', id=None)}

# Preprocessing

In [9]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/465 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [10]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["story"],
        max_length=MAX_LENGTH,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        start_char = examples["span_start"][i]
        end_char = examples["span_start"][i] + len(examples["span_text"][i])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


In [11]:
tokenized_coqa = coqa.map(preprocess_function, batched=True, remove_columns=coqa["train"].column_names)

  0%|          | 0/109 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

# Model

In [18]:
model = AutoModelForQuestionAnswering.from_pretrained("alistvt/bert-squad-finetuned-coqa")
data_collator = default_data_collator

https://huggingface.co/alistvt/bert-squad-finetuned-coqa/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpea7df35e


Downloading:   0%|          | 0.00/675 [00:00<?, ?B/s]

storing https://huggingface.co/alistvt/bert-squad-finetuned-coqa/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/279c295b6ff7de05c51d2e4f9540d6dfe97bb0d9b49f738239054241ac46cbdb.e2da8c5176f5655042efe8a6c90357f0b61eb4bdb68e5cdecc54665ab19a938b
creating metadata file for /root/.cache/huggingface/transformers/279c295b6ff7de05c51d2e4f9540d6dfe97bb0d9b49f738239054241ac46cbdb.e2da8c5176f5655042efe8a6c90357f0b61eb4bdb68e5cdecc54665ab19a938b
loading configuration file https://huggingface.co/alistvt/bert-squad-finetuned-coqa/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/279c295b6ff7de05c51d2e4f9540d6dfe97bb0d9b49f738239054241ac46cbdb.e2da8c5176f5655042efe8a6c90357f0b61eb4bdb68e5cdecc54665ab19a938b
Model config BertConfig {
  "_name_or_path": "alistvt/bert-squad-finetuned-coqa",
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dr

Downloading:   0%|          | 0.00/415M [00:00<?, ?B/s]

storing https://huggingface.co/alistvt/bert-squad-finetuned-coqa/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/aa80d46d3e83f533ef4dc89a022e3e75bf6ba39261f24b152c59869e9f767815.72b3902f10779ba7858d98207d2aa41d6e0c75226db983e7149655cda03a04e8
creating metadata file for /root/.cache/huggingface/transformers/aa80d46d3e83f533ef4dc89a022e3e75bf6ba39261f24b152c59869e9f767815.72b3902f10779ba7858d98207d2aa41d6e0c75226db983e7149655cda03a04e8
loading weights file https://huggingface.co/alistvt/bert-squad-finetuned-coqa/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/aa80d46d3e83f533ef4dc89a022e3e75bf6ba39261f24b152c59869e9f767815.72b3902f10779ba7858d98207d2aa41d6e0c75226db983e7149655cda03a04e8
All model checkpoint weights were used when initializing BertForQuestionAnswering.

All the weights of BertForQuestionAnswering were initialized from the model checkpoint at alistvt/bert-squad-finetuned-coqa.
If your task is similar to th

# Training script

In [19]:
model = model.to(device)
# set_seed(SEED)

In [20]:
training_args = TrainingArguments(
    "bert-squad-finetuned-coqa",
    
    # evaluation_strategy="epoch",
    evaluation_strategy="steps",
    eval_steps=2000,

    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,

    save_steps=2000,
    load_best_model_at_end=True,

    num_train_epochs=2,

    weight_decay=0.01,
    push_to_hub=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_coqa['train'],
    eval_dataset=tokenized_coqa['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
/content/bert-squad-finetuned-coqa is already a clone of https://huggingface.co/alistvt/bert-squad-finetuned-coqa. Make sure you pull the latest changes with `repo.git_pull()`.


In [21]:
trainer.train()
# trainer.train()
trainer.push_to_hub()
# trainer.save_model()

***** Running training *****
  Num examples = 108647
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 13582


Step,Training Loss,Validation Loss
2000,2.3262,2.84288
4000,2.3328,2.841394
6000,2.3724,2.797816
8000,2.1817,2.859463
10000,2.2255,2.851664
12000,2.3237,2.803486


***** Running Evaluation *****
  Num examples = 7983
  Batch size = 16
Saving model checkpoint to bert-squad-finetuned-coqa/checkpoint-2000
Configuration saved in bert-squad-finetuned-coqa/checkpoint-2000/config.json
Model weights saved in bert-squad-finetuned-coqa/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in bert-squad-finetuned-coqa/checkpoint-2000/tokenizer_config.json
Special tokens file saved in bert-squad-finetuned-coqa/checkpoint-2000/special_tokens_map.json
tokenizer config file saved in bert-squad-finetuned-coqa/tokenizer_config.json
Special tokens file saved in bert-squad-finetuned-coqa/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 7983
  Batch size = 16
Saving model checkpoint to bert-squad-finetuned-coqa/checkpoint-4000
Configuration saved in bert-squad-finetuned-coqa/checkpoint-4000/config.json
Model weights saved in bert-squad-finetuned-coqa/checkpoint-4000/pytorch_model.bin
tokenizer config file saved in bert-squad-finetuned-

Upload file pytorch_model.bin:   0%|          | 3.38k/415M [00:00<?, ?B/s]

Upload file runs/Jan26_19-51-41_59f8eff8939f/events.out.tfevents.1643226711.59f8eff8939f.78.0:  36%|###6      …

To https://huggingface.co/alistvt/bert-squad-finetuned-coqa
   0943f78..771f0cf  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Question Answering', 'type': 'question-answering'}}
To https://huggingface.co/alistvt/bert-squad-finetuned-coqa
   771f0cf..bd38b90  main -> main



'https://huggingface.co/alistvt/bert-squad-finetuned-coqa/commit/771f0cf033ef1c4aaee007b10f9ce95aa2e60070'

In [22]:
# Evaluation
results = {}
    
eval_output = trainer.evaluate()

output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
with open(output_eval_file, "w") as writer:
    print("***** Eval results *****")
    for key in sorted(eval_output.keys()):
        print("  %s = %s"% (key, str(eval_output[key])))

results.update(eval_output)

print(results)


***** Running Evaluation *****
  Num examples = 7983
  Batch size = 16


***** Eval results *****
  epoch = 2.0
  eval_loss = 2.7978155612945557
  eval_runtime = 144.1292
  eval_samples_per_second = 55.388
  eval_steps_per_second = 3.462
{'eval_loss': 2.7978155612945557, 'eval_runtime': 144.1292, 'eval_samples_per_second': 55.388, 'eval_steps_per_second': 3.462, 'epoch': 2.0}


In [None]:
tokenizer.save_pretrained(MODEL_DIR)

tokenizer config file saved in /content/drive/MyDrive/Colab Notebooks/seminar/pretrained-finetuning-bert/model/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Colab Notebooks/seminar/pretrained-finetuning-bert/model/special_tokens_map.json


('/content/drive/MyDrive/Colab Notebooks/seminar/pretrained-finetuning-bert/model/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/seminar/pretrained-finetuning-bert/model/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/seminar/pretrained-finetuning-bert/model/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/seminar/pretrained-finetuning-bert/model/added_tokens.json',
 '/content/drive/MyDrive/Colab Notebooks/seminar/pretrained-finetuning-bert/model/tokenizer.json')

In [None]:
trainer.save_model(output_dir=MODEL_DIR)

Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/seminar/pretrained-finetuning-bert/model/
Configuration saved in /content/drive/MyDrive/Colab Notebooks/seminar/pretrained-finetuning-bert/model/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/seminar/pretrained-finetuning-bert/model/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Colab Notebooks/seminar/pretrained-finetuning-bert/model/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Colab Notebooks/seminar/pretrained-finetuning-bert/model/special_tokens_map.json


In [None]:
trainer.push_to_hub(MODEL_DIR)

Saving model checkpoint to bert-base-uncased-pretrained-mlm-coqa-stories-pretrain-finetuned-coqa-falttened
Configuration saved in bert-base-uncased-pretrained-mlm-coqa-stories-pretrain-finetuned-coqa-falttened/config.json
Model weights saved in bert-base-uncased-pretrained-mlm-coqa-stories-pretrain-finetuned-coqa-falttened/pytorch_model.bin
tokenizer config file saved in bert-base-uncased-pretrained-mlm-coqa-stories-pretrain-finetuned-coqa-falttened/tokenizer_config.json
Special tokens file saved in bert-base-uncased-pretrained-mlm-coqa-stories-pretrain-finetuned-coqa-falttened/special_tokens_map.json


AttributeError: ignored

# Test model with data

In [23]:
def construct_answer(tokenizer, model, outputs, encoding):
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits
    all_tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"][0].tolist())

    start_index = torch.argmax(start_logits)

    end_sorted = torch.argsort(end_logits, descending=True).squeeze().tolist()
    for i in end_sorted:
        if i+1 > start_index:
            end_index = i+1
            break

    if start_index < end_index:
        answer_tokens = all_tokens[start_index:end_index]
        answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens))
    else: # TODO: a good condition for unknown
        answer = UNKNOWN
    return answer

In [None]:
def test_construct_answer():
    question, text = "who is Ali?", "Ali is a good student."

    encoding = tokenizer(question, text, return_tensors="pt").to(device)

    outputs = model(**encoding)
    # outputs = model(**encoding, start_positions=start_positions, end_positions=end_positions)

    answer = construct_answer(tokenizer, model, outputs, encoding)
    print(answer)

test_construct_answer()

ali is a good student


In [24]:
df_test = pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/seminar/dataset/coqa_flat_val_df.pkl")

predictions = []

for index, item in tqdm(df_test.iterrows()):
    try:
        question, text = item["question"], item["story"]

        encoding = tokenizer(question, text, return_tensors="pt").to(device)

        outputs = model(**encoding)
        # outputs = model(**encoding, start_positions=start_positions, end_positions=end_positions)

        answer = construct_answer(tokenizer, model, outputs, encoding)

        predictions.append(
            {
                "id": item["id"],
                "turn_id": item["turn_id"],
                "answer": answer
            }
        )
    except Exception as e:
        print(e)
        predictions.append(
            {
                "id": item["id"],
                "turn_id": item["turn_id"],
                "answer": UNKNOWN
            }
        )

0it [00:00, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors


The size of tensor a (515) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (801) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (805) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (802) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (801) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (804) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (800) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (809) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (804) must match the size of tensor b (

In [28]:
with open("answers.json", 'w') as f:
    f.write(json.dumps(predictions))

In [26]:
import requests

evaluation_script = requests.get("https://nlp.stanford.edu/data/coqa/evaluate-v1.0.py").text
with open("evaluate-v1.0.py", 'w') as f:
    f.write(evaluation_script)

In [29]:
! python evaluate-v1.0.py --data-file "/content/drive/MyDrive/Colab Notebooks/seminar/dataset/coqa-dev-v1.0.json" --pred-file "answers.json"

{
  "children_stories": {
    "em": 9.0,
    "f1": 32.9,
    "turns": 1425
  },
  "literature": {
    "em": 12.5,
    "f1": 31.9,
    "turns": 1630
  },
  "mid-high_school": {
    "em": 11.4,
    "f1": 31.5,
    "turns": 1653
  },
  "news": {
    "em": 14.2,
    "f1": 36.0,
    "turns": 1649
  },
  "wikipedia": {
    "em": 13.3,
    "f1": 38.6,
    "turns": 1626
  },
  "reddit": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "science": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "in_domain": {
    "em": 12.2,
    "f1": 34.2,
    "turns": 7983
  },
  "out_domain": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "overall": {
    "em": 12.2,
    "f1": 34.2,
    "turns": 7983
  }
}
