# Intro

In [1]:
!pip install nlp transformers datasets wandb
!apt install git-lfs

Collecting nlp
  Downloading nlp-0.4.0-py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 7.0 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 77.3 MB/s 
[?25hCollecting datasets
  Downloading datasets-1.18.1-py3-none-any.whl (311 kB)
[K     |████████████████████████████████| 311 kB 78.8 MB/s 
[?25hCollecting wandb
  Downloading wandb-0.12.9-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 61.9 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 80.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 7.6 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |███████████████████████████████

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [2]:
## IMPORTS
import os
import sys
import logging
from dataclasses import dataclass, field
import json
from typing import Dict, List, Optional

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm.notebook import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import datasets

from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from transformers import default_data_collator, TrainingArguments, Trainer, EvalPrediction, set_seed


In [3]:
# MONITOR CPU and GPU

os.environ["WANDB_DISABLED"] = "true"

# os.environ["WANDB_DISABLED"] = "false"
# import wandb
# wandb.init()

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# GLOBAL VARIABLES

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

UNKNOWN = "unknown"
BASE_DIR = "/content/drive/MyDrive/Colab Notebooks/seminar/finetune-squad"

MODEL_NAME = "peggyhuang/bert-base-uncased-coqa"

MY_MLM_MODEL_NAME = f"{MODEL_NAME}-pretrained-mlm-coqa-stories"
MY_QA_MODEL_NAME = f"{MODEL_NAME}-pretrain-finetuned-coqa-falt"
MY_MLM_MODEL_PATH = f"alistvt/{MY_MLM_MODEL_NAME}"

SEED = 7

MAX_LENGTH = 512

In [5]:
print(device)

LOGS_DIR = os.path.join(BASE_DIR, "logs/")
MODEL_DIR = os.path.join(BASE_DIR, "model/")
OUTPUT_DIR = os.path.join(BASE_DIR, "output/")
TOKENIZER_DIR = os.path.join(BASE_DIR, "tokenizer/")
ANSWERS_DIR = os.path.join(BASE_DIR, "answers/")

DIRECTORIES = [LOGS_DIR, MODEL_DIR, OUTPUT_DIR, TOKENIZER_DIR]

for direc in DIRECTORIES:
    if not os.path.exists(direc):
        os.makedirs(direc)
        print(direc)

cuda


# Prepare Data

In [6]:
from datasets import load_dataset
coqa = load_dataset("alistvt/coqa-flat")

Downloading:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Using custom data configuration alistvt--coqa-flat-859610cee67df2fb


Downloading and preparing dataset None/None (download: 16.39 MiB, generated: 209.24 MiB, post-processed: Unknown size, total: 225.63 MiB) to /root/.cache/huggingface/datasets/parquet/alistvt--coqa-flat-859610cee67df2fb/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/16.1M [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/alistvt--coqa-flat-859610cee67df2fb/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
coqa['train'].features

{'bad_turn': Value(dtype='bool', id=None),
 'filename': Value(dtype='string', id=None),
 'id': Value(dtype='string', id=None),
 'index': Value(dtype='int64', id=None),
 'input_text': Value(dtype='string', id=None),
 'name': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'source': Value(dtype='string', id=None),
 'span_end': Value(dtype='int64', id=None),
 'span_start': Value(dtype='int64', id=None),
 'span_text': Value(dtype='string', id=None),
 'story': Value(dtype='string', id=None),
 'turn_id': Value(dtype='int64', id=None)}

# Preprocessing

In [14]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

loading file https://huggingface.co/peggyhuang/bert-base-uncased-coqa/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/fbc8ade909aca6e60cb518f6a852f0063dacb1a5031f7c5ff1b5026dc1753dca.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/peggyhuang/bert-base-uncased-coqa/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/b0cf3da1b35af76b3ec503bdf459a2c1f16cf21dfbc7b1ff4c0c941e4aedb9c2.f471bd2d72c48b932f7be40446896b7e97c3be406ee93abfb500399bc606c829
loading file https://huggingface.co/peggyhuang/bert-base-uncased-coqa/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/peggyhuang/bert-base-uncased-coqa/resolve/main/special_tokens_map.json from cache at /root/.cache/huggingface/transformers/98b3fb7c60bbd5197227b8cc12b1a186406586dbf5d1089cb6705c43340d9ce5.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
loading file https://huggingface.co/

In [8]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["story"],
        max_length=MAX_LENGTH,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        start_char = examples["span_start"][i]
        end_char = examples["span_start"][i] + len(examples["span_text"][i])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


In [9]:
tokenized_coqa = coqa.map(preprocess_function, batched=True, remove_columns=coqa["train"].column_names)

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/109 [00:00<?, ?ba/s]

# Model

In [10]:
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)
data_collator = default_data_collator

Downloading:   0%|          | 0.00/683 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/415M [00:00<?, ?B/s]

# Training script

In [11]:
model = model.to(device)
# set_seed(SEED)

In [12]:
training_args = TrainingArguments(
    'output',
    # evaluation_strategy="epoch",
    evaluation_strategy="steps",
    eval_steps=1000,

    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,

    save_steps=1000,
    load_best_model_at_end=True,

    num_train_epochs=0.2,

    weight_decay=0.01,
    # push_to_hub=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_coqa['train'],
    eval_dataset=tokenized_coqa['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [13]:
trainer.train()
# trainer.train()
# trainer.push_to_hub()
# trainer.save_model()

***** Running training *****
  Num examples = 108647
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1359


Step,Training Loss,Validation Loss
1000,2.8268,3.012791


***** Running Evaluation *****
  Num examples = 7983
  Batch size = 16
Saving model checkpoint to output/checkpoint-1000
Configuration saved in output/checkpoint-1000/config.json
Model weights saved in output/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in output/checkpoint-1000/tokenizer_config.json
Special tokens file saved in output/checkpoint-1000/special_tokens_map.json


KeyboardInterrupt: ignored

In [26]:
# Evaluation
results = {}

eval_output = trainer.evaluate()

output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
with open(output_eval_file, "w") as writer:
    print("***** Eval results *****")
    for key in sorted(eval_output.keys()):
        print("  %s = %s"% (key, str(eval_output[key])))

results.update(eval_output)

print(results)


***** Running Evaluation *****
  Num examples = 7983
  Batch size = 1


RuntimeError: ignored

In [None]:
tokenizer.save_pretrained(MODEL_DIR)

tokenizer config file saved in /content/drive/MyDrive/Colab Notebooks/seminar/pretrained-finetuning-bert/model/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Colab Notebooks/seminar/pretrained-finetuning-bert/model/special_tokens_map.json


('/content/drive/MyDrive/Colab Notebooks/seminar/pretrained-finetuning-bert/model/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/seminar/pretrained-finetuning-bert/model/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/seminar/pretrained-finetuning-bert/model/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/seminar/pretrained-finetuning-bert/model/added_tokens.json',
 '/content/drive/MyDrive/Colab Notebooks/seminar/pretrained-finetuning-bert/model/tokenizer.json')

In [None]:
trainer.save_model(output_dir=MODEL_DIR)

Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/seminar/pretrained-finetuning-bert/model/
Configuration saved in /content/drive/MyDrive/Colab Notebooks/seminar/pretrained-finetuning-bert/model/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/seminar/pretrained-finetuning-bert/model/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Colab Notebooks/seminar/pretrained-finetuning-bert/model/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Colab Notebooks/seminar/pretrained-finetuning-bert/model/special_tokens_map.json


In [None]:
trainer.push_to_hub(MODEL_DIR)

Saving model checkpoint to bert-base-uncased-pretrained-mlm-coqa-stories-pretrain-finetuned-coqa-falttened
Configuration saved in bert-base-uncased-pretrained-mlm-coqa-stories-pretrain-finetuned-coqa-falttened/config.json
Model weights saved in bert-base-uncased-pretrained-mlm-coqa-stories-pretrain-finetuned-coqa-falttened/pytorch_model.bin
tokenizer config file saved in bert-base-uncased-pretrained-mlm-coqa-stories-pretrain-finetuned-coqa-falttened/tokenizer_config.json
Special tokens file saved in bert-base-uncased-pretrained-mlm-coqa-stories-pretrain-finetuned-coqa-falttened/special_tokens_map.json


AttributeError: ignored

# Test model with data

In [15]:
def construct_answer(tokenizer, model, outputs, encoding):
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits
    all_tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"][0].tolist())

    start_index = torch.argmax(start_logits)

    end_sorted = torch.argsort(end_logits, descending=True).squeeze().tolist()
    for i in end_sorted:
        if i+1 > start_index:
            end_index = i+1
            break

    if start_index < end_index:
        answer_tokens = all_tokens[start_index:end_index]
        answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens))
    else: # TODO: a good condition for unknown
        answer = UNKNOWN
    return answer

In [1]:
def test_construct_answer():
    question, text = "who is Ali?", "Ali is a good student."

    encoding = tokenizer(question, text, return_tensors="pt").to(device)

    outputs = model(**encoding)
    # outputs = model(**encoding, start_positions=start_positions, end_positions=end_positions)

    answer = construct_answer(tokenizer, model, outputs, encoding)
    print(answer)

test_construct_answer()

NameError: ignored

In [34]:
from transformers import pipeline
# question_answering = pipeline("question-answering", model=model.to("cpu"), tokenizer=tokenizer)
question_answering = pipeline("question-answering", model=model.to(device), tokenizer=tokenizer, device=0)

In [45]:
result = question_answering(question="Which name is also used to describe the Amazon rainforest in English?", context="""The Amazon rainforest (Portuguese: Floresta Amazônica or Amazônia; Spanish: Selva Amazónica, Amazonía or usually Amazonia; French: Forêt amazonienne; Dutch: Amazoneregenwoud), also known in English as Amazonia or the Amazon Jungle, is a moist broadleaf forest that covers most of the Amazon basin of South America. This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres (2,100,000 sq mi) are covered by the rainforest. This region includes territory belonging to nine nations. The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana. States or departments in four nations contain "Amazonas" in their names. The Amazon represents over half of the planet's remaining rainforests, and comprises the largest and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual trees divided into 16,000 species.""")
result



{'answer': 'Amazonia or the Amazon Jungle',
 'end': 230,
 'score': 0.12015396356582642,
 'start': 201}

In [64]:
predictions[1]['answer']

'in a barn near a farm house'

In [89]:
df_test = pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/seminar/dataset/coqa_flat_val_df.pkl")
ds_val = datasets.Dataset.from_pandas(df_test)

predictions = []
x = 0
for item in tqdm(ds_val):
    question, text = item["question"], item["story"]

    result = question_answering(question=question, context=text)

    answer = result['answer']
    
    predictions.append(
        {
            "id": item["id"],
            "turn_id": item["turn_id"],
            "answer": answer
        }
    )

  0%|          | 0/7983 [00:00<?, ?it/s]

  return array(a, dtype, copy=False, order=order)


In [90]:
with open("answers.json", 'w') as f:
    f.write(json.dumps(predictions))

In [73]:
# import requests

# evaluation_script = requests.get("http://downloads.cs.stanford.edu/nlp/data/coqa/drqa-pgnet-coqa-dev-hist1.txt.json").text
# with open("answers.json", 'w') as f:
#     f.write(evaluation_script)


In [70]:
import requests

evaluation_script = requests.get("https://nlp.stanford.edu/data/coqa/evaluate-v1.0.py").text
with open("evaluate-v1.0.py", 'w') as f:
    f.write(evaluation_script)

In [91]:
! python evaluate-v1.0.py --data-file "/content/drive/MyDrive/Colab Notebooks/seminar/dataset/coqa-dev-v1.0.json" --pred-file "answers.json"

{
  "children_stories": {
    "em": 41.2,
    "f1": 65.3,
    "turns": 1425
  },
  "literature": {
    "em": 30.7,
    "f1": 53.5,
    "turns": 1630
  },
  "mid-high_school": {
    "em": 33.4,
    "f1": 57.6,
    "turns": 1653
  },
  "news": {
    "em": 32.0,
    "f1": 58.6,
    "turns": 1649
  },
  "wikipedia": {
    "em": 36.4,
    "f1": 65.3,
    "turns": 1626
  },
  "reddit": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "science": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "in_domain": {
    "em": 34.6,
    "f1": 59.9,
    "turns": 7983
  },
  "out_domain": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "overall": {
    "em": 34.6,
    "f1": 59.9,
    "turns": 7983
  }
}
