# Intro

In [1]:
!pip install nlp
!pip install transformers
!pip install datasets
!pip install wandb



In [2]:
## IMPORTS
import os
import sys
import logging
from dataclasses import dataclass, field
import json
from typing import Dict, List, Optional

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm.notebook import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import datasets

from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from transformers import default_data_collator, TrainingArguments, Trainer, EvalPrediction, set_seed


In [3]:
# MONITOR CPU and GPU

os.environ["WANDB_DISABLED"] = "true"

# import wandb
# wandb.init()

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# GLOBAL VARIABLES

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

UNKNOWN = "unknown"
BASE_DIR = "/content/drive/MyDrive/Colab Notebooks/seminar/finetuning-bert"

DATASET_TRAIN_DIR = "/content/drive/MyDrive/Colab Notebooks/seminar/dataset/coqa_train_ds_512_tokenized"
DATASET_TEST_DIR = "/content/drive/MyDrive/Colab Notebooks/seminar/dataset/coqa_val_ds_512_tokenized"

MODEL_NAME = "bert-base-uncased"

SEED = 7

MAX_LENGTH = 512

In [6]:
print(device)

LOGS_DIR = os.path.join(BASE_DIR, "logs/")
MODEL_DIR = os.path.join(BASE_DIR, "model/")
OUTPUT_DIR = os.path.join(BASE_DIR, "output/")
TOKENIZER_DIR = os.path.join(BASE_DIR, "tokenizer/")
ANSWERS_DIR = os.path.join(BASE_DIR, "answers/")

DIRECTORIES = [LOGS_DIR, MODEL_DIR, OUTPUT_DIR, TOKENIZER_DIR]

for direc in DIRECTORIES:
    if not os.path.exists(direc):
        os.makedirs(direc)
        print(direc)

cuda


# Prepare Data

In [7]:
ds_train = datasets.load_from_disk(DATASET_TRAIN_DIR)
ds_val = datasets.load_from_disk(DATASET_TEST_DIR)

ds_train, ds_val

# cache the dataset, so we can load it directly for training

torch.save(ds_train, 'train_data.pt')
torch.save(ds_val, 'valid_data.pt')

# Model

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)
data_collator = default_data_collator

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

# Training script

In [9]:
model = model.to(device)
set_seed(SEED)

In [10]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    
    # evaluation_strategy="epoch",
    evaluation_strategy="steps",
    eval_steps=1000,

    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,

    save_steps=1000,
    load_best_model_at_end=True,

    num_train_epochs=4,

    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
# trainer.train(resume_from_checkpoint=True)
trainer.train()
trainer.save_model()

***** Running training *****
  Num examples = 108647
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 27164


Step,Training Loss,Validation Loss
1000,3.527,3.24143
2000,3.2892,3.019825
3000,3.1558,2.90574
4000,3.1042,2.841635
5000,3.0311,2.798681
6000,3.0081,2.793006
7000,2.7994,2.829924
8000,2.5476,2.808648


***** Running Evaluation *****
  Num examples = 7983
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/seminar/finetuning-bert/output/checkpoint-1000
Configuration saved in /content/drive/MyDrive/Colab Notebooks/seminar/finetuning-bert/output/checkpoint-1000/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/seminar/finetuning-bert/output/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Colab Notebooks/seminar/finetuning-bert/output/checkpoint-1000/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Colab Notebooks/seminar/finetuning-bert/output/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 7983
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/seminar/finetuning-bert/output/checkpoint-2000
Configuration saved in /content/drive/MyDrive/Colab Notebooks/seminar/finetuning-bert/output/checkpoint-20

Step,Training Loss,Validation Loss
1000,3.527,3.24143
2000,3.2892,3.019825
3000,3.1558,2.90574
4000,3.1042,2.841635
5000,3.0311,2.798681
6000,3.0081,2.793006
7000,2.7994,2.829924
8000,2.5476,2.808648
9000,2.5188,2.813808


Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/seminar/finetuning-bert/output/checkpoint-9000
Configuration saved in /content/drive/MyDrive/Colab Notebooks/seminar/finetuning-bert/output/checkpoint-9000/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/seminar/finetuning-bert/output/checkpoint-9000/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Colab Notebooks/seminar/finetuning-bert/output/checkpoint-9000/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Colab Notebooks/seminar/finetuning-bert/output/checkpoint-9000/special_tokens_map.json


In [None]:
# dmodel = AutoModelForQuestionAnswering.from_pretrained(OUTPUT_DIR)
# del dmodel

In [None]:
# set_seed(SEED)

# training_args = TrainingArguments(
#     output_dir="output",
#     overwrite_output_dir = True,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     num_train_epochs = 1,
#     per_device_train_batch_size = 16,
#     per_device_eval_batch_size=8,
#     gradient_accumulation_steps = 8,    
#     load_best_model_at_end=True,
#     eval_steps=500,
#     disable_tqdm = False, 
#     seed=0,
#     # warmup_steps=200,
#     weight_decay=0.01,
#     logging_steps = 4,
#     learning_rate = 1e-4,
#     logging_dir='logs/',
#     run_name = 'finetuning-longformer-on-coqa-flat-no-mlm',
#     do_train = True,
#     prediction_loss_only=True,
#     # fp16 = True,
# ) 

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=valid_dataset,
#     data_collator=MyDataCollator(),
#     # compute_metrics 
# ) 

In [None]:
# Evaluation
results = {}
    
eval_output = trainer.evaluate()

output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
with open(output_eval_file, "w") as writer:
    print("***** Eval results *****")
    for key in sorted(eval_output.keys()):
        print("  %s = %s"% (key, str(eval_output[key])))

results.update(eval_output)

print(results)


***** Eval results *****
  epoch = 0.01
  eval_loss = 5.485827445983887
  eval_runtime = 143.7961
  eval_samples_per_second = 55.516
  eval_steps_per_second = 3.47


# Test model with data

In [None]:
def construct_answer(tokenizer, model, outputs, encoding):
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits
    all_tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"][0].tolist())

    start_index = torch.argmax(start_logits)

    end_sorted = torch.argsort(end_logits, descending=True).squeeze().tolist()
    for i in end_sorted:
        if i+1 > start_index:
            end_index = i+1
            break

    if start_index < end_index:
        answer_tokens = all_tokens[start_index:end_index]
        answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens))
    else: # TODO: a good condition for unknown
        answer = UNKNOWN
    return answer

In [None]:
def test_construct_answer():
    question, text = "who is Ali?", "Ali is a good student."

    encoding = tokenizer(question, text, return_tensors="pt").to(device)

    outputs = model(**encoding)
    # outputs = model(**encoding, start_positions=start_positions, end_positions=end_positions)

    answer = construct_answer(tokenizer, model, outputs, encoding)
    print(answer)

test_construct_answer()

ali is a good student.


In [None]:
predictions = []

for index, item in tqdm(df_test.iterrows()):

    question, text = item["question"], item["story"]

    encoding = tokenizer(question, text, return_tensors="pt").to(device)

    outputs = model(**encoding)
    # outputs = model(**encoding, start_positions=start_positions, end_positions=end_positions)

    answer = construct_answer(tokenizer, model, outputs, encoding)

    predictions.append(
        {
            "id": item["id"],
            "turn_id": item["turn_id"],
            "answer": answer
        }
    )