# Intro

In [None]:
!pip install nlp
!pip install transformers
!pip install datasets
!pip install wandb

In [None]:
# MONITOR CPU and GPU
import wandb
wandb.init()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
## IMPORTS
import os
import sys
import logging
from dataclasses import dataclass, field
import json
from typing import Dict, List, Optional

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm.notebook import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import datasets

from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from transformers import default_data_collator, TrainingArguments, Trainer, EvalPrediction, set_seed


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# GLOBAL VARIABLES

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

UNKNOWN = "unknown"
BASE_PATH = "/content/drive/MyDrive/Colab Notebooks/seminar/pickle/11/"
DATASET_TRAIN_PATH = "/content/drive/MyDrive/Colab Notebooks/seminar/dataset/coqa_train_ds_512_tokenized"
DATASET_TEST_PATH = "/content/drive/MyDrive/Colab Notebooks/seminar/dataset/coqa_val_ds_512_tokenized"

ANSWERS_PATH = "/content/drive/MyDrive/Colab Notebooks/seminar/answers/{file_name}"

MODEL_NAME = "bert-base-uncased"

SEED = 7

max_length = 512

In [None]:
print(device)

cuda


In [None]:
LOGS_DIR = BASE_PATH + "logs/"
MODEL_DIR = BASE_PATH + "model/"
OUTPUT_DIR = BASE_PATH + "output/"
TOKENIZER_DIR = BASE_PATH + "tokenizer/"
DIRECTORIES = [LOGS_DIR, MODEL_DIR, OUTPUT_DIR, TOKENIZER_DIR]

In [None]:
for direc in DIRECTORIES:
    if not os.path.exists(direc):
        os.makedirs(direc)

# Prepare Data

In [None]:
ds_train = datasets.load_from_disk(DATASET_TRAIN_PATH)
ds_val = datasets.load_from_disk(DATASET_TEST_PATH)

In [None]:
ds_train, ds_val

(Dataset({
     features: ['attention_mask', 'end_positions', 'input_ids', 'start_positions', 'token_type_ids'],
     num_rows: 108647
 }), Dataset({
     features: ['attention_mask', 'end_positions', 'input_ids', 'start_positions', 'token_type_ids'],
     num_rows: 7983
 }))

In [None]:
# cache the dataset, so we can load it directly for training

# torch.save(dataset_train, 'train_data.pt')
# torch.save(dataset_val, 'valid_data.pt')

# Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)
data_collator = default_data_collator

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

# Training script

In [None]:
model = model.to(device)
set_seed(SEED)

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [30]:
# trainer.train(resume_from_checkpoint=True)
trainer.train()

***** Running training *****
  Num examples = 108647
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 20373
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,2.9971,2.768387
2,2.6209,2.715732


Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/seminar/pickle/11/output/checkpoint-500
Configuration saved in /content/drive/MyDrive/Colab Notebooks/seminar/pickle/11/output/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/seminar/pickle/11/output/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Colab Notebooks/seminar/pickle/11/output/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Colab Notebooks/seminar/pickle/11/output/checkpoint-500/special_tokens_map.json
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/seminar/pickle/11/output/checkpoint-1000
Configuration saved in /content/drive/MyDrive/Colab Notebooks/seminar/pickle/11/output/checkpoint-1000/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/seminar/pickle/11/output/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive

Epoch,Training Loss,Validation Loss
1,2.9971,2.768387
2,2.6209,2.715732
3,2.3394,2.765806




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=20373, training_loss=2.739230358213843, metrics={'train_runtime': 18755.562, 'train_samples_per_second': 17.378, 'train_steps_per_second': 1.086, 'total_flos': 8.516732618728858e+16, 'train_loss': 2.739230358213843, 'epoch': 3.0})

In [31]:
model.save_pretrained(MODEL_DIR + "3_epochs")

Configuration saved in /content/drive/MyDrive/Colab Notebooks/seminar/pickle/11/model/3_epochs/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/seminar/pickle/11/model/3_epochs/pytorch_model.bin


In [None]:
# set_seed(SEED)

# training_args = TrainingArguments(
#     output_dir="output",
#     overwrite_output_dir = True,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     num_train_epochs = 1,
#     per_device_train_batch_size = 16,
#     per_device_eval_batch_size=8,
#     gradient_accumulation_steps = 8,    
#     eval_steps=500,
#     disable_tqdm = False, 
#     seed=0,
#     load_best_model_at_end=True,
#     # warmup_steps=200,
#     weight_decay=0.01,
#     logging_steps = 4,
#     learning_rate = 1e-4,
#     logging_dir='logs/',
#     run_name = 'finetuning-longformer-on-coqa-flat-no-mlm',
#     do_train = True,
#     prediction_loss_only=True,
#     # fp16 = True,
# ) 

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=valid_dataset,
#     data_collator=MyDataCollator(),
#     # compute_metrics 
# ) 

In [None]:

# Evaluation
results = {}
if training_args.do_eval and training_args.local_rank in [-1, 0]:
    
    logger.info("*** Evaluate ***")

    eval_output = trainer.evaluate()

    output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results *****")
        for key in sorted(eval_output.keys()):
            logger.info("  %s = %s", key, str(eval_output[key]))
            writer.write("%s = %s\n" % (key, str(eval_output[key])))

    results.update(eval_output)
    
    print(results)


# Test model with data

In [None]:
predictions = []

for index, item in tqdm(df_test.iterrows()):

    question, text = item["question"], item["story"]

    encoding = model.tokenizer(question, text, return_tensors="pt").to(device)

    outputs = model(**encoding)
    # outputs = model(**encoding, start_positions=start_positions, end_positions=end_positions)

    answer = model.construct_answer(outputs, encoding)

    predictions.append(
        {
            "id": item["id"],
            "turn_id": item["turn_id"],
            "answer": answer
        }
    )