# Intro

In [1]:
!pip install nlp transformers datasets wandb
!apt install git-lfs

Collecting nlp
  Downloading nlp-0.4.0-py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 5.2 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 53.5 MB/s 
[?25hCollecting datasets
  Downloading datasets-1.18.0-py3-none-any.whl (311 kB)
[K     |████████████████████████████████| 311 kB 55.9 MB/s 
[?25hCollecting wandb
  Downloading wandb-0.12.9-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 39.2 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 46.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 3.7 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |███████████████████████████████

In [2]:
## IMPORTS
import os
import sys
import logging
from dataclasses import dataclass, field
import json
from typing import Dict, List, Optional

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm.notebook import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import datasets

from transformers import (LongformerModel, LongformerTokenizer, LongformerPreTrainedModel, AutoTokenizer,
                          LongformerConfig, Trainer, TrainingArguments, EarlyStoppingCallback)
from transformers.models.longformer.modeling_longformer import LongformerQuestionAnsweringModelOutput
from transformers import LongformerForQuestionAnswering, LongformerTokenizerFast, EvalPrediction, AutoModelForQuestionAnswering

from transformers import (
    HfArgumentParser,
    DataCollator,
    Trainer,
    TrainingArguments,
    set_seed,
)

In [3]:
# MONITOR CPU and GPU

os.environ["WANDB_DISABLED"] = "true"

# os.environ["WANDB_DISABLED"] = "false"
# import wandb
# wandb.init()

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# GLOBAL VARIABLES

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

UNKNOWN = "unknown"

BASE_DIR = "/content/drive/MyDrive/Colab Notebooks/seminar/finetuning-longformer-1024"

DATASET_TRAIN_PATH = "/content/drive/MyDrive/Colab Notebooks/seminar/dataset/coqa_flat_train_df_tokenized_reduced_1024.pkl"
DATASET_TEST_PATH = "/content/drive/MyDrive/Colab Notebooks/seminar/dataset/coqa_flat_val_df_tokenized_reduced_1024.pkl"
DATAFRAME_TEST_UNTOKENIZED_PATH = "/content/drive/MyDrive/Colab Notebooks/seminar/dataset/coqa_flat_val_df.pkl"

ANSWERS_PATH = "/content/drive/MyDrive/Colab Notebooks/seminar/answers/{file_name}"

MODEL_NAME = "allenai/longformer-base-4096"
MY_QA_MODEL_NAME = f"{MODEL_NAME}-finetuned-coqa-falttened"

SEED = 7

MAX_LENGTH = 1024

# Model

In [None]:
def qa(question,answer_text,model,tokenizer):
    inputs = tokenizer.encode_plus(question, answer_text, add_special_tokens=True, return_tensors="pt").to(device)
    input_ids = inputs["input_ids"].tolist()[0]

    text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    #   print(text_tokens)
    outputs = model(**inputs)
    answer_start_scores=outputs.start_logits
    answer_end_scores=outputs.end_logits

    answer_start = torch.argmax(
        answer_start_scores
    )  # Get the most likely beginning of answer with the argmax of the score
    answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    # Combine the tokens in the answer and print it out.""
    answer = answer.replace("#","")

    return answer


In [None]:
tokenizer = AutoTokenizer.from_pretrained("valhalla/longformer-base-4096-finetuned-squadv1")

model = AutoModelForQuestionAnswering.from_pretrained("valhalla/longformer-base-4096-finetuned-squadv1")

In [None]:
model = model.to(device)

# Test model with data

In [None]:
df_val = pd.read_pickle(DATAFRAME_TEST_UNTOKENIZED_PATH)
df_val.head(0)

Unnamed: 0,index,name,filename,id,source,story,turn_id,question,input_text,span_text,span_start,span_end,bad_turn


In [None]:
predictions = []

for index, item in tqdm(df_val.iterrows()):

    question, text = item["question"], item["story"]

    answer = qa(question, text, model, tokenizer)

    predictions.append(
        {
            "id": item["id"],
            "turn_id": item["turn_id"],
            "answer": answer
        }
    )

0it [00:00, ?it/s]

In [None]:
with open("answers.json", 'w') as f:
    f.write(json.dumps(predictions))

# Evaluation


In [None]:
import requests

evaluation_script = requests.get("https://nlp.stanford.edu/data/coqa/evaluate-v1.0.py").text
with open("evaluate-v1.0.py", 'w') as f:
    f.write(evaluation_script)


In [None]:
! python evaluate-v1.0.py --data-file "/content/drive/MyDrive/Colab Notebooks/seminar/dataset/coqa-dev-v1.0.json" --pred-file "answers.json"

{
  "children_stories": {
    "em": 39.1,
    "f1": 48.6,
    "turns": 1425
  },
  "literature": {
    "em": 36.1,
    "f1": 44.2,
    "turns": 1630
  },
  "mid-high_school": {
    "em": 38.0,
    "f1": 47.9,
    "turns": 1653
  },
  "news": {
    "em": 42.6,
    "f1": 51.2,
    "turns": 1649
  },
  "wikipedia": {
    "em": 47.0,
    "f1": 56.1,
    "turns": 1626
  },
  "reddit": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "science": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "in_domain": {
    "em": 40.6,
    "f1": 49.6,
    "turns": 7983
  },
  "out_domain": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "overall": {
    "em": 40.6,
    "f1": 49.6,
    "turns": 7983
  }
}
