In [1]:
import os

In [2]:
from datasets import get_dataset_config_names, load_dataset

In [3]:
qa_cols = ["title", "question", "answers.text",
           "answers.answer_start", "context"]

In [4]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from transformers import *

# Load Test

In [5]:
# https://www.kaggle.com/raghavendrakotala/fine-tunned-on-roberta-base-as-ner-problem-0-533
test_names, test = [], []
for f in list(os.listdir('../input/feedback-prize-2021/test')):
    test_names.append(f.replace('.txt', ''))
    test.append(open('../input/feedback-prize-2021/test/' + f, 'r').read())
test = pd.DataFrame({'id': test_names, 'text': test})

In [6]:
test.columns = ["id", "context"]

In [7]:
labels = ['Lead', 'Position', 'Evidence', 'Claim', 'Concluding Statement',
             'Counterclaim', 'Rebuttal']

In [8]:
test['question'] = [labels for row in range(test.shape[0])]
test = test.explode('question')
#model_ckpt = "distilbert-base-cased-distilled-squad"
model_checkpoint = "../input/q-a-pytorch/model.h5"
# model_checkpoint = "distilbert-base-cased-distilled-squad"
config_model = "../input/q-a-pytorch/model.h5/config.json"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [9]:
target_map = {'Lead':0, 'Position':1, 'Evidence':2, 'Claim':3, 'Concluding Statement':4,
             'Counterclaim':5, 'Rebuttal':6}


In [10]:
max_length = 512
stride = 128
def preprocess_examples(examples):
    #strip removes leading and trailing whitespaces
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    #offset_mapping = inputs.pop("offset_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
#         offset = inputs["offset_mapping"][i]
#         inputs["offset_mapping"][i] = [
#             o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
#         ]

    inputs["example_id"] = example_ids
    return inputs

In [11]:
from datasets import Dataset
dataset = Dataset.from_pandas(test)

In [12]:
dataset

Dataset({
    features: ['id', 'context', 'question', '__index_level_0__'],
    num_rows: 35
})

In [13]:
train_dataset = dataset.map(
    preprocess_examples,
    batched=True,
    remove_columns=dataset.column_names,
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [14]:
from datasets import load_metric
metric = load_metric("squad")

Downloading:   0%|          | 0.00/1.73k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

In [15]:
import torch
from transformers import AutoModelForQuestionAnswering

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [16]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
model.eval()

DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            

In [17]:
from transformers import TrainingArguments

args = TrainingArguments(
    "distilbert-base-cased-distilled-squad",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    push_to_hub=False,
)

In [18]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

Using amp fp16 backend


In [19]:
def get_predictions(start_logits, end_logits, offsets, context, question):

    start_probabilities = torch.nn.functional.softmax(start_logits, dim=-1)
    end_probabilities = torch.nn.functional.softmax(end_logits, dim=-1)
    
    candidates = []
    scores = start_probabilities * end_probabilities
    idx = scores.argmax().item()

    start_idx = start_probabilities.argmax().item()
    end_idx = end_probabilities.argmax().item()
    score = scores[idx]
    candidates.append((start_idx, end_idx, score))

    #print(candidates)
    results = []
    for candidate, offset in zip(candidates, offsets):
        start_token, end_token, score = candidate
        start_char, _ = offset[start_token]
        _, end_char = offset[end_token]
        answer = context[start_char:end_char]
        result = {"answer": answer, "start": start_char, "end": end_char, "score": score, "question": questions}
        results.append(result)
    return results

In [20]:
def get_predictionstring(result):
    pred = result['answer']
    if len(pred) == 0:
        return ""
    start = result['start']
    end = result['end']
    score = result['score']
    start_index = len(context[:start].split())
    end_index = start_index + len(pred.split())
    predictionstring = ""
    for i in range(start_index, end_index):
        predictionstring += str(i) + " "
    return predictionstring

In [21]:
outputs = trainer.predict(train_dataset)

The following columns in the test set  don't have a corresponding argument in `DistilBertForQuestionAnswering.forward` and have been ignored: example_id, offset_mapping.
***** Running Prediction *****
  Num examples = 70
  Batch size = 8


In [22]:
outputs.predictions

(array([[  2.324 ,   0.618 ,  -0.7573, ...,  -7.69  ,  -9.664 ,  -0.7573],
        [ 10.016 ,  -5.242 ,   0.4333, ..., -11.06  , -11.055 , -11.    ],
        [  3.164 ,  -4.562 ,  -5.438 , ...,  -9.375 ,  -8.75  ,  -4.055 ],
        ...,
        [ -7.004 , -10.55  , -11.09  , ..., -12.266 , -12.23  , -12.22  ],
        [ -7.535 , -10.266 , -10.98  , ..., -11.44  , -11.47  , -11.4   ],
        [ -8.66  , -10.016 , -10.01  , ..., -11.28  , -11.26  , -11.21  ]],
       dtype=float16),
 array([[  1.756  ,  -3.951  ,  -0.01736, ...,  -9.266  ,  -6.363  ,
          -0.01733],
        [ 10.45   ,  -7.254  ,  -3.514  , ..., -11.51   , -11.49   ,
         -11.555  ],
        [  2.584  ,  -8.43   ,  -8.55   , ...,  -6.11   ,  -8.945  ,
          -1.827  ],
        ...,
        [ -4.52   , -10.85   , -10.53   , ..., -10.65   , -10.75   ,
         -10.766  ],
        [ -6.723  ,  -8.66   ,  -8.63   , ..., -11.37   , -11.37   ,
         -11.42   ],
        [ -8.586  , -10.52   , -10.57   , ..., -11

In [23]:
from torch.utils.data import DataLoader
# from transformers import default_data_collator
dataloader = DataLoader(dataset, batch_size = 1)
tokenized_loader = DataLoader(train_dataset, batch_size = 1)

In [24]:
# index = 0
# for batch, t_batch in zip(dataloader, tokenized_loader):
#     question = batch['question']
#     context = batch['context']
#     text_id = batch['id']
#     inputs, offset_mapping = t_batch["input_ids"], t_batch["offset_mapping"]
#     start_logits, end_logits = torch.Tensor(outputs.predictions[0][index]), torch.Tensor(outputs.predictions[1][index])
#     results = get_predictions(start_logits, end_logits, inputs, offset_mapping, context)