<a href="https://colab.research.google.com/github/bhattacharjee/mtu-nlp-assignment/blob/main/web-examples/bert_qa_rahul_agarwal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch -q -q -q
!pip install transformers -q -q -q
!pip install pytorch-nlp -q -q -q
!pip install datasets -q -q -q


In [2]:
from datasets import load_dataset, load_metric
import torch
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
#from datasets import load_dataset, load_metric
import random
from transformers import AutoTokenizer
import transformers
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
import torch
from transformers import default_data_collator
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# Use cuda by default
cuda = torch.device('cuda')
torch.set_default_tensor_type('torch.cuda.FloatTensor')



# Use the default BERT pretrained on SQUAD for examples

In [None]:
import time

bert_model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
model = AutoModelForQuestionAnswering.from_pretrained(bert_model_name)

text = r"""
🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
TensorFlow 2.0 and PyTorch.
"""
questions = [
    "How many pretrained models are available in Transformers?",
    "What does Transformers provide?",
    "Transformers provides interoperability between which frameworks?",
]
for question in questions:
    t1 = time.monotonic()
    inputs = tokenizer.encode_plus(\
                question,\
                text,\
                add_special_tokens=True,\
                return_tensors="pt")

    input_ids = inputs["input_ids"].tolist()[0]
    text_tokens = tokenizer.convert_ids_to_tokens(input_ids)

    pred = model(**inputs)
    answer_start_scores = pred['start_logits'][0]
    answer_end_scores = pred['end_logits'][0]

    # Get the most likely beginning of answer with the argmax of the score
    answer_start = torch.argmax(answer_start_scores)
    # Get the most likely end of answer with the argmax of the score
    answer_end = torch.argmax(answer_end_scores) + 1

    #print(answer_start_scores.detach().cpu().numpy().shape)
    #print(answer_end_scores.detach().cpu().numpy().shape)
    #print(answer_start, answer_end)

    #print("--->\n", inputs['input_ids'].detach().cpu().numpy())


    answer = tokenizer\
                    .convert_tokens_to_string(\
                        tokenizer.convert_ids_to_tokens(\
                            input_ids[answer_start:answer_end]))

    print(f"Time taken = {time.monotonic() - t1}")
    print(f"Question: {question}")
    print(f"Answer: {answer}\n")

# Load the SQUAD dataset

In [None]:
datasets = load_dataset("squad")
def visualize(datasets, datatype = 'train', n_questions=10):
    n = len(datasets[datatype])
    random_questions=random.choices(list(range(n)),k=n_questions)
    for i in random_questions:
        print(f"Context:{datasets[datatype][i]['context']}")
        print(f"Question:{datasets[datatype][i]['question']}")
        print(f"Answer:{datasets[datatype][i]['answers']['text']}")
        print(f"Answer Start in Text:{datasets[datatype][i]['answers']['answer_start']}")
        print("-"*100)
visualize(datasets)

# Preparing the dataset

In [7]:
# Dealing with long docs:
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it

def prepare_train_features(examples):
    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    print(examples.keys())
    tokenized_examples = tokenizer(
        examples["question" ],
        examples["context" ],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    # Looks like [0,1,2,2,2,3,4,5,5...] - Here 2nd input pair has been split in 3 parts
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    # Looks like [[(0,0),(0,3),(3,4)...] ] - Contains the actual start indices and end indices for each word in the input.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

# Actually train the model

In [None]:
model_checkpoint = "bert-base-uncased"
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenized_datasets = datasets.map(prepare_train_features, batched=True, remove_columns=datasets["train"].column_names)

args = TrainingArguments(
    f"test-squad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

data_collator = default_data_collator
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()
trainer.save_model(trainer.save_model("test-squad-trained"))

# Using at evaluate time

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained("test-squad-trained")
text = r"""
🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
TensorFlow 2.0 and PyTorch
"""
questions = [
    "How many pretrained models are available in Transformers?",
    "What does Transformers provide?",
    "Transformers provides interoperability between which frameworks?",
]
for question in questions:
    inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]
text_tokens = tokenizer.convert_ids_to_tokens(input_ids)

    pred = model(**inputs)
    answer_start_scores, answer_end_scores = pred['start_logits'][0] ,pred['end_logits'][0]

    answer_start = torch.argmax(
        answer_start_scores
    )  # Get the most likely beginning of answer with the argmax of the score
    answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
print(f"Question: {question}")
    print(f"Answer: {answer}\n")
