In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.3-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 4.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 68.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 49.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.3


In [None]:
import transformers
import pandas as pd
import numpy as np
import torch
import json
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm
from sklearn.metrics import f1_score

In [None]:
import json
with open("/content/quote_to_squad_train.json", "r") as read_it:
     input = json.load(read_it)

list_mp = []
for i in range(len(input['data'])):
    temp = {
        "answers":
            {
                "text":"",
                "answer_start":0,
            },
        "context":"",
        "questions":""
        }
    temp["answers"]["text"] = input["data"][i]['paragraphs'][0]['qas'][0]["answers"][0]['text']
    temp["answers"]["answer_start"] = input["data"][i]['paragraphs'][0]['qas'][0]["answers"][0]['answer_start']
    temp["context"]=input["data"][i]['paragraphs'][0]['context']
    temp["questions"] = input["data"][i]['paragraphs'][0]['qas'][0]["question"]
    list_mp.append(temp)

In [None]:
with open("/content/gandhi.json", "r") as read_it:
     data = json.load(read_it)

list_mp_val = []
for key,val in data.items():
    temp = {
    "answers":
        {
            "text":"",
            "answer_start":0,
            "answer_end":0
        },
    "context":"",
    "questions":""
    }
    temp["answers"]["text"] = val["quote"]
    temp["answers"]["answer_start"] = val["start_index"]
    temp["answers"]["answer_end"] = val["end_index"]
    temp["questions"] = key
    temp["context"] = val["paragraph"]
    list_mp_val.append(temp)

In [None]:
len(list_mp)

10253

In [None]:
len(list_mp_val)

526

In [None]:
def read_quotus(list_mp):
    contexts = []
    questions = []
    answers = []
    for val in list_mp:
        contexts.append(val["context"])
        questions.append(val["questions"])
        answers.append(val["answers"])
    return contexts, questions, answers

In [None]:
train_contexts, train_questions, train_answers = read_quotus(list_mp)
val_contexts, val_questions, val_answers = read_quotus(list_mp_val)

In [None]:
# Import generic wrappers
from transformers import AutoTokenizer 


# Define the model repo
model_name = "SpanBERT/spanbert-base-cased" 

tokenizer = AutoTokenizer.from_pretrained(model_name,model_max_length=512)


train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

Downloading config.json:   0%|          | 0.00/413 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/208k [00:00<?, ?B/s]

In [None]:
def add_end_idx(answers, contexts):
    # loop through each answer-context pair
    for answer, context in zip(answers, contexts):
        # gold_text refers to the answer we are expecting to find in context
        gold_text = answer['text']
        # we already know the start index
        start_idx = answer['answer_start']
        # and ideally this would be the end index...
        end_idx = start_idx + len(gold_text)

        # ...however, sometimes squad answers are off by a character or two
        if context[start_idx:end_idx] == gold_text:
            # if the answer is not off :)
            answer['answer_end'] = end_idx
        else:
            # this means the answer is off by 1-2 tokens
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n
add_end_idx(train_answers, train_contexts)

In [None]:
def add_token_positions(encodings, answers):
    # initialize lists to contain the token indices of answer start/end
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        # append start/end token position using char_to_token method
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        # end position cannot be found, char_to_token found space, so shift position until found
        shift = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
            shift += 1
    # update our encodings object with the new token-based start/end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# apply function to our data
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [None]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [None]:
import torch

class QuotusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# build datasets for both our training and validation sets
train_dataset = QuotusDataset(train_encodings)
val_dataset = QuotusDataset(val_encodings)

In [None]:
from transformers import AutoModelForQuestionAnswering
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

Downloading pytorch_model.bin:   0%|          | 0.00/205M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at SpanBERT/spanbert-base-cased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# move model over to detected device
model.to(device)
# activate training mode of model
model.train()
# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = AdamW(model.parameters(), lr=5e-5)

# initialize data loader for training data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

for epoch in range(6):
    # set model to train mode
    model.train()
    # setup loop (we use tqdm for the progress bar)
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        # train model on batch and return outputs (incl. loss)
        outputs = model(input_ids, token_type_ids = token_type_ids,attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        # extract loss
        loss = outputs[0]
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 641/641 [17:19<00:00,  1.62s/it, loss=0.158]
Epoch 1: 100%|██████████| 641/641 [17:25<00:00,  1.63s/it, loss=0.0207]
Epoch 2: 100%|██████████| 641/641 [17:25<00:00,  1.63s/it, loss=0.264]
Epoch 3: 100%|██████████| 641/641 [17:27<00:00,  1.63s/it, loss=0.00329]
Epoch 4: 100%|██████████| 641/641 [17:26<00:00,  1.63s/it, loss=0.0023]
Epoch 5: 100%|██████████| 641/641 [17:25<00:00,  1.63s/it, loss=0.00197]


In [None]:
model_path = 'models/spanbert-custom'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('models/spanbert-custom/tokenizer_config.json',
 'models/spanbert-custom/special_tokens_map.json',
 'models/spanbert-custom/vocab.txt',
 'models/spanbert-custom/added_tokens.json',
 'models/spanbert-custom/tokenizer.json')

In [None]:
# switch model out of training mode
model.eval()
# initialize validation set data loader
val_loader = DataLoader(val_dataset, batch_size=16)
# initialize list to store accuracies
acc = []
# loop through batches
for batch in val_loader:
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        # we will use true positions for accuracy calc
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        #print(input_ids[0])
        # make predictions
        outputs = model(input_ids, token_type_ids = token_type_ids,attention_mask=attention_mask)
        # pull prediction tensors out and argmax to get predicted tokens
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        # calculate accuracy for both and append to accuracy list
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
# calculate average accuracy in total
acc = sum(acc)/len(acc)
print(acc)

0.16084956727696187


In [None]:
num = 30

question = val_questions[num]
context = val_contexts[num]

inputs = tokenizer.encode_plus(question, context, return_tensors='pt')
inputs.to(device)
outputs = model(**inputs)

answer_start = torch.argmax(outputs[0])  
answer_end = torch.argmax(outputs[1])  

# answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))
answer=tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end + 1])
answer=tokenizer.convert_tokens_to_string(answer)

print("###" + question)
print("===" + context)
print(val_answers[num])
print(answer_start)
print(answer_end)
print("$$$Predicted Answer: " + str(answer))

###They have to fight until the end.
===That a State reported to be advanced should work in opposition to progressive opinion is a sad commentary on its "advanced" state. The progressives really won the battle from a moral standpoint. Whilst it is regrettable that 22 should have voted against the use by the so-called untouchables of the public road in question, it is consoling to find that 21 members vindicated the position taken by the Hindu reformers by voting in favour of their resolution. The saddest part of the letter, however, is the fact that the satyagrahis seem to be losing hope. I do not wonder. Theirs is the first experience of sustained satyagraha. Let me, however, assure them that victory is assured. For their cause is just, their means non-violent. Let them realize, too, that by their sufferings they have attracted the attention of the world. Whoever knew Vykom before the struggle commenced? They should also know that they are fighting an age-long superstition. What is a 

In [None]:
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))
def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [None]:
## Calculting average f1 score 
f1 = 0
for id in range(len(val_questions)):
    question = val_questions[id]
    context = val_contexts[id]

    inputs = tokenizer.encode_plus(question, context, return_tensors='pt',max_length=512)
    inputs.to(device)
    outputs = model(**inputs)

    # print(outputs.__dict__)
    answer_start = torch.argmax(outputs[0])  # get the most likely beginning of answer with the argmax of the score
    answer_end = torch.argmax(outputs[1]) + 1 

    predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))

    f1 += compute_f1(predicted_answer, val_answers[id]['text'])
    
print(f1/len(val_questions) * 100)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


17.473691385271433


In [None]:
!pip install ml_metrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ml_metrics
  Downloading ml_metrics-0.1.4.tar.gz (5.0 kB)
Building wheels for collected packages: ml-metrics
  Building wheel for ml-metrics (setup.py) ... [?25l[?25hdone
  Created wheel for ml-metrics: filename=ml_metrics-0.1.4-py3-none-any.whl size=7845 sha256=23ef58810ca1320bc60ccb81d8ee2aa901c19f99ca8701b4b1f7a8be3d277879
  Stored in directory: /root/.cache/pip/wheels/56/41/5b/0c6d42b3604a5c823d8922564c4708f84962fa7f2f4facfa6d
Successfully built ml-metrics
Installing collected packages: ml-metrics
Successfully installed ml-metrics-0.1.4


In [None]:
import ml_metrics as metrics

In [None]:
answers_starts = []
answers_ends = []
actual_starts , actual_ends = [],[]
n = len(val_questions)
for id in range(n):
    question = val_questions[id]
    context = val_contexts[id]

    actual_starts.append([val_answers[id]["answer_start"]])
    actual_ends.append([val_answers[id]["answer_end"]])

    inputs = tokenizer.encode_plus(question, context, return_tensors='pt',max_length=512)
    inputs.to(device)
    outputs = model(**inputs)

    # print(outputs.__dict__)
    answer_start = torch.argmax(outputs[0])  # get the most likely beginning of answer with the argmax of the score
    #print(f"answer start {answer_start}")
    answers_starts.append(torch.topk(outputs[0], 5)[1][0])
    # print(f"predicted answer starts {answers_starts}")
    # print(f"actual answer starts {actual_starts}")
    answer_end = torch.argmax(outputs[1])
    #print(f"answer end {answer_end}")
    answers_ends.append(torch.topk(outputs[1], 5)[1][0])
    # print(f"answer ends {answers_ends}")
    # print(f"actual answer ends {actual_ends}")

In [None]:
metrics.mapk(actual_starts,answers_starts,5)

0.0025348542458808617

In [None]:
metrics.mapk(actual_ends,answers_ends,5)

0.0