In [None]:
!mkdir squad
# !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O squad/train-v2.0.json
# !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squad/dev-v2.0.json

In [None]:
import json
from pathlib import Path

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    print(squad_dict)
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_squad('/content/drive/MyDrive/BERT/data/train_UK_2word_same_frst_word_30k_with_extra_symbols.json')
val_contexts, val_questions, val_answers = read_squad('/content/drive/MyDrive/BERT/data/val_UK_2word_same_frst_word_30k_with_extra_symbols.json')

In [None]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [None]:
!pip install transformers

In [None]:
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
# from transformers import AutoTokenizer
    
# tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [None]:


def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [None]:
for x in train_encodings["input_ids"][:5]:
    print(tokenizer.decode(x))

In [None]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [None]:
val_dataset[:10]

In [None]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

In [None]:
# path_to_model ="model/pytorch_model.bin"
# # print(".............",path)
# config = DistilBertConfig.from_pretrained(path + "/config.json")
# # tokenizer = DistilBertTokenizer.from_pretrained(path, do_lower_case=self.do_lower_case)
# model = DistilBertForQuestionAnswering.from_pretrained(path_to_model, from_tf=False, config=config)

In [None]:

batch_size=8

In [None]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"/content/drive/MyDrive/BERT/test-squad3",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    save_steps=100000,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=12,
    weight_decay=0.01,
)

In [None]:
from transformers import default_data_collator

data_collator = default_data_collator

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:

trainer.train()

In [None]:
trainer.save_model("/content/drive/MyDrive/BERT/model_UK_2word_first_word_same_50k_random_with_symbol_batch8_epoch20")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
os.listdir("/content/drive/MyDrive")

In [None]:
# context = "The US has passed the peak on new coronavirus cases, " \
#           "President Donald Trump said and predicted that some states would reopen this month. " \
#           "The US has over 637,000 confirmed Covid-19 cases and over 30,826 deaths, the highest for any country in the world."

# question = "What was President Donald Trump's prediction?"

In [None]:
trainer.predict(val_dataset)

In [None]:
import torch

for batch in trainer.get_eval_dataloader():
    break
batch = {k: v.to(trainer.args.device) for k, v in batch.items()}
with torch.no_grad():
    output = trainer.model(**batch)
output.keys()

In [None]:
output.start_logits.shape, output.end_logits.shape

In [None]:
output.start_logits.argmax(dim=-1), output.end_logits.argmax(dim=-1)

In [None]:
n_best_size = 20

In [None]:
import numpy as np

start_logits = output.start_logits[0].cpu().numpy()
end_logits = output.end_logits[0].cpu().numpy()
# Gather the indices the best start/end logits:
start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
valid_answers = []
for start_index in start_indexes:
    for end_index in end_indexes:
        if start_index <= end_index: # We need to refine that test to check the answer is inside the context
            valid_answers.append(
                {
                    "score": start_logits[start_index] + end_logits[end_index],
                    "text": "" # We need to find a way to get back the original substring corresponding to the answer in the context
                }
            )

In [None]:
validation_features = val_dataset.map(
    prepare_validation_features,
    batched=True,
    remove_columns=val_dataset.column_names
)

In [None]:
raw_predictions = trainer.predict(val_dataset)

In [None]:
val_contexts

In [None]:
# start_logits = output.start_logits[0].cpu().numpy()
# end_logits = output.end_logits[0].cpu().numpy()
# # offset_mapping = val_dataset[0]["offset_mapping"]
# # The first feature comes from the first example. For the more general case, we will need to be match the example_id to
# # an example index
# context = val_contexts

# # Gather the indices the best start/end logits:
# start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
# end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
# valid_answers = []
# for start_index in start_indexes:
#     for end_index in end_indexes:
#         # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
#         # to part of the input_ids that are not in the context.
#         # if (
#         #     start_index >= len(offset_mapping)
#         #     or end_index >= len(offset_mapping)
#         #     or offset_mapping[start_index] is None
#         #     or offset_mapping[end_index] is None
#         # ):
#         #     continue
#         # Don't consider answers with a length that is either < 0 or > max_answer_length.
#         if end_index < start_index or end_index - start_index + 1 > max_answer_length:
#             continue
#         if start_index <= end_index: # We need to refine that test to check the answer is inside the context
#             start_char = offset_mapping[start_index][0]
#             end_char = offset_mapping[end_index][1]
#             valid_answers.append(
#                 {
#                     "score": start_logits[start_index] + end_logits[end_index],
#                     "text": context[start_char: end_char]
#                 }
#             )

# valid_answers = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[:n_best_size]
# valid_answers

In [None]:
21# from torch.utils.data import DataLoader
# from transformers import AdamW

# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# model.to(device)
# model.train()

# train_loader = DataLoader(train_dataset,batch_size=2, shuffle=True)
# print(train_loader)
# optim = AdamW(model.parameters(), lr=5e-5)
# print("len:",len(train_loader))
# for epoch in range(1,2):
#     print("epoch:",epoch)
#     i=0
#     for batch in train_loader:

#         # print(i)
#         # i+=1
#         optim.zero_grad()
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         start_positions = batch['start_positions'].to(device)
#         end_positions = batch['end_positions'].to(device)
#         outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
#         loss = outputs[0]
#         loss.backward()
#         optim.step()

# model.eval()


In [None]:
model.save_pretrained("squad")

In [None]:
!pip install transformers


In [None]:
from transformers import BertConfig, BertModel,TFBertModel

In [None]:
config = BertConfig.from_json_file('./squad/config.json')

In [None]:
model = TFBertModel.from_pretrained('./squad/pytorch_model.bin')

# New Section

# New Section