In [1]:
import json
from pathlib import Path

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    print(squad_dict)
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_squad('data/trainUK!_2000.json')
val_contexts, val_questions, val_answers = read_squad('data/valUK!_2000.json')

{'version': 'v2.0', 'data': [{'title': 'ryshbdwiwn', 'paragraphs': [{'qas': [{'question': 'Claims Paid TP?', 'id': '6xfkvbv', 'answers': [{'text': '71842', 'answer_start': 18}], 'is_impossible': 'false'}, {'question': 'Claims Paid FT?', 'id': '7rmsbtrv', 'answers': [{'text': '4489595092', 'answer_start': 29}], 'is_impossible': 'false'}, {'question': 'Claims Paid AD?', 'id': '6sjlgsfi', 'answers': [{'text': '43778', 'answer_start': 45}], 'is_impossible': 'false'}, {'question': 'Claims Outstanding TP?', 'id': '5xehidbzqc', 'answers': [{'text': '646083377', 'answer_start': 77}], 'is_impossible': 'false'}, {'question': 'Claims Outstanding FT?', 'id': '7wqbabff', 'answers': [{'text': '45085', 'answer_start': 92}], 'is_impossible': 'false'}, {'question': 'Claims Outstanding AD?', 'id': '5lz', 'answers': [{'text': '67', 'answer_start': 103}], 'is_impossible': 'false'}], 'context': 'Claims Paid TP : 71842 FT : 4489595092 AD : 43778 ! Claims Outstanding TP : 646083377 FT : 45085 AD : 67 ! '}, {

{'version': 'v2.0', 'data': [{'title': 'flv', 'paragraphs': [{'qas': [{'question': 'Claims Paid FT?', 'id': '9qmktseocbr', 'answers': [{'text': '4129782387', 'answer_start': 18}, {'text': '4129782387', 'answer_start': 18}, {'text': '4129782387', 'answer_start': 18}], 'is_impossible': 'false'}, {'question': 'Claims Paid TP?', 'id': '4inxtdregaj', 'answers': [{'text': '93', 'answer_start': 34}, {'text': '93', 'answer_start': 34}, {'text': '93', 'answer_start': 34}], 'is_impossible': 'false'}, {'question': 'Claims Paid AD?', 'id': '6erekdmjzs', 'answers': [{'text': '47296305', 'answer_start': 42}, {'text': '47296305', 'answer_start': 42}, {'text': '47296305', 'answer_start': 42}], 'is_impossible': 'false'}, {'question': 'Claims Outstanding FT?', 'id': '8sitwao', 'answers': [{'text': '0155644703', 'answer_start': 77}, {'text': '0155644703', 'answer_start': 77}, {'text': '0155644703', 'answer_start': 77}], 'is_impossible': 'false'}, {'question': 'Claims Outstanding TP?', 'id': '6gejfidem', 

In [2]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [3]:
!pip install transformers



You should consider upgrading via the 'c:\users\ashwin\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.





In [4]:
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

In [5]:
from transformers import (WEIGHTS_NAME, DistilBertConfig,DistilBertForQuestionAnswering, DistilBertTokenizer)
from transformers import DistilBertTokenizerFast

In [6]:

path ="model"
tokenizer = DistilBertTokenizerFast.from_pretrained(path)

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [8]:
train_encodings 


{'input_ids': [[101, 4447, 3825, 1056, 2361, 1024, 6390, 2620, 20958, 3027, 1024, 4008, 2620, 2683, 28154, 12376, 2683, 2475, 4748, 1024, 4724, 2581, 2581, 2620, 999, 4447, 5151, 1056, 2361, 1024, 4185, 16086, 2620, 22394, 2581, 2581, 3027, 1024, 10332, 27531, 4748, 1024, 6163, 999, 102, 4447, 3825, 1056, 2361, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 4447, 3825, 1056, 2361, 1024, 6390, 2620, 20958, 3027, 1024, 4008, 2620, 2683, 28154, 12376, 2683, 2475, 4748, 1024, 4724, 2581, 2581, 2620, 999, 4447, 5151, 1056, 2361, 1024, 4185, 16086, 2620, 22394, 2581, 2581, 3027, 1024, 10332, 27531, 4748, 1024, 6163, 999, 102, 4447, 3825, 3027, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 4447, 3825, 1056, 2361, 1024, 6390, 2620, 20958, 3027, 1024, 4008, 2620, 2683, 28154, 12376, 2683, 2475, 4748, 1024, 4724, 2581, 2581, 2620, 999, 4447, 5151, 1056, 2361, 1024, 4185, 16086, 2620, 22394, 2581, 2581, 3027, 1024, 10332, 27531, 4748, 1024, 6163, 999, 102, 4447, 3825, 4748, 1029, 102, 0, 0, 0, 0,

In [9]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [10]:
for x in train_encodings["input_ids"][:5]:
    print(tokenizer.decode(x))

[CLS] claims paid tp : 71842 ft : 4489595092 ad : 43778! claims outstanding tp : 646083377 ft : 45085 ad : 67! [SEP] claims paid tp? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] claims paid tp : 71842 ft : 4489595092 ad : 43778! claims outstanding tp : 646083377 ft : 45085 ad : 67! [SEP] claims paid ft? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] claims paid tp : 71842 ft : 4489595092 ad : 43778! claims outstanding tp : 646083377 ft : 45085 ad : 67! [SEP] claims paid ad? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] claims paid tp : 71842 ft : 4489595092 ad : 43778! claims outstanding tp : 646083377 ft : 45085 ad : 67! [SEP] claims outstanding tp? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] claims paid tp : 71842 ft : 4489595092 ad : 43778! claims outstanding tp : 646083377 ft : 45085 ad : 67! [SEP] claims outstanding ft? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [11]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [12]:
path_to_model ="model/pytorch_model.bin"
# print(".............",path)
config = DistilBertConfig.from_pretrained(path + "/config.json")
# tokenizer = DistilBertTokenizer.from_pretrained(path, do_lower_case=self.do_lower_case)
model = DistilBertForQuestionAnswering.from_pretrained(path_to_model, from_tf=False, config=config)

In [13]:
batch_size=2

In [18]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"test_transferUK",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    save_steps=10000,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
)

In [19]:
from transformers import default_data_collator

data_collator = default_data_collator

In [20]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


In [18]:
trainer.save_model("test-squad-trained5")