In [47]:
# from huggingface_hub import notebook_login
# notebook_login()

from datasets import load_dataset
import pandas as pd

squad = load_dataset("squad", split="train[:5000]")
squad = squad.train_test_split(test_size=0.2)


def get_start_end(train):
    ans_start = []
    ans_end = []

    for ans in train['answers']:
        start = ans['answer_start'][0]
        end = start + len(ans['text'][0].strip())

        ans_start.append(start)
        ans_end.append(end)
        
    return {
        "start": ans_start,
        "end": ans_end   
    }

In [43]:
# Convert the dataset to a dictionary
data_dict = squad["train"].to_dict()
# Create a DataFrame from the dictionary
df = pd.DataFrame.from_dict(data_dict)
df.head()
train = df.iloc[:1000, :].copy()
val = df.iloc[1000:1500, :].copy()

In [48]:
train_answers = get_start_end(train)
val_answers = get_start_end(val)

In [3]:
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("models/RoBERT.pth")
tokenizer = AutoTokenizer.from_pretrained("models/Tokenizer_RoBERT.pth/")

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at models/RoBERT.pth and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [93]:
questions = [q.strip() for q in train["question"]]
context = [q.strip() for q in train["context"]]
train_encodings = tokenizer(context, questions, max_length=512, truncation=True, padding=True)

questions = [q.strip() for q in val["question"]]
context = [q.strip() for q in val["context"]]
val_encodings = tokenizer(context, questions, max_length=512, truncation=True, padding=True)

In [94]:
train_encodings.char_to_token(0, train['answers'][0]['answer_start'][0])

296

In [95]:
train['answers'][0]['answer_start'][0]

750

In [96]:
train_encodings.char_to_token(0, train_answers['end'][0])

301

In [115]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers['start'])):
        start = encodings.char_to_token(i, answers['start'][i])
        end = encodings.char_to_token(i, answers['end'][i])
        
        
        if start is None:
            # start = tokenizer.model_max_length
            start = 512
        
        go_back = 1
        while end is None:
            end = encodings.char_to_token(i, answers['end'][i] - go_back)
            go_back += 1
            
#         if end is None:
#             print('yes')
#             end = tokenizer.model_max_length
            
        start_positions.append(start)
        end_positions.append(end)
    
    encodings.update({
        'start_positions': start_positions,
        'end_positions': end_positions
    })
    

In [116]:
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [117]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [118]:
len(train_encodings['input_ids'][0])

512

In [119]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __len__(self, ):
        return len(self.encodings.input_ids)
    def __getitem__(self, x):
        return {key: torch.tensor(val[x]) for key, val in self.encodings.items()}

In [120]:
next(iter(SquadDataset(train_encodings))).keys()

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [121]:
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [122]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

In [123]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
model.train()
optim = AdamW(model.parameters(), lr=5e-5)



In [124]:
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=True)

In [125]:
for epoch in range(3):
    loop = tqdm(train_loader)
    for batch in loop:
        optim.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, 
                        start_positions=start_positions,
                        end_positions=end_positions)
        
        loss = outputs[0]
        loss.backward()
        optim.step()
        
        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|█████████████████████████████████████████████████████████████| 500/500 [00:46<00:00, 10.69it/s, loss=4.5]
Epoch 1: 100%|████████████████████████████████████████████████████████████| 500/500 [00:46<00:00, 10.64it/s, loss=3.79]
Epoch 2: 100%|████████████████████████████████████████████████████████████| 500/500 [00:47<00:00, 10.62it/s, loss=2.87]
