In [1]:
import torch
import json
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torch.optim import AdamW
import jiwer
from sklearn.metrics import f1_score

# Set device to GPU if available, otherwise use CPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

def load_data_from_json(filepath):
    with open(filepath, 'rb') as file:
        dataset = json.load(file)
    passages, questions, answers = [], [], []

    for item in dataset['data']:
        for paragraph in item['paragraphs']:
            context = paragraph['context'].lower()
            for qa in paragraph['qas']:
                question = qa['question'].lower()
                for answer in qa['answers']:
                    passages.append(context)
                    questions.append(question)
                    answers.append(answer)

    return passages, questions, answers

def calculate_answer_bounds(answers, passages):
    for ans, passage in zip(answers, passages):
        ans['text'] = ans['text'].lower()
        ans['answer_end'] = ans['answer_start'] + len(ans['text'])

def process_and_tokenize(passages, questions, answers, tokenizer, max_len):
    tokenized_data = tokenizer(questions, passages, max_length=max_len, padding="max_length", truncation=True)
    start_positions, end_positions = [], []

    for i, (answer, passage) in enumerate(zip(answers, passages)):
        tokens = tokenized_data['input_ids'][i]
        start, end = answer['answer_start'], answer['answer_end']
        
        start_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(passage[:start]))
        end_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(passage[:end]))

        if not start_ids or not end_ids:
            start_positions.append(0)
            end_positions.append(0)
            continue

        midpoint = (start_ids[0] + end_ids[0]) // 2
        start_segment = max(0, min(midpoint - max_len // 2, len(tokens) - max_len))

        start_positions.append(start - start_segment)
        end_positions.append(end - start_segment)

    tokenized_data.update({'start_positions': start_positions, 'end_positions': end_positions})
    return tokenized_data

class CustomQADataset(Dataset):
    def __init__(self, tokenized_data):
        self.data = tokenized_data

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.data.items()}

    def __len__(self):
        return len(self.data['input_ids'])

# Load and process data for training and validation
train_passages, train_questions, train_answers = load_data_from_json('spoken_train-v1.1.json')
val_passages, val_questions, val_answers = load_data_from_json('spoken_test-v1.1.json')

# Calculate boundaries for answers in training and validation sets
calculate_answer_bounds(train_answers, train_passages)
calculate_answer_bounds(val_answers, val_passages)

MAX_TOKEN_LENGTH = 512
MODEL_NAME = "distilbert-base-uncased"

tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)

# Tokenize data
train_encodings = process_and_tokenize(train_passages, train_questions, train_answers, tokenizer, MAX_TOKEN_LENGTH)
val_encodings = process_and_tokenize(val_passages, val_questions, val_answers, tokenizer, MAX_TOKEN_LENGTH)

train_dataset = CustomQADataset(train_encodings)
val_dataset = CustomQADataset(val_encodings)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1)

# Load model and optimizer
model = DistilBertForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training function
def train(model, data_loader, optimizer):
    model.train()
    total_loss = 0
    for batch in tqdm(data_loader, desc='Training'):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_pos = batch['start_positions'].to(device)
        end_pos = batch['end_positions'].to(device)

        output = model(input_ids=input_ids, attention_mask=attention_mask, start_positions=start_pos, end_positions=end_pos)
        loss = output.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    return total_loss / len(data_loader)

# Validation function with F1 calculation
def validate(model, data_loader):
    model.eval()
    true_labels, predictions = [], []

    for batch in tqdm(data_loader, desc='Validation'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        true_start = batch['start_positions'].to(device)
        true_end = batch['end_positions'].to(device)
        
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        pred_start = torch.argmax(outputs.start_logits, dim=1)
        pred_end = torch.argmax(outputs.end_logits, dim=1)

        predictions.extend([(pred_start[i].item(), pred_end[i].item()) for i in range(len(true_start))])
        true_labels.extend([(true_start[i].item(), true_end[i].item()) for i in range(len(true_start))])

    # Compute F1 score
    true_start_labels = [x[0] for x in true_labels]
    true_end_labels = [x[1] for x in true_labels]
    pred_start_labels = [x[0] for x in predictions]
    pred_end_labels = [x[1] for x in predictions]

    f1_start = f1_score(true_start_labels, pred_start_labels, average="macro")
    f1_end = f1_score(true_end_labels, pred_end_labels, average="macro")
    overall_f1 = (f1_start + f1_end) / 2

    return overall_f1

# Training loop for 3 epochs
NUM_EPOCHS = 3
for epoch in range(NUM_EPOCHS):
    avg_train_loss = train(model, train_loader, optimizer)
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Average Training Loss: {avg_train_loss}")

# Calculate final F1 score on validation set
final_f1_score = validate(model, val_loader)
print(f"Final F1 Score: {final_f1_score}")

  torch.utils._pytree._register_pytree_node(
Token indices sequence length is longer than the specified maximum sequence length for this model (593 > 512). Running this sequence through the model will result in indexing errors
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Training: 100%|██████████| 2320/2320 [09:01<00:00,  4.28it/s]


Epoch 1/3, Average Training Loss: 5.914451896938784


Training: 100%|██████████| 2320/2320 [09:03<00:00,  4.27it/s]


Epoch 2/3, Average Training Loss: 5.288564804607424


Training: 100%|██████████| 2320/2320 [09:03<00:00,  4.27it/s]


Epoch 3/3, Average Training Loss: 4.688990122079849


Validation: 100%|██████████| 15875/15875 [01:44<00:00, 152.05it/s]

Final F1 Score: 0.005956339899499583



