Bonus Attempted
1. Automatic learning rate decay
2. Changing doc stride - preprocessing
3. Automatic mixed precision
4. Gradient accumulation
5. Post processing
6. Other pretraining models

In [1]:
import torch
import csv
import json
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, Dataset
from transformers import BertForQuestionAnswering, BertTokenizer
import torch.optim.lr_scheduler as lr_scheduler
import warnings
from transformers import logging

warnings.filterwarnings("ignore")
logging.set_verbosity_error()

In [None]:
def preprocess_data(squad_file_path):
    with open(file_path, 'r') as f:
        squad_dict = json.load(f)
    squad_data = squad_dict['data']
    
    rows = []
    for article in squad_data:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                answer = qa['answers'][0]['text']
                answer_start = qa['answers'][0]['answer_start']
                answer_end = answer_start + len(answer)
                rows.append([str(context), str(question), str(answer), answer_start, answer_end])
    return rows


def save_to_csv(file_path, rows):
    row_count = 0
    with open(file_path, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['context', 'question', 'answer', 'answer_start', 'answer_end'])
        for row in rows:
            try:
                writer.writerow(row)
            except:
                row_count = row_count + 1

In [None]:
squad_train_data_rows = preprocess_squad_data('spoken_train-v1.1.json')
save_to_csv('squad_train_data.csv', squad_train_data_rows)

squad_test_data_rows = preprocess_squad_data('spoken_test-v1.1.json')
save_to_csv('squad_test_data.csv', test_data_rows)

In [6]:
print(torch.cuda.is_available())

True


# DataLoading

In [2]:
batch_size = 8
num_epochs = 3
learning_rate = 2e-5

In [3]:
# Define the dataset
class Dataset_Squad(Dataset):
    def __init__(self, squad_file_path):
        self.contexts = []
        self.questions = []
        self.answers = []
        self.answer_starts = []
        self.answer_ends = []

        with open(squad_file_path, 'r', encoding='cp1252') as f:
            reader = csv.reader(f)
            next(reader)
            for row in reader:
                self.contexts.append(row[0])
                self.questions.append(row[1])
                self.answers.append(row[2])
                self.answer_starts.append(int(row[3]))
                self.answer_ends.append(int(row[4]))

    
    def __len__(self):
        return len(self.contexts)

    
    def __getitem__(self, idx):
        return {
            'context': self.contexts[idx],
            'question': self.questions[idx],
            'answer': self.answers[idx],
            'answer_start': self.answer_starts[idx],
            'answer_end': self.answer_ends[idx]
        }

In [4]:
train_dataset = Dataset_Squad('squad_train_data.csv')
print(f'Length of Train dataset {len(train_dataset)}')

test_dataset = Dataset_Squad('squad_test_data.csv')
print(f'Length of Test dataset {len(test_dataset)}')

Length of Train dataset 37079
Length of Test dataset 5351


In [5]:
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
print(f'Length of Train dataloader {len(train_data_loader)}')

test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
print(f'Length of Train dataloader {len(test_data_loader)}')

Length of Train dataloader 4635
Length of Train dataloader 669


# Train function

In [6]:
from torch.cuda.amp import autocast, GradScaler

def model_train(model, data_loader, optimizer, device, accumulation_steps):
    model.train()
    model.to(device)
    total_loss = 0
    
    scaler = GradScaler()  # initialize the GradScaler object
    batch_counter = 0

    for data in data_loader:
        # Move data to device
        inputs = tokenizer(
            data['context'],
            data['question'],
            return_tensors='pt',
            padding=True,
            truncation=True,
            stride=128,
            max_length=512
        )

        inputs = {key: val.to(device) for key, val in inputs.items()}
        
        start_positions = data['answer_start'].to(device)
        end_positions = data['answer_end'].to(device)

        # Clear gradients
        optimizer.zero_grad()
        
        with autocast():  # enable automatic mixed precision
            outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
            loss = outputs.loss
        
        scaler.scale(loss).backward()  # scale the loss and perform backward pass
        
        batch_counter += 1
        if batch_counter % accumulation_steps == 0:
            scaler.step(optimizer)  # update the model weights
            scaler.update()  # update the GradScaler for the next iteration
            optimizer.zero_grad()  # clear gradients
            
        loss_val = loss.item()
        
        if str(loss_val) == 'nan':
            loss_val = 0

        total_loss += loss_val

    if batch_counter % accumulation_steps != 0:
        scaler.step(optimizer)  # update the model weights
        scaler.update()  # update the GradScaler for the next iteration
        optimizer.zero_grad()  # clear gradients
    
    return total_loss / len(data_loader)

# Test function

In [7]:
def model_test(model, data_loader, optimizer, device):    
    valid_loss = 0.0
    model.eval()
    model.to(device)
    
    scaler = GradScaler()  # initialize the GradScaler object
    
    for data in data_loader:
        # Move data to device
        inputs = tokenizer(
            data['context'],
            data['question'],
            return_tensors='pt',
            padding=True,
            truncation=True,
            stride=128,
            max_length=512
        )
        inputs = {key: val.to(device) for key, val in inputs.items()}
        
        start_positions = data['answer_start'].to(device)
        end_positions = data['answer_end'].to(device)

        # Clear gradients
        optimizer.zero_grad()
        
        with autocast():  # enable automatic mixed precision
            outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
            loss = outputs.loss
        
        loss_val = loss.item()
        
        if str(loss_val) == 'nan':
            loss_val = 0
        
        valid_loss += loss_val

    return valid_loss / len(data_loader)


# Prediction

In [24]:
def predict_result(model, tokenizer, context, question):
    # Tokenize inputs
    inputs = tokenizer(context, question, return_tensors='pt', padding=True, truncation=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    input_ids = inputs['input_ids'].squeeze()
    
    # Forward pass
    output = model(**inputs)

    # Get predicted answer
    start_idx = torch.argmax(output.start_logits)
    end_idx = torch.argmax(output.end_logits) + 1
    
    # what if end index < start index
    if end_idx < start_idx:
        # Swap the indices if end_idx is less than start_idx
        start_idx, end_idx = end_idx, start_idx
    
    answer = tokenizer.decode(input_ids[start_idx:end_idx])
    return answer

# Training & Evaluation - BERT Model

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', max_length=512)
bert_model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Initialize the optimizer
optimizer = torch.optim.AdamW(bert_model.parameters(), lr=learning_rate)

# Define learning rate scheduler
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)

cuda


In [10]:
bert_model = bert_model.to(device)
for epoch in range(num_epochs):
    train_loss = model_train(bert_model, train_data_loader, optimizer, device, accumulation_steps=2)
    test_loss = model_test(bert_model, test_data_loader, optimizer, device)
    
    print(f'Epoch {epoch+1} ---> train loss {train_loss} ---> test loss {test_loss}')
    
    # Update learning rate
    scheduler.step(test_loss)

Epoch 1 ---> train loss 5.3849138663681835 ---> test loss 5.4383845165349625
Epoch 2 ---> train loss 5.307152661587942 ---> test loss 5.3353072385260525
Epoch 3 ---> train loss 5.089821841312972 ---> test loss 5.265667626319623


In [37]:
def predict_answer(model, tokenizer, context, question):
    # Tokenize inputs
    #inputs = tokenizer(context, question, return_tensors='pt', padding=True, truncation=True)
    inputs = tokenizer(
            context,
            question,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=512
        )
    inputs = {key: val.to(device) for key, val in inputs.items()}
    input_ids = inputs['input_ids'].squeeze()
    
    # Forward pass
    output = model(**inputs)

    # Get predicted answer
    start_idx = torch.argmax(output.start_logits)
    end_idx = torch.argmax(output.end_logits) + 1
    
    if end_idx < start_idx:
        # Swap the indices if end_idx is less than start_idx
        start_idx, end_idx = end_idx, start_idx
    
    answer = tokenizer.decode(input_ids[start_idx:end_idx])
    return answer

In [44]:
context = "The quick brown fox jumps over the lazy dog."
question = "What does the fox jump over?"
answer = predict_answer(bert_model, tokenizer, context, question)
print(answer) 

the lazy dog


In [43]:
context = '''this main building and the library collection was entirely destroyed by a fire in april eighteen 
             seventy nine and the school closed immediately and students were sent home. the university founder 
             f r. soaring and the president at the time the rent. william corby immediately plan for the 
             rebuilding of the structures that have housed virtually the entire university. 
             construction was started on the seventeenth of may and by the incredible zeal of a administrator 
             and workers the building was completed before the fall semester of eighteen seventy nine. 
             the library collection was also rebuilt in spain housed in the new main building for years 
             afterwards. around the time of the fire the music hall was opened. eventually becoming known as 
             washington hall a hosted placing musical act put on by the school. by eighteen eighty as 
             science program was established at the university and a science hall today lafferty in student 
             center was built in eighteen eighty three. the hall housed multiple classrooms in science labs 
             needed for early research at the university.'''

question = "What was the music hall at Notre Dame called?"

answer = predict_answer(bert_model, tokenizer, context, question)

print(answer) 

washington


# Training & Evaluation - DistilBert Model

In [40]:
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizer

# Initialize the DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased', max_length=512)
distilbert_model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased')

# Initialize the optimizer
optimizer = torch.optim.AdamW(distilbert_model.parameters(), lr=learning_rate)

# Define learning rate scheduler
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)

distilbert_model = distilbert_model.to(device)

for epoch in range(num_epochs):
    train_loss = model_train(distilbert_model, train_data_loader, optimizer, device, accumulation_steps=2)
    test_loss = model_test(distilbert_model, test_data_loader, optimizer, device)
    print(f'Epoch {epoch+1} ---> train loss {train_loss} ---> test loss {test_loss}')

    scheduler.step(test_loss)


Epoch 1 ---> train loss 5.510384187986421 ---> test loss 5.601940179263173
Epoch 2 ---> train loss 5.566739558144809 ---> test loss 5.58954277665804
Epoch 3 ---> train loss 5.526833188829555 ---> test loss 5.491155148978012


In [45]:
context = "The quick brown fox jumps over the lazy dog."
question = "What does the fox jump over?"
answer = predict_answer(distilbert_model, tokenizer, context, question)
print(answer) 

the lazy dog


In [46]:
context = '''this main building and the library collection was entirely destroyed by a fire in april eighteen 
             seventy nine and the school closed immediately and students were sent home. the university founder 
             f r. soaring and the president at the time the rent. william corby immediately plan for the 
             rebuilding of the structures that have housed virtually the entire university. 
             construction was started on the seventeenth of may and by the incredible zeal of a administrator 
             and workers the building was completed before the fall semester of eighteen seventy nine. 
             the library collection was also rebuilt in spain housed in the new main building for years 
             afterwards. around the time of the fire the music hall was opened. eventually becoming known as 
             washington hall a hosted placing musical act put on by the school. by eighteen eighty as 
             science program was established at the university and a science hall today lafferty in student 
             center was built in eighteen eighty three. the hall housed multiple classrooms in science labs 
             needed for early research at the university.'''

question = "What was the music hall at Notre Dame called?"

answer = predict_answer(distilbert_model, tokenizer, context, question)

print(answer) 

washington
