In [112]:
from transformers import BertTokenizer, BertForQuestionAnswering
from torch.utils.data import DataLoader
from datasets import load_dataset
import torch
from torch.optim import AdamW

In [133]:
device = "cuda" if torch.cuda.is_available() else "cpu"
kwargs = {'num_workers': 1, 'pin_memory': True} if device=='cuda' else {}

# Define model name and path
model_name = "bert-base-uncased"  # Adjust as needed

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

# Load SpokenSQUAD dataset
squad_dataset = load_dataset("squad", name="spoken_squad")


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Found cached dataset parquet (/Users/andrewwright/.cache/huggingface/datasets/parquet/plain_text-57edf78d6033ac9a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

In [134]:
#squad_dataset["train"]

In [135]:
# Preprocess data function (example)
def preprocess_function(examples):
  #ids = examples["id"]
  question = examples["question"]
  context = examples["context"]
  #print(examples["answers"][0]["answer_start"])
  answer_start = [] 
  answer_end =[]
  for _, ans in enumerate(examples["answers"]):
    #print(ans["answer_start"])
    ans_start = ans["answer_start"][0]
    answer_start.append(ans_start)
    ans_end = ans_start + len(ans["text"][0].split())
    answer_end.append(ans_end)
  #print(answer_start)
  #print(examples["answers"][0]["text"])
  #print(answer_start[0])
  #print(int(answer_start[0]) + len(examples["answers"][0]["text"][0].split()))
  #answer_end =   answer_start + len(examples["answers"][0]["text"][0].split())  # Assuming single answer
  
  # Tokenize and convert to tensors
  encoding = tokenizer(question, context, padding="max_length", truncation=True , return_tensors="pt")
  start_positions = torch.tensor([answer_start], dtype=torch.long)
  #print(start_positions.shape)
  #print(start_positions.squeeze(0).shape)
  end_positions = torch.tensor([answer_end], dtype=torch.long)
  
  return {
      "input_ids": encoding["input_ids"],
      "attention_mask": encoding["attention_mask"],
      "start_positions": start_positions.squeeze(0),
      "end_positions": end_positions.squeeze(0),
  }

In [136]:
# Preprocess training and validation data
train_dataset = squad_dataset["train"].map(preprocess_function, batched=True)
validation_dataset = squad_dataset["validation"].map(preprocess_function, batched=True)

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=8, **kwargs)
validation_dataloader = DataLoader(validation_dataset, batch_size=8, **kwargs)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [161]:
# Define optimizer and learning rate scheduler (adjust parameters as needed)
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
def train(epochs, device):
  for epoch in range(epochs):
    model.train()  # Set model to training mode
    for batch in train_dataloader:
      # Access data from batch
      input_ids = batch["input_ids"]
      print(input_ids)
      attention_mask = batch["attention_mask"]
      start_positions = batch["start_positions"]
      end_positions = batch["end_positions"]
      model_args = {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "start_positions": start_positions,
        "end_positions": end_positions
    }
      # Forward pass
      outputs = model(**model_args)
      loss = outputs.loss  # Access loss from model outputs

      # Backward pass and optimize
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()
    
    # Perform validation after each epoch (optional)
    evaluate(validation_dataloader)

# Evaluation function
def evaluate(dataloader, device):
  model.eval()  # Set model to evaluation mode
  
  # Initialize variables for keeping track of metrics
  f1 = 0
  exact_match = 0
  total = 0
  
  for batch in dataloader:
    # Access data from batch
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    start_positions = batch["start_positions"]
    end_positions = batch["end_positions"]
    
    # Forward pass
    with torch.no_grad():  # Disable gradient calculation for evaluation
      outputs = model(**batch)
    
    # Extract predictions
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Implement your answer prediction logic here (e.g., beam search)
    # This example uses greedy decoding (replace with your preferred method)
    predicted_start_positions = torch.argmax(start_logits, dim=-1)
    predicted_end_positions = torch.argmax(end_logits, dim=-1)

    # Calculate metrics (replace with your preferred evaluation method)
    for i in range(len(batch)):
      # Assuming single answer per example
      predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(batch["input_ids"][i][predicted_start_positions[i]:predicted_end_positions[i]+1]))
      reference_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(batch["context"][i][start_positions[i]:end_positions[i]+1]))

      # Update metrics based on your chosen evaluation method (e.g., SQuAD)
      # Here, a simple string matching example
      if predicted_answer == reference_answer:
        exact_match += 1
      total += 1

  # Calculate F1 score (modify based on your chosen evaluation method)
  if total > 0:
    f1 = exact_match / total

  # Print evaluation metrics (modify to print desired metrics)
  print(f"F1 Score: {f1:.4f}, Exact Match: {exact_match}/{total}")

In [162]:
# Train and evaluate the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move model to GPU if available
train(epochs=1, device = device)  # Adjust number of epochs
evaluate(validation_dataloader, device = device)

# Save the fine-tuned model
model.save_pretrained("my_squad_bert_model")

id
title
context
question
answers
input_ids
attention_mask
start_positions
end_positions
[tensor([101, 101, 101, 101, 101, 101, 101, 101]), tensor([2000, 2054, 1996, 2054, 2054, 2043, 2129, 2054]), tensor([ 3183,  2003, 13546,  2003,  7719,  2106,  2411,  2003]), tensor([2106, 1999, 1997, 1996, 2006, 1996, 2003, 1996]), tensor([ 1996,  2392,  1996, 24665,  2327, 24105, 10289,  3679]), tensor([ 6261,  1997,  6730, 23052,  1997,  2932,  8214,  3076]), tensor([2984, 1996, 2540, 2012, 1996, 1997, 1005, 3259]), tensor([ 9382, 10289,  2012, 10289,  2364, 10289,  1055,  2012]), tensor([ 3711,  8214, 10289,  8214,  2311,  8214,  1996, 10289]), tensor([ 1999,  2364,  8214,  1029,  2012,  4088, 26536,  8214]), tensor([ 8517,  2311,  2003,   102, 10289,  4640, 17420,  2170]), tensor([1999, 1029, 3875, 6549, 8214, 1029, 2405, 1029]), tensor([10223,   102,  2000,  2135,  1029,   102,  1029,   102]), tensor([26371,  6549,  2029,  1010,   102,  2004,   102,  2004]), tensor([2605, 2135, 3252, 1996, 65

AttributeError: 'list' object has no attribute 'size'

In [148]:
type(model._parameters)

collections.OrderedDict