In [2]:
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from transformers import BertForQuestionAnswering

In [None]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking')

In [3]:
from transformers import BertTokenizer, BertForQuestionAnswering
from torch.utils.data import DataLoader
from datasets import load_dataset
import torch

# Define model name and path
model_name = "bert-base-uncased"  # Adjust as needed

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

# Load SpokenSQUAD dataset
squad_dataset = load_dataset("squad", name="spoken_squad")

# Preprocess data function (example)
def preprocess_function(examples):
  question = examples["question"]
  context = examples["context"]
  answer_start = examples["answers"]["answer_start"][0]  # Assuming single answer
  answer_end = examples["answers"]["answer_end"][0]  # Assuming single answer
  
  # Tokenize and convert to tensors
  encoding = tokenizer(question, context, return_tensors="pt")
  start_positions = torch.tensor([answer_start], dtype=torch.long)
  end_positions = torch.tensor([answer_end], dtype=torch.long)
  
  return {
      "input_ids": encoding["input_ids"],
      "attention_mask": encoding["attention_mask"],
      "start_positions": start_positions,
      "end_positions": end_positions,
  }

# Preprocess training and validation data
train_dataset = squad_dataset["train"].map(preprocess_function, batched=True)
validation_dataset = squad_dataset["validation"].map(preprocess_function, batched=True)

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=8)
validation_dataloader = DataLoader(validation_dataset, batch_size=8)

# Define optimizer and learning rate scheduler (adjust parameters as needed)
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
def train(epochs):
  for epoch in range(epochs):
    model.train()  # Set model to training mode
    for batch in train_dataloader:
      # Access data from batch
      input_ids = batch["input_ids"].to(device)
      attention_mask = batch["attention_mask"].to(device)
      start_positions = batch["start_positions"].to(device)
      end_positions = batch["end_positions"].to(device)

      # Forward pass
      outputs = model(**batch)
      loss = outputs.loss  # Access loss from model outputs

      # Backward pass and optimize
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()
    
    # Perform validation after each epoch (optional)
    evaluate(validation_dataloader)

# Evaluation function
def evaluate(dataloader):
  model.eval()  # Set model to evaluation mode
  
  # Initialize variables for keeping track of metrics
  f1 = 0
  exact_match = 0
  total = 0
  
  for batch in dataloader:
    # Access data from batch
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    start_positions = batch["start_positions"].to(device)
    end_positions = batch["end_positions"].to(device)

    # Forward pass
    with torch.no_grad():  # Disable gradient calculation for evaluation
      outputs = model(**batch)
    
    # Extract predictions
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Implement your answer prediction logic here (e.g., beam search)
    # This example uses greedy decoding (replace with your preferred method)
    predicted_start_positions = torch.argmax(start_logits, dim=-1)
    predicted_end_positions = torch.argmax(end_logits, dim=-1)

    # Calculate metrics (replace with your preferred evaluation method)
    for i in range(len(batch)):
      # Assuming single answer per example
      predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(batch["input_ids"][i][predicted_start_positions[i]:predicted_end_positions[i]+1]))
      reference_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(batch["context"][i][start_positions[i]:end_positions[i]+1]))

      # Update metrics based on your chosen evaluation method (e.g., SQuAD)
      # Here, a simple string matching example
      if predicted_answer == reference_answer:
        exact_match += 1
      total += 1

  # Calculate F1 score (modify based on your chosen evaluation method)
  if total > 0:
    f1 = exact_match / total

  # Print evaluation metrics (modify to print desired metrics)
  print(f"F1 Score: {f1:.4f}, Exact Match: {exact_match}/{total}")

# Train and evaluate the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move model to GPU if available
train(epochs=1)  # Adjust number of epochs
evaluate(validation_dataloader)

# Save the fine-tuned model
model.save_pretrained("my_squad_bert_model")


Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading and preparing dataset None/plain_text to /Users/andrewwright/.cache/huggingface/datasets/parquet/plain_text-57edf78d6033ac9a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /Users/andrewwright/.cache/huggingface/datasets/parquet/plain_text-57edf78d6033ac9a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

TypeError: list indices must be integers or slices, not str