<a href="https://colab.research.google.com/github/akshaya-bharadhwaj/J008-SNLP-Labs/blob/master/J008_Question_Answering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[

In [None]:
import numpy as np
import pandas as pd

In [None]:
!kaggle datasets download -d stanfordu/stanford-question-answering-dataset

Dataset URL: https://www.kaggle.com/datasets/stanfordu/stanford-question-answering-dataset
License(s): CC-BY-SA-4.0
Downloading stanford-question-answering-dataset.zip to /content
 80% 7.00M/8.73M [00:01<00:00, 8.03MB/s]
100% 8.73M/8.73M [00:01<00:00, 5.87MB/s]


In [None]:
!unzip stanford-question-answering-dataset.zip

Archive:  stanford-question-answering-dataset.zip
  inflating: dev-v1.1.json           
  inflating: train-v1.1.json         


In [None]:
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import json

with open("train-v1.1.json") as f:
    train = json.load(f)

with open('dev-v1.1.json') as f:
    dev = json.load(f)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from datasets import load_dataset, load_metric, Dataset


In [None]:
# Preprocess the data

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def preprocess_function(question, context, answer_start_char, answer_end_char):
    inputs = tokenizer(
        question,
        context,
        max_length=config["max_length"],
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset = inputs.pop("offset_mapping")
    sequence_ids = inputs.sequence_ids()

    context_start, context_end = -1, -1

    # Add logic to find the token indices for context start and context end using `sequence_ids``.
    for i, seq_id in enumerate(sequence_ids):
      if seq_id==1 and context_start== -1:
        context_start=i
      elif seq_id != 1 and context_start != -1:
        context_end =i
        break
    if context_end ==-1:
      context_end = len(sequence_ids)


    context_offsets = offset[context_start: context_end]

    # Create a mapping of charcter index to token index.
    charcter_pos_to_token_pos = {}
    for token_pos, (char_start, char_end) in enumerate(context_offsets):
        for char_pos in range(char_start, char_end):
          charcter_pos_to_token_pos[char_pos] = token_pos+context_start

    start_pos = charcter_pos_to_token_pos.get(answer_start_char, 0)
    end_pos = charcter_pos_to_token_pos.get(
        answer_end_char - 1,
        0 if start_pos == 0 else config['max_length'] - 1
    )

    inputs["start_positions"] = start_pos
    inputs["end_positions"] = end_pos

    return inputs

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# Preprocessing the dataset

def preprocess_data(examples):
    preprocessed_examples = []

    # Iterate through each article in the dataset
    for article in examples['data']:
        # Iterate through each paragraph in the article
        for paragraph in article['paragraphs']:
            context = paragraph['context']  # Extract the context text

            # Iterate through each question-answer (QA) pair in the paragraph
            for qa in paragraph['qas']:
                question = qa['question']  # Extract the question text
                answers = qa['answers']    # Extract the list of answers

                # For each answer, get the start and end positions
                for answer in answers:
                    start_char = answer['answer_start']
                    end_char = start_char + len(answer['text'])

                    # Preprocess and append the example
                    preprocessed_examples.append(preprocess_function(question, context, start_char, end_char))

    return preprocessed_examples



In [None]:
# Define the configuration dictionary
config = {
    "max_length": 384,  # You can adjust this value based on your requirements
    "doc_stride": 128,   # This is often used for splitting long contexts
    "batch_size": 16,    # Adjust based on your hardware capacity
    "epochs": 3,         # Number of training epochs
    "learning_rate": 3e-5, # Learning rate for the optimizer
}

In [None]:
preprocessed_train_data = preprocess_data(train)
preprocessed_dev_data = preprocess_data(dev)

In [None]:
train_dataset = Dataset.from_pandas(pd.DataFrame(preprocessed_train_data))
dev_dataset = Dataset.from_pandas(pd.DataFrame(preprocessed_dev_data))

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained('distilbert-base-uncased')

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Token Level IoU
from transformers import EvalPrediction

def compute_token_level_iou(eval_pred: EvalPrediction):

    # Unpack the predictions and label_ids
    predictions = eval_pred.predictions
    labels = eval_pred.label_ids

    # Convert labels to a NumPy array if it's a tuple
    if isinstance(labels, tuple):
        labels = np.array(labels)

    # Assuming predictions are logits for start and end positions
    # Split the predictions into start and end logits
    start_logits, end_logits = predictions
    # Convert logits to predicted start and end positions
    pred_starts = np.argmax(start_logits, axis=1)
    pred_ends = np.argmax(end_logits, axis=1)

    # Extract true start and end positions from labels
    # Assuming labels contain start and end positions
    # Depending on how labels are structured, you might need to adjust this
    true_starts = labels[:, 0]
    true_ends = labels[:, 1]

    # Compute IoU for each prediction
    iou_scores = []
    for pred_start, pred_end, true_start, true_end in zip(pred_starts, pred_ends, true_starts, true_ends):
        # Calculate intersection
        intersection_start = max(pred_start, true_start)
        intersection_end = min(pred_end, true_end)
        intersection = max(0, intersection_end - intersection_start + 1)

        # Calculate union
        union_start = min(pred_start, true_start)
        union_end = max(pred_end, true_end)
        union = union_end - union_start + 1

        # Compute IoU
        iou = intersection / union if union > 0 else 0.0
        iou_scores.append(iou)

    # Calculate the average IoU over all samples
    average_iou = np.mean(iou_scores)

    return {"token_level_IoU": average_iou}

In [None]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_token_level_iou,  # Pass the custom metric function here
)




In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Token Level Iou
1,1.2707,1.276737,0.5


TrainOutput(global_step=5475, training_loss=1.5753566905679224, metrics={'train_runtime': 4339.9017, 'train_samples_per_second': 20.185, 'train_steps_per_second': 1.262, 'total_flos': 8583810682277376.0, 'train_loss': 1.5753566905679224, 'epoch': 1.0})

In [None]:
# Inference pipeline

def answer_question(question, context):
    # Check the device of the model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Prepare inputs and move to the correct device
    inputs = tokenizer(question, context, return_tensors='pt').to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1

    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(inputs.input_ids[0][answer_start:answer_end])
    )

    return answer

# Example question
question = "What is the capital of France?"
context = "France is a country in Europe. The capital of France is Paris."
print(answer_question(question, context))


paris
