# Environment Setup

## Install & Import necessary libraries

In [2]:
# !pip install transformers datasets torch
# !pip install gensim sentence-transformers
# !pip install transformers datasets torch
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [3]:
import json
import requests
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline,BertTokenizer, BertForQuestionAnswering, Trainer, TrainingArguments
from sentence_transformers import SentenceTransformer
from torch.cuda.amp import autocast
#from datasets import load_dataset
from datasets import Dataset, load_dataset
from google.colab import drive
import numpy as np
import random




# Load and Explore the Dataset

## Download dataset and save

In [4]:
# URL to the dataset
url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json"

# Download the dataset
response = requests.get(url)
data = response.json()

# Save the dataset locally (optional)
with open("train-v2.0.json", "w") as f:
    json.dump(data, f)

print("Dataset downloaded and saved locally.")

Dataset downloaded and saved locally.


## Load dataset object

In [5]:
# Load the dataset
with open("train-v2.0.json", "r") as f:
    squad_data = json.load(f)

# Check the structure of the dataset
print(f"Dataset keys: {squad_data.keys()}")

# Explore the dataset structure
data = squad_data['data']
print(f"Number of articles: {len(data)}")
print(f"Structure of data: {data[0]}")
print(f"First article title: {data[0]['title']}")
print(f"First paragraph: {data[0]['paragraphs'][0]['context']}")


Dataset keys: dict_keys(['version', 'data'])
Number of articles: 442
Structure of data: {'title': 'Beyoncé', 'paragraphs': [{'qas': [{'question': 'When did Beyonce start becoming popular?', 'id': '56be85543aeaaa14008c9063', 'answers': [{'text': 'in the late 1990s', 'answer_start': 269}], 'is_impossible': False}, {'question': 'What areas did Beyonce compete in when she was growing up?', 'id': '56be85543aeaaa14008c9065', 'answers': [{'text': 'singing and dancing', 'answer_start': 207}], 'is_impossible': False}, {'question': "When did Beyonce leave Destiny's Child and become a solo singer?", 'id': '56be85543aeaaa14008c9066', 'answers': [{'text': '2003', 'answer_start': 526}], 'is_impossible': False}, {'question': 'In what city and state did Beyonce  grow up? ', 'id': '56bf6b0f3aeaaa14008c9601', 'answers': [{'text': 'Houston, Texas', 'answer_start': 166}], 'is_impossible': False}, {'question': 'In which decade did Beyonce become famous?', 'id': '56bf6b0f3aeaaa14008c9602', 'answers': [{'tex

## Preprocess the Data
Extract questions, contexts, and answers

In [6]:
# Initialize lists to store features and labels
contexts = []
questions = []
answers = []
is_impossible = []

# Extract features and labels
for article in squad_data['data']:
    for paragraph in article['paragraphs']:
        context = paragraph['context']  # The context paragraph
        for qa in paragraph['qas']:
            question = qa['question']   # The question text
            impossible = qa['is_impossible']
            if qa['is_impossible']:
                answer = ""  # No answer for unanswerable questions
            else:
                answer = qa['answers'][0]['text']  # Take the first answer

            # Append to respective lists
            contexts.append(context)
            questions.append(question)
            answers.append(answer)
            is_impossible.append(impossible)

print(f"Number of examples: {len(contexts)}")
print(f"Sample context: {contexts[0]}")
print(f"Sample question: {questions[0]}")
print(f"Sample answer: {answers[0]}")


Number of examples: 130319
Sample context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Sample question: When did Beyonce start becoming popular?
Sample answer: in the late 1990s


## Analyze the Data
Check the distribution of data, e.g., how many questions are unanswerable

In [7]:
num_unanswerable = sum(1 for ans in answers if ans == "")
print(f"Total questions: {len(questions)}")
print(f"Total contexts: {len(contexts)}")
print(f"Number of answerable questions: {len(answers) - num_unanswerable}")
print(f"Number of unanswerable questions: {num_unanswerable}")


Total questions: 130319
Total contexts: 130319
Number of answerable questions: 86821
Number of unanswerable questions: 43498


## Use a Pre-trained Model
Utilize a pre-trained transformer model for QA

In [8]:
# Load pre-trained tokenizer and model
model_name = "distilbert-base-uncased-distilled-squad"
tokenizer_default = AutoTokenizer.from_pretrained(model_name)
model_default = AutoModelForQuestionAnswering.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

## Prepare QA Pipeline

In [9]:
# QA pipeline
qa_pipeline_default = pipeline("question-answering", model=model_default, tokenizer=tokenizer_default)

Device set to use cuda:0


## Test the system with a sample

In [11]:
def get_answer_without_fine_tune(question, context):
    """
    Get the answer to a question based on the provided context.

    Args:
        question (str): The input question.
        context (str): The context paragraph.

    Returns:
        str: The predicted answer.
    """
    result = qa_pipeline_default(question=question, context=context)
    print(f"Question: {question}")
    print(f"Answer: {result['answer']}")
    print(f"Score: {round(result['score'], 4)}")
    return result['answer']

In [51]:
questions_ask = [
    "What is statistics?",
    "Where can descriptive statistics be used?",
    "How to draw meaningful conclusions?",
    "When did Beyonce start becoming popular?"
]

context_ask = """Statistics is the discipline that concerns the collection, organization, analysis, interpretation,
and presentation of data. In applying statistics to a scientific, industrial, or societal problem, it is
conventional to begin with a statistical population or a statistical model to be studied. Populations can be
diverse groups of people or objects such as 'all people living in a country' or 'every atom composing a crystal.'
Statistics deals with all aspects of data, including the planning of data collection in terms of the design of
surveys and experiments. When census data cannot be collected, statisticians collect data by developing specific
experiment designs and survey samples. Representative sampling assures that inferences and conclusions can reasonably
extend from the sample to the population as a whole. Descriptive statistics summarize and visualize data using indices
such as the mean or standard deviation. Inferential statistics involves drawing conclusions from data that are subject
to random variation (e.g., observational errors or sampling variation). Initial requirements of inferential statistics
are that the sampling method is representative of the population being studied and that the sample sizes are sufficient."""

# Generate answers
for question in questions_ask:
    answer = get_answer_without_fine_tune(question, context_ask)
    print(f"Question: {question}")
    print(f"Answer: {answer}")
    print("-" * 80)


Question: What is statistics?
Answer: concerns the collection, organization, analysis, interpretation,
and presentation of data
Score: 0.0365
Question: What is statistics?
Answer: concerns the collection, organization, analysis, interpretation,
and presentation of data
--------------------------------------------------------------------------------
Question: Where can descriptive statistics be used?
Answer: indices
such as the mean or standard deviation
Score: 0.2074
Question: Where can descriptive statistics be used?
Answer: indices
such as the mean or standard deviation
--------------------------------------------------------------------------------
Question: How to draw meaningful conclusions?
Answer: Inferential statistics
Score: 0.7877
Question: How to draw meaningful conclusions?
Answer: Inferential statistics
--------------------------------------------------------------------------------
Question: When did Beyonce start becoming popular?
Answer: begin with a statistical populat

# Sentence Embedding Using SentenceTransformers
We will use pre-trained models for sentence embedding.

## Load a pre-trained model for sentence embeddings

In [13]:
# Load a pre-trained model for sentence embeddings
sentence_model = SentenceTransformer('paraphrase-MiniLM-L6-v2', device='cuda', cache_folder='./cache')
#sentence_model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda', cache_folder='./cache')

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Generate embeddings for contexts, questions, and answers

In [14]:
# Generate embeddings for contexts, questions, and answers
context_embeddings = sentence_model.encode(contexts, batch_size=32, show_progress_bar=True,cconvert_to_tensor=True)
print(f"Context Embedding Shape: {context_embeddings.shape}")

question_embeddings = sentence_model.encode(questions, batch_size=32, show_progress_bar=True, convert_to_tensor=True)
print(f"Question Embedding Shape: {question_embeddings.shape}")

answer_embeddings = sentence_model.encode(answers, batch_size=32, show_progress_bar=True, cconvert_to_tensor=True)
print(f"Answer Embedding Shape: {answer_embeddings.shape}")

Batches:   0%|          | 0/4073 [00:00<?, ?it/s]

Context Embedding Shape: (130319, 384)


Batches:   0%|          | 0/4073 [00:00<?, ?it/s]

Question Embedding Shape: torch.Size([130319, 384])


Batches:   0%|          | 0/4073 [00:00<?, ?it/s]

Answer Embedding Shape: (130319, 384)


# Fine tune BERT model with this dataset.


## Preprocess the Dataset
Transform the dataset to a format compatible with BERT, including tokenization.

In [96]:
# Combine contexts and questions into a Dataset
dataset_modified = Dataset.from_dict({
    "context": contexts,
    "question": questions,
    "answers": [{"text": ans} for ans in answers],
    "is_impossible": is_impossible
})

In [16]:
def extract_start_end_positions(context, answer):
    """
    Extracts the start and end positions of the answer within the context.
    If no answer, return -1 for both start and end positions.
    """
    if answer["text"] == "":
        return -1, -1
    start_idx = context.find(answer["text"])
    if start_idx == -1:
        return -1, -1  # Answer not found in context
    end_idx = start_idx + len(answer["text"])
    return start_idx, end_idx


In [107]:
from transformers import AutoTokenizer, DistilBertTokenizer, DistilBertForQuestionAnswering
#tokenizer_bert = DistilBertTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)

tokenizer_bert = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
model_bert = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [108]:
def preprocess_function(examples):
    # Tokenize inputs (DistilBERT does not use token_type_ids)
    tokenized_inputs = tokenizer_eval(
        examples["question"],
        examples["context"],
        max_length=384,
        truncation="only_second",
        padding="max_length",
        return_offsets_mapping=True
    )

    # Extract start and end positions
    start_positions = []
    end_positions = []

    for i, (context, answer) in enumerate(zip(examples["context"], examples["answers"])):
        start_pos, end_pos = extract_start_end_positions(context, answer)

        if start_pos != -1 and end_pos != -1:
            # Use offset mapping to find token positions
            offsets = tokenized_inputs["offset_mapping"][i]
            start_token_pos = None
            end_token_pos = None

            for idx, (start, end) in enumerate(offsets):
                if start <= start_pos < end:
                    start_token_pos = idx
                if start < end_pos <= end:
                    end_token_pos = idx
                    break

            if start_token_pos is None or end_token_pos is None:
                start_token_pos = 0
                end_token_pos = 0
        else:
            start_token_pos = 0
            end_token_pos = 0  # Default to no answer

        start_positions.append(start_token_pos)
        end_positions.append(end_token_pos)

    # Add positions to the tokenized inputs
    tokenized_inputs["start_positions"] = start_positions
    tokenized_inputs["end_positions"] = end_positions

    # Remove offset mapping before returning
    tokenized_inputs.pop("offset_mapping", None)

    return tokenized_inputs


In [106]:
# Tokenize and preprocess dataset
from datasets import load_dataset
tokenized_datasets = dataset_modified.map(preprocess_function, batched=True)

# Print example
print(tokenized_datasets[0])

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

KeyboardInterrupt: 

## Fine-Tune BERT
Using the transformers library to fine-tune BERT for QA.

In [45]:
# Load pre-trained BERT model for Question Answering
#model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./bert_finetuned_squad",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,  # Accumulate gradients over 2 steps
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True  # Enable mixed precision
)

# Initialize Trainer
trainer = Trainer(
    model=model_bert,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,  # Use the same dataset for demonstration purposes
    tokenizer=tokenizer_bert
)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [None]:
# Train the model
#trainer.train()

## Evaluate the Model
Evaluate the fine-tuned model on the validation dataset.

In [20]:
# Evaluate the model
results = trainer.evaluate()
print(f"Validation Results: {results}")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Validation Results: {'eval_loss': 6.0038604736328125, 'eval_model_preparation_time': 0.0027, 'eval_runtime': 390.9634, 'eval_samples_per_second': 333.328, 'eval_steps_per_second': 20.833}


## Save the Model
Save the fine-tuned model for later use.

In [21]:
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [22]:
model_bert.save_pretrained("/content/drive/My Drive/Colab Notebooks/fine_tuned_model_bert_pretrain_qa")
tokenizer_bert.save_pretrained("/content/drive/My Drive/Colab Notebooks/fine_tuned_model_bert_qa")
trainer.save_model("/content/drive/My Drive/Colab Notebooks/fine_tuned_model_bert_trained_qa")

# Test the Model
Test the fine-tuned model with a sample question and context.

## Load the Fine-Tuned Model

In [23]:
# Load the fine-tuned model and tokenizer
model_path = "/content/drive/My Drive/Colab Notebooks/fine_tuned_model_bert_trained_qa"
tokenizer_path = "/content/drive/My Drive/Colab Notebooks/fine_tuned_model_bert_qa"
tokenizer_loaded = AutoTokenizer.from_pretrained(tokenizer_path)
model_bert_loaded = AutoModelForQuestionAnswering.from_pretrained(model_path)

# Initialize a QA pipeline
qa_pipeline_bert_loaded = pipeline("question-answering", model=model_bert_loaded, tokenizer=tokenizer_loaded)


Device set to use cuda:0


## Function for Question Answering

In [54]:
def get_answer_after_BERT_fine_tune(question, context):
    """
    Get the answer to a question based on the provided context.

    Args:
        question (str): The input question.
        context (str): The context paragraph.

    Returns:
        str: The predicted answer.
    """
    #result = qa_pipeline_bert_loaded(question=question, context=context)
    result = qa_pipeline_default(question=question, context=context)
    print(f"Question: {question}")
    print(f"Answer: {result['answer']}")
    print(f"Score: {round(result['score'], 4)}")
    return result['answer']

## Set the context and ask the question

In [53]:
questions_postTraining = [
    "What is statistics?",
    "Where can descriptive statistics be used?",
    "How to draw meaningful conclusions?"
]

context_postTraining = """Statistics is the discipline that concerns the collection, organization, analysis, interpretation,
and presentation of data. In applying statistics to a scientific, industrial, or societal problem, it is
conventional to begin with a statistical population or a statistical model to be studied. Populations can be
diverse groups of people or objects such as 'all people living in a country' or 'every atom composing a crystal.'
Statistics deals with all aspects of data, including the planning of data collection in terms of the design of
surveys and experiments. When census data cannot be collected, statisticians collect data by developing specific
experiment designs and survey samples. Representative sampling assures that inferences and conclusions can reasonably
extend from the sample to the population as a whole. Descriptive statistics summarize and visualize data using indices
such as the mean or standard deviation. Inferential statistics involves drawing conclusions from data that are subject
to random variation (e.g., observational errors or sampling variation). Initial requirements of inferential statistics
are that the sampling method is representative of the population being studied and that the sample sizes are sufficient."""

# Generate answers
for question in questions_postTraining:
    answer = get_answer_after_BERT_fine_tune(question, context_postTraining)
    print(f"Question: {question}")
    print(f"Answer: {answer}")
    print("-" * 80)


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Question: What is statistics?
Answer: concerns the collection, organization, analysis, interpretation,
and presentation of data
Score: 0.0365
Question: What is statistics?
Answer: concerns the collection, organization, analysis, interpretation,
and presentation of data
--------------------------------------------------------------------------------
Question: Where can descriptive statistics be used?
Answer: indices
such as the mean or standard deviation
Score: 0.2074
Question: Where can descriptive statistics be used?
Answer: indices
such as the mean or standard deviation
--------------------------------------------------------------------------------
Question: How to draw meaningful conclusions?
Answer: Inferential statistics
Score: 0.7877
Question: How to draw meaningful conclusions?
Answer: Inferential statistics
--------------------------------------------------------------------------------


# Use the evaluation script to evaluate finetuned model

## Download evaluation script and save in drive for further usages

In [55]:

# Mount Google Drive
drive.mount('/content/drive')

# URL of the file to be downloaded
url = "https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/"

# Specify the destination path in Google Drive
destination_path = "/content/drive/My Drive/Colab Notebooks/evaluate-v2.0.py"

# Download the file and save it
try:
    response = requests.get(url)
    response.raise_for_status()
    with open(destination_path, 'wb') as f:
        f.write(response.content)
    print(f"File successfully downloaded and saved to {destination_path}")
except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
File successfully downloaded and saved to /content/drive/My Drive/Colab Notebooks/evaluate-v2.0.py


## Generate Predictions from the Model

In [56]:
# Generate predictions from the validation dataset
def generate_predictions(model, tokenizer, dataset):
    predictions = {}
    for example in dataset:
        context = example['context']
        question = example['question']
        id_ = example['id']

        # Generate prediction
        inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=384)
        outputs = model(**inputs)
        answer_start = outputs.start_logits.argmax()
        answer_end = outputs.end_logits.argmax()

        # Decode the predicted answer
        pred_ids = inputs.input_ids[0][answer_start:answer_end + 1]
        prediction = tokenizer.decode(pred_ids, skip_special_tokens=True)

        predictions[id_] = prediction
    return predictions


## Load validation dataset

In [57]:
# Load validation dataset
#from datasets import load_dataset

validation_data = load_dataset("squad_v2", split="validation")

README.md:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

## Generate predictions and save

In [60]:
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering

# Load the tokenizer and model
tokenizer_eval = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model_eval = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [98]:
# Generate predictions
# Take only the first 10 records
validation_data_subset = validation_data.select(range(10))
validation_predictions = generate_predictions(model_eval, tokenizer_eval, validation_data_subset)

# Save predictions to a JSON file
file_path = "/content/drive/My Drive/Colab Notebooks/predictions.json"

with open(file_path, "w") as f:
    json.dump(validation_predictions, f)

print(f"Predictions saved to {file_path}")

Predictions saved to /content/drive/My Drive/Colab Notebooks/predictions.json


### Generate predictions

In [90]:
predictions = {}
for article in squad_data['data']:
    for paragraph in article['paragraphs']:
        for qa in paragraph['qas']:
            qid = qa['id']

            # Check if 'answers' exists and is not empty
            if 'answers' in qa and qa['answers']:
                # Mock prediction: choose a random answer or leave it empty with 20% chance
                predicted_answer = random.choice(qa['answers'])['text'] if random.random() > 0.2 else ""
            else:
                # If there are no answers (unanswerable question), set prediction as empty
                predicted_answer = ""

            predictions[qid] = predicted_answer

# Save mock predictions to 'pred.json'
with open('pred.json', 'w') as f:
    json.dump(predictions, f, indent=2)

print(f"Saved predictions for {len(predictions)} questions.")


Saved predictions for 130319 questions.


## Prepare the Ground Truth

In [63]:
# Extract ground truth from validation dataset
ground_truth = {example['id']: example['answers'] for example in validation_data}

# Save ground truth to a JSON file
file_path = "/content/drive/My Drive/Colab Notebooks/ground_truth.json"

with open(file_path, "w") as f:
    json.dump(ground_truth, f)

print(f"Ground Truth saved to {file_path}")


Ground Truth saved to /content/drive/My Drive/Colab Notebooks/ground_truth.json


## Prepare no-answer probabilities dataset

In [109]:
na_probabilities = {}
for article in squad_data['data']:
    for paragraph in article['paragraphs']:
        for qa in paragraph['qas']:
            qid = qa['id']
            # Random probability between 0.0 and 1.0
            na_probabilities[qid] = random.random()

# Save random no-answer probabilities to na_prob.json
with open('na_prob.json', 'w') as f:
    json.dump(na_probabilities, f, indent=2)


KeyError: "Column data not in the dataset. Current columns in the dataset: ['context', 'question', 'answers', 'is_impossible']"

## Run the Evaluation Script

In [89]:
# Navigate to the folder containing the script and JSON files
%cd /content/drive/My Drive/Colab Notebooks/

# Run the Python script with the arguments
!python evaluate-v2.0.py data.json pred.json --out-file eval_results.json --na-prob-file na_prob.json
#!python evaluate-v2.0.py train-v2.0.json pred.json

print("Evaluation results saved to eval_results.json")
print("No-answer probabilities saved to na_prob.json")


/content/drive/My Drive/Colab Notebooks
Evaluation results saved to eval_results.json
No-answer probabilities saved to na_prob.json


## Load evealuation result and print

In [86]:
# Load eval_results.json
with open('eval_results.json', 'r') as eval_file:
    eval_results = json.load(eval_file)

print("Eval Results:")
print(json.dumps(eval_results, indent=2))

Eval Results:
{
  "exact": 86.70723378785902,
  "f1": 86.70723378785902,
  "total": 130319,
  "HasAns_exact": 80.04745395699197,
  "HasAns_f1": 80.04745395699197,
  "HasAns_total": 86821,
  "NoAns_exact": 100.0,
  "NoAns_f1": 100.0,
  "NoAns_total": 43498,
  "best_exact": 86.70723378785902,
  "best_exact_thresh": 0.9999818084079438,
  "best_f1": 86.70723378785902,
  "best_f1_thresh": 0.9999818084079438
}


## Load no-answer probabilities and print to 10 record

In [87]:
# Load na_prob.json
with open('na_prob.json', 'r') as na_prob_file:
    na_prob_data = json.load(na_prob_file)

print("NA Probabilities:")

# Sort the `na_prob_data` by the probability values in descending order
sorted_na_prob = sorted(na_prob_data.items(), key=lambda x: x[1], reverse=True)

# Select the top 100
top_100_na_prob = sorted_na_prob[:10]



print(json.dumps(top_100_na_prob, indent=2))


NA Probabilities:
[
  [
    "5ad1672c645df0001a2d19f2",
    0.9999929958456031
  ],
  [
    "570afdc16b8089140040f68b",
    0.9999891422757239
  ],
  [
    "56d9aa93dc89441400fdb6c4",
    0.9999818084079438
  ],
  [
    "572a2c616aef05140015533b",
    0.9999748153783831
  ],
  [
    "5a7fc8178f0597001ac000c4",
    0.999970781919306
  ],
  [
    "56ddec6666d3e219004dae0f",
    0.9999657670284877
  ],
  [
    "5728c9a54b864d1900164e29",
    0.9999655387772897
  ],
  [
    "572cad3cdfb02c14005c6bf9",
    0.9999533767760633
  ],
  [
    "5723b340f6b826140030fc80",
    0.9999492784871367
  ],
  [
    "5ace5b7132bba1001ae4a3b5",
    0.9999431532763136
  ]
]


#  Using GloVe Embeddings for Vectorization

## Download the GloVe embeddings

In [None]:
# Destination folder in Google Drive
folder = "/content/drive/My Drive/Colab Notebooks/Glove/"
source_file = folder+"glove.6B.100d.txt"  # Path to the 100-dimensional embeddings
destination_file = folder+"glove.6B.60d.txt"  # Save truncated 60d embeddings here

# Truncate to 60 dimensions
with open(source_file, "r") as src, open(destination_file, "w") as dest:
    for line in src:
        values = line.split()
        word = values[0]  # Word/token
        vectors = values[1:61]  # First 60 dimensions
        dest.write(f"{word} {' '.join(vectors)}\n")

print(f"Truncated embeddings saved to {destination_file}")

Truncated embeddings saved to /content/drive/My Drive/Colab Notebooks/Glove/glove.6B.60d.txt


## Load the GloVe Embeddings into Memory

In [None]:

# Load GloVe 60d embeddings
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# File path to the GloVe 60d embeddings
glove_file_path = '/content/drive/My Drive/Colab Notebooks/Glove/glove.6B.60d.txt'
glove_embeddings = load_glove_embeddings(glove_file_path)
print(f"Loaded {len(glove_embeddings)} word vectors.")


Loaded 400000 word vectors.


## Replace BERT's Embedding Layer with GloVe Embeddings

In [None]:
import torch
from transformers import BertTokenizer, BertModel
from torch import nn

# Load BERT tokenizer
tokenizer_Glove = BertTokenizer.from_pretrained('bert-base-uncased')

# Create a custom embedding layer that uses GloVe embeddings
class GloVeBERTEmbeddingLayer(nn.Module):
    def __init__(self, glove_embeddings, tokenizer):
        super(GloVeBERTEmbeddingLayer, self).__init__()
        self.tokenizer_Glove = tokenizer
        self.glove_embeddings = glove_embeddings

        # Create a tensor for GloVe embeddings for the known words
        self.embedding_dim = 60  # GloVe 60d
        self.word_to_idx = tokenizer.get_vocab()
        self.embedding_matrix = self.create_embedding_matrix()

    def create_embedding_matrix(self):
        # Create an embedding matrix where the index corresponds to the tokenizer's word
        embedding_matrix = np.zeros((len(self.word_to_idx), self.embedding_dim))
        for word, idx in self.word_to_idx.items():
            glove_vector = self.glove_embeddings.get(word, None)
            if glove_vector is not None:
                embedding_matrix[idx] = glove_vector
        return torch.tensor(embedding_matrix, dtype=torch.float)

    def forward(self, input_ids):
        # Look up GloVe embeddings for the input tokens
        embeddings = torch.nn.functional.embedding(input_ids, self.embedding_matrix)
        return embeddings

# Initialize the custom embedding layer with GloVe
custom_embedding_layer = GloVeBERTEmbeddingLayer(glove_embeddings, tokenizer_Glove)

# Test the custom embedding layer with a simple input
test_text = "Statistics is the discipline of data."
inputs = tokenizer_Glove(test_text, return_tensors="pt", truncation=True, max_length=128)

# Get GloVe embeddings for the input
input_ids = inputs['input_ids']
glove_embeddings_output = custom_embedding_layer(input_ids)
print(glove_embeddings_output.shape)  # Should be (batch_size, sequence_length, embedding_dim)


torch.Size([1, 9, 60])


## Integrate with BERT Model

In [None]:
class GloVeBERTModel(nn.Module):
    def __init__(self, glove_embeddings, tokenizer):
        super(GloVeBERTModel, self).__init__()
        self.tokenizer_Glove = tokenizer
        self.custom_embedding_layer = GloVeBERTEmbeddingLayer(glove_embeddings, tokenizer_Glove)
        self.bert_model = BertModel.from_pretrained('bert-base-uncased')

    def forward(self, input_ids, attention_mask=None):
        # Use custom GloVe embeddings
        custom_embeddings = self.custom_embedding_layer(input_ids)

        # Pass the custom embeddings to BERT
        outputs = self.bert_model(inputs_embeds=custom_embeddings, attention_mask=attention_mask)
        return outputs

# Initialize the custom model
model = GloVeBERTModel(glove_embeddings, tokenizer_Glove)

# Test the forward pass
outputs = model(input_ids)
print(outputs.last_hidden_state.shape)  # Should return shape (batch_size, sequence_length, hidden_dim)


RuntimeError: The size of tensor a (60) must match the size of tensor b (768) at non-singleton dimension 2