## Importing Modules

In [1]:
import os
import pathlib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from transformers import AutoModel, AutoTokenizer, TFAutoModel
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import TrainingArguments, Trainer
import torch
from transformers import DefaultDataCollator

if torch.cuda.is_available():
    print("GPU is available!")
else:
    print("GPU is not available.")

  from .autonotebook import tqdm as notebook_tqdm



GPU is available!


In [2]:
MAIN_PATH = str(pathlib.Path().resolve())
DATASET_PATH = MAIN_PATH + '\\datasets'
MODEL_PATH = MAIN_PATH + '\\models'

In [3]:
models = os.listdir(MODEL_PATH)
models

['bert-base-cased',
 'bert-base-multilingual-cased',
 'bert-base-uncased',
 'bert-large-cased',
 'bert-large-uncased',
 'flan-t5-base',
 'flan-t5-large',
 'flan-t5-small',
 'gemma-2-2b',
 'gpt2',
 'gpt2-large',
 'gpt2-medium',
 'LM-Studio-0.3.4-Setup.exe',
 'Qwen2.5-0.5B',
 'Qwen2.5-1.5B',
 'Qwen2.5-3B',
 'Qwen2.5-7B-Instruct-GPTQ-Int4']

In [4]:
model_path = MODEL_PATH + '\\' + models[2]
model_path

'D:\\Python\\LLM_Environment\\models\\bert-base-uncased'

In [5]:
model = AutoModel.from_pretrained(model_path)

## Import Dataset

In [6]:
# Load the dataset
squad = load_dataset("squad")

Using the latest cached version of the dataset since squad couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'plain_text' at C:\Users\422in\.cache\huggingface\datasets\squad\plain_text\0.0.0\7b6d24c440a36b6815f21b70d25016731768db1f (last modified on Wed Oct  9 21:46:34 2024).


In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

model = AutoModelForQuestionAnswering.from_pretrained(model_path)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at D:\Python\LLM_Environment\models\bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def prepare_train_features(examples):
    # Tokenize our examples with truncation and padding, but keep the overflows using a stride.
    # This results in one example possible giving several features when a context is long,
    # each of those features having a context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",  # truncate context, not the question
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context.
    # This will help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [9]:
# Apply the function to our data
tokenized_datasets = squad.map(prepare_train_features, batched=True, remove_columns=squad["train"].column_names)

Map: 100%|██████████| 87599/87599 [00:29<00:00, 2940.71 examples/s]
Map: 100%|██████████| 10570/10570 [00:03<00:00, 2928.80 examples/s]


In [10]:
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [11]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 88524
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 10784
    })
})

In [12]:
args = TrainingArguments(
    f"finetune-BERT-squad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)
     



In [13]:
data_collator = DefaultDataCollator()

In [15]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"].select(range(10000)),
    eval_dataset=tokenized_datasets["validation"].select(range(1000)),
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [16]:
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(
 27%|██▋       | 500/1875 [04:30<12:36,  1.82it/s]

{'loss': 2.2879, 'grad_norm': 19.72804069519043, 'learning_rate': 1.4666666666666666e-05, 'epoch': 0.8}


                                                  
 33%|███▎      | 625/1875 [05:52<11:30,  1.81it/s]

{'eval_loss': 1.4439274072647095, 'eval_runtime': 11.4693, 'eval_samples_per_second': 87.189, 'eval_steps_per_second': 5.493, 'epoch': 1.0}


 53%|█████▎    | 1000/1875 [09:21<08:07,  1.80it/s] 

{'loss': 1.1675, 'grad_norm': 12.791193008422852, 'learning_rate': 9.333333333333334e-06, 'epoch': 1.6}


                                                   
 67%|██████▋   | 1250/1875 [11:54<05:46,  1.80it/s]

{'eval_loss': 1.3677297830581665, 'eval_runtime': 11.6322, 'eval_samples_per_second': 85.968, 'eval_steps_per_second': 5.416, 'epoch': 2.0}


 80%|████████  | 1500/1875 [14:16<03:39,  1.71it/s]

{'loss': 0.9056, 'grad_norm': 16.688323974609375, 'learning_rate': 4.000000000000001e-06, 'epoch': 2.4}


                                                   
100%|██████████| 1875/1875 [18:04<00:00,  1.73it/s]

{'eval_loss': 1.4073059558868408, 'eval_runtime': 12.2582, 'eval_samples_per_second': 81.578, 'eval_steps_per_second': 5.139, 'epoch': 3.0}
{'train_runtime': 1084.4167, 'train_samples_per_second': 27.665, 'train_steps_per_second': 1.729, 'train_loss': 1.31505927734375, 'epoch': 3.0}





TrainOutput(global_step=1875, training_loss=1.31505927734375, metrics={'train_runtime': 1084.4167, 'train_samples_per_second': 27.665, 'train_steps_per_second': 1.729, 'total_flos': 5879177026560000.0, 'train_loss': 1.31505927734375, 'epoch': 3.0})

In [17]:
instance = squad['train'][20]
context = instance['context']
question = instance['question']

In [18]:
context

"All of Notre Dame's undergraduate students are a part of one of the five undergraduate colleges at the school or are in the First Year of Studies program. The First Year of Studies program was established in 1962 to guide incoming freshmen in their first year at the school before they have declared a major. Each student is given an academic advisor from the program who helps them to choose classes that give them exposure to any major in which they are interested. The program also includes a Learning Resource Center which provides time management, collaborative learning, and subject tutoring. This program has been recognized previously, by U.S. News & World Report, as outstanding."

In [19]:
instance['answers']

{'text': ['Learning Resource Center'], 'answer_start': [496]}

In [20]:
given_answer = instance['answers']['text'][0]  # Assuming the first answer is the correct one
given_answer_start = instance['answers']['answer_start'][0]
given_answer, given_answer_start

('Learning Resource Center', 496)

In [21]:
# Tokenize the data
inputs = tokenizer(question, context, return_tensors='pt', max_length=512, truncation=True)

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [23]:
inputs = {k: v.to(device) for k, v in inputs.items()}

In [24]:
# Get model's output
with torch.no_grad():
    output = model(**inputs)

In [25]:
# Get the predicted answer
start_idx = torch.argmax(output.start_logits)
end_idx = torch.argmax(output.end_logits)

predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_idx:end_idx + 1]))

In [26]:
predicted_answer, start_idx, end_idx, start_idx.item(), end_idx.item()

('learning resource center',
 tensor(111, device='cuda:0'),
 tensor(113, device='cuda:0'),
 111,
 113)

In [27]:
correct = (predicted_answer.lower() == given_answer.lower())
evaluation = 'Correct' if correct else f'Incorrect (Predicted: {predicted_answer}, Given: {given_answer})'

print(evaluation)

Correct


In [28]:
# Function to evaluate a single instance
def evaluate_instance(instance, device):
    context = instance['context']
    question = instance['question']
    given_answer = instance['answers']['text'][0]  # Assuming the first answer is the correct one

    # Tokenize the data
    inputs = tokenizer(question, context, return_tensors='pt', max_length=512, truncation=True)

    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Apply the BERT model
    with torch.no_grad():  # No need to calculate gradients
        output = model(**inputs)

    # Get the predicted answer
    start_idx = torch.argmax(output.start_logits)
    end_idx = torch.argmax(output.end_logits)
    predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_idx:end_idx + 1]))

    return predicted_answer.lower() == given_answer.lower()

In [29]:
from tqdm import tqdm

In [30]:
# Evaluate the a number of instances
correct_count = 0
total_count = 100

for i in tqdm(range(total_count)):
    correct_count += evaluate_instance(squad['train'][i], device)

100%|██████████| 100/100 [00:01<00:00, 77.89it/s]


In [31]:
# Calculate and output the accuracy
accuracy = correct_count / total_count
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 67.00%
