<a href="https://colab.research.google.com/github/addaia/TechnicalProject/blob/main/dataset_accuracy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#!pip install datasets

In [36]:
from datasets import load_dataset, Dataset
import re
import random
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    XLNetLMHeadModel
)
import torch
import pandas as pd

In [12]:
# load the GSM8K dataset
gsm8k = load_dataset("openai/gsm8k", "main")

# preprocess dataset:
#   question -- same
#   answer ---- number (only)
def preprocess_gsm8k(dataset):
    preprocessed_data = []
    for example in dataset:
        question = example["question"]
        answer = example["answer"]
        extracted_answer = re.findall(r"####\s*(\d+)", answer)
        if extracted_answer:
            formatted_answer = extracted_answer[-1]
            preprocessed_data.append({"question": question, "answer": formatted_answer})
    return preprocessed_data

# preprocess and turn to HUGGING FACE datasets
train_data = preprocess_gsm8k(gsm8k["train"])
test_data  = preprocess_gsm8k(gsm8k["test"])

train_dataset = Dataset.from_list(train_data)
eval_dataset  = Dataset.from_list(test_data)

In [39]:
# VAR 1: subset of data (random)
def random_sample_dataset(dataset, fraction=0.1, seed=42):
    random.seed(seed)
    sample_size = int(len(dataset) * fraction)
    indices = random.sample(range(len(dataset)), sample_size)
    return [dataset[i] for i in indices]

train_data_sampled = random_sample_dataset(train_data)
test_data_sampled  = random_sample_dataset(test_data)

train_dataset_sampled = Dataset.from_list(train_data_sampled)
eval_dataset_sampled  = Dataset.from_list(test_data_sampled)

# VAR 2: answers from 0 to 10
def filter_dataset_range(dataset, min_val=0, max_val=10):
    filtered_data = []
    for example in dataset:
         try:
             answer_val = int(example["answer"])
         except ValueError:
             continue
         if min_val <= answer_val <= max_val:
              filtered_data.append(example)
    return filtered_data

train_data_filtered = filter_dataset_range(train_data)
test_data_filtered  = filter_dataset_range(test_data)

train_dataset_filtered = Dataset.from_list(train_data_filtered)
eval_dataset_filtered  = Dataset.from_list(test_data_filtered)

# define tests cases of dataset
dataset_variants = {
    "Full": test_data,
    "Sampled": test_data_sampled,
    "Filtered": test_data_filtered
}

In [42]:
# format prompt to try and force model to answer in one way
def format_prompt(example):
    return f"Question: {example['question']}\nAnswer:"

# list of models chosen
model_names = [
    "distilgpt2",
    "arnir0/Tiny-LLM",
    "xlnet/xlnet-base-cased"
]

# help with comp
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# store
results = []

# test accuracy for all 3 models and 3 datasets types
for model_name in model_names:

    # load tokenizer and config
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    config = AutoConfig.from_pretrained(model_name)

    # load correct model class
    if config.is_encoder_decoder:
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    elif config.model_type == "xlnet":
        model = XLNetLMHeadModel.from_pretrained(model_name)
    else:
        model = AutoModelForCausalLM.from_pretrained(model_name)
    model.eval()

    # add dedicated pad to avoid warning
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
        model.resize_token_embeddings(len(tokenizer))

    model.to(device)

    # evaluate
    for variant_name, dataset_variant in dataset_variants.items():
        correct = 0
        total = len(dataset_variant)

        for example in dataset_variant:
            prompt = format_prompt(example)
            inputs = tokenizer(prompt, return_tensors="pt").to(device)
            outputs = model.generate(
                **inputs,
                max_new_tokens=5,
                do_sample=False,
                eos_token_id=tokenizer.eos_token_id
            )
            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            # extract part after answer
            answer_text = generated_text.split("Answer:")[-1].strip()
            numbers = re.findall(r"(-?\d+(?:\.\d+)?)", answer_text) # check for numbers

            if numbers:
                predicted = numbers[-1]  # take the last full number
            else:
                predicted = None

            try:
                if predicted is not None and abs(float(predicted) - float(example["answer"])) < 1e-5:
                    correct += 1
            except ValueError:
                pass  # do not update correct counter if not true

        accuracy = correct / total if total > 0 else 0
        results.append({"Model": model_name, "Dataset": variant_name, "Accuracy": round(accuracy, 4)})



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Evaluation Results (Accuracy):
Dataset                 Filtered    Full  Sampled
Model                                            
arnir0/Tiny-LLM           0.0169  0.0084   0.0153
distilgpt2                0.0339  0.0099   0.0305
xlnet/xlnet-base-cased    0.0068  0.0030   0.0000


EVALUATION ACCURACY

In [44]:
df = pd.DataFrame(results)
pivot_df = df.pivot(index="Model", columns="Dataset", values="Accuracy")
print(pivot_df)

Dataset                 Filtered    Full  Sampled
Model                                            
arnir0/Tiny-LLM           0.0169  0.0084   0.0153
distilgpt2                0.0339  0.0099   0.0305
xlnet/xlnet-base-cased    0.0068  0.0030   0.0000
