<a href="https://colab.research.google.com/github/addaia/TechnicalProject/blob/main/sft_33.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

In [3]:
from datasets import load_dataset, Dataset
import re
import random
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    XLNetLMHeadModel,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    DataCollatorForLanguageModeling,
    TrainerCallback, TrainerState, TrainerControl
)

import torch
import pandas as pd

In [5]:
# load the GSM8K dataset
gsm8k = load_dataset("openai/gsm8k", "main")

# preprocess dataset:
#   question -- same
#   answer ---- number (only)
def preprocess_gsm8k(dataset):
    processed = []
    for example in dataset:
        question = example["question"]
        answer_text = example["answer"]
        # Split into reasoning and final answer based on the delimiter
        parts = answer_text.split("####")
        if len(parts) >= 2:
            reasoning = parts[0].strip()
            final_answer = parts[-1].strip()
            processed.append({
                "question": question,
                "reasoning": reasoning,
                "answer": final_answer  # the final numeric answer
            })
    return processed

# preprocess and turn to HUGGING FACE datasets
train_data = preprocess_gsm8k(gsm8k["train"])
test_data  = preprocess_gsm8k(gsm8k["test"])

train_dataset = Dataset.from_list(train_data)
eval_dataset  = Dataset.from_list(test_data)

README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [6]:
# VAR 1: subset of data (random)
def random_sample_dataset(dataset, fraction=0.1, seed=42):
    random.seed(seed)
    sample_size = int(len(dataset) * fraction)
    indices = random.sample(range(len(dataset)), sample_size)
    return [dataset[i] for i in indices]

train_data_sampled = random_sample_dataset(train_data)
test_data_sampled  = random_sample_dataset(test_data)

train_dataset_sampled = Dataset.from_list(train_data_sampled)
eval_dataset_sampled  = Dataset.from_list(test_data_sampled)

# VAR 2: answers from 0 to 10
def filter_dataset_range(dataset, min_val=0, max_val=10):
    filtered_data = []
    for example in dataset:
         try:
             answer_val = int(example["answer"])
         except ValueError:
             continue
         if min_val <= answer_val <= max_val:
              filtered_data.append(example)
    return filtered_data

train_data_filtered = filter_dataset_range(train_data)
test_data_filtered  = filter_dataset_range(test_data)

train_dataset_filtered = Dataset.from_list(train_data_filtered)
eval_dataset_filtered  = Dataset.from_list(test_data_filtered)

# define tests cases of dataset
dataset_variants = {
    "Full": test_data,
    "Sampled": test_data_sampled,
    "Filtered": test_data_filtered
}

# list of models chosen
model_names = [
    "distilgpt2",
    "arnir0/Tiny-LLM",
    "xlnet/xlnet-base-cased"
]

# format prompt to try and force model to answer in one way -- include chain of thought
def format_train(example):
    return (
        f"Question: {example['question']}\n"
        f"Chain-of-thought: {example['reasoning']}\n"
        f"Answer: {example['answer']}"
    )



In [None]:
# define
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# select model and dataset
selected_model = "distilgpt2"
dataset_variant = "sampled"

# get dataset
if dataset_variant == "full":
    train_ds = train_dataset
    eval_ds = eval_dataset
elif dataset_variant == "sampled":
    train_ds = train_dataset_sampled
    eval_ds = eval_dataset_sampled
elif dataset_variant == "filtered":
    train_ds = train_dataset_filtered
    eval_ds = eval_dataset_filtered

# tokeniser and confi
tokenizer = AutoTokenizer.from_pretrained(selected_model)
config = AutoConfig.from_pretrained(selected_model)

if config.is_encoder_decoder:
    model = AutoModelForSeq2SeqLM.from_pretrained(selected_model)
elif config.model_type == "xlnet":
    model = XLNetLMHeadModel.from_pretrained(selected_model)
else:
    model = AutoModelForCausalLM.from_pretrained(selected_model)
model.eval()

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    model.resize_token_embeddings(len(tokenizer))

model.to(device)

# prep data
def format_train(example):
    return (
        f"Question: {example['question']}\n"
        f"Chain-of-thought: {example['reasoning']}\n"
        f"Answer: {example['answer']}"
    )


def tokenise_func(example):
    text = format_train(example)
    return tokenizer(text, truncation=True, max_length=256)


# prep correclty
train_dataset_hf = Dataset.from_list(train_ds)
eval_dataset_hf = Dataset.from_list(eval_ds)

train_ds_tokenised = train_dataset_hf.map(tokenise_func, batched=False)
eval_ds_tokenised = eval_dataset_hf.map(tokenise_func, batched=False)


data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# compute accuracy metric to check how many are correct
def compute_accuracy(model, dataset, tokenizer):
    model.eval()
    correct = 0
    total = len(dataset)
    for example in dataset:
        prompt = f"Question: {example['question']}\nAnswer:"
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        outputs = model.generate(
            **inputs,
            max_new_tokens=5,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id
        )
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        answer_text = generated_text.split("Answer:")[-1].strip()
        numbers = re.findall(r"(-?\d+(?:\.\d+)?)", answer_text)
        predicted = numbers[-1] if numbers else None
        try:
            if predicted is not None and abs(float(predicted) - float(example["answer"])) < 1e-5:
                correct += 1
        except ValueError:
            pass
    return correct / total if total > 0 else 0


# custom callback that logs epoch 0 (initial evaluation) and then at each epoch end.
class EvalAccuracyCallback(TrainerCallback):
    def __init__(self, train_dataset, eval_dataset, tokenizer):
        self.train_dataset = train_dataset  # non-tokenised lists
        self.eval_dataset = eval_dataset
        self.tokenizer = tokenizer
        self.epochs = [0]
        self.train_accuracies = []
        self.eval_accuracies = []

    def on_train_begin(self, args, state, control, **kwargs):
        model = kwargs["model"]
        init_train_acc = compute_accuracy(model, self.train_dataset, self.tokenizer)
        init_eval_acc = compute_accuracy(model, self.eval_dataset, self.tokenizer)
        self.train_accuracies.append(init_train_acc)
        self.eval_accuracies.append(init_eval_acc)
        print(f"Epoch 0: Train Accuracy = {init_train_acc:.4f}, Eval Accuracy = {init_eval_acc:.4f}")

    def on_epoch_end(self, args, state, control, **kwargs):
        model = kwargs["model"]
        train_acc = compute_accuracy(model, self.train_dataset, self.tokenizer)
        eval_acc = compute_accuracy(model, self.eval_dataset, self.tokenizer)
        epoch_num = int(state.epoch)
        self.epochs.append(epoch_num)
        self.train_accuracies.append(train_acc)
        self.eval_accuracies.append(eval_acc)
        print(f"Epoch {epoch_num}: Train Accuracy = {train_acc:.4f}, Eval Accuracy = {eval_acc:.4f}")

# args
training_args = TrainingArguments(
    output_dir="./sft_model",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
)


# train -- sft
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds_tokenised,
    eval_dataset=eval_ds_tokenised,
    data_collator=data_collator,
    callbacks=[EvalAccuracyCallback(train_ds, eval_ds, tokenizer)]
)

trainer.train()
print("final accuracy", compute_accuracy(model, eval_ds, tokenizer))

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Map:   0%|          | 0/747 [00:00<?, ? examples/s]

Map:   0%|          | 0/131 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter: