In [1]:
# @title 1. Install Dependencies
!pip install -q datasets transformers accelerate evaluate sentencepiece

In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [3]:
# @title 2. Imports and Device Check
import torch, os, re
import numpy as np
from pathlib import Path
from datasets import load_dataset
from transformers import (
    AutoTokenizer, T5ForConditionalGeneration,
    DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
)

print("Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
os.environ["WANDB_DISABLED"] = "true"

Device: Tesla T4


In [4]:
# @title 3. Configuration
dataset_name = "lighteval/MATH-Hard"     # or "hendrycks/competition_math"
model_name = "google/byt5-base"
output_dir = "./byt5_math_proof_output"

num_train_epochs = 10
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
num_think_tokens = 1      # adds <think> tokens for reasoning
max_input_length = 512
max_target_length = 512
self_consistency_k = 0    # >0 enables majority-vote inference


In [5]:
# @title 4. Load Dataset
def load_math_dataset(name, split="train", max_examples=None):
    ds = load_dataset(name, split=split)
    if max_examples:
        ds = ds.select(range(max_examples))
    print(ds)
    return ds

train_dataset = load_math_dataset(dataset_name, split="train[:50%]")
test_dataset  = load_math_dataset(dataset_name, split="test[:1%]")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Dataset({
    features: ['problem', 'level', 'type', 'solution'],
    num_rows: 1152
})
Dataset({
    features: ['problem', 'level', 'type', 'solution'],
    num_rows: 13
})


In [6]:
# @title 5. Preprocessing Functions
def build_input(problem):
    return (
        "Solve this problem step by step. "
        "Show reasoning clearly and put the final answer inside \\boxed{...}.\n\n"
        f"Problem: {problem}\n\nSolution:"
    )


def preprocess_examples(examples, tokenizer, max_input_length, max_target_length, num_think_tokens=0):
    inputs  = [build_input(p) for p in examples["problem"]]

    examples["solution"] = [s.replace("\\\\boxed", "\\boxed") for s in examples["solution"]]
    targets = examples["solution"]  # full LaTeX solution with \boxed{...}


    if num_think_tokens > 0:
        prefix = "<think>" * num_think_tokens
        targets = [prefix + t for t in targets]

    model_inputs = tokenizer(
        inputs, max_length=max_input_length, truncation=True, padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, max_length=max_target_length, truncation=True, padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [7]:
# @title 6. Load Tokenizer & Model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

if num_think_tokens > 0 and "<think>" not in tokenizer.get_vocab():
    tokenizer.add_tokens(["<think>"])
    model.resize_token_embeddings(len(tokenizer))


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [8]:
# @title 7. Tokenize Datasets
def preprocess_fn(batch):
    return preprocess_examples(batch, tokenizer, max_input_length, max_target_length, num_think_tokens)

tokenized_train = train_dataset.map(preprocess_fn, batched=True, remove_columns=train_dataset.column_names)
tokenized_test  = test_dataset.map(preprocess_fn,  batched=True, remove_columns=test_dataset.column_names)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


Map:   0%|          | 0/1152 [00:00<?, ? examples/s]



In [9]:
# @title 8. Evaluation Metrics
def extract_boxed_answer(text: str):
    # matches \boxed{ ... } (raw string in HF JSON often has '\\boxed{')
    m = re.findall(r"\\boxed\{([^}]*)\}", text)
    return m[-1].strip() if m else None

def final_answer_accuracy(preds, refs):
    total, hits = 0, 0
    for p, r in zip(preds, refs):
        p_box, r_box = extract_boxed_answer(p), extract_boxed_answer(r)
        # Count only if reference actually has a boxed answer
        # (MATH should, but this guards weird cases)
        if r_box is None:
            continue
        total += 1
        if p_box is not None and p_box == r_box:
            hits += 1
    return {"final_answer_acc": (hits / total) if total else 0.0}

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple): preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    acc = final_answer_accuracy(decoded_preds, decoded_labels)
    return acc


In [10]:
# @title 9. Training Arguments & Trainer
args_1 = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    predict_with_generate=True,
    logging_dir=Path(output_dir) / "logs",
    logging_steps=100,
    # evaluation_strategy="epoch",
    save_total_limit=2,
    # evaluation_strategy=  # ← removed for compatibility
    # If you want periodic saving/logging by steps instead of epochs, you can add:
    # save_steps=500,
    # logging_strategy="steps",
)

# ================================================================================
args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    learning_rate=6e-4,           # T5/ByT5 like a higher LR than BERT
    warmup_ratio=0.05,
    lr_scheduler_type="cosine",

    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16, # effective batch 32
    dataloader_num_workers=2,

    predict_with_generate=True,
    generation_max_length=512,

    logging_dir=Path(output_dir) / "logs",
    logging_steps=100,
    # evaluation_strategy="steps",
    eval_steps=500,
    save_total_limit=2,
    bf16=torch.cuda.is_available(),   # or fp16=True if no bf16
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Seq2SeqTrainer(


In [None]:
# @title 10. Train & Evaluate
train = True
evaluate = True

if train:
    print("🚀 Starting fine-tuning...")
    trainer.train()
    trainer.save_model()
    tokenizer.save_pretrained(output_dir)

if evaluate:
    print("📈 Evaluating...")
    results = trainer.evaluate()
    print("Results:", results)


🚀 Starting fine-tuning...


Step,Training Loss
100,2.2114
200,0.8037
300,0.69
400,0.6124
500,0.5596
600,0.5267


In [None]:
# @title 11. Self-Consistency Evaluation
if self_consistency_k > 0:
    from collections import Counter
    print(f"Running self-consistency with k={self_consistency_k}")
    model.eval()
    preds, refs = [], []
    for ex in test_dataset:
        q, ref = ex["problem"], ex["solution"]
        input_ids = tokenizer(q, return_tensors="pt").input_ids
        candidates = []
        for _ in range(self_consistency_k):
            out = model.generate(input_ids, do_sample=True, top_p=0.9, max_length=max_target_length)
            text = tokenizer.decode(out[0], skip_special_tokens=True)
            candidates.append(text)
        preds.append(Counter(candidates).most_common(1)[0][0])
        refs.append(ref)
    acc = final_answer_accuracy(preds, refs)
    print("Self-consistency final-answer Acc:", acc)


In [None]:
# @title 12. Quick Inference
# Inference (device-safe)
# --- generation config ---
gen_kwargs = dict(
    max_new_tokens=512,
    num_beams=4,                # or do_sample=True, top_p=0.9 for self-consistency
    length_penalty=0.8,         # discourage overly long rambles
    no_repeat_ngram_size=3,
)

# --- inference ---
question = "Compute the derivative of x^3 + 2x^2 + 5x + 7."
inp = build_input(question)  # <-- use your build_input() if you added it
inputs = tokenizer(inp, return_tensors="pt")
inputs = {k: v.to(model.device) for k, v in inputs.items()}

with torch.no_grad():
    output_ids = model.generate(**inputs, **gen_kwargs)

print(tokenizer.decode(output_ids[0], skip_special_tokens=True))




In [None]:
print(train_dataset[0].keys())
print(train_dataset[0]["problem"])
print(train_dataset[0]["solution"])
