In [1]:
# --- Hugging Face Login & Installations ---
from google.colab import userdata
from huggingface_hub import notebook_login

hf_token = userdata.get('HF_TOKEN')
if not hf_token:
    raise ValueError("HF_TOKEN not found in Colab Secrets. Please add it.")
# notebook_login(new_session=hf_token) # Unsloth handles token auth automatically

In [2]:
# # Install Unsloth for Google Colab
# !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# # Standard installations
# !pip install -U transformers
# !pip install -U datasets
# !pip install -U accelerate # Required for Unsloth

!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
!pip install --no-deps unsloth

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting xformers==0.0.29.post3
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting trl
  Downloading trl-0.21.0-py3-none-any.whl.metadata (11 kB)
Collecting cut_cross_entropy
  Downloading cut_cross_entropy-25.1.1-py3-none-any.whl.metadata (9.3 kB)
Collecting unsloth_zoo
  Downloading unsloth_zoo-2025.8.3-py3-none-any.whl.metadata (9.4 kB)
Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl (43.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.4/43.4 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl (72.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.21.0-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
from pathlib import Path
import json
import datetime
import torch
import random
import numpy as np

CONFIG = {
    # Core experiment parameters
    "experiment_type": "equation_extraction",
    # UPDATED to use Unsloth's 4-bit Gemma 3 1B model
    "model_name": "unsloth/gemma-3-1b-it-unsloth-bnb-4bit",
    "max_seq_length": 512, # Unsloth's FastModel requires this at load time

    # Prompting configuration
    "include_examples": True,
    "few_shot_examples": [
        ('computational_error', 4966),
        ('conceptual_error', 1091),
    ],

    # Training parameters
    "learning_rate": 1e-4,
    "num_epochs": 1, # Set to 1 as requested
    "batch_size": 32, # Halved from 4, since Unsloth uses more VRAM initially
    "gradient_accumulation_steps": 1,

    # LoRa params (Unsloth defaults are often good)
    "lora_rank": 16,
    "lora_alpha": 32, # Often set to rank
    "lora_dropout": 0.05,

    # Paths
    "base_dataset_path": "/content/aug_10_eqn_extraction_dataset.csv",
    "output_base_dir": "/content/experiments",
}

# --- Generate Unique Experiment ID ---
model_id_short = "gemma3-1b-unsloth"
experiment_id = f"{CONFIG['experiment_type']}_{model_id_short}_{datetime.datetime.now().strftime('%Y%m%d_%H%M')}"
CONFIG["experiment_id"] = experiment_id
print(f"Experiment ID: {experiment_id}")

# --- Setup Output Directories ---
output_dir = Path(CONFIG["output_base_dir"]) / CONFIG["experiment_id"]
(output_dir / "baseline_results").mkdir(parents=True, exist_ok=True)
(output_dir / "final_results").mkdir(parents=True, exist_ok=True)
CONFIG["output_dir"] = str(output_dir)
CONFIG["final_adapter_dir"] = str(output_dir / "final_adapter")
CONFIG["merged_model_dir"] = str(output_dir / "final_merged_model")

with open(output_dir / "config.json", 'w') as f: json.dump(CONFIG, f, indent=2)
print(f"Output directory created: {output_dir}")

# --- Set Random Seeds for Reproducibility ---
def set_seeds(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
set_seeds(42)

print("\n✅ Setup complete.")

Experiment ID: equation_extraction_gemma3-1b-unsloth_20250810_2013
Output directory created: /content/experiments/equation_extraction_gemma3-1b-unsloth_20250810_2013

✅ Setup complete.


In [4]:
### Cell 3: System Prompt for Equation Extraction

SYSTEM_PROMPT = \
"""[ROLE]
You are an expert at parsing mathematical solutions.

[TASK]
You are given a single line from a mathematical solution. Your task is to extract the calculation from this line.

**This is a literal transcription task. Follow these rules with extreme precision:**
- **RULE 1: Transcribe EXACTLY.** Do not correct mathematical errors. If a line implies `2+2=5`, your output for that line must be `2+2=5`.
- **RULE 2: Isolate the Equation.** Your output must contain ONLY the equation, with no surrounding text, units, or currency symbols.

[RESPONSE FORMAT]
Your response must ONLY contain the extracted equation, wrapped in <eq> and </eq> tags.
If the line contains no calculation, respond with empty tags: <eq></eq>.
"""

In [5]:
### Cell 4: Core utlilities

import pandas as pd
from unsloth import FastModel
from unsloth.chat_templates import get_chat_template
import torch

# 4.1 Loading

def load_base_dataset():
    """Loads the base dataset from the specified CSV file."""
    data = pd.read_csv(CONFIG['base_dataset_path'])
    print(f"Loaded dataset with {len(data)} samples from {CONFIG['base_dataset_path']}")
    return data

def load_unsloth_model_and_tokenizer():
    """
    Loads the 4-bit quantized model and tokenizer WITHOUT modification.
    """
    model_name = CONFIG["model_name"]
    print(f"Loading model and tokenizer: {model_name}")

    model, tokenizer = FastModel.from_pretrained(
        model_name=model_name,
        max_seq_length=CONFIG["max_seq_length"],
        dtype=None,
        load_in_4bit=True,
    )

    tokenizer = get_chat_template(
        tokenizer,
        chat_template="gemma-3",
    )

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"

    return model, tokenizer

def prepare_model_for_lora_training(model):
    """DO NOT CALL THIS DIRECTLY. Use the new setup function below."""
    model = FastModel.get_peft_model(
        model,
        r=CONFIG["lora_rank"],
        lora_alpha=CONFIG["lora_alpha"],
        lora_dropout=CONFIG["lora_dropout"],
        bias="none",
        use_gradient_checkpointing=True,
        random_state=42,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    )
    return model

# 4.2 Formatting

def format_user_message(sample: dict) -> str:
    """Formats the user message from the 'line' column."""
    return f"### Solution Line:\n{sample['line']}"

def format_expected_output(sample: dict) -> str:
    """Formats the expected output from the 'eqn' column, wrapping it in tags."""
    equation = sample.get('eqn', '')
    if pd.isna(equation):
        equation = ''
    return f"<eq>{equation}</eq>"

# 4.3 Prompt Construction

def _build_conversation_messages(sample, is_training_prompt=True):
    """Builds the list of messages for the chat template."""
    user_content = f"{SYSTEM_PROMPT}\n\n{format_user_message(sample)}"
    messages = [{"role": "user", "content": user_content}]

    if is_training_prompt:
        messages.append({"role": "assistant", "content": format_expected_output(sample)})

    return messages

# 4.4 Prompt creation for inference
def create_sample_prompt_for_inference(sample, tokenizer):
    """Creates a full prompt for a single sample for inference."""
    messages = _build_conversation_messages(sample, is_training_prompt=False)
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [6]:
### Cell 5: Dataset preparation

from datasets import Dataset
from sklearn.model_selection import train_test_split

def prepare_datasets(base_df):
    """
    Splits the base DataFrame into training and testing sets using a
    stratified split on the 'type' column.
    """
    train_df, test_df = train_test_split(
        base_df,
        test_size=0.2,
        random_state=42,
        stratify=base_df['type']
    )
    print(f"Data split using stratified split: {len(train_df)} training, {len(test_df)} testing samples.")
    return train_df, test_df

def create_training_dataset(df, tokenizer):
    """
    Creates the tokenized training dataset object for the SFTTrainer.
    """
    def create_text_for_sample(sample):
        messages = _build_conversation_messages(sample, is_training_prompt=True)
        return {"text": tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)}

    return Dataset.from_pandas(df).map(create_text_for_sample, load_from_cache_file=False)

In [7]:
### Cell 6: Evaluation logic

import re
import math
import json
import pandas as pd
from tqdm import tqdm

def _safe_eval(expression: str):
    """Safely evaluates a string, returning a sentinel on error."""
    try:
        if not expression: return None
        return eval(expression, {"__builtins__": None}, {})
    except Exception:
        return object()

def _extract_components(expression: str) -> tuple[list, list]:
    """Extracts number and operator sequences, rounding to 2 decimal places."""
    number_strings = re.findall(r'\d+\.?\d*|\.\d+', expression)
    numbers = [round(float(n), 2) for n in number_strings]
    operators = re.findall(r'[+\-*/]', expression)
    return numbers, operators

def _expressions_are_equivalent(pred_expr: str, exp_expr: str) -> bool:
    """Checks if two expression strings are structurally and mathematically equivalent."""
    pred_val = _safe_eval(pred_expr)
    exp_val = _safe_eval(exp_expr)

    if not (isinstance(pred_val, (int, float)) and isinstance(exp_val, (int, float))):
        return False
    if not math.isclose(pred_val, exp_val, rel_tol=1e-5):
        return False

    pred_nums, pred_ops = _extract_components(pred_expr)
    exp_nums, exp_ops = _extract_components(exp_expr)

    if pred_nums != exp_nums or pred_ops != exp_ops:
        return False
    return True

def extract_equation_from_response(response: str) -> str | None:
    """Extracts content from between <eq> and </eq> tags."""
    match = re.search(r'<eq>(.*?)</eq>', response, re.DOTALL)
    return match.group(1) if match else None

def _sanitize_equation_string(expression: str) -> str:
    """Cleans a single equation string by stripping whitespace, standardizing
    multiplication, and removing non-mathematical characters."""
    if not isinstance(expression, str):
        return ""
    sanitized = expression.strip().replace(' ', '')
    sanitized = sanitized.replace('x', '*').replace('×', '*')
    sanitized = re.sub(r'/([a-zA-Z]+)', '', sanitized)
    sanitized = re.sub(r'[^\d.()+\-*/=]', '', sanitized)
    return sanitized

def rigorous_compare_equations(predicted_eqn: str, expected_eqn: str) -> float:
    """Rigorously compares two single equation strings."""
    if not predicted_eqn and not expected_eqn:
        return 1.0
    if not predicted_eqn or not expected_eqn:
        return 0.0
    if predicted_eqn.count('=') != 1 or expected_eqn.count('=') != 1:
        return 0.0

    pred_lhs, pred_rhs = predicted_eqn.split('=', 1)
    exp_lhs, exp_rhs = expected_eqn.split('=', 1)

    if _expressions_are_equivalent(pred_lhs, exp_lhs) and \
       _expressions_are_equivalent(pred_rhs, exp_rhs):
        return 1.0
    return 0.0

def run_unsloth_inference(model, tokenizer, df_to_eval, batch_size=32):
    """Runs inference using the provided Unsloth model and tokenizer."""
    print(f"\n--- Running Unsloth native inference ---")
    prompts = [create_sample_prompt_for_inference(row.to_dict(), tokenizer) for _, row in df_to_eval.iterrows()]
    all_predictions = []
    for i in tqdm(range(0, len(prompts), batch_size), desc="Inference Batches"):
        batch_prompts = prompts[i:i + batch_size]
        tokenizer.padding_side = "left"
        inputs = tokenizer(
            batch_prompts,
            return_tensors="pt",
            padding=True
            ).to("cuda")
        tokenizer.padding_side = "left"
        outputs = model.generate(
            **inputs,
            max_new_tokens=64,
            use_cache=True,
            pad_token_id=tokenizer.pad_token_id
            )
        tokenizer.padding_side = "left"
        decoded_outputs = tokenizer.batch_decode(
            outputs[:, inputs.input_ids.shape[1]:],
            skip_special_tokens=True
            )
        all_predictions.extend(decoded_outputs)
    return all_predictions

def evaluate_predictions(test_df, predictions):
    """Parses predictions, sanitizes data, computes metrics, and returns a results DataFrame."""
    results_data = []
    for i, pred_text in enumerate(predictions):
        original_sample = test_df.iloc[i].to_dict()

        expected_eqn = original_sample.get('eqn', '')
        if pd.isna(expected_eqn): expected_eqn = ''

        predicted_eqn_raw = extract_equation_from_response(pred_text)

        sanitized_pred = _sanitize_equation_string(predicted_eqn_raw)
        sanitized_exp = _sanitize_equation_string(expected_eqn)
        score = rigorous_compare_equations(sanitized_pred, sanitized_exp)

        results_data.append({
            'line_text': original_sample.get('line'),
            'expected_equation': expected_eqn,
            'predicted_equation': predicted_eqn_raw,
            'rigorous_score': score,
            'full_prediction_text': pred_text.strip(),
        })

    results_df = pd.DataFrame(results_data)

    parse_failures = (results_df['predicted_equation'].isnull()).sum()
    metrics = {
        "mean_rigorous_score": results_df['rigorous_score'].mean(),
        "total_samples": len(results_df),
        "tag_parse_failures": int(parse_failures),
        "failure_rate": parse_failures / len(results_df) if len(results_df) > 0 else 0
    }
    return results_df, metrics

In [8]:
### Cell 7: Fine-tuning function

from trl import SFTTrainer, SFTConfig
from transformers import TrainingArguments
from unsloth.chat_templates import train_on_responses_only

def run_fine_tuning(model, tokenizer, train_dataset):
    """Runs fine-tuning using Unsloth and SFTTrainer."""

    # Configure the trainer
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        dataset_text_field="text",
        max_seq_length=CONFIG["max_seq_length"],
        dataset_num_proc=2,
        args=SFTConfig(
            per_device_train_batch_size=CONFIG["batch_size"],
            gradient_accumulation_steps=CONFIG["gradient_accumulation_steps"],
            warmup_steps=5,
            num_train_epochs=1,
            learning_rate=CONFIG["learning_rate"],
            fp16=not torch.cuda.is_bf16_supported(),
            bf16=torch.cuda.is_bf16_supported(),
            logging_steps=5,
            optim="adamw_8bit",
            weight_decay=0.01,
            lr_scheduler_type="linear",
            seed=42,
            output_dir=str(Path(CONFIG["output_dir"]) / "training_checkpoints"),
            report_to="none",
        ),
    )

    # Use Unsloth's helper to only train on assistant's responses
    # This is more efficient than manual masking.
    trainer = train_on_responses_only(
        trainer,
        instruction_part="<start_of_turn>user",
        response_part="<start_of_turn>model",
    )

    print(f"\n--- Starting fine-tuning for {CONFIG['num_epochs']} epoch(s) ---")
    trainer_stats = trainer.train()

    # Save the final LoRa adapter
    print(f"\n✅ Fine-tuning finished! Saving final adapter to {CONFIG['final_adapter_dir']}")
    model.save_pretrained(CONFIG["final_adapter_dir"])

    # 4Save Training Log and Configuration
    log_history = [log for log in trainer.state.log_history if 'loss' in log]
    log_history_df = pd.DataFrame(log_history)
    log_path = output_dir / "training_log.csv"
    log_history_df.to_csv(log_path, index=False)
    print(f"✅ Training log saved to: {log_path}")

    return trainer_stats

In [9]:
### Cell 8: Pipeline execution

In [10]:
# 8.1 Load dataset and few-shot examples
base_df = load_base_dataset()
train_df, test_df = prepare_datasets(base_df)
print("\n✅ Data loaded and split.")

Loaded dataset with 3383 samples from /content/aug_10_eqn_extraction_dataset.csv
Data split using stratified split: 2706 training, 677 testing samples.

✅ Data loaded and split.


In [11]:
# 8.2 Load model and tokenizer
model, tokenizer = load_unsloth_model_and_tokenizer()
print("\n✅ Unsloth model and tokenizer loaded.")

Loading model and tokenizer: unsloth/gemma-3-1b-it-unsloth-bnb-4bit
==((====))==  Unsloth 2025.8.4: Fast Gemma3 patching. Transformers: 4.55.0.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




model.safetensors:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]


✅ Unsloth model and tokenizer loaded.


In [12]:
# 8.3 Apply Formatting (Inspect Message List)
inspection_sample = train_df.iloc[0].to_dict()

conversation_messages = _build_conversation_messages(sample=inspection_sample, is_training_prompt=True)

import json
print("Example conversation:")
for message in conversation_messages:
    print(f"{message['role']}:")
    print(message['content'])
    print()

Example conversation:
user:
[ROLE]
You are an expert at parsing mathematical solutions.

[TASK]
You are given a single line from a mathematical solution. Your task is to extract the calculation from this line.

**This is a literal transcription task. Follow these rules with extreme precision:**
- **RULE 1: Transcribe EXACTLY.** Do not correct mathematical errors. If a line implies `2+2=5`, your output for that line must be `2+2=5`.
- **RULE 2: Isolate the Equation.** Your output must contain ONLY the equation, with no surrounding text, units, or currency symbols.

[RESPONSE FORMAT]
Your response must ONLY contain the extracted equation, wrapped in <eq> and </eq> tags.
If the line contains no calculation, respond with empty tags: <eq></eq>.


### Solution Line:
If Melany has to fence 5000 feet of the field and has 4000 feet of wire mesh, she will not fence 5000-4000 = 100 feet of the field.

assistant:
<eq>5000-4000=100</eq>



In [13]:
# 8.4 Apply tokenizer and inspect

final_prompt_string = tokenizer.apply_chat_template(
    conversation_messages,
    tokenize=False,
    add_generation_prompt=False # False because it's a training example
)

print(final_prompt_string)

<bos><start_of_turn>user
[ROLE]
You are an expert at parsing mathematical solutions.

[TASK]
You are given a single line from a mathematical solution. Your task is to extract the calculation from this line.

**This is a literal transcription task. Follow these rules with extreme precision:**
- **RULE 1: Transcribe EXACTLY.** Do not correct mathematical errors. If a line implies `2+2=5`, your output for that line must be `2+2=5`.
- **RULE 2: Isolate the Equation.** Your output must contain ONLY the equation, with no surrounding text, units, or currency symbols.

[RESPONSE FORMAT]
Your response must ONLY contain the extracted equation, wrapped in <eq> and </eq> tags.
If the line contains no calculation, respond with empty tags: <eq></eq>.


### Solution Line:
If Melany has to fence 5000 feet of the field and has 4000 feet of wire mesh, she will not fence 5000-4000 = 100 feet of the field.<end_of_turn>
<start_of_turn>model
<eq>5000-4000=100</eq><end_of_turn>



In [14]:
# ===================================================================
# PHASE 1: BASELINE EVALUATION
# ===================================================================

from tqdm import tqdm

print("\n" + "="*50)
print("PHASE 1: BASELINE EVALUATION")
print("="*50)

# Run inference on the base model
baseline_predictions = run_unsloth_inference(
    model=model,
    tokenizer=tokenizer,
    df_to_eval=test_df,
    batch_size=256
)

# Evaluate and save baseline results
baseline_results_df, baseline_metrics = evaluate_predictions(test_df, baseline_predictions)
baseline_results_path = Path(CONFIG["output_dir"]) / "baseline_results" / "baseline_evaluation_results.csv"
baseline_metrics_path = Path(CONFIG["output_dir"]) / "baseline_results" / "baseline_metrics.json"
baseline_results_df.to_csv(baseline_results_path, index=False)
with open(baseline_metrics_path, 'w') as f:
    json.dump(baseline_metrics, f, indent=2)

print("\n--- Baseline Metrics ---")
print(json.dumps(baseline_metrics, indent=2))
print(f"✅ Baseline results saved.")



PHASE 1: BASELINE EVALUATION

--- Running Unsloth native inference ---


Inference Batches: 100%|██████████| 3/3 [00:40<00:00, 13.45s/it]


--- Baseline Metrics ---
{
  "mean_rigorous_score": 0.4741506646971935,
  "total_samples": 677,
  "tag_parse_failures": 4,
  "failure_rate": 0.005908419497784343
}
✅ Baseline results saved.





In [15]:
baseline_results_df

Unnamed: 0,line_text,expected_equation,predicted_equation,rigorous_score,full_prediction_text
0,"If she remained with $900, she initially had $...",900+800=1700,2*900 = 1800,0.0,<eq>2*900 = 1800</eq>
1,The group was able to sell 20 x 8 = 160 candy ...,20*8=160,20 x 8 = 160,1.0,<eq>20 x 8 = 160</eq>
2,One pack of sugar costs $2 - $1 = $2.,2-1=2,2 - $1 = $2,1.0,<eq>2 - $1 = $2</eq>
3,The discount of the United flight would come t...,1100*0.3=303,303*0.3,0.0,<eq>303*0.3</eq>
4,So the combined flock has 150+150=300 ducks,150+150=300,150+150=300,1.0,<eq>150+150=300</eq>
...,...,...,...,...,...
672,The distance each person travels is equal to t...,,20 mph * (x + 15),0.0,<eq>20 mph * (x + 15) </eq>
673,He also scored 2*6=12 points from the 2 point ...,2*6=12,2*6=12,1.0,<eq>2*6=12</eq>
674,The total number of sweaters she had knit on M...,10+8=81,10+8=81,1.0,<eq>10+8=81</eq>
675,He therefore needs to buy 5 bags of flour beca...,,4 < 4.8 < 5,0.0,<eq>4 < 4.8 < 5</eq>


In [16]:
# ===================================================================
# PHASE 2: FINE-TUNE
# ===================================================================

print("\n" + "="*50)
print("PHASE 2: FINE-TUNING")
print("="*50)

# 1. Apply LoRa adapters to the existing model object for training
model = prepare_model_for_lora_training(model)

# 2. Prepare the Hugging Face Dataset for the trainer
train_dataset = create_training_dataset(train_df, tokenizer)

# 3. Run the fine-tuning process
training_stats = run_fine_tuning(model, tokenizer, train_dataset)

print("✅ Fine-tuning complete. The model object in memory is now updated.")


PHASE 2: FINE-TUNING


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.


Unsloth: Making `model.base_model.model.model` require gradients


Map:   0%|          | 0/2706 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/2706 [00:00<?, ? examples/s]

Map (num_proc=12):   0%|          | 0/2706 [00:00<?, ? examples/s]


--- Starting fine-tuning for 1 epoch(s) ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,706 | Num Epochs = 1 | Total steps = 85
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 13,045,760 of 1,012,931,712 (1.29% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
5,0.7917
10,0.1505
15,0.0798
20,0.0598
25,0.0464
30,0.0315
35,0.0318
40,0.0295
45,0.0112
50,0.0164


Unsloth: Will smartly offload gradients to save VRAM!

✅ Fine-tuning finished! Saving final adapter to /content/experiments/equation_extraction_gemma3-1b-unsloth_20250810_2013/final_adapter
✅ Training log saved to: /content/experiments/equation_extraction_gemma3-1b-unsloth_20250810_2013/training_log.csv
✅ Fine-tuning complete. The model object in memory is now updated.


In [17]:
# ===================================================================
# PHASE 3: FINAL EVALUATION
# ===================================================================

import gc

print("\n" + "="*50)
print("PHASE 3: FINAL EVALUATION")
print("="*50)

# Run inference with the fine-tuned LoRa model
final_predictions = run_unsloth_inference(
    model=model,
    tokenizer=tokenizer,
    df_to_eval=test_df,
    batch_size=256
)

# Evaluate and save final results
final_results_df, final_metrics = evaluate_predictions(test_df, final_predictions)
final_results_path = Path(CONFIG["output_dir"]) / "final_results" / "final_evaluation_results.csv"
final_metrics_path = Path(CONFIG["output_dir"]) / "final_results" / "final_metrics.json"
final_results_df.to_csv(final_results_path, index=False)
with open(final_metrics_path, 'w') as f:
    json.dump(final_metrics, f, indent=2)

print("\n--- Final Metrics ---")
print(json.dumps(final_metrics, indent=2))
print(f"✅ Final results saved.")

# # --- Clean up ---
# del model, tokenizer, final_predictions, final_results_df
# gc.collect()
# torch.cuda.empty_cache()


PHASE 3: FINAL EVALUATION

--- Running Unsloth native inference ---


Inference Batches: 100%|██████████| 3/3 [00:28<00:00,  9.36s/it]


--- Final Metrics ---
{
  "mean_rigorous_score": 0.948301329394387,
  "total_samples": 677,
  "tag_parse_failures": 0,
  "failure_rate": 0.0
}
✅ Final results saved.





In [18]:
final_results_df

Unnamed: 0,line_text,expected_equation,predicted_equation,rigorous_score,full_prediction_text
0,"If she remained with $900, she initially had $...",900+800=1700,900+800=1700,1.0,<eq>900+800=1700</eq>
1,The group was able to sell 20 x 8 = 160 candy ...,20*8=160,20*8=160,1.0,<eq>20*8=160</eq>
2,One pack of sugar costs $2 - $1 = $2.,2-1=2,2-1=2,1.0,<eq>2-1=2</eq>
3,The discount of the United flight would come t...,1100*0.3=303,1100*.3=303,1.0,<eq>1100*.3=303</eq>
4,So the combined flock has 150+150=300 ducks,150+150=300,150+150=300,1.0,<eq>150+150=300</eq>
...,...,...,...,...,...
672,The distance each person travels is equal to t...,,,1.0,<eq></eq>
673,He also scored 2*6=12 points from the 2 point ...,2*6=12,2*6=12,1.0,<eq>2*6=12</eq>
674,The total number of sweaters she had knit on M...,10+8=81,10+8=81,1.0,<eq>10+8=81</eq>
675,He therefore needs to buy 5 bags of flour beca...,,,1.0,<eq></eq>


In [19]:
# --- FINAL COMPARISON ---
print("\n" + "="*50)
print("PERFORMANCE COMPARISON")
print("="*50)

print("\n--- Baseline Metrics ---")
print(json.dumps(baseline_metrics, indent=2))

print("\n--- Final Fine-Tuned Metrics ---")
print(json.dumps(final_metrics, indent=2))
print("\n" + "="*50)

print("\n✅✅✅ Experiment Complete! ✅✅✅")


PERFORMANCE COMPARISON

--- Baseline Metrics ---
{
  "mean_rigorous_score": 0.4741506646971935,
  "total_samples": 677,
  "tag_parse_failures": 4,
  "failure_rate": 0.005908419497784343
}

--- Final Fine-Tuned Metrics ---
{
  "mean_rigorous_score": 0.948301329394387,
  "total_samples": 677,
  "tag_parse_failures": 0,
  "failure_rate": 0.0
}


✅✅✅ Experiment Complete! ✅✅✅


In [20]:
from unsloth import FastModel
from peft import PeftModel
import torch

print("--- Reloading fine-tuned model from local adapter checkpoint ---")

# --- 1. Define the paths from your CONFIG dictionary ---
base_model_name = CONFIG["model_name"]
adapter_path = CONFIG["final_adapter_dir"]

# --- 2. Load the 4-bit base model and add special tokens ---
print(f"Loading base model: {base_model_name}")
model, tokenizer = FastModel.from_pretrained(
    model_name=base_model_name,
    max_seq_length=CONFIG["max_seq_length"],
    dtype=None,
    load_in_4bit=True,
)

# --- 3. Apply your saved LoRa adapters ---
# This merges your fine-tuning into the base model.
print(f"Applying LoRa adapters from: {adapter_path}")
model = PeftModel.from_pretrained(model, adapter_path)

print("\n✅ Model successfully reloaded from checkpoint.")
print("You can now proceed with inference or pushing to the Hub.")

--- Reloading fine-tuned model from local adapter checkpoint ---
Loading base model: unsloth/gemma-3-1b-it-unsloth-bnb-4bit
==((====))==  Unsloth 2025.8.4: Fast Gemma3 patching. Transformers: 4.55.0.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




Applying LoRa adapters from: /content/experiments/equation_extraction_gemma3-1b-unsloth_20250810_2013/final_adapter

✅ Model successfully reloaded from checkpoint.
You can now proceed with inference or pushing to the Hub.


In [21]:
# ===================================================================
# CELL 11: PUSH ADAPTERS TO HUGGING FACE HUB
# ===================================================================

# --- Configuration ---
# Replace with your Hugging Face username and desired repo name
hf_username = "arvindsuresh-math"
hf_repo_name = "gemma-3-1b-equation-line-extractor-aug-10"
commit_message = "Fine-tuned with Unsloth on equation extraction dataset"

# --- Login to Hugging Face ---
# This uses the token you provided at the start of the notebook
from huggingface_hub import login
from google.colab import userdata

# hf_token = userdata.get('HF_TOKEN')
login(token=hf_token)

# --- Push the LoRa adapters ---
# The 'model' object currently in memory is the fine-tuned adapter model
print(f"Pushing LoRa adapters to: {hf_username}/{hf_repo_name}")
model.push_to_hub(f"{hf_username}/{hf_repo_name}", use_auth_token=True, commit_message=commit_message)
tokenizer.push_to_hub(f"{hf_username}/{hf_repo_name}", use_auth_token=True, commit_message=commit_message)

print("✅ Adapters successfully pushed to the Hugging Face Hub.")

### How to use these adapters in your HF Space `app.py`:

# from unsloth import FastModel
# from peft import PeftModel
# import torch

# # Your chosen repo and the original base model
# adapter_repo = "your-hf-username/gemma-3-1b-equation-extractor-lora"
# base_model_name = "unsloth/gemma-3-1b-it-unsloth-bnb-4bit"

# # 1. Load the 4-bit base model
# model, tokenizer = FastModel.from_pretrained(
#     model_name = base_model_name,
#     max_seq_length = 2048,
#     dtype = None,
#     load_in_4bit = True,
# )

# # 2. Apply your fine-tuned adapters
# model = PeftModel.from_pretrained(model, adapter_repo)

# # Now the 'model' is ready for inference

Pushing LoRa adapters to: arvindsuresh-math/gemma-3-1b-equation-line-extractor-aug-10




README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...pkv0hvmk7/adapter_model.safetensors:   1%|1         |  561kB / 52.2MB            

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/tmp6gd_zsuo/tokenizer.json       : 100%|##########| 33.4MB / 33.4MB            

  /tmp/tmp6gd_zsuo/tokenizer.model      : 100%|##########| 4.69MB / 4.69MB            

No files have been modified since last commit. Skipping to prevent empty commit.


✅ Adapters successfully pushed to the Hugging Face Hub.


In [22]:
import zipfile
from pathlib import Path
import os

print("\n" + "="*50)
print("COMPRESSING RESULTS FOR DOWNLOAD")
print("="*50)

# Define paths from the global CONFIG
output_dir = Path(CONFIG["output_dir"])
experiment_id = CONFIG["experiment_id"]
adapter_path = Path(CONFIG["final_adapter_dir"])

# Define the name and location of the output zip file
zip_path = output_dir / f"{experiment_id}_results.zip"

# List of files and directories to be included in the zip archive
files_to_zip = [
    output_dir / "baseline_results" / "baseline_evaluation_results.csv",
    output_dir / "baseline_results" / "baseline_metrics.json",
    output_dir / "final_results" / "final_evaluation_results.csv",
    output_dir / "final_results" / "final_metrics.json",
    output_dir / "training_log.csv",
    output_dir / "config.json",
]

try:
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        print(f"Creating zip archive at: {zip_path}")
        for file_path in files_to_zip:
            if file_path.exists():
                # The arcname is the path of the file relative to the experiment directory,
                # which keeps the folder structure (e.g., 'baseline_results/...') inside the zip.
                arcname = file_path.relative_to(output_dir)
                zipf.write(file_path, arcname)
                print(f"  - Adding: {arcname}")
            else:
                print(f"  - Skipping (not found): {file_path}")

    print(f"\n✅ Successfully created results zip archive at: {zip_path}")

except Exception as e:
    print(f"\n❌ An error occurred while creating the zip file: {e}")


COMPRESSING RESULTS FOR DOWNLOAD
Creating zip archive at: /content/experiments/equation_extraction_gemma3-1b-unsloth_20250810_2013/equation_extraction_gemma3-1b-unsloth_20250810_2013_results.zip
  - Adding: baseline_results/baseline_evaluation_results.csv
  - Adding: baseline_results/baseline_metrics.json
  - Adding: final_results/final_evaluation_results.csv
  - Adding: final_results/final_metrics.json
  - Adding: training_log.csv
  - Adding: config.json

✅ Successfully created results zip archive at: /content/experiments/equation_extraction_gemma3-1b-unsloth_20250810_2013/equation_extraction_gemma3-1b-unsloth_20250810_2013_results.zip
