Install dependencies and import necessary libraries

In [1]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install unsloth
# Get latest Unsloth
!pip install --upgrade --no-deps "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [69]:
from sympy import symbols, sympify, N
import re as regex
from unsloth import FastLanguageModel
import torch
from google.colab import drive
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import Dataset
from transformers import AutoTokenizer
from datasets import load_dataset

All of the Sympy functions for parsing the formulas are below:

In [65]:
def insert_spaces(formula):
    return regex.sub(r'([(),])', r' \1 ', formula).replace("  ", " ").strip()

def remove_const(expression):
    return regex.sub(r'const_([-0-9_.]+)', r'\1', expression)

ops = ['add', 'subtract', 'multiply', 'divide', 'power', 'sqrt', 'log', 'choose', 'speed',
       'volume_rectangular_prism', 'square_area', 'circle_area', 'circumface']

def fuse_operator_parens(expression, operators = ['add', 'subtract', 'multiply', 'divide', 'power', 'sqrt', 'log', 'choose', 'speed',
       'volume_rectangular_prism', 'square_area', 'circle_area', 'circumface']):
    for op in operators:
        expression = regex.sub(rf'\b{op}\s*\(', f'{op}(', expression)
    return expression

In [66]:
# Define known symbols
const_100 = symbols('const_100')

def evaluate_functional_expression(expr_str):
    stack = []
    num_buffer = ""
    i = 0
    while i < len(expr_str):
        char = expr_str[i]

        # Accumulate alphanumeric + underscores + decimals
        if char.isalnum() or char == '.':
            num_buffer += char
        elif char == "_":
            num_buffer += '.'
        elif char == "(":
            if num_buffer:
                if num_buffer.startswith("const_"):
                    const_value = num_buffer.replace("const_", "").replace("_", ".")
                    stack.append(const_value)
                else:
                    stack.append(num_buffer)
                num_buffer = ""

        elif char == "," or char == ")":
            if num_buffer:
                if num_buffer.startswith("const_"):
                    const_value = num_buffer.replace("const_", "").replace("_", ".")
                    stack.append(const_value)
                else:
                    stack.append(num_buffer)
                num_buffer = ""

            if char == ")":
                args = []
                known_funcs = {"add", "subtract", "multiply", "divide", "power", "sqrt", "log", "choose", "speed", "volume_rectangular_prism", "square_area", "circle_area", "circumface"}
                while stack and stack[-1] not in known_funcs:
                    args.append(stack.pop())
                args.reverse()

                if stack:
                    func = stack.pop()
                    if func == "add":
                        result = f"({args[0]} + {args[1]})"
                    elif func == "subtract":
                        result = f"({args[0]} - {args[1]})"
                    elif func == "multiply":
                        result = f"({args[0]} * {args[1]})"
                    elif func == "divide":
                        result = f"({args[0]} / {args[1]})"
                    elif func == "power":
                        result = f"({args[0]} ** {args[1]})"
                    elif func == "sqrt":
                        result = f"sqrt({args[0]})"
                    elif func == "log":
                        result = f"log({args[0]})"
                    elif func == "choose":
                        result = f"(({args[0]}!) / ({args[1]}! * ({args[0]} - {args[1]})!))"
                    elif func == "speed":
                        result = f"({args[0]} / {args[1]})"
                    elif func == "volume_rectangular_prism":
                        result = f"({args[0]} * {args[1]} * {args[2]})"
                    elif func == "square_area":
                        result = f"({args[0]} ** 2)"
                    elif func == "circle_area":
                        result = f"pi * ({args[0]} ** 2)"
                    elif func == "circumface":
                        result = f"2 * pi * {args[0]}"
                    else:
                        result = ""
                    stack.append(result)

        i += 1

    return stack[0] if stack else ""



def check_answer_numeric(input):
  math_expr = evaluate_functional_expression(input)
  sympy_expr = sympify(math_expr, locals={'const_100': 100})
  return N(sympy_expr.simplify())

def safe_check_answer_numeric(x):
    try:
        return check_answer_numeric(x)
    except Exception as e:
        return None

## Training the model

This is the training setup and loop for the tinyllama model. We ended up stopping the training loop in the middle and restarting it which is why there are 2 separate trainer.train() calls.

In [67]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
CSV_PATH = "/content/drive/MyDrive/train.csv"
MAX_SEQ_LENGTH = 2048
# 1. Load model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/tinyllama-bnb-4bit",
    max_seq_length=2048,
    load_in_4bit=True,
)

# === PREPARE LoRA ===
model = FastLanguageModel.get_peft_model(
    model,
    r=64,
    target_modules=["q_proj", "v_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing=True,
    random_state=42,
)

# === LOAD DATASET ===
dataset = load_dataset('csv', data_files='/content/drive/MyDrive/train_answerextracted.csv', split='train')

# === FORMAT PROMPTS ===
math_prompt = """### Math Word Problem:
{}

### Extract the formula used to solve it:
{}"""

def formatting_prompts_func(examples):
    problems = examples["Problem"]
    formulas = examples["annotated_formula"]
    texts = [
        math_prompt.format(p, f) + tokenizer.eos_token
        for p, f in zip(problems, formulas)
    ]
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True)

# === TOKENIZE with truncation ===
tokenized = dataset.map(
    lambda x: tokenizer(x["text"], truncation=True, max_length=MAX_SEQ_LENGTH),
    batched=True,
    remove_columns=dataset.column_names
)

# === TRAINING ARGS ===
training_args = TrainingArguments(
    output_dir="./formula-model",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    num_train_epochs=3,
    logging_steps=10,
    fp16=False,
    bf16=True,
    save_strategy="epoch",
    report_to="none",
)

# === TRAINER ===
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized,
    max_seq_length=MAX_SEQ_LENGTH,
    packing=False,
    args=training_args,
)

# === START TRAINING ===
trainer.train()

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Map:   0%|          | 0/29837 [00:00<?, ? examples/s]

Map:   0%|          | 0/29837 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 29,837 | Num Epochs = 3 | Total steps = 11,190
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 2 x 1) = 8
 "-____-"     Trainable parameters = 9,011,200/4,000,000,000 (0.23% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,2.7648
20,2.7896
30,2.686
40,2.5289
50,2.5432
60,2.3516
70,2.3604
80,2.1017
90,1.976
100,1.9691


KeyboardInterrupt: 

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 29,837 | Num Epochs = 1 | Total steps = 3,729
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 25,231,360/4,000,000,000 (0.63% trained)


Step,Training Loss
1,1.1088
2,1.2477
3,1.1831
4,1.1075
5,0.9311
6,1.3025
7,1.0677
8,1.0469
9,1.007
10,1.0163


KeyboardInterrupt: 

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

5034.6413 seconds used for training.
83.91 minutes used for training.
Peak reserved memory = 13.508 GB.
Peak reserved memory for training = 12.629 GB.
Peak reserved memory % of max memory = 91.592 %.
Peak reserved memory for training % of max memory = 85.632 %.


<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

In [None]:
FastLanguageModel.for_inference(model)

prompt = """### Math Word Problem:
there are 10 girls and 20 boys in a classroom . what is the ratio of girls to boys ?

### Extract the formula used to solve it:
"""

inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=64,
    use_cache=True,
    temperature=0.0
)

decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
formula = decoded.split("Extract the formula used to solve it:")[-1].strip()

print("🧮 Extracted formula:", formula)


🧮 Extracted formula: divide( 10 , 20 )


In [None]:
FastLanguageModel.for_inference(model)

prompt = """### Math Word Problem:
a right triangle is inscribed in a circle . the legs of the triangle have lengths 6 and 8 . what is the diameter of the circle ?

### Extract the formula used to solve it:
"""

inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=64,
    use_cache=True,
    temperature=0.0
)

decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
formula = decoded.split("Extract the formula used to solve it:")[-1].strip()

print("🧮 Extracted formula:", formula)


🧮 Extracted formula: divide( 8 , 2 )


 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    math_prompt.format(
        "the banker ' s gain of a certain sum due 3 years hence at 10 % per annum is rs . 36 . what is the present worth ?", # instruction
        "a ) rs . 400 , b ) rs . 300 , c ) rs . 500 , d ) rs . 350 , e ) none of these", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<s> ### Math Word Problem:
the banker ' s gain of a certain sum due 3 years hence at 10 % per annum is rs . 36 . what is the present worth ?

### Extract the formula used to solve it:
a ) rs . 400 , b ) rs . 300 , c ) rs . 500 , d ) rs . 350 , e ) none of these</s>


## Saving and Loading the Model

In [None]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving


('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.model',
 'lora_model/added_tokens.json',
 'lora_model/tokenizer.json')

In [19]:
# Load tokenizer from saved directory
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/lora_model_updated")

# Load base model (must match the one used during training)
base_model_name = "unsloth/tinyllama-bnb-4bit"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = base_model_name,
    max_seq_length = 2048,
    dtype = None,          # or "auto"
    load_in_4bit = True,
)

# Load the LoRA weights
model.load_adapter("/content/drive/MyDrive/lora_model_updated")  # This merges the saved LoRA adapter into the model

==((====))==  Unsloth 2025.5.1: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


## Testing Normalized Levenshtein and Accuracy

In [12]:
def output_formula_pretrained(model, problem):
    FastLanguageModel.for_inference(model)

    input_text = math_prompt.format(problem, "")
    inputs = tokenizer([input_text], return_tensors="pt").to("cuda")

    output_ids = model.generate(**inputs, max_new_tokens=128)

    # Slice out only the newly generated tokens
    generated_ids = output_ids[0][inputs['input_ids'].shape[1]:]
    output_str = tokenizer.decode(generated_ids, skip_special_tokens=True)

    return output_str


In [58]:
import pandas as pd
from difflib import SequenceMatcher
import regex as re
test_df = pd.read_csv('/content/drive/MyDrive/test_answerextracted.csv')

test_df['annotated_formula'] = test_df['annotated_formula'].apply(insert_spaces)
test_df['annotated_formula'] = test_df['annotated_formula'].apply(remove_const)
test_df['annotated_formula'] = test_df['annotated_formula'].apply(lambda x: fuse_operator_parens(x, ops))

part_df = test_df.sample(n=20)


def normalized_levenshtein(pred, truth):
    ratio = SequenceMatcher(None, pred, truth).ratio()
    return ratio

part_df['prediction'] = part_df['Problem'].apply(lambda x: output_formula_pretrained(model, x))
part_df['score'] = part_df.apply(lambda x: normalized_levenshtein(x['prediction'], x['annotated_formula']), axis=1)
print(part_df['score'].mean())

0.5306555295514903


In [59]:
part_df

Unnamed: 0,Problem,Rationale,options,correct,annotated_formula,linear_formula,category,answer_raw,answer_numeric,prediction,score
600,average of 15 results is 43 . if the average o...,option ' c ',"a ) 41 , b ) 39 , c ) 43 , d ) 45 , e ) 47",c,"subtract( multiply( 15 , 43 ) , add( multiply(...","multiply(n0,n1)|multiply(n2,n3)|multiply(n2,n5...",general,43,43.0,"add( multiply( 43 , 7 ) , multiply( 41 , 7 ) )",0.634921
812,"p and q started a business investing rs . 48,0...","""p : q = 48000 : 24000 = 2 : 1 . answer : c""","a ) 2 : 6 , b ) 2 : 3 , c ) 2 : 1 , d ) 17 : 9...",c,"divide( add( multiply( add( add( 2 , 3 ) , 3 )...","add(n2,const_3)|add(#0,const_3)|multiply(n2,#0...",gain,2 : 1,2.0,"divide( multiply( 48 , 2 ) , multiply( 24 , 2 ) )",0.416667
2816,how many seconds will a train 100 meters long ...,"""d = 100 + 150 = 250 s = 36 * 5 / 18 = 10 mps ...","a ) 2 , b ) 28 , c ) 25 , d ) 99 , e ) 12",c,"divide( add( 150 , 100 ) , multiply( 36 , 0_27...","add(n0,n1)|multiply(n2,const_0_2778)|divide(#0...",physics,25,25.0,"divide( 150 , multiply( 36 , 0_2778 ) )",0.857143
2389,"if n is a prime number greater than 17 , what ...","""there are several algebraic ways to solve thi...","a ) 0 , b ) 1 , c ) 2 , d ) 3 , e ) 5",b,"subtract( power( add( 17 , 2 ) , 2 ) , multipl...","add(n0,n1)|multiply(n2,const_4)|power(#0,n1)|s...",general,1,1.0,"divide( 12 , 2 )",0.293333
1378,a certain ball team has an equal number of rig...,"""say the total number of players is 18 , 9 rig...","a ) 1 / 3 , b ) 5 / 1 , c ) 5 / 7 , d ) 7 / 5 ...",b,"divide( subtract( divide( 1 , 2 ) , subtract( ...","divide(const_1,const_2)|divide(const_1,const_3...",general,5 / 1,5.0,"divide( 1 , add( divide( 1 , 3 ) , divide( 1 ,...",0.054237
1281,"a train 300 m long , running with a speed of 5...","""speed = 54 * 5 / 18 = 15 m / sec time taken =...","a ) 17 sec , b ) 16 sec , c ) 20 sec , d ) 14 ...",c,"multiply( divide( 300 , multiply( 54 , 1000 ) ...","multiply(n1,const_1000)|divide(n0,#0)|multiply...",physics,20 sec,20.0,"divide( 300 , multiply( 54 , 0_2778 ) )",0.715789
303,john and steve are speed walkers in a race . j...,"""let t be the time that john spent for his fin...","a ) 13 seconds , b ) 17 seconds , c ) 24 secon...",c,"divide( add( divide( multiply( 3.7 , add( 10 ,...","add(n0,n3)|subtract(n1,n2)|multiply(n2,#0)|div...",physics,24 seconds,24.0,"divide( 2 , subtract( divide( 10 , 4.2 ) , div...",0.435294
2486,albert is 2 times mary ’ s age and 4 times as ...,"""a = 2 m = m + 12 m = 12 a = 24 a = 4 b , and ...","a ) 6 , b ) 12 , c ) 10 , d ) 15 , e ) 18",a,"divide( multiply( 2 , 12 ) , 4 )","multiply(n0,n2)|divide(#0,n1)|",general,6,6.0,"multiply( divide( subtract( 12 , 4 ) , 2 ) , 2 )",0.5
834,a retailer marks her goods in such a way that ...,let cost price = x profit = y selling price = ...,"a ) 66.67 % , b ) 33.33 % , c ) 40 % , d ) 25 ...",a,"multiply( subtract( divide( 50 , subtract( 50 ...","subtract(n0,n1)|divide(n0,#0)|subtract(#1,cons...",gain,66.67 %,66.67,"divide( subtract( 100 , 50 ) , subtract( 100 ,...",0.644628
784,a bus 75 m long is running with a speed of 21 ...,"""speed of bus relative to woman = 21 + 3 = 24 ...","a ) 5.75 , b ) 7.62 , c ) 11.25 , d ) 4.25 , e...",c,"divide( divide( multiply( 75 , 3600 ) , add( 2...","add(n1,n2)|multiply(n0,const_3600)|divide(#1,#...",physics,11.25,11.25,"divide( multiply( 75 , 21 ) , 3 )",0.639175


In [60]:
import numpy as np
import numbers
test_ans = part_df['prediction'].apply(safe_check_answer_numeric)

test_np = np.array(test_ans)
test_np_float = np.array([float(p) if (p is not None and isinstance(p, numbers.Number)) else np.nan for p in test_np], dtype=np.float64)
print(len(test_np_float))
ans_np = np.array(part_df['answer_numeric'])
print(test_np_float)
print(ans_np)
matches = np.isclose(test_np_float, ans_np, rtol=1e-4, atol=1e-6)
accuracy = np.sum(matches) / len(ans_np)

part_df['ans_predicted'] = test_np_float

print(f"Accuracy: {accuracy:.2%}")

20
[ 5.88000000e+02  2.00000000e+00  1.49988001e+01  6.00000000e+00
  1.50000000e+00  1.99984001e+01  1.44827586e+00  8.00000000e+00
  6.25000000e-01  5.25000000e+02  5.00000000e+01  6.66666667e-01
  8.00000000e+01  4.00000000e+00  1.50000000e+01  2.20000000e+01
  8.75000000e+01  3.30000000e+01 -1.66666667e-01  3.23333333e+00]
[4.30000000e+01 2.00000000e+00 2.50000000e+01 1.00000000e+00
 5.00000000e+00 2.00000000e+01 2.40000000e+01 6.00000000e+00
 6.66700000e+01 1.12500000e+01 2.70000000e+01 4.00000000e-01
 9.00000000e+01 4.62962963e-03 5.50000000e+00 9.00000000e+01
 2.50000000e+02 8.00000000e+01 1.80000000e+03 1.30000000e+01]
Accuracy: 10.00%
