In [None]:
!pip install lightning



In [None]:
import pandas as pd
import numpy as np
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import lightning as pl
from sklearn.model_selection import train_test_split

from torch.optim import AdamW
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)
import textwrap
from tqdm.auto import tqdm
from sklearn import metrics

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:
class T5FineTuner(pl.LightningModule):
  def __init__(self, hparams, train_data, val_data):
    super(T5FineTuner, self).__init__()
    self.save_hyperparameters(hparams)
    self.train_dataset = train_data
    self.val_dataset = val_data
    self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
    self.model.train()
    self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)
    #self.train_losses = []
    #self.val_losses = []

  def is_logger(self):
    return self.trainer.global_rank <= 0

  def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        labels = labels,
    )

  def _step(self, batch):
    labels = batch["target_ids"].clone()
    labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        labels= labels,
        decoder_attention_mask=batch['target_mask']
    )

    loss = outputs[0]
    return loss

  def training_step(self, batch, batch_idx):
      input_ids = batch["source_ids"]
      attention_mask = batch["source_mask"]
      labels = batch["target_ids"]

      outputs = self.model(
          input_ids=input_ids,
          attention_mask=attention_mask,
          labels=labels
      )
      loss = outputs.loss

      self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
      return loss

  def validation_step(self, batch, batch_idx):
    self.model.eval()
    loss = self._step(batch)
    tensorboard_logs = {"val_loss": loss}
    #self.val_losses.append(loss)
    self.log("val_loss", loss)
    return {"val_loss": loss}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hparams.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    self.opt = optimizer
    return [optimizer]

  def optimizer_step(self, epoch=None, batch_idx=None, optimizer=None, optimizer_closure=None,):
    optimizer.step(optimizer_closure)
    optimizer.zero_grad()
    self.lr_scheduler.step()

  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

    return tqdm_dict

  def train_dataloader(self):
    train_dataset = self.train_dataset
    dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4)
    t_total = (
        (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
        // self.hparams.gradient_accumulation_steps
        * float(self.hparams.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    val_dataset = self.val_dataset
    return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      # Log results
      for key in sorted(metrics):
        if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))
          print("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    logger.info("***** Test results *****")

    if pl_module.is_logger():
      metrics = trainer.callback_metrics

      # Log and save results to file
      output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
      with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

class PredictionCallback(pl.Callback):
    def __init__(self, tokenizer, example_text):
        self.tokenizer = tokenizer
        self.example_text = example_text

    def on_train_epoch_end(self, trainer, pl_module):
        print(f"\n[Callback ✅] Epoch {trainer.current_epoch} — running prediction...\n")

        pl_module.model.eval()

        input_ids = self.tokenizer(
            self.example_text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=100
        ).input_ids.to(pl_module.device)

        with torch.no_grad():
            output_ids = pl_module.model.generate(
                input_ids=input_ids,
                max_length=30,
                do_sample=False,
                num_beams=4,
                early_stopping=True
            )

        decoded = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
        print(f"\n[🔎 EPOCH {trainer.current_epoch}] Prediction: {decoded}\n")


In [None]:
train_df = pd.read_csv('train_answerextracted.csv')
val_df = pd.read_csv('validation_answerextracted.csv')
test_df = pd.read_csv('test_answerextracted.csv')

def insert_spaces(formula):
    return re.sub(r'([(),])', r' \1 ', formula).replace("  ", " ").strip()

def remove_const(expression):
    return re.sub(r'const_([-0-9_.]+)', r'\1', expression)

ops = ['add', 'subtract', 'multiply', 'divide', 'power', 'sqrt', 'log', 'choose', 'speed',
       'volume_rectangular_prism', 'square_area', 'circle_area', 'circumface']

def fuse_operator_parens(expression, operators):
    for op in operators:
        expression = re.sub(rf'\b{op}\s*\(', f'{op}(', expression)
    return expression

train_df['annotated_formula'] = train_df['annotated_formula'].apply(insert_spaces)
val_df['annotated_formula'] = val_df['annotated_formula'].apply(insert_spaces)

train_df['annotated_formula'] = train_df['annotated_formula'].apply(remove_const)
val_df['annotated_formula'] = val_df['annotated_formula'].apply(remove_const)

train_df['annotated_formula'] = train_df['annotated_formula'].apply(lambda x: fuse_operator_parens(x, ops))
val_df['annotated_formula'] = val_df['annotated_formula'].apply(lambda x: fuse_operator_parens(x, ops))

In [None]:
tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-large')

class SATDataset(Dataset):
  def __init__(self, tokenizer, data,  max_len=100):
    self.data_column = "Problem"
    self.class_column = "annotated_formula"
    self.data = data

    self.max_len = max_len
    self.tokenizer = tokenizer
    self.inputs = []
    self.targets = []

    self._build()

  def __len__(self):
    return len(self.inputs)

  def __getitem__(self, index):
    source_ids = self.inputs[index]["input_ids"].squeeze(0)
    target_ids = self.targets[index]["input_ids"].squeeze(0)

    src_mask    = self.inputs[index]["attention_mask"].squeeze(0)
    target_mask = self.targets[index]["attention_mask"].squeeze(0)

    return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}

  def _build(self):
    for idx in range(len(self.data)):
      input_, target = self.data.loc[idx, self.data_column], self.data.loc[idx, self.class_column]

      input_ = input_ + ' '
      target = target + " "

      tokenized_inputs = self.tokenizer.batch_encode_plus(
          [input_], max_length=self.max_len, padding="max_length", truncation=True, return_tensors="pt"
      )
      tokenized_targets = self.tokenizer.batch_encode_plus(
          [target], max_length=30, padding="max_length", truncation=True, return_tensors="pt"
      )
      self.inputs.append(tokenized_inputs)
      self.targets.append(tokenized_targets)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
train_df['count'] = train_df["annotated_formula"].apply(lambda x: len(tokenizer.encode(x, truncation=False)))
train_df = train_df[train_df["count"] <= 30]
train_df['count2'] = train_df["Problem"].apply(lambda x: len(tokenizer.encode(x, truncation=False)))
train_df = train_df[train_df["count2"] <= 100]

val_df['count'] = val_df["annotated_formula"].apply(lambda x: len(tokenizer.encode(x, truncation=False)))
val_df = val_df[val_df["count"] <= 30]
val_df['count2'] = val_df["Problem"].apply(lambda x: len(tokenizer.encode(x, truncation=False)))
val_df = val_df[val_df["count2"] <= 100]

test_df['count'] = test_df["annotated_formula"].apply(lambda x: len(tokenizer.encode(x, truncation=False)))
test_df = test_df[test_df["count"] <= 30]
test_df['count2'] = test_df["Problem"].apply(lambda x: len(tokenizer.encode(x, truncation=False)))
test_df = test_df[test_df["count2"] <= 100]

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

Token indices sequence length is longer than the specified maximum sequence length for this model (2797 > 512). Running this sequence through the model will result in indexing errors


In [None]:
train_dataset = SATDataset(tokenizer, train_df)
print(len(train_dataset))
val_dataset = SATDataset(tokenizer, val_df)
print(len(val_dataset))

15165
2296


In [None]:
args_dict = dict(
    model_name_or_path='google/flan-t5-large',
    tokenizer_name_or_path='google/flan-t5-large',
    max_seq_length=100,
    learning_rate=3e-5,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=32,
    eval_batch_size=32,
    num_train_epochs=8,
    gradient_accumulation_steps=2,
    n_gpu=1,
    early_stop_callback=False,
    seed=42,
    output_dir="t5_sat_generator"
)
args = argparse.Namespace(**args_dict)

checkpoint_callback = pl.pytorch.callbacks.ModelCheckpoint(
    dirpath=args.output_dir, filename="checkpoint", monitor="val_loss", mode="min", save_top_k=5, save_last=True
)

train_params = dict(
    accumulate_grad_batches=args_dict["gradient_accumulation_steps"],
    accelerator="gpu",
    devices=1,
    max_epochs=args_dict["num_train_epochs"],
    precision=32,
    gradient_clip_val=1.0,
    log_every_n_steps=10
)

In [None]:
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
torch.cuda.reset_accumulated_memory_stats()
del model
del trainer
torch.cuda.empty_cache()

In [None]:
model = T5FineTuner(args, train_dataset, val_dataset)

train_params["callbacks"] = [LoggingCallback(), checkpoint_callback]

trainer = pl.Trainer(**train_params)
trainer.fit(model)

INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:lightning.pytorch.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for p

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


val_loss = tensor(3.4432, device='cuda:0')



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

val_loss = tensor(0.5252, device='cuda:0')



Validation: |          | 0/? [00:00<?, ?it/s]

train_loss = tensor(1.9749, device='cuda:0')

val_loss = tensor(0.4248, device='cuda:0')



Validation: |          | 0/? [00:00<?, ?it/s]

train_loss = tensor(0.3790, device='cuda:0')

val_loss = tensor(0.3743, device='cuda:0')



Validation: |          | 0/? [00:00<?, ?it/s]

train_loss = tensor(0.3246, device='cuda:0')

val_loss = tensor(0.3406, device='cuda:0')



Validation: |          | 0/? [00:00<?, ?it/s]

train_loss = tensor(0.2910, device='cuda:0')

val_loss = tensor(0.3189, device='cuda:0')



Validation: |          | 0/? [00:00<?, ?it/s]

train_loss = tensor(0.2700, device='cuda:0')

val_loss = tensor(0.3026, device='cuda:0')



Validation: |          | 0/? [00:00<?, ?it/s]

train_loss = tensor(0.2548, device='cuda:0')

val_loss = tensor(0.2955, device='cuda:0')



Validation: |          | 0/? [00:00<?, ?it/s]

train_loss = tensor(0.2455, device='cuda:0')

val_loss = tensor(0.2924, device='cuda:0')



INFO: `Trainer.fit` stopped: `max_epochs=8` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=8` reached.


In [None]:
print(model.trainer.callback_metrics)

{'val_loss': tensor(0.2924), 'train_loss': tensor(0.2396)}


In [None]:
loader = DataLoader(val_dataset, batch_size=32, shuffle=True)
it = iter(loader)
batch = next(it)
model.model.to('cuda')

T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=False)
       

In [None]:
def output_formula(model, tokenizer, problem):
    input_ids = tokenizer(problem, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.cuda()

    with torch.no_grad():
        output_ids = model.model.generate(
          input_ids,
          min_length=5,
          max_length=1000,
          do_sample=False,
          num_beams=2,
          early_stopping=False
      )

    classification = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return classification

In [None]:
easy = 'the sum of the fourth and twelfth term of an arithmetic progression is 30 . what is the sum of the first 12 terms of the arithmetic progression ?'

output_formula(model, tokenizer, easy)

'divide( add( 30 , 12 ) , 12 )'

In [None]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# model.model.push_to_hub("andrewyw/mathsolver")
# model.tokenizer.push_to_hub("andrewyw/mathsolver")

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/andrewyw/mathsolver/commit/fb589bd0a69206cbdc4584f912cfccfb0cfb6ee9', commit_message='Upload tokenizer', commit_description='', oid='fb589bd0a69206cbdc4584f912cfccfb0cfb6ee9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/andrewyw/mathsolver', endpoint='https://huggingface.co', repo_type='model', repo_id='andrewyw/mathsolver'), pr_revision=None, pr_num=None)

In [None]:
model.model.push_to_hub("andrewyw/mathsolverprelim")
model.tokenizer.push_to_hub("andrewyw/mathsolverprelim")

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/andrewyw/mathsolverprelim/commit/84258f850ac31edd2c0a078f695e026fa36889d4', commit_message='Upload tokenizer', commit_description='', oid='84258f850ac31edd2c0a078f695e026fa36889d4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/andrewyw/mathsolverprelim', endpoint='https://huggingface.co', repo_type='model', repo_id='andrewyw/mathsolverprelim'), pr_revision=None, pr_num=None)

In [None]:
test_df['annotated_formula'] = test_df['annotated_formula'].apply(insert_spaces)
test_df['annotated_formula'] = test_df['annotated_formula'].apply(remove_const)
test_df['annotated_formula'] = test_df['annotated_formula'].apply(lambda x: fuse_operator_parens(x, ops))

In [None]:
from difflib import SequenceMatcher

part_df = test_df.sample(n=100, random_state=42)


def normalized_levenshtein(pred, truth):
    ratio = SequenceMatcher(None, pred, truth).ratio()
    return ratio

part_df['prediction'] = part_df['Problem'].apply(lambda x: output_formula(model, tokenizer, x))
part_df['score'] = part_df.apply(lambda x: normalized_levenshtein(x['prediction'], x['annotated_formula']), axis=1)
print(part_df['score'].mean())

0.6855179651602796


In [None]:
closeish_df = part_df[part_df['score'] >= 0.8][['Problem', 'annotated_formula', 'prediction', 'score']]

In [None]:
len(part_df[part_df['score'] >= 0.95])

15

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

closeish_df

Unnamed: 0,Problem,annotated_formula,prediction,score
411,"solution x is 10 percent alcohol by volume , and solution y is 30 percent alcohol by volume . how many milliliters of solution y must be added to 150 milliliters of solution x to create a solution that is 25 percent alcohol by volume ?","multiply( divide( subtract( 25 , 10 ) , subtract( 30 , 25 ) ) , 150 )","multiply( divide( subtract( 25 , 10 ) , subtract( 30 , 25 ) ) , 150 )",1.0
1050,"two cyclist start on a circular track from a given point but in opposite direction with speeds of 7 m / s and 8 m / s . if the circumference of the circle is 360 meters , after what time will they meet at the starting point ?","divide( 360 , add( 8 , 7 ) )","divide( 360 , multiply( 8 , 7 ) )",0.819672
1444,an amount at compound interest sums to rs . 17640 / - in 2 years and to rs . 20286 / - in 3 years at the same rate of interest . find the rate percentage ?,"multiply( divide( subtract( 20286 , 17640 ) , 17640 ) , 100 )","multiply( divide( subtract( 20286 , 17640 ) , 17640 ) , 100 )",1.0
1244,a can complete a certain job in 16 days . b is 60 % more efficient than a . in how many days can b complete the same job ?,"divide( multiply( 16 , 60 ) , 100 )","divide( multiply( 16 , 60 ) , 100 )",1.0
629,"after 6 games , team b had an average of 65 points per game . if it got only 47 points in game 7 , how many more points does it need to score to get its total above 500 ?","subtract( 500 , add( multiply( 6 , 65 ) , 47 ) )","subtract( 500 , multiply( 6 , 47 ) )",0.857143
727,a train running at the speed of 110 km / hr crosses a pole in 9 sec . what is the length of the train ?,"multiply( divide( multiply( 110 , 1000 ) , 3600 ) , 9 )","multiply( divide( multiply( 110 , 1000 ) , 3600 ) , 9 )",1.0
864,"if a train , travelling at a speed of 90 kmph , crosses a pole in 6 sec , then the length of train is ?","multiply( multiply( 90 , 0_2778 ) , 6 )","multiply( divide( 6 , multiply( 90 , 0_2778 ) ) , 3600 )",0.821053
1631,"the average weight of 7 persons increases by 1.5 kg . if a person weighing 65 kg is replaced by a new person , what could be the weight of the new person ?","add( 65 , multiply( 7 , 1.5 ) )","add( multiply( 7 , 1.5 ) , 65 )",0.83871
453,how long does a truck of 200 m long traveling at 60 kmph takes to cross a bridge of 180 m in length ?,"divide( add( 200 , 180 ) , multiply( 60 , 0_2778 ) )","divide( add( 200 , 180 ) , multiply( 60 , 0_2778 ) )",1.0
584,"an agent , gets a commission of 2.5 % on the sales of cloth . if on a certain day , he gets rs . 12.50 as commission , the cloth sold through him on that day is worth","divide( 12.5 , divide( 2.5 , 100 ) )","divide( 12.50 , divide( 2.5 , 100 ) )",0.986301


In [None]:
part_df[['annotated_formula', 'prediction', 'score']]

Unnamed: 0,annotated_formula,prediction,score
1535,"divide( add( multiply( 2 , 285 ) , 5 ) , 3 )","subtract( 285 , multiply( 5 , 3 ) )",0.493827
983,"add( divide( 25 , 10 ) , 1 )","multiply( subtract( 25 , 1 ) , 1 )",0.571429
432,"multiply( divide( 80 , multiply( 400 , 2 ) ) , 100 )","multiply( divide( 80 , multiply( 400 , 2 ) ) , 100 )",0.981132
962,"subtract( add( add( 24 , 22 ) , 5 ) , 29 )","subtract( add( add( 24 , 22 ) , 5 ) , 29 )",0.976744
976,"multiply( 4 , 4 )","multiply( power( 3 , 3 ) , 2 )",0.638298
1492,"floor ( divide( 21 , 10 ) )","floor ( divide( 21 , 10 ) )",0.981818
1434,"multiply( multiply( 19 , 2 ) , divide( 19 , 2 ) )","add( lvt ( 19 , 2 ) , 2 )",0.526316
936,"divide( subtract( subtract( 280 , 180 ) , 36 ) , 2 )","divide( subtract( subtract( 280 , 180 ) , 36 ) , 2 )",0.981132
1537,"multiply( divide( 16 , 16 ) , 16 )","multiply( divide( 16 , 16 ) , 16 )",0.985507
566,"divide( 90 , add( 84 , add( 3 , 3 ) ) )","multiply( 500 , divide( 2 , add( 90 , 90 ) ) )",0.574713


In [None]:
from sympy import symbols, sympify, N

# Define known symbols
const_100 = symbols('const_100')

def evaluate_functional_expression(expr_str):
    stack = []
    num_buffer = ""
    i = 0
    while i < len(expr_str):
        char = expr_str[i]

        # Accumulate alphanumeric + underscores + decimals
        if char.isalnum() or char == '.':
            num_buffer += char
        elif char == "_":
            num_buffer += '.'
        elif char == "(":
            if num_buffer:
                if num_buffer.startswith("const_"):
                    const_value = num_buffer.replace("const_", "").replace("_", ".")
                    stack.append(const_value)
                else:
                    stack.append(num_buffer)
                num_buffer = ""

        elif char == "," or char == ")":
            if num_buffer:
                if num_buffer.startswith("const_"):
                    const_value = num_buffer.replace("const_", "").replace("_", ".")
                    stack.append(const_value)
                else:
                    stack.append(num_buffer)
                num_buffer = ""

            if char == ")":
                args = []
                while stack and stack[-1] not in {"add", "subtract", "multiply", "divide"}:
                    args.append(stack.pop())
                args.reverse()

                if stack:
                    func = stack.pop()
                    if func == "add":
                        result = f"({args[0]} + {args[1]})"
                    elif func == "subtract":
                        result = f"({args[0]} - {args[1]})"
                    elif func == "multiply":
                        result = f"({args[0]} * {args[1]})"
                    elif func == "divide":
                        result = f"({args[0]} / {args[1]})"
                    stack.append(result)

        i += 1

    return stack[0] if stack else ""



def check_answer_numeric(input):
  math_expr = evaluate_functional_expression(input)
  sympy_expr = sympify(math_expr, locals={'const_100': 100})
  return sympy_expr.simplify()

def safe_check_answer_numeric(x):
    try:
        return check_answer_numeric(x)
    except Exception as e:
        return None

In [None]:
part_df['annotated_formula'][0]

'subtract( divide( multiply( add( 100 , 31.1 ) , 100 ) , subtract( 100 , 5 ) ) , 100 )'

In [None]:
check_answer_numeric('add( 8 , add( 4 , 2 ) )')

(8 + (4 + 2))


14

In [None]:
test_ans = part_df['annotated_formula'].apply(safe_check_answer_numeric)

test_np = np.array(test_ans)
test_np_float = np.array([float(N(p)) if p is not None else np.nan for p in test_np], dtype=np.float64)

ans_np = np.array(part_df['answer_numeric'])

matches = np.isclose(test_np_float, ans_np, rtol=1e-4, atol=1e-6)
accuracy = np.sum(matches) / len(ans_np)

print(f"Accuracy: {accuracy:.2%}")

Accuracy: 46.00%


array([95/3, 38, 48, 14, 10, 15, 975, 40, None, 23, 22/25,
       50.9968802495800, 15, 9, 10, 716, 7, 36/31, 100.000000000000,
       2298.47494553377, None, 149/5, 1/36, 9.34579439252336, 200/7,
       54000, 50, -1700/27, 1095, 1800, 5.25000000000000, 441/5, 3, 8,
       None, -179, None, 24, 445, 21, 72, 3.00000000000000, 678, 28, 26,
       100/9, 250.032000000000, 8000, 175, 1/6, 12, 90, 300, None, 4, 7,
       6, 260, 720/13, None, 444.600000000000, None, -6, None, None,
       111/7, None, 870, 1000/7, None, 50, -3, 38.0000000000000, None, 2,
       1/3, 15, None, 6.17093489663684, 3/2, 7/4, None, 1540, 10, 4/3,
       None, 1, 52, 620, 6, 27, 83, 200/3, 54, 7/15, 14, 89.0000000000000,
       36, 250.000000000000, None], dtype=object)

In [None]:
ans_np

array([3.16700000e+01, 3.80000000e+01, 4.90000000e+01, 1.40000000e+01,
       1.00000000e+01, 1.50000000e+01, 3.15000000e+02, 4.00000000e+01,
       3.00000000e+00, 2.30000000e+01, 8.80000000e-01, 5.10000000e+01,
       1.60000000e+01, 9.00000000e+00, 5.00000000e+00, 7.30000000e+02,
       7.00000000e+00, 1.16129032e+00, 1.27000000e+02, 2.00000000e+00,
       1.07500000e+04, 2.98000000e+01, 6.60000000e+01, 8.75000000e+02,
       2.85700000e+01, 5.40000000e+01, 5.00000000e+01, 6.30000000e+01,
       1.09500000e+03, 1.80000000e+03, 5.60000000e+00, 8.82000000e+01,
       3.00000000e+00, 8.00000000e+00, 3.12500000e-01, 5.00000000e+00,
       6.00000000e+00, 2.40000000e+01, 4.45000000e+02, 2.10000000e+01,
       4.90000000e+01, 3.00000000e+00, 6.83000000e+02, 2.80000000e+01,
       2.60000000e+01, 1.11000000e+00, 2.50000000e+02, 8.00000000e+03,
       1.74000000e+02, 1.66666667e-01, 2.00000000e+01, 9.00000000e+01,
       5.00000000e+02, 1.20000000e+03, 3.00000000e+00, 7.00000000e+00,
      

In [None]:
train_params["max_epochs"] = 8

trainer = pl.Trainer(**train_params)

trainer.fit(model, ckpt_path="t5_sat_generator/last.ckpt")

INFO: Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
INFO:lightning.pytorch.utilities.rank_zero:Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.11/dist-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /content/t5_sat_generator exists and is not empty.
INFO: Restoring states from the checkpoint path at t5

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]


[Callback ✅] Epoch 4 — running prediction...


[🔎 EPOCH 4] Prediction: Let x be the number of eggs that Sally has, multiply( 5 , 5



Validation: |          | 0/? [00:00<?, ?it/s]


[Callback ✅] Epoch 5 — running prediction...


[🔎 EPOCH 5] Prediction: Let x be the number of eggs that Sally has. Multiply that by 5 .



Validation: |          | 0/? [00:00<?, ?it/s]


[Callback ✅] Epoch 6 — running prediction...


[🔎 EPOCH 6] Prediction: Let x be the number of eggs that Sally has. Multiply that by 5 .



Validation: |          | 0/? [00:00<?, ?it/s]


[Callback ✅] Epoch 7 — running prediction...


[🔎 EPOCH 7] Prediction: Let x be the number of eggs that Sally has, multiply( 5 , 5



INFO: `Trainer.fit` stopped: `max_epochs=8` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=8` reached.


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_fromhf = "andrewyw/mathsolverprelim"

tokenizer = AutoTokenizer.from_pretrained(model_fromhf)
model = AutoModelForSeq2SeqLM.from_pretrained(model_fromhf)

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [None]:
def output_formula_pretrained(model, tokenizer, problem):
    input_ids = tokenizer(problem, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids

    with torch.no_grad():
        output_ids = model.generate(
            input_ids=input_ids,
            max_length=100,
            min_length=10,
            do_sample=False,
            num_beams=4,
            early_stopping=True
        )

    return tokenizer.decode(output_ids[0], skip_special_tokens=True)


In [None]:
from difflib import SequenceMatcher

test_df['annotated_formula'] = test_df['annotated_formula'].apply(insert_spaces)
test_df['annotated_formula'] = test_df['annotated_formula'].apply(remove_const)
test_df['annotated_formula'] = test_df['annotated_formula'].apply(lambda x: fuse_operator_parens(x, ops))

part_df = test_df.sample(n=100, random_state=42)


def normalized_levenshtein(pred, truth):
    ratio = SequenceMatcher(None, pred, truth).ratio()
    return ratio

part_df['prediction'] = part_df['Problem'].apply(lambda x: output_formula_pretrained(model, tokenizer, x))
part_df['score'] = part_df.apply(lambda x: normalized_levenshtein(x['prediction'], x['annotated_formula']), axis=1)
print(part_df['score'].mean())

In [None]:
closeish_df = part_df[part_df['score'] >= 0.9][['Problem', 'annotated_formula', 'prediction', 'score']]
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

closeish_df