In [1]:
!pip install lightning

Collecting lightning
  Downloading lightning-2.5.1-py3-none-any.whl.metadata (39 kB)
Collecting lightning-utilities<2.0,>=0.10.0 (from lightning)
  Downloading lightning_utilities-0.14.3-py3-none-any.whl.metadata (5.6 kB)
Collecting torchmetrics<3.0,>=0.7.0 (from lightning)
  Downloading torchmetrics-1.7.1-py3-none-any.whl.metadata (21 kB)
Collecting pytorch-lightning (from lightning)
  Downloading pytorch_lightning-2.5.1-py3-none-any.whl.metadata (20 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<4.0,>=2.1.0->lightning)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<4.0,>=2.1.0->lightning)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<4.0,>=2.1.0->lightning)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Co

In [2]:
import pandas as pd
import numpy as np
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import lightning as pl
from sklearn.model_selection import train_test_split

from torch.optim import AdamW
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)
import textwrap
from tqdm.auto import tqdm
from sklearn import metrics

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [88]:
class T5FineTuner(pl.LightningModule):
  def __init__(self, hparams, train_data, val_data):
    super(T5FineTuner, self).__init__()
    self.save_hyperparameters(hparams)
    self.train_dataset = train_data
    self.val_dataset = val_data
    self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
    self.model.train()
    self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)
    #self.train_losses = []
    #self.val_losses = []

  def is_logger(self):
    return self.trainer.global_rank <= 0

  def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        labels = labels,
    )

  def _step(self, batch):
    labels = batch["target_ids"]
    labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        labels= labels,
        decoder_attention_mask=batch['target_mask']
    )

    loss = outputs[0]
    return loss

  def training_step(self, batch, batch_idx):
    loss = self._step(batch)
    tensorboard_logs = {"train_loss": loss}
    #print(f"Model in training mode? {self.model.training}")
    #self.train_losses.append(loss)
    return {"loss": loss}

  def validation_step(self, batch, batch_idx):
    self.model.eval()
    loss = self._step(batch)
    tensorboard_logs = {"val_loss": loss}
    #self.val_losses.append(loss)
    self.log("val_loss", loss)
    return {"val_loss": loss}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hparams.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    self.opt = optimizer
    return [optimizer]

  def optimizer_step(self, epoch=None, batch_idx=None, optimizer=None, optimizer_closure=None,):
    optimizer.step(optimizer_closure)
    optimizer.zero_grad()
    self.lr_scheduler.step()

  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

    return tqdm_dict

  def train_dataloader(self):
    train_dataset = self.train_dataset
    dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4)
    t_total = (
        (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
        // self.hparams.gradient_accumulation_steps
        * float(self.hparams.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    val_dataset = self.val_dataset
    return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      # Log results
      for key in sorted(metrics):
        if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    logger.info("***** Test results *****")

    if pl_module.is_logger():
      metrics = trainer.callback_metrics

      # Log and save results to file
      output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
      with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

class PredictionCallback(pl.Callback):
    def __init__(self, tokenizer, example_text):
        self.tokenizer = tokenizer
        self.example_text = example_text

    def on_train_epoch_end(self, trainer, pl_module):  # 🔥 use `on_train_epoch_end` not `on_epoch_end`
        print(f"\n[Callback ✅] Epoch {trainer.current_epoch} — running prediction...\n")

        pl_module.model.eval()

        input_ids = self.tokenizer(
            self.example_text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        ).input_ids.to(pl_module.device)

        with torch.no_grad():
            output_ids = pl_module.model.generate(
                input_ids=input_ids,
                max_length=50,
                do_sample=False,
                num_beams=4,
                early_stopping=True
            )

        decoded = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
        print(f"\n[🔎 EPOCH {trainer.current_epoch}] Prediction: {decoded}\n")


In [170]:
train_df = pd.read_csv('train_answerextracted.csv')
val_df = pd.read_csv('validation_answerextracted.csv')
test_df = pd.read_csv('test_answerextracted.csv')

def insert_spaces(formula):
    return re.sub(r'([(),])', r' \1 ', formula).replace("  ", " ").strip()

def remove_const(expression):
    return re.sub(r'const_([-0-9_.]+)', r'\1', expression)

ops = ['add', 'subtract', 'multiply', 'divide', 'power', 'sqrt', 'log', 'choose', 'speed',
       'volume_rectangular_prism', 'square_area', 'circle_area', 'circumface']

def fuse_operator_parens(expression, operators):
    for op in operators:
        expression = re.sub(rf'\b{op}\s*\(', f'{op}(', expression)
    return expression

train_df['annotated_formula'] = train_df['annotated_formula'].apply(insert_spaces)
val_df['annotated_formula'] = val_df['annotated_formula'].apply(insert_spaces)

train_df['annotated_formula'] = train_df['annotated_formula'].apply(remove_const)
val_df['annotated_formula'] = val_df['annotated_formula'].apply(remove_const)

train_df['annotated_formula'] = train_df['annotated_formula'].apply(lambda x: fuse_operator_parens(x, ops))
val_df['annotated_formula'] = val_df['annotated_formula'].apply(lambda x: fuse_operator_parens(x, ops))

In [172]:
tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-base')

class SATDataset(Dataset):
  def __init__(self, tokenizer, data,  max_len=512):
    self.data_column = "Problem"
    self.class_column = "annotated_formula"
    self.data = data

    self.max_len = max_len
    self.tokenizer = tokenizer
    self.inputs = []
    self.targets = []

    self._build()

  def __len__(self):
    return len(self.inputs)

  def __getitem__(self, index):
    source_ids = self.inputs[index]["input_ids"].squeeze()
    target_ids = self.targets[index]["input_ids"].squeeze()

    src_mask    = self.inputs[index]["attention_mask"].squeeze()
    target_mask = self.targets[index]["attention_mask"].squeeze()

    return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}

  def _build(self):
    for idx in range(len(self.data)):
      input_, target = self.data.loc[idx, self.data_column], self.data.loc[idx, self.class_column]

      input_ = input_ + ' '
      target = target + " "

      tokenized_inputs = self.tokenizer.batch_encode_plus(
          [input_], max_length=self.max_len, padding="max_length", truncation=True, return_tensors="pt"
      )
      tokenized_targets = self.tokenizer.batch_encode_plus(
          [target], max_length=self.max_len, padding="max_length", truncation=True, return_tensors="pt"
      )
      self.inputs.append(tokenized_inputs)
      self.targets.append(tokenized_targets)

In [173]:
train_dataset = SATDataset(tokenizer, train_df)
print(len(train_dataset))
val_dataset = SATDataset(tokenizer, val_df)
print(len(val_dataset))

29649
4454


In [175]:
args_dict = dict(
    output_dir="t5_brainrot_classifier",
    model_name_or_path='google/flan-t5-base',
    tokenizer_name_or_path='google/flan-t5-base',
    max_seq_length=512,
    learning_rate=8e-5,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=4,
    eval_batch_size=4,
    num_train_epochs=4,
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=False,
    seed=42,
)

args = argparse.Namespace(**args_dict)

checkpoint_callback = pl.pytorch.callbacks.ModelCheckpoint(
    dirpath=args.output_dir, filename="checkpoint", monitor="val_loss", mode="min", save_top_k=5, save_last=True
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    accelerator = "gpu",
    devices=args.n_gpu,
    max_epochs=args.num_train_epochs,
    precision=16
)

In [176]:
model = T5FineTuner(args, train_dataset, val_dataset)

sample_problem = "Sally has 5 times as many eggs as Bob. Bob has 5 eggs. How many eggs does Sally have?"
prediction_cb = PredictionCallback(tokenizer, sample_problem)
train_params["callbacks"] = [LoggingCallback(), prediction_cb, checkpoint_callback]

trainer = pl.Trainer(**train_params)
trainer.fit(model)

/usr/local/lib/python3.11/dist-packages/lightning/fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
INFO: Using 16bit Automatic Mixed Precision (AMP)
INFO:lightning.pytorch.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.11/dist-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /content/t5_brainrot_classifier exists and is not empty.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]


[Callback ✅] Epoch 0 — running prediction...


[🔎 EPOCH 0] Prediction: Bob has 5 eggs so Sally has 5 * 5 = 30 eggs. Bob has 5 eggs so Sally has 30 + 5 = 35 eggs. Sally has 5 times as many eggs as Bob so Sally has 35 / 5 =



Validation: |          | 0/? [00:00<?, ?it/s]


[Callback ✅] Epoch 1 — running prediction...


[🔎 EPOCH 1] Prediction: Bob has 5 eggs so Sally has 5 * 5 = 30 eggs. Bob has 5 eggs so Sally has 30 + 5 = 35 eggs. Sally has 5 times as many eggs as Bob so Sally has 35 / 5 =



Validation: |          | 0/? [00:00<?, ?it/s]


[Callback ✅] Epoch 2 — running prediction...


[🔎 EPOCH 2] Prediction: Bob has 5 eggs so Sally has 5 * 5 = 30 eggs. Bob has 5 eggs so Sally has 30 + 5 = 35 eggs. Sally has 5 times as many eggs as Bob so Sally has 35 / 5 =



Validation: |          | 0/? [00:00<?, ?it/s]


[Callback ✅] Epoch 3 — running prediction...


[🔎 EPOCH 3] Prediction: Bob has 5 eggs so Sally has 5 * 5 = 30 eggs. Bob has 5 eggs so Sally has 30 + 5 = 35 eggs. Sally has 5 times as many eggs as Bob so Sally has 35 / 5 =



INFO: `Trainer.fit` stopped: `max_epochs=4` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=4` reached.


In [178]:
print(model.trainer.callback_metrics)

{'val_loss': tensor(nan)}


In [179]:
loader = DataLoader(val_dataset, batch_size=32, shuffle=True)
it = iter(loader)
batch = next(it)
model.model.to('cuda')

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [183]:
def output_formula(model, tokenizer, problem):
    input_ids = tokenizer(problem, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.cuda()

    with torch.no_grad():
        output_ids = model.model.generate(
          input_ids,
          min_length=10,
          max_length=500,
          do_sample=False,
          num_beams=4,
          early_stopping=True
      )

    classification = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return classification

In [184]:
easy = 'each week a restaurant serving mexican food uses the same volume of chili paste , which comes in either 35 - ounce cans or 25 - ounce cans of chili paste . if the restaurant must order 20 more of the smaller cans than the larger cans to fulfill its weekly needs , then how manysmallercans are required to fulfill its weekly needs ?'

output_formula(model, tokenizer, easy)

'The restaurant must order 35 - ounce cans of chili paste , or 35 - ounce cans of chili paste , or 25 - ounce cans of chili paste , or 25 - ounce cans of chili paste , or 35 - ounce cans of chili paste , or 35 - ounce cans of chili paste , or 25 - ounce cans of chili paste , or 35 - ounce cans of chili paste , or 25 - ounce cans of chili paste , or 25 - ounce cans of chili paste , or 25 - ounce cans of chili paste , or 25 - ounce cans of chili paste , or 25 - ounce cans of chili paste , or 25 - ounce cans of chili paste , or 25 - ounce cans of chili paste , or 25 - ounce cans of chili paste , or 25 - ounce cans of chili paste , or 25 - ounce cans of chili paste , or 25 - ounce cans of chili paste , or 25 - ounce cans of chili paste , or 25 - ounce cans of chili paste , or 25 - ounce cans of chili paste , or 25 - ounce cans of chili paste , or 25 - ounce cans of chili paste , or 25 - ounce cans of chili paste , or 25 - ounce cans of chili paste , or 25 - ounce cans of chili paste , or 2