In [5]:
# Uninstall potentially conflicting packages
!pip uninstall -y transformers accelerate unsloth torch torchvision torchaudio

# Install base packages
!pip install unsloth

# Install dependencies
!pip install -q transformers accelerate peft
!pip install -q datasets evaluate bitsandbytes trl
!pip install -q torch torchvision torchaudio

# Install Colab-optimized unsloth
!pip uninstall unsloth -y
!pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# Install other tools
!pip install pandas scikit-learn
!pip install -q ipywidgets

Found existing installation: transformers 4.46.2
Uninstalling transformers-4.46.2:
  Successfully uninstalled transformers-4.46.2
Found existing installation: accelerate 1.1.1
Uninstalling accelerate-1.1.1:
  Successfully uninstalled accelerate-1.1.1
Found existing installation: unsloth 2024.11.6
Uninstalling unsloth-2024.11.6:
  Successfully uninstalled unsloth-2024.11.6
Found existing installation: torch 2.5.1
Uninstalling torch-2.5.1:
  Successfully uninstalled torch-2.5.1
Found existing installation: torchvision 0.20.1
Uninstalling torchvision-0.20.1:
  Successfully uninstalled torchvision-0.20.1
Found existing installation: torchaudio 2.5.1
Uninstalling torchaudio-2.5.1:
  Successfully uninstalled torchaudio-2.5.1
Collecting unsloth
  Using cached unsloth-2024.11.6-py3-none-any.whl.metadata (59 kB)
Collecting torch>=2.4.0 (from unsloth)
  Using cached torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting transformers>=4.46.1 (from unsloth)
  Using cached transf

Found existing installation: unsloth 2024.11.6
Uninstalling unsloth-2024.11.6:
  Successfully uninstalled unsloth-2024.11.6
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-rz51z8vq/unsloth_f2d6924234e04527be13eac3a89f2dd2
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-rz51z8vq/unsloth_f2d6924234e04527be13eac3a89f2dd2
  Resolved https://github.com/unslothai/unsloth.git to commit d8ff860c842095f4729fdd1d5aedf567a9e2c4da
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: unsloth
  Building wheel for unsloth (pyproject.toml) ... [?25l[?25hdone
  Created wheel for unsloth: filename=unsloth-2024.11.6-py3-none-a



Preparations
------

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Environment setup
import os
import warnings
import random
import numpy as np
import torch
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import gc
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
import transformers
import accelerate

# Print versions
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")

# Configure environment
os.environ["CUDA_VISIBLE_DEVICES"]="0"
warnings.filterwarnings('ignore')
torch.set_float32_matmul_precision('high')

# Set random seeds
def set_seeds(seed=3407):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

# Memory management utilities
def clear_memory():
    gc.collect()
    torch.cuda.empty_cache()

def print_gpu_utilization():
    print("\nGPU Memory Usage:")
    !nvidia-smi | grep -E "Memory|Volatile"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
PyTorch version: 2.5.1+cu124
Transformers version: 4.46.2
Accelerate version: 1.1.1


set up wandb

In [None]:
!pip install wandb --upgrade

import wandb
#wandb.login()



define the sweep

In [None]:
sweep_config = {
    'method': 'random'
    }

metric = {
    'name': 'loss',
    'goal': 'minimize'
    }

sweep_config['metric'] = metric

parameters_dict = {
    'learning_rate': {
        'distribution': 'log_uniform_values',
        'min': 1e-5,
        'max': 1e-3
        },
    'warmup_ratio': {
        'values': [0.05, 0.1, 0.2]
        },
    'weight_decay': {
        'values': [0.01, 0.03, 0.05]
        },
    'per_device_train_batch_size': {
        'values': [2, 4]
        },
    'gradient_accumulation_steps': {
        'values': [2, 4, 8]
        },
    'epochs': {
        'value': 1
        }
}

sweep_config['parameters'] = parameters_dict
import pprint
pprint.pprint(sweep_config)

In [None]:
import pprint

pprint.pprint(sweep_config)

Define class
---

In [2]:
class MathVerificationTrainer:
    def __init__(self, max_seq_length=2048, save_dir='/content/drive/MyDrive/math_verification_10000_samples'):
        self.max_seq_length = max_seq_length
        self.save_dir = save_dir
        self.model = None
        self.tokenizer = None
        self.train_dataset = None
        self.eval_dataset = None
        os.makedirs(self.save_dir, exist_ok=True)

    def setup_model(self, checkpoint_dir=None):
        clear_memory()
        print("Loading model...")

        try:
            if checkpoint_dir:
                # Load model from checkpoint if specified
                model, tokenizer = FastLanguageModel.from_pretrained(
                    checkpoint_dir,
                    max_seq_length=self.max_seq_length,
                    load_in_4bit=True,
                )
                print(f"Model loaded from checkpoint: {checkpoint_dir}")
            else:
                # Load model from scratch if no checkpoint is specified
                model, tokenizer = FastLanguageModel.from_pretrained(
                    model_name="unsloth/Meta-Llama-3.1-8B",
                    max_seq_length=self.max_seq_length,
                    load_in_4bit=True,
                )

            model = FastLanguageModel.get_peft_model(
                model,
                r=16,
                target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
                lora_alpha=16,
                lora_dropout=0.1,
                bias="none",
                use_gradient_checkpointing=True,
                random_state=3407,
                use_rslora=True,
            )

            self.model = model
            self.tokenizer = tokenizer
            print("Model loaded successfully!")

        except Exception as e:
            print(f"Error loading model: {str(e)}")
            raise

    def process_training_example(self, example):
        # Handle LaTeX formatting
        question = example['question'].replace("$", "").replace("\\", "\\\\")

        # Process code blocks
        solution = example['solution']
        solution = solution.replace("<llm-code>", "\nCode:\n").replace("</llm-code>", "")
        solution = solution.replace("<llm-code-output>", "\nOutput:\n").replace("</llm-code-output>", "")

        prompt = (
            "You are a mathematics expert. Verify this answer:\n\n"
            f"Problem: {question}\n"
            f"Given Answer: {example['answer']}\n\n"
            "Verification Process:\n"
            f"{solution}\n\n"
            "Based on the verification above, respond with EXACTLY 'True' or 'False'.\n"
            f"Answer: {str(example['is_correct'])}"
        ) + self.tokenizer.eos_token

        return {"text": prompt}

    def process_test_example(self, example):
        question = example['question'].replace("$", "").replace("\\", "\\\\")
        solution = example['solution']
        solution = solution.replace("<llm-code>", "\nCode:\n").replace("</llm-code>", "")
        solution = solution.replace("<llm-code-output>", "\nOutput:\n").replace("</llm-code-output>", "")

        prompt = (
            "You are a mathematics expert. Verify this answer:\n\n"
            f"Problem: {question}\n"
            f"Given Answer: {example['answer']}\n\n"
            "Verification Process:\n"
            f"{solution}\n\n"
            "Based on the verification above, respond with EXACTLY 'True' or 'False'.\n"
        )
        return prompt

    def prepare_datasets(self, max_samples=1000):
        clear_memory()
        print("Preparing datasets...")

        try:
            dataset = load_dataset(
                "ad6398/nyu-dl-teach-maths-comp",
                split='train',
                streaming=True
            )

            train_data = list(dataset.take(max_samples))

            train_idx, val_idx = train_test_split(
                range(len(train_data)),
                test_size=0.1,
                random_state=3407
            )

            train_examples = [self.process_training_example(train_data[i]) for i in train_idx]
            eval_examples = [self.process_training_example(train_data[i]) for i in val_idx]

            self.train_dataset = Dataset.from_list(train_examples)
            self.eval_dataset = Dataset.from_list(eval_examples)

            del train_data, train_examples, eval_examples
            clear_memory()

            print(f"Datasets prepared! Train size: {len(self.train_dataset)}, Eval size: {len(self.eval_dataset)}")

        except Exception as e:
            print(f"Error preparing datasets: {str(e)}")
            raise

    def setup_training_args(self, config=None):
        if config is None:
            # Default training arguments
            return TrainingArguments(
                output_dir=os.path.join(self.save_dir, "checkpoints"),
                per_device_train_batch_size=2,
                gradient_accumulation_steps=8,
                warmup_ratio=0.1,
                num_train_epochs=1,
                learning_rate=0.0006026,
                fp16=True,
                logging_steps=10,
                optim="adamw_torch",
                weight_decay=0.05,
                lr_scheduler_type="cosine",
                seed=3407,
                evaluation_strategy="steps",
                eval_steps=50,
                save_strategy="steps",
                save_steps=50,
                load_best_model_at_end=True,
                metric_for_best_model="eval_loss",
                gradient_checkpointing=True,
                max_grad_norm=0.3,
                report_to="none",
                remove_unused_columns=True,
                dataloader_pin_memory=False
            )
        else:
            # Training arguments for hyperparameter sweep
            return TrainingArguments(
                output_dir=os.path.join(self.save_dir, "checkpoints"),
                per_device_train_batch_size=config.per_device_train_batch_size,
                gradient_accumulation_steps=config.gradient_accumulation_steps,
                warmup_ratio=config.warmup_ratio,
                num_train_epochs=config.epochs,
                learning_rate=config.learning_rate,
                fp16=True,
                logging_steps=10,
                optim="adamw_torch",
                weight_decay=config.weight_decay,
                lr_scheduler_type="cosine",
                seed=3407,
                evaluation_strategy="steps",
                eval_steps=50,
                save_strategy="steps",
                save_steps=50,
                load_best_model_at_end=True,
                metric_for_best_model="eval_loss",
                gradient_checkpointing=True,
                max_grad_norm=0.3,
                report_to="wandb",
                remove_unused_columns=True,
                dataloader_pin_memory=False,
            )

    def train(self, checkpoint_dir=None):
        clear_memory()
        print("Starting training...")

        try:
            # Load model from checkpoint if available
            self.setup_model(checkpoint_dir=checkpoint_dir)

            trainer = SFTTrainer(
                model=self.model,
                tokenizer=self.tokenizer,
                train_dataset=self.train_dataset,
                eval_dataset=self.eval_dataset,
                dataset_text_field="text",
                max_seq_length=self.max_seq_length,
                dataset_num_proc=2,
                packing=False,
                args=self.setup_training_args()
            )

            trainer.train()

            final_save_path = os.path.join(self.save_dir, "final_model")
            self.model.save_pretrained(final_save_path)
            self.tokenizer.save_pretrained(final_save_path)
            print(f"Training completed! Model saved to {final_save_path}")

        except Exception as e:
            print(f"Error during training: {str(e)}")
            raise

    def sweep(self, checkpoint_dir=None):
        print("Starting sweeping...")

        with wandb.init():
            config = wandb.config
            training_args = self.setup_training_args(config)

            try:
                # Load model from checkpoint if available
                self.setup_model(checkpoint_dir=checkpoint_dir)

                trainer = SFTTrainer(
                    model=self.model,
                    tokenizer=self.tokenizer,
                    train_dataset=self.train_dataset,
                    eval_dataset=self.eval_dataset,
                    dataset_text_field="text",
                    max_seq_length=self.max_seq_length,
                    dataset_num_proc=2,
                    packing=False,
                    args=training_args
                )

                trainer.train()

                final_save_path = os.path.join(self.save_dir, "final_model")
                self.model.save_pretrained(final_save_path)
                self.tokenizer.save_pretrained(final_save_path)
                print(f"Training completed! Model saved to {final_save_path}")

            except Exception as e:
                print(f"Error during sweeping: {str(e)}")
                raise

    def generate_predictions(self, test_data, batch_size=16):
        clear_memory()
        print("Generating predictions...")

        try:
            FastLanguageModel.for_inference(self.model)
            predictions = []

            # Convert test data to a list to support batch processing
            test_examples = list(test_data)
            total_batches = (len(test_examples) + batch_size - 1) // batch_size

            all_predictions = []

            # Process in batches
            for i in range(0, len(test_examples), batch_size):
                if i % (batch_size * 10) == 0:
                    print(f"Processing batch {i//batch_size}/{total_batches}")

                # Get current batch samples
                batch = test_examples[i:i + batch_size]
                prompts = [self.process_test_example(example) for example in batch]

                # Batch encoding
                inputs = self.tokenizer(
                    prompts,
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=self.max_seq_length
                ).to("cuda")

                # Batch generation
                with torch.inference_mode():
                    outputs = self.model.generate(
                        **inputs,
                        max_new_tokens=8,  # Reduce the number of generated tokens as we only need True/False
                        temperature=0.1,
                        top_p=0.9,
                        do_sample=False,    # Disable sampling for faster generation
                        use_cache=True,
                        pad_token_id=self.tokenizer.pad_token_id,
                    )

                input_length = inputs['input_ids'].shape[1]
                responses = self.tokenizer.batch_decode(
                    [output[input_length:] for output in outputs],
                    skip_special_tokens=True
                )

                # Batch processing prediction results
                batch_predictions = ["true" in response.lower() for response in responses]
                all_predictions.extend(batch_predictions)

                # Periodically clear memory
                if i % (batch_size * 50) == 0:
                    clear_memory()

            print(f"Total predictions: {len(all_predictions)}")
            assert len(all_predictions) == len(test_examples)

            return all_predictions

        except Exception as e:
            print(f"Error generating predictions: {str(e)}")
            raise

    def create_submission(self):
        print("Creating submission file...")
        try:
            test_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")['test']
            print(f"Test dataset size: {len(test_dataset)}")
            predictions = self.generate_predictions(test_dataset, batch_size=16)
            print(f"Generated predictions: {len(predictions)}")

            assert len(predictions) == len(test_dataset), \
                f"Prediction count mismatch! Expected {len(test_dataset)}, got {len(predictions)}"

            submission_df = pd.DataFrame({
                'ID': range(len(predictions)),
                'is_correct': predictions
            })

            print(f"Submission DataFrame shape: {submission_df.shape}")

            submission_path = os.path.join(self.save_dir, 'submission.csv')
            submission_df.to_csv(submission_path, index=False)
            print(f"Submission saved to {submission_path}")

            saved_df = pd.read_csv(submission_path)
            print(f"Saved file shape: {saved_df.shape}")

        except Exception as e:
            print(f"Error creating submission: {str(e)}")
            raise

# Hyper Parameters sweeping

## 1. Initial the sweep

In [None]:
sweep_id = wandb.sweep(sweep_config, project="math_verification")

## 2. Run sweep agent

In [None]:
def run_sweep():
    trainer = MathVerificationTrainer()
    trainer.setup_model()
    trainer.prepare_datasets(max_samples=1000)  # Adjust based on available RAM
    trainer.sweep()

wandb.agent(sweep_id, run_sweep, count = 50)

# Get final result

Before running the main function, hyper parameter in trainer.train() should be changed.

In [4]:
def main():
    try:
        set_seeds()
        print("Starting training pipeline...")

        # Initialize and run trainer
        trainer = MathVerificationTrainer() # save_dir='/content/drive/MyDrive/math_verification_12000_samples'
        trainer.setup_model(checkpoint_dir='/content/drive/MyDrive/math_verification_10000_samples/checkpoints/checkpoint-550')
        trainer.prepare_datasets(max_samples=12000)  # Adjust based on available RAM
        trainer.train(checkpoint_dir='/content/drive/MyDrive/math_verification_10000_samples/checkpoints/checkpoint-550')
        trainer.create_submission()

        print("Training pipeline completed successfully!")

    except Exception as e:
        print(f"Fatal error in main: {str(e)}")
        raise
    finally:
        clear_memory()
        print_gpu_utilization()

if __name__ == "__main__":
    main()

Starting training pipeline...
Loading model...
==((====))==  Unsloth 2024.11.6: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 8.9. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Already have LoRA adapters! We shall skip this step.


Model loaded from checkpoint: /content/drive/MyDrive/math_verification_10000_samples/checkpoints/checkpoint-550
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 10800, Eval size: 1200
Starting training...
Loading model...
==((====))==  Unsloth 2024.11.6: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 8.9. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Already have LoRA adapters! We shall skip this step.


Model loaded from checkpoint: /content/drive/MyDrive/math_verification_10000_samples/checkpoints/checkpoint-550
Model loaded successfully!


Map (num_proc=2):   0%|          | 0/10800 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/1200 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 10,800 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 675
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.5193,0.784097
100,0.5414,0.793473
150,0.5412,0.813341
200,0.5504,0.819851
250,0.5623,0.814547
300,0.5681,0.800602
350,0.552,0.787862
400,0.5941,0.772201
450,0.5677,0.755366
500,0.5956,0.746332


Training completed! Model saved to /content/drive/MyDrive/math_verification_10000_samples/final_model
Creating submission file...
Test dataset size: 10000
Generating predictions...
Processing batch 0/625
Processing batch 10/625
Processing batch 20/625
Processing batch 30/625
Processing batch 40/625
Processing batch 50/625
Processing batch 60/625
Processing batch 70/625
Processing batch 80/625
Processing batch 90/625
Processing batch 100/625
Processing batch 110/625
Processing batch 120/625
Processing batch 130/625
Processing batch 140/625
Processing batch 150/625
Processing batch 160/625
Processing batch 170/625
Processing batch 180/625
Processing batch 190/625
Processing batch 200/625
Processing batch 210/625
Processing batch 220/625
Processing batch 230/625
Processing batch 240/625
Processing batch 250/625
Processing batch 260/625
Processing batch 270/625
Processing batch 280/625
Processing batch 290/625
Processing batch 300/625
Processing batch 310/625
Processing batch 320/625
Proce

In [3]:
# generate submission without training
def main():
    try:
        set_seeds()
        print("Starting training pipeline...")

        # Initialize and run trainer
        trainer = MathVerificationTrainer() # save_dir='/content/drive/MyDrive/math_verification_12000_samples'
        trainer.setup_model(checkpoint_dir='/content/drive/MyDrive/math_verification_10000_samples/checkpoints/checkpoint-550')
        trainer.prepare_datasets(max_samples=12000)  # Adjust based on available RAM
        #trainer.train(checkpoint_dir='/content/drive/MyDrive/math_verification_10000_samples/checkpoints/checkpoint-1200')
        trainer.create_submission()

        print("Training pipeline completed successfully!")

    except Exception as e:
        print(f"Fatal error in main: {str(e)}")
        raise
    finally:
        clear_memory()
        print_gpu_utilization()

if __name__ == "__main__":
    main()

Starting training pipeline...
Loading model...
==((====))==  Unsloth 2024.11.6: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 8.9. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

Unsloth 2024.11.6 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.
Unsloth: Already have LoRA adapters! We shall skip this step.


Model loaded from checkpoint: /content/drive/MyDrive/math_verification_10000_samples/checkpoints/checkpoint-550
Model loaded successfully!
Preparing datasets...


README.md:   0%|          | 0.00/2.09k [00:00<?, ?B/s]

Datasets prepared! Train size: 10800, Eval size: 1200
Creating submission file...


train-00000-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/3.65M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Test dataset size: 10000
Generating predictions...
Processing batch 0/625
Processing batch 10/625
Processing batch 20/625
Processing batch 30/625
Processing batch 40/625
Processing batch 50/625
Processing batch 60/625
Processing batch 70/625
Processing batch 80/625
Processing batch 90/625
Processing batch 100/625
Processing batch 110/625
Processing batch 120/625
Processing batch 130/625
Processing batch 140/625
Processing batch 150/625
Processing batch 160/625
Processing batch 170/625
Processing batch 180/625
Processing batch 190/625
Processing batch 200/625
Processing batch 210/625
Processing batch 220/625
Processing batch 230/625
Processing batch 240/625
Processing batch 250/625
Processing batch 260/625
Processing batch 270/625
Processing batch 280/625
Processing batch 290/625
Processing batch 300/625
Processing batch 310/625
Processing batch 320/625
Processing batch 330/625
Processing batch 340/625
Processing batch 350/625
Processing batch 360/625
Processing batch 370/625
Processing