In [None]:
# Uninstall potentially conflicting packages
!pip uninstall -y transformers accelerate unsloth torch torchvision torchaudio

# Install base packages
!pip install unsloth

# Install dependencies
!pip install -q transformers accelerate peft
!pip install -q datasets evaluate bitsandbytes trl
!pip install -q torch torchvision torchaudio

# Install Colab-optimized unsloth
!pip uninstall unsloth -y
!pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# Install other tools
!pip install pandas scikit-learn
!pip install -q ipywidgets

Preparations
------

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Environment setup
import os
import warnings
import random
import numpy as np
import torch
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import gc
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
import transformers
import accelerate

# Print versions
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")

# Configure environment
os.environ["CUDA_VISIBLE_DEVICES"]="0"
warnings.filterwarnings('ignore')
torch.set_float32_matmul_precision('high')

# Set random seeds
def set_seeds(seed=3407):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

# Memory management utilities
def clear_memory():
    gc.collect()
    torch.cuda.empty_cache()

def print_gpu_utilization():
    print("\nGPU Memory Usage:")
    !nvidia-smi | grep -E "Memory|Volatile"

Mounted at /content/drive
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
PyTorch version: 2.5.1+cu124
Transformers version: 4.46.2
Accelerate version: 1.1.1


set up wandb

In [None]:
!pip install wandb --upgrade

import wandb
wandb.login()

define the sweep

In [None]:
sweep_config = {
    'method': 'random'
    }

metric = {
    'name': 'loss',
    'goal': 'minimize'
    }

sweep_config['metric'] = metric

parameters_dict = {
    'learning_rate': {
        'distribution': 'log_uniform_values',
        'min': 1e-5,
        'max': 1e-3
        },
    'warmup_ratio': {
        'values': [0.05, 0.1, 0.2]
        },
    'weight_decay': {
        'values': [0.01, 0.03, 0.05]
        },
    'per_device_train_batch_size': {
        'values': [2, 4]
        },
    'gradient_accumulation_steps': {
        'values': [2, 4, 8]
        },
    'epochs': {
        'value': 1
        }
}

sweep_config['parameters'] = parameters_dict
import pprint
pprint.pprint(sweep_config)

In [None]:
import pprint

pprint.pprint(sweep_config)

Define class
---

In [4]:
class MathVerificationTrainer:
    def __init__(self, max_seq_length=2048, save_dir='/content/drive/MyDrive/math_verification_sweep'):
        self.max_seq_length = max_seq_length
        self.save_dir = save_dir
        self.model = None
        self.tokenizer = None
        self.train_dataset = None
        self.eval_dataset = None
        os.makedirs(self.save_dir, exist_ok=True)

    def setup_model(self):
        clear_memory()
        print("Loading model...")

        try:
            model, tokenizer = FastLanguageModel.from_pretrained(
                model_name="unsloth/Meta-Llama-3.1-8B",
                max_seq_length=self.max_seq_length,
                load_in_4bit=True,
            )

            model = FastLanguageModel.get_peft_model(
                model,
                r=16,
                target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
                lora_alpha=16,
                lora_dropout=0.1,
                bias="none",
                use_gradient_checkpointing=True,
                random_state=3407,
                use_rslora=True,
            )

            self.model = model
            self.tokenizer = tokenizer
            print("Model loaded successfully!")

        except Exception as e:
            print(f"Error loading model: {str(e)}")
            raise

    def process_training_example(self, example):
        # Handle LaTeX formatting
        question = example['question'].replace("$", "").replace("\\", "\\\\")

        # Process code blocks
        solution = example['solution']
        solution = solution.replace("<llm-code>", "\nCode:\n").replace("</llm-code>", "")
        solution = solution.replace("<llm-code-output>", "\nOutput:\n").replace("</llm-code-output>", "")

        prompt = (
            "You are a mathematics expert. Verify this answer:\n\n"
            f"Problem: {question}\n"
            f"Given Answer: {example['answer']}\n\n"
            "Verification Process:\n"
            f"{solution}\n\n"
            "Based on the verification above, respond with EXACTLY 'True' or 'False'.\n"
            f"Answer: {str(example['is_correct'])}"
        ) + self.tokenizer.eos_token

        return {"text": prompt}

    def process_test_example(self, example):
        question = example['question'].replace("$", "").replace("\\", "\\\\")
        solution = example['solution']
        solution = solution.replace("<llm-code>", "\nCode:\n").replace("</llm-code>", "")
        solution = solution.replace("<llm-code-output>", "\nOutput:\n").replace("</llm-code-output>", "")

        prompt = (
            "You are a mathematics expert. Verify this answer:\n\n"
            f"Problem: {question}\n"
            f"Given Answer: {example['answer']}\n\n"
            "Verification Process:\n"
            f"{solution}\n\n"
            "Based on the verification above, respond with EXACTLY 'True' or 'False'.\n"
        )
        return prompt

    def prepare_datasets(self, max_samples=1000):
        clear_memory()
        print("Preparing datasets...")

        try:
            dataset = load_dataset(
                "ad6398/nyu-dl-teach-maths-comp",
                split='train',
                streaming=True
            )

            train_data = list(dataset.take(max_samples))

            train_idx, val_idx = train_test_split(
                range(len(train_data)),
                test_size=0.1,
                random_state=3407
            )

            train_examples = [self.process_training_example(train_data[i]) for i in train_idx]
            eval_examples = [self.process_training_example(train_data[i]) for i in val_idx]

            self.train_dataset = Dataset.from_list(train_examples)
            self.eval_dataset = Dataset.from_list(eval_examples)

            del train_data, train_examples, eval_examples
            clear_memory()

            print(f"Datasets prepared! Train size: {len(self.train_dataset)}, Eval size: {len(self.eval_dataset)}")

        except Exception as e:
            print(f"Error preparing datasets: {str(e)}")
            raise

    def setup_training_args(self, config=None):
        if config is None:
            # Default training arguments
            return TrainingArguments(
                output_dir=os.path.join(self.save_dir, "checkpoints"),
                per_device_train_batch_size=2,
                gradient_accumulation_steps=8,
                warmup_ratio=0.1,
                num_train_epochs=1,
                learning_rate=0.0006026,
                fp16=True,
                logging_steps=10,
                optim="adamw_torch",
                weight_decay=0.05,
                lr_scheduler_type="cosine",
                seed=3407,
                evaluation_strategy="steps",
                eval_steps=50,
                save_strategy="steps",
                save_steps=50,
                load_best_model_at_end=True,
                metric_for_best_model="eval_loss",
                gradient_checkpointing=True,
                max_grad_norm=0.3,
                report_to="none",
                remove_unused_columns=True,
                dataloader_pin_memory=False
            )
        else:
            # Training arguments for hyperparameter sweep
            return TrainingArguments(
                output_dir=os.path.join(self.save_dir, "checkpoints"),
                per_device_train_batch_size=config.per_device_train_batch_size,
                gradient_accumulation_steps=config.gradient_accumulation_steps,
                warmup_ratio=config.warmup_ratio,
                num_train_epochs=config.epochs,
                learning_rate=config.learning_rate,
                fp16=True,
                logging_steps=10,
                optim="adamw_torch",
                weight_decay=config.weight_decay,
                lr_scheduler_type="cosine",
                seed=3407,
                evaluation_strategy="steps",
                eval_steps=50,
                save_strategy="steps",
                save_steps=50,
                load_best_model_at_end=True,
                metric_for_best_model="eval_loss",
                gradient_checkpointing=True,
                max_grad_norm=0.3,
                report_to="wandb",
                remove_unused_columns=True,
                dataloader_pin_memory=False,
            )

    def train(self):
        clear_memory()
        print("Starting training...")

        try:
            trainer = SFTTrainer(
                model=self.model,
                tokenizer=self.tokenizer,
                train_dataset=self.train_dataset,
                eval_dataset=self.eval_dataset,
                dataset_text_field="text",
                max_seq_length=self.max_seq_length,
                dataset_num_proc=2,
                packing=False,
                args=self.setup_training_args()
            )

            trainer.train()

            final_save_path = os.path.join(self.save_dir, "final_model")
            self.model.save_pretrained(final_save_path)
            self.tokenizer.save_pretrained(final_save_path)
            print(f"Training completed! Model saved to {final_save_path}")

        except Exception as e:
            print(f"Error during training: {str(e)}")
            raise

    def sweep(self):
        print("Starting sweeping...")

        with wandb.init():
            config = wandb.config
            training_args = self.setup_training_args(config)

            try:
                trainer = SFTTrainer(
                    model=self.model,
                    tokenizer=self.tokenizer,
                    train_dataset=self.train_dataset,
                    eval_dataset=self.eval_dataset,
                    dataset_text_field="text",
                    max_seq_length=self.max_seq_length,
                    dataset_num_proc=2,
                    packing=False,
                    args=training_args
                )

                trainer.train()

                final_save_path = os.path.join(self.save_dir, "final_model")
                self.model.save_pretrained(final_save_path)
                self.tokenizer.save_pretrained(final_save_path)
                print(f"Training completed! Model saved to {final_save_path}")

            except Exception as e:
                print(f"Error during training: {str(e)}")
                raise

    def generate_predictions(self, test_data, batch_size=16):
        clear_memory()
        print("Generating predictions...")

        try:
            FastLanguageModel.for_inference(self.model)
            predictions = []

            # Convert test data to a list to support batch processing
            test_examples = list(test_data)
            total_batches = (len(test_examples) + batch_size - 1) // batch_size

            all_predictions = []

            # Process in batches
            for i in range(0, len(test_examples), batch_size):
                if i % (batch_size * 10) == 0:
                    print(f"Processing batch {i//batch_size}/{total_batches}")

                # Get current batch samples
                batch = test_examples[i:i + batch_size]
                prompts = [self.process_test_example(example) for example in batch]

                # Batch encoding
                inputs = self.tokenizer(
                    prompts,
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=self.max_seq_length
                ).to("cuda")

                # Batch generation
                with torch.inference_mode():
                    outputs = self.model.generate(
                        **inputs,
                        max_new_tokens=8,  # Reduce the number of generated tokens as we only need True/False
                        temperature=0.1,
                        top_p=0.9,
                        do_sample=False,    # Disable sampling for faster generation
                        use_cache=True,
                        pad_token_id=self.tokenizer.pad_token_id,
                    )

                input_length = inputs['input_ids'].shape[1]
                responses = self.tokenizer.batch_decode(
                    [output[input_length:] for output in outputs],
                    skip_special_tokens=True
                )

                # Batch processing prediction results
                batch_predictions = ["true" in response.lower() for response in responses]
                all_predictions.extend(batch_predictions)

                # Periodically clear memory
                if i % (batch_size * 50) == 0:
                    clear_memory()

            print(f"Total predictions: {len(all_predictions)}")
            assert len(all_predictions) == len(test_examples)

            return all_predictions

        except Exception as e:
            print(f"Error generating predictions: {str(e)}")
            raise

    def create_submission(self):
        print("Creating submission file...")
        try:
            test_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")['test']
            print(f"Test dataset size: {len(test_dataset)}")
            predictions = self.generate_predictions(test_dataset, batch_size=16)
            print(f"Generated predictions: {len(predictions)}")

            assert len(predictions) == len(test_dataset), \
                f"Prediction count mismatch! Expected {len(test_dataset)}, got {len(predictions)}"

            submission_df = pd.DataFrame({
                'ID': range(len(predictions)),
                'is_correct': predictions
            })

            print(f"Submission DataFrame shape: {submission_df.shape}")

            submission_path = os.path.join(self.save_dir, 'submission.csv')
            submission_df.to_csv(submission_path, index=False)
            print(f"Submission saved to {submission_path}")

            saved_df = pd.read_csv(submission_path)
            print(f"Saved file shape: {saved_df.shape}")

        except Exception as e:
            print(f"Error creating submission: {str(e)}")
            raise

# Hyper Parameters sweeping

## 1. Initial the sweep

In [None]:
sweep_id = wandb.sweep(sweep_config, project="math_verification")

## 2. Run sweep agent

In [None]:
def run_sweep():
    trainer = MathVerificationTrainer()
    trainer.setup_model()
    trainer.prepare_datasets(max_samples=1000)  # Adjust based on available RAM
    trainer.sweep()

wandb.agent(sweep_id, run_sweep, count = 50)

# Get final result

Before running the main function, hyper parameter in trainer.train() should be changed.

In [None]:
def main():
    try:
        set_seeds()
        print("Starting training pipeline...")

        # Initialize and run trainer
        trainer = MathVerificationTrainer()
        trainer.setup_model()
        trainer.prepare_datasets(max_samples=1000)  # Adjust based on available RAM
        trainer.train()
        trainer.create_submission()

        print("Training pipeline completed successfully!")

    except Exception as e:
        print(f"Fatal error in main: {str(e)}")
        raise
    finally:
        clear_memory()
        print_gpu_utilization()

if __name__ == "__main__":
    main()

Starting training pipeline...
Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2024.11.5 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Model loaded successfully!
Preparing datasets...


README.md:   0%|          | 0.00/2.09k [00:00<?, ?B/s]

Datasets prepared! Train size: 900, Eval size: 100
Starting training...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 56
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
