In [1]:
# Uninstall potentially conflicting packages
!pip uninstall -y transformers accelerate unsloth torch torchvision torchaudio

# Install base packages
!pip install unsloth

# Install dependencies
!pip install -q transformers accelerate peft
!pip install -q datasets evaluate bitsandbytes trl
!pip install -q torch torchvision torchaudio

# Install Colab-optimized unsloth
!pip uninstall unsloth -y
!pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# Install other tools
!pip install pandas scikit-learn
!pip install -q ipywidgets

Found existing installation: transformers 4.44.2
Uninstalling transformers-4.44.2:
  Successfully uninstalled transformers-4.44.2
Found existing installation: accelerate 0.34.2
Uninstalling accelerate-0.34.2:
  Successfully uninstalled accelerate-0.34.2
[0mFound existing installation: torch 2.5.0+cu121
Uninstalling torch-2.5.0+cu121:
  Successfully uninstalled torch-2.5.0+cu121
Found existing installation: torchvision 0.20.0+cu121
Uninstalling torchvision-0.20.0+cu121:
  Successfully uninstalled torchvision-0.20.0+cu121
Found existing installation: torchaudio 2.5.0+cu121
Uninstalling torchaudio-2.5.0+cu121:
  Successfully uninstalled torchaudio-2.5.0+cu121
Collecting unsloth
  Downloading unsloth-2024.11.5-py3-none-any.whl.metadata (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.6/59.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth-zoo>=2024.11.1 (from unsloth)
  Downloading unsloth_zoo-2024.11.4-py3-none-any.whl.metadata (16 kB)
Coll

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Environment setup
import os
import warnings
import random
import numpy as np
import torch
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import gc
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
import transformers
import accelerate

# Print versions
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")

# Configure environment
os.environ["CUDA_VISIBLE_DEVICES"]="0"
warnings.filterwarnings('ignore')
torch.set_float32_matmul_precision('high')

# Set random seeds
def set_seeds(seed=3407):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

# Memory management utilities
def clear_memory():
    gc.collect()
    torch.cuda.empty_cache()

def print_gpu_utilization():
    print("\nGPU Memory Usage:")
    !nvidia-smi | grep -E "Memory|Volatile"

class MathVerificationTrainer:
    def __init__(self, max_seq_length=2048, save_dir='/content/drive/MyDrive/math_verification'):
        self.max_seq_length = max_seq_length
        self.save_dir = save_dir
        self.model = None
        self.tokenizer = None
        self.train_dataset = None
        self.eval_dataset = None
        os.makedirs(self.save_dir, exist_ok=True)

    def setup_model(self):
        clear_memory()
        print("Loading model...")

        try:
            model, tokenizer = FastLanguageModel.from_pretrained(
                model_name="unsloth/Meta-Llama-3.1-8B",
                max_seq_length=self.max_seq_length,
                load_in_4bit=True,
            )

            # Updated LoRA configuration
            model = FastLanguageModel.get_peft_model(
                model,
                r=16,
                target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
                lora_alpha=16,
                lora_dropout=0.1,
                bias="none",
                use_gradient_checkpointing=True,
                random_state=3407,
                use_rslora=True,
            )

            self.model = model
            self.tokenizer = tokenizer
            print("Model loaded successfully!")

        except Exception as e:
            print(f"Error loading model: {str(e)}")
            raise

    def prepare_datasets(self, max_samples=1000):
        clear_memory()
        print("Preparing datasets...")

        try:
            dataset = load_dataset(
                "ad6398/nyu-dl-teach-maths-comp",
                split='train',
                streaming=True
            )

            train_data = list(dataset.take(max_samples))

            train_idx, val_idx = train_test_split(
                range(len(train_data)),
                test_size=0.1,
                random_state=3407
            )

            def process_example(example):
                prompt = (
                    "Analyze this mathematics problem and solution:\n\n"
                    f"Question: {example['question']}\n"
                    f"Student's Answer: {example['answer']}\n"
                    "Let's verify this step by step:\n"
                    f"{example.get('solution', 'Analyzing...')}\n"
                    f"Is the answer correct? {str(example['is_correct'])}"
                ) + self.tokenizer.eos_token

                return {"text": prompt}

            train_examples = [process_example(train_data[i]) for i in train_idx]
            eval_examples = [process_example(train_data[i]) for i in val_idx]

            self.train_dataset = Dataset.from_list(train_examples)
            self.eval_dataset = Dataset.from_list(eval_examples)

            del train_data, train_examples, eval_examples
            clear_memory()

            print(f"Datasets prepared! Train size: {len(self.train_dataset)}, Eval size: {len(self.eval_dataset)}")

        except Exception as e:
            print(f"Error preparing datasets: {str(e)}")
            raise

    def setup_training_args(self):
        return TrainingArguments(
            output_dir=os.path.join(self.save_dir, "checkpoints"),
            per_device_train_batch_size=2,
            gradient_accumulation_steps=8,
            warmup_ratio=0.1,
            num_train_epochs=3,
            learning_rate=1e-4,
            fp16=True,  # Updated to always use fp16
            logging_steps=10,
            optim="adamw_torch",  # Updated optimizer
            weight_decay=0.05,
            lr_scheduler_type="cosine",
            seed=3407,
            evaluation_strategy="steps",
            eval_steps=50,
            save_strategy="steps",
            save_steps=50,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            gradient_checkpointing=True,
            max_grad_norm=0.3,
            report_to="none",
            remove_unused_columns=True,
            dataloader_pin_memory=False
        )

    def train(self):
        clear_memory()
        print("Starting training...")

        try:
            trainer = SFTTrainer(
                model=self.model,
                tokenizer=self.tokenizer,
                train_dataset=self.train_dataset,
                eval_dataset=self.eval_dataset,
                dataset_text_field="text",
                max_seq_length=self.max_seq_length,
                dataset_num_proc=2,
                packing=False,
                args=self.setup_training_args()
            )

            trainer.train()

            final_save_path = os.path.join(self.save_dir, "final_model")
            self.model.save_pretrained(final_save_path)
            self.tokenizer.save_pretrained(final_save_path)
            print(f"Training completed! Model saved to {final_save_path}")

        except Exception as e:
            print(f"Error during training: {str(e)}")
            raise

    def sweep(self):
        clear_memory()
        print("Starting training...")

        wandb.init()
        config = wandb.config

        training_args = TrainingArguments(
            output_dir=os.path.join(self.save_dir, "checkpoints"),
            per_device_train_batch_size=config.per_device_train_batch_size,
            gradient_accumulation_steps=config.gradient_accumulation_steps,
            warmup_ratio=config.warmup_ratio,
            num_train_epochs=config.epochs,
            learning_rate=config.learning_rate,
            fp16=True,  # Updated to always use fp16
            logging_steps=10,
            optim="adamw_torch",  # Updated optimizer
            weight_decay=config.weight_decay,
            lr_scheduler_type="cosine",
            seed=3407,
            evaluation_strategy="steps",
            eval_steps=50,
            save_strategy="steps",
            save_steps=50,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            gradient_checkpointing=True,
            max_grad_norm=0.3,
            report_to="none",
            remove_unused_columns=True,
            dataloader_pin_memory=False,
            report_to="wandb"
        )

        try:
            trainer = SFTTrainer(
                model=self.model,
                tokenizer=self.tokenizer,
                train_dataset=self.train_dataset,
                eval_dataset=self.eval_dataset,
                dataset_text_field="text",
                max_seq_length=self.max_seq_length,
                dataset_num_proc=2,
                packing=False,
                args=self.setup_training_args()
            )

            trainer.train()

            final_save_path = os.path.join(self.save_dir, "final_model")
            self.model.save_pretrained(final_save_path)
            self.tokenizer.save_pretrained(final_save_path)
            print(f"Training completed! Model saved to {final_save_path}")

        except Exception as e:
            print(f"Error during training: {str(e)}")
            raise

    def generate_predictions(self, test_data):
        clear_memory()
        print("Generating predictions...")

        try:
            FastLanguageModel.for_inference(self.model)
            predictions = []

            for i, example in enumerate(test_data):
                if i % 50 == 0:  # More frequent updates
                    print(f"Processing example {i}/{len(test_data)}")
                    clear_memory()

                prompt = (
                    "Analyze this mathematics problem and solution:\n\n"
                    f"Question: {example['question']}\n"
                    f"Student's Answer: {example['answer']}\n"
                    "Is this answer correct (True/False)?\n"
                )

                inputs = self.tokenizer([prompt], return_tensors="pt").to("cuda")
                with torch.inference_mode():
                    outputs = self.model.generate(
                        **inputs,
                        max_new_tokens=64,
                        temperature=0.7,
                        top_p=0.9,
                        do_sample=True,
                        use_cache=True
                    )

                response = self.tokenizer.batch_decode(
                    [outputs[0][inputs['input_ids'].shape[1]:]],
                    skip_special_tokens=True
                )[0].strip().lower()

                predictions.append("true" in response)

            return predictions

        except Exception as e:
            print(f"Error generating predictions: {str(e)}")
            raise

    def create_submission(self):
        print("Creating submission file...")
        try:
            test_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")['test']
            predictions = self.generate_predictions(test_dataset)

            submission_df = pd.DataFrame({
                'ID': range(len(predictions)),
                'is_correct': predictions
            })

            submission_path = os.path.join(self.save_dir, 'submission.csv')
            submission_df.to_csv(submission_path, index=False)
            print(f"Submission saved to {submission_path}")

        except Exception as e:
            print(f"Error creating submission: {str(e)}")
            raise

def main():
    try:
        set_seeds()
        print("Starting training pipeline...")

        # Initialize and run trainer
        trainer = MathVerificationTrainer()
        trainer.setup_model()
        trainer.prepare_datasets(max_samples=1000)  # Adjust based on available RAM
        trainer.train()
        trainer.create_submission()

        print("Training pipeline completed successfully!")

    except Exception as e:
        print(f"Fatal error in main: {str(e)}")
        raise
    finally:
        clear_memory()
        print_gpu_utilization()

if __name__ == "__main__":
    main()

Mounted at /content/drive
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
PyTorch version: 2.5.1+cu124
Transformers version: 4.46.2
Accelerate version: 1.1.1
Starting training pipeline...
Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2024.11.5 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Model loaded successfully!
Preparing datasets...


README.md:   0%|          | 0.00/2.09k [00:00<?, ?B/s]

Datasets prepared! Train size: 900, Eval size: 100
Starting training...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 168
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.8267,0.814631
100,0.7564,0.784168
150,0.7177,0.784338


Training completed! Model saved to /content/drive/MyDrive/math_verification/final_model
Creating submission file...


train-00000-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/3.65M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating predictions...
Processing example 0/10000
Processing example 50/10000


Hyper Parameters sweeping
------

set up wandb

In [None]:
!pip install wandb --upgrade

import wandb
wandb.login()

define the sweep

In [None]:
sweep_config = {
    'method': 'random'
    }

metric = {
    'name': 'loss',
    'goal': 'minimize'
    }

sweep_config['metric'] = metric

parameters_dict = {
    'learning_rate:': {
        'distribution': 'q_log_uniform',
        'min': 1e-5,
        'max': 1e-3,
        'q': 1e-5
        },
    'warmup_ratio': {
        'values': [0.05, 0.1, 0.2]
        },
    'weight_decay': {
        'values': [0.01, 0.03, 0.05]
        },
    'per_device_train_batch_size': {
        'values': [4, 8, 16, 32]
        },
    'gradient_accumulation_steps': {
        'values': [2, 4, 8]
        },
    'epochs': {
        'value': 1
    }
}

sweep_config['parameters'] = parameters_dict

In [None]:
import pprint

pprint.pprint(sweep_config)

Initial the sweep
-----

In [None]:
sweep_id = wandb.sweep(sweep_config, project="math_verification")

Run sweep agent
-----

In [None]:
def sweep(self):
    clear_memory()
    print("Starting training...")

    wandb.init()
    config = wandb.config

    training_args = TrainingArguments(
        output_dir=os.path.join(self.save_dir, "checkpoints"),
        per_device_train_batch_size=config.per_device_train_batch_size,
        gradient_accumulation_steps=config.gradient_accumulation_steps,
        warmup_ratio=config.warmup_ratio,
        num_train_epochs=config.epochs,
        learning_rate=config.learning_rate,
        fp16=True,  # Updated to always use fp16
        logging_steps=10,
        optim="adamw_torch",  # Updated optimizer
        weight_decay=config.weight_decay,
        lr_scheduler_type="cosine",
        seed=3407,
        evaluation_strategy="steps",
        eval_steps=50,
        save_strategy="steps",
        save_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        gradient_checkpointing=True,
        max_grad_norm=0.3,
        report_to="none",
        remove_unused_columns=True,
        dataloader_pin_memory=False,
        report_to="wandb"
    )

    try:
        trainer = SFTTrainer(
            model=self.model,
            tokenizer=self.tokenizer,
            train_dataset=self.train_dataset,
            eval_dataset=self.eval_dataset,
            dataset_text_field="text",
            max_seq_length=self.max_seq_length,
            dataset_num_proc=2,
            packing=False,
            args=self.setup_training_args()
        )

        trainer.train()

        final_save_path = os.path.join(self.save_dir, "final_model")
        self.model.save_pretrained(final_save_path)
        self.tokenizer.save_pretrained(final_save_path)
        print(f"Training completed! Model saved to {final_save_path}")

    except Exception as e:
        print(f"Error during training: {str(e)}")
        raise

In [None]:
def run_sweep():
    trainer = MathVerificationTrainer()
    trainer.setup_model()
    trainer.prepare_datasets(max_samples=1000)  # Adjust based on available RAM
    trainer.sweep()

wandb.agent(sweep_id, run_sweep, count = 50)