In [None]:
# Uninstall potentially conflicting packages
!pip uninstall -y transformers accelerate unsloth torch torchvision torchaudio

# Install base packages
!pip install unsloth

# Install dependencies
!pip install -q transformers accelerate peft
!pip install -q datasets evaluate bitsandbytes trl
!pip install -q torch torchvision torchaudio

# Install Colab-optimized unsloth
!pip uninstall unsloth -y
!pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# Install other tools
!pip install pandas scikit-learn
!pip install -q ipywidgets

Found existing installation: transformers 4.44.2
Uninstalling transformers-4.44.2:
  Successfully uninstalled transformers-4.44.2
Found existing installation: accelerate 0.34.2
Uninstalling accelerate-0.34.2:
  Successfully uninstalled accelerate-0.34.2
[0mFound existing installation: torch 2.5.0+cu121
Uninstalling torch-2.5.0+cu121:
  Successfully uninstalled torch-2.5.0+cu121
Found existing installation: torchvision 0.20.0+cu121
Uninstalling torchvision-0.20.0+cu121:
  Successfully uninstalled torchvision-0.20.0+cu121
Found existing installation: torchaudio 2.5.0+cu121
Uninstalling torchaudio-2.5.0+cu121:
  Successfully uninstalled torchaudio-2.5.0+cu121
Collecting unsloth
  Downloading unsloth-2024.11.5-py3-none-any.whl.metadata (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.6/59.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth-zoo>=2024.11.1 (from unsloth)
  Downloading unsloth_zoo-2024.11.4-py3-none-any.whl.metadata (16 kB)
Coll

Preparations
------

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Environment setup
import os
import warnings
import random
import numpy as np
import torch
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import gc
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
import transformers
import accelerate

# Print versions
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")

# Configure environment
os.environ["CUDA_VISIBLE_DEVICES"]="0"
warnings.filterwarnings('ignore')
torch.set_float32_matmul_precision('high')

# Set random seeds
def set_seeds(seed=3407):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

# Memory management utilities
def clear_memory():
    gc.collect()
    torch.cuda.empty_cache()

def print_gpu_utilization():
    print("\nGPU Memory Usage:")
    !nvidia-smi | grep -E "Memory|Volatile"

Mounted at /content/drive
🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
PyTorch version: 2.5.1+cu124
Transformers version: 4.46.2
Accelerate version: 1.1.1


set up wandb

In [None]:
!pip install wandb --upgrade

import wandb
wandb.login()

Collecting wandb
  Downloading wandb-0.18.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.7 kB)
Downloading wandb-0.18.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.1/16.1 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: wandb
  Attempting uninstall: wandb
    Found existing installation: wandb 0.18.5
    Uninstalling wandb-0.18.5:
      Successfully uninstalled wandb-0.18.5
Successfully installed wandb-0.18.6


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

define the sweep

In [None]:
sweep_config = {
    'method': 'random'
    }

metric = {
    'name': 'loss',
    'goal': 'minimize'
    }

sweep_config['metric'] = metric

parameters_dict = {
    'learning_rate': {
        'distribution': 'log_uniform_values',
        'min': 1e-5,
        'max': 1e-3
        },
    'warmup_ratio': {
        'values': [0.05, 0.1, 0.2]
        },
    'weight_decay': {
        'values': [0.01, 0.03, 0.05]
        },
    'per_device_train_batch_size': {
        'values': [2, 4]
        },
    'gradient_accumulation_steps': {
        'values': [2, 4, 8]
        },
    'epochs': {
        'value': 1
        }
}

sweep_config['parameters'] = parameters_dict

In [None]:
import pprint

pprint.pprint(sweep_config)

{'method': 'random',
 'metric': {'goal': 'minimize', 'name': 'loss'},
 'parameters': {'epochs': {'value': 1},
                'gradient_accumulation_steps': {'values': [2, 4, 8]},
                'learning_rate': {'distribution': 'log_uniform_values',
                                  'max': 0.001,
                                  'min': 1e-05},
                'per_device_train_batch_size': {'values': [2, 4]},
                'warmup_ratio': {'values': [0.05, 0.1, 0.2]},
                'weight_decay': {'values': [0.01, 0.03, 0.05]}}}


Define class
---

In [None]:
class MathVerificationTrainer:
    def __init__(self, max_seq_length=2048, save_dir='/content/drive/MyDrive/math_verification_sweep'):
        self.max_seq_length = max_seq_length
        self.save_dir = save_dir
        self.model = None
        self.tokenizer = None
        self.train_dataset = None
        self.eval_dataset = None
        os.makedirs(self.save_dir, exist_ok=True)

    def setup_model(self):
        clear_memory()
        print("Loading model...")

        try:
            model, tokenizer = FastLanguageModel.from_pretrained(
                model_name="unsloth/Meta-Llama-3.1-8B",
                max_seq_length=self.max_seq_length,
                load_in_4bit=True,
            )

            # Updated LoRA configuration
            model = FastLanguageModel.get_peft_model(
                model,
                r=16,
                target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
                lora_alpha=16,
                lora_dropout=0.1,
                bias="none",
                use_gradient_checkpointing=True,
                random_state=3407,
                use_rslora=True,
            )

            self.model = model
            self.tokenizer = tokenizer
            print("Model loaded successfully!")

        except Exception as e:
            print(f"Error loading model: {str(e)}")
            raise

    def prepare_datasets(self, max_samples=1000):
        clear_memory()
        print("Preparing datasets...")

        try:
            dataset = load_dataset(
                "ad6398/nyu-dl-teach-maths-comp",
                split='train',
                streaming=True
            )

            train_data = list(dataset.take(max_samples))

            train_idx, val_idx = train_test_split(
                range(len(train_data)),
                test_size=0.1,
                random_state=3407
            )

            def process_example(example):
                prompt = (
                    "Analyze this mathematics problem and solution:\n\n"
                    f"Question: {example['question']}\n"
                    f"Student's Answer: {example['answer']}\n"
                    "Let's verify this step by step:\n"
                    f"{example.get('solution', 'Analyzing...')}\n"
                    f"Is the answer correct? {str(example['is_correct'])}"
                ) + self.tokenizer.eos_token

                return {"text": prompt}

            train_examples = [process_example(train_data[i]) for i in train_idx]
            eval_examples = [process_example(train_data[i]) for i in val_idx]

            self.train_dataset = Dataset.from_list(train_examples)
            self.eval_dataset = Dataset.from_list(eval_examples)

            del train_data, train_examples, eval_examples
            clear_memory()

            print(f"Datasets prepared! Train size: {len(self.train_dataset)}, Eval size: {len(self.eval_dataset)}")

        except Exception as e:
            print(f"Error preparing datasets: {str(e)}")
            raise

    def setup_training_args(self):
        return TrainingArguments(
            output_dir=os.path.join(self.save_dir, "checkpoints"),
            per_device_train_batch_size=2,
            gradient_accumulation_steps=8,
            warmup_ratio=0.1,
            num_train_epochs=3,
            learning_rate=1e-4,
            fp16=True,  # Updated to always use fp16
            logging_steps=10,
            optim="adamw_torch",  # Updated optimizer
            weight_decay=0.05,
            lr_scheduler_type="cosine",
            seed=3407,
            evaluation_strategy="steps",
            eval_steps=50,
            save_strategy="steps",
            save_steps=50,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            gradient_checkpointing=True,
            max_grad_norm=0.3,
            report_to="none",
            remove_unused_columns=True,
            dataloader_pin_memory=False
        )

    def train(self):
        clear_memory()
        print("Starting training...")

        try:
            trainer = SFTTrainer(
                model=self.model,
                tokenizer=self.tokenizer,
                train_dataset=self.train_dataset,
                eval_dataset=self.eval_dataset,
                dataset_text_field="text",
                max_seq_length=self.max_seq_length,
                dataset_num_proc=2,
                packing=False,
                args=self.setup_training_args()
            )

            trainer.train()

            final_save_path = os.path.join(self.save_dir, "final_model")
            self.model.save_pretrained(final_save_path)
            self.tokenizer.save_pretrained(final_save_path)
            print(f"Training completed! Model saved to {final_save_path}")

        except Exception as e:
            print(f"Error during training: {str(e)}")
            raise

    def sweep(self):
        print("Starting sweeping...")

        with wandb.init():
            config = wandb.config

            training_args = TrainingArguments(
                output_dir=os.path.join(self.save_dir, "checkpoints"),
                per_device_train_batch_size=config.per_device_train_batch_size,
                gradient_accumulation_steps=config.gradient_accumulation_steps,
                warmup_ratio=config.warmup_ratio,
                num_train_epochs=config.epochs,
                learning_rate=config.learning_rate,
                fp16=True,  # Updated to always use fp16
                logging_steps=10,
                optim="adamw_torch",  # Updated optimizer
                weight_decay=config.weight_decay,
                lr_scheduler_type="cosine",
                seed=3407,
                evaluation_strategy="steps",
                eval_steps=50,
                save_strategy="steps",
                save_steps=50,
                load_best_model_at_end=True,
                metric_for_best_model="eval_loss",
                gradient_checkpointing=True,
                max_grad_norm=0.3,
                report_to="wandb",
                remove_unused_columns=True,
                dataloader_pin_memory=False,
            )

            try:
                trainer = SFTTrainer(
                    model=self.model,
                    tokenizer=self.tokenizer,
                    train_dataset=self.train_dataset,
                    eval_dataset=self.eval_dataset,
                    dataset_text_field="text",
                    max_seq_length=self.max_seq_length,
                    dataset_num_proc=2,
                    packing=False,
                    args=training_args
                )

                trainer.train()

                final_save_path = os.path.join(self.save_dir, "final_model")
                self.model.save_pretrained(final_save_path)
                self.tokenizer.save_pretrained(final_save_path)
                print(f"Training completed! Model saved to {final_save_path}")

            except Exception as e:
                print(f"Error during training: {str(e)}")
                raise

    def generate_predictions(self, test_data):
        clear_memory()
        print("Generating predictions...")

        try:
            FastLanguageModel.for_inference(self.model)
            predictions = []

            for i, example in enumerate(test_data):
                if i % 50 == 0:  # More frequent updates
                    print(f"Processing example {i}/{len(test_data)}")
                    clear_memory()

                prompt = (
                    "Analyze this mathematics problem and solution:\n\n"
                    f"Question: {example['question']}\n"
                    f"Student's Answer: {example['answer']}\n"
                    "Is this answer correct (True/False)?\n"
                )

                inputs = self.tokenizer([prompt], return_tensors="pt").to("cuda")
                with torch.inference_mode():
                    outputs = self.model.generate(
                        **inputs,
                        max_new_tokens=64,
                        temperature=0.7,
                        top_p=0.9,
                        do_sample=True,
                        use_cache=True
                    )

                response = self.tokenizer.batch_decode(
                    [outputs[0][inputs['input_ids'].shape[1]:]],
                    skip_special_tokens=True
                )[0].strip().lower()

                predictions.append("true" in response)

            return predictions

        except Exception as e:
            print(f"Error generating predictions: {str(e)}")
            raise

    def create_submission(self):
        print("Creating submission file...")
        try:
            test_dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")['test']
            predictions = self.generate_predictions(test_dataset)

            submission_df = pd.DataFrame({
                'ID': range(len(predictions)),
                'is_correct': predictions
            })

            submission_path = os.path.join(self.save_dir, 'submission.csv')
            submission_df.to_csv(submission_path, index=False)
            print(f"Submission saved to {submission_path}")

        except Exception as e:
            print(f"Error creating submission: {str(e)}")
            raise

# Hyper Parameters sweeping

## 1. Initial the sweep

In [None]:
sweep_id = wandb.sweep(sweep_config, project="math_verification")

Create sweep with ID: n9hno2jm
Sweep URL: https://wandb.ai/bw2676-new-york-university/math_verification/sweeps/n9hno2jm


## 2. Run sweep agent

In [8]:
def run_sweep():
    trainer = MathVerificationTrainer()
    trainer.setup_model()
    trainer.prepare_datasets(max_samples=1000)  # Adjust based on available RAM
    trainer.sweep()

wandb.agent(sweep_id, run_sweep, count = 50)

[34m[1mwandb[0m: Agent Starting Run: tg2iycnn with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 9.192921588298024e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 4
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0.05


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2024.11.5 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Model loaded successfully!
Preparing datasets...


README.md:   0%|          | 0.00/2.09k [00:00<?, ?B/s]

Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


[34m[1mwandb[0m: Currently logged in as: [33mbw2676[0m ([33mbw2676-new-york-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 56
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.8368,0.824147


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▃▄▆▇▇█
train/global_step,▁▃▄▆▇▇█
train/grad_norm,█▄▂▁▂
train/learning_rate,▇█▆▃▁
train/loss,█▄▂▁▁

0,1
eval/loss,0.82415
eval/runtime,45.599
eval/samples_per_second,2.193
eval/steps_per_second,0.285
total_flos,1.599323206975488e+16
train/epoch,0.99556
train/global_step,56.0
train/grad_norm,0.6767
train/learning_rate,0.0
train/loss,0.8368


[34m[1mwandb[0m: Agent Starting Run: usu2lv44 with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 7.626315258122584e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 4
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.03


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 56
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.8416,0.827964


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▃▄▆▇▇█
train/global_step,▁▃▄▆▇▇█
train/grad_norm,█▁▁▁▁
train/learning_rate,█▇▅▂▁
train/loss,█▃▂▁▁

0,1
eval/loss,0.82796
eval/runtime,45.3803
eval/samples_per_second,2.204
eval/steps_per_second,0.286
total_flos,1.599323206975488e+16
train/epoch,0.99556
train/global_step,56.0
train/grad_norm,0.73445
train/learning_rate,0.0
train/loss,0.8416


[34m[1mwandb[0m: Agent Starting Run: 7kw2t51i with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 4.172348964048659e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.05


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 56
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.8888,0.868061


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▃▄▆▇▇█
train/global_step,▁▃▄▆▇▇█
train/grad_norm,█▇▄▁▂
train/learning_rate,█▇▄▂▁
train/loss,█▄▂▁▁

0,1
eval/loss,0.86806
eval/runtime,45.4977
eval/samples_per_second,2.198
eval/steps_per_second,0.286
total_flos,1.31675481452544e+16
train/epoch,0.99556
train/global_step,56.0
train/grad_norm,0.86042
train/learning_rate,0.0
train/loss,0.8888


[34m[1mwandb[0m: Agent Starting Run: adp9ash2 with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 2.5184215334854048e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 4
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.01


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 56
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,1.0035,0.972108


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▃▄▆▇▇█
train/global_step,▁▃▄▆▇▇█
train/grad_norm,▂█▃▁▃
train/learning_rate,█▇▄▂▁
train/loss,█▅▃▂▁

0,1
eval/loss,0.97211
eval/runtime,45.3594
eval/samples_per_second,2.205
eval/steps_per_second,0.287
total_flos,1.599323206975488e+16
train/epoch,0.99556
train/global_step,56.0
train/grad_norm,1.18233
train/learning_rate,0.0
train/loss,1.0035


[34m[1mwandb[0m: Agent Starting Run: 1pmoqbrm with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 0.0004101334263733724
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.01


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112715011111302, max=1.0…

Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 112
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.8184,0.798576
100,0.7593,0.779979


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,█▁
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁█
train/epoch,▁▂▂▃▄▄▄▅▆▆▇▇██
train/global_step,▁▂▂▃▄▄▄▅▆▆▇▇██
train/grad_norm,▇▂▃▂█▃▃▁▂▂▁
train/learning_rate,▇██▇▆▅▄▃▂▁▁
train/loss,█▂▂▂▂▂▂▁▁▁▁

0,1
eval/loss,0.77998
eval/runtime,45.3585
eval/samples_per_second,2.205
eval/steps_per_second,0.287
total_flos,1.31675481452544e+16
train/epoch,0.99556
train/global_step,112.0
train/grad_norm,0.5959
train/learning_rate,0.0
train/loss,0.7908


[34m[1mwandb[0m: Agent Starting Run: tc83keks with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.0845340246781782e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 4
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0.05


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 56
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,1.2733,1.227553


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▃▄▆▇▇█
train/global_step,▁▃▄▆▇▇█
train/grad_norm,▁▃▅▆█
train/learning_rate,▇█▆▃▁
train/loss,█▇▄▂▁

0,1
eval/loss,1.22755
eval/runtime,45.4631
eval/samples_per_second,2.2
eval/steps_per_second,0.286
total_flos,1.599323206975488e+16
train/epoch,0.99556
train/global_step,56.0
train/grad_norm,1.61457
train/learning_rate,0.0
train/loss,1.2733


[34m[1mwandb[0m: Agent Starting Run: kdrwvzbs with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 0.0001593219500970072
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.01


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 56
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.8206,0.81032


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▃▄▆▇▇█
train/global_step,▁▃▄▆▇▇█
train/grad_norm,█▂▁▁▂
train/learning_rate,█▇▄▂▁
train/loss,█▂▁▁▁

0,1
eval/loss,0.81032
eval/runtime,45.4313
eval/samples_per_second,2.201
eval/steps_per_second,0.286
total_flos,1.31675481452544e+16
train/epoch,0.99556
train/global_step,56.0
train/grad_norm,0.63489
train/learning_rate,0.0
train/loss,0.8206


[34m[1mwandb[0m: Agent Starting Run: pg7mn6p5 with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 0.00012065934051908218
[34m[1mwandb[0m: 	per_device_train_batch_size: 4
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0.03


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 8
\        /    Total batch size = 32 | Total steps = 28
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
train/epoch,▁▅█
train/global_step,▁▅█
train/grad_norm,█▁
train/learning_rate,█▁
train/loss,█▁

0,1
total_flos,1.599323206975488e+16
train/epoch,0.99556
train/global_step,28.0
train/grad_norm,0.5573
train/learning_rate,4e-05
train/loss,0.9491
train_loss,1.08338
train_runtime,1048.5202
train_samples_per_second,0.858
train_steps_per_second,0.027


[34m[1mwandb[0m: Agent Starting Run: wgvv5cj9 with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 7.693767731804174e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 4
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.01


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 56
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.8414,0.827667


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▃▄▆▇▇█
train/global_step,▁▃▄▆▇▇█
train/grad_norm,█▂▁▁▁
train/learning_rate,█▇▄▂▁
train/loss,█▃▁▁▁

0,1
eval/loss,0.82767
eval/runtime,45.2572
eval/samples_per_second,2.21
eval/steps_per_second,0.287
total_flos,1.599323206975488e+16
train/epoch,0.99556
train/global_step,56.0
train/grad_norm,0.72107
train/learning_rate,0.0
train/loss,0.8414


[34m[1mwandb[0m: Agent Starting Run: t6zlwh5v with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 3.425210036931093e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.03


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 56
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.9193,0.892802


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▃▄▆▇▇█
train/global_step,▁▃▄▆▇▇█
train/grad_norm,▁█▁▁▁
train/learning_rate,█▇▅▂▁
train/loss,█▅▂▂▁

0,1
eval/loss,0.8928
eval/runtime,45.5953
eval/samples_per_second,2.193
eval/steps_per_second,0.285
total_flos,1.31675481452544e+16
train/epoch,0.99556
train/global_step,56.0
train/grad_norm,1.04567
train/learning_rate,0.0
train/loss,0.9193


[34m[1mwandb[0m: Agent Starting Run: 37b2p5qp with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 0.00011483101359700156
[34m[1mwandb[0m: 	per_device_train_batch_size: 4
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0.01


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 8
\        /    Total batch size = 32 | Total steps = 28
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
train/epoch,▁▅█
train/global_step,▁▅█
train/grad_norm,█▁
train/learning_rate,█▁
train/loss,█▁

0,1
total_flos,1.599323206975488e+16
train/epoch,0.99556
train/global_step,28.0
train/grad_norm,0.59141
train/learning_rate,3e-05
train/loss,0.9535
train_loss,1.08702
train_runtime,1047.9987
train_samples_per_second,0.859
train_steps_per_second,0.027


[34m[1mwandb[0m: Agent Starting Run: urh0hrmx with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 3.795480775536448e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0.03


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 2
\        /    Total batch size = 4 | Total steps = 225
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,1.0184,0.920984
100,0.8597,0.829742
150,0.885,0.815663
200,0.8046,0.812249


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,█▂▁▁
eval/runtime,██▃▁
eval/samples_per_second,▁▁▆█
eval/steps_per_second,▁▁▁▁
train/epoch,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇███
train/grad_norm,▂▁█▃▃▃▂▁▁▁▁▂▁▂▁▁▂▂▂▃▁▂
train/learning_rate,▃▄▆▇███▇▇▇▆▅▅▄▄▃▃▂▂▁▁▁
train/loss,██▅▄▃▂▂▂▂▂▂▁▂▂▂▁▂▁▁▁▁▂

0,1
eval/loss,0.81225
eval/runtime,45.413
eval/samples_per_second,2.202
eval/steps_per_second,0.286
total_flos,1.323828272160768e+16
train/epoch,1.0
train/global_step,225.0
train/grad_norm,1.48795
train/learning_rate,0.0
train/loss,0.8917


[34m[1mwandb[0m: Agent Starting Run: x5vm9upk with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.6693602551757924e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.03


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 112
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,1.0384,0.986562
100,0.892,0.889678


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,█▁
eval/runtime,▁█
eval/samples_per_second,█▁
eval/steps_per_second,█▁
train/epoch,▁▂▂▃▄▄▄▅▆▆▇▇██
train/global_step,▁▂▂▃▄▄▄▅▆▆▇▇██
train/grad_norm,▁▅▆▄▂▆█▅▆▇▆
train/learning_rate,██▇▆▅▄▃▂▂▁▁
train/loss,█▅▅▃▂▂▂▁▁▁▁

0,1
eval/loss,0.88968
eval/runtime,45.4869
eval/samples_per_second,2.198
eval/steps_per_second,0.286
total_flos,1.31675481452544e+16
train/epoch,0.99556
train/global_step,112.0
train/grad_norm,1.85452
train/learning_rate,0.0
train/loss,0.9166


[34m[1mwandb[0m: Agent Starting Run: 7vrlmaau with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 0.00011957968900483828
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.03


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 2
\        /    Total batch size = 4 | Total steps = 225
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.8546,0.829527
100,0.8359,0.800822
150,0.8478,0.781579
200,0.7607,0.778389


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,█▄▁▁
eval/runtime,█▁▆▄
eval/samples_per_second,▁█▂▅
eval/steps_per_second,▁▁▁▁
train/epoch,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇███
train/grad_norm,█▄▄▃▆▃▄▁▂▂▃▂▂▃▁▂▃▄▂█▁▂
train/learning_rate,▄▇███▇▇▇▆▆▅▅▄▄▃▃▂▂▂▁▁▁
train/loss,█▆▂▂▂▂▂▂▂▂▂▁▂▂▂▁▂▁▁▁▁▂

0,1
eval/loss,0.77839
eval/runtime,45.4317
eval/samples_per_second,2.201
eval/steps_per_second,0.286
total_flos,1.323828272160768e+16
train/epoch,1.0
train/global_step,225.0
train/grad_norm,0.98611
train/learning_rate,0.0
train/loss,0.8557


[34m[1mwandb[0m: Agent Starting Run: fjn3dkyq with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 4.390975255491766e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.05


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 56
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.8829,0.862308


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▃▄▆▇▇█
train/global_step,▁▃▄▆▇▇█
train/grad_norm,▇█▃▁▁
train/learning_rate,█▇▅▂▁
train/loss,█▅▂▁▁

0,1
eval/loss,0.86231
eval/runtime,45.3506
eval/samples_per_second,2.205
eval/steps_per_second,0.287
total_flos,1.31675481452544e+16
train/epoch,0.99556
train/global_step,56.0
train/grad_norm,0.8531
train/learning_rate,0.0
train/loss,0.8829


[34m[1mwandb[0m: Agent Starting Run: tz6vqnq1 with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.85654668740785e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.05


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 112
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,1.0187,0.96408
100,0.8774,0.877952


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,█▁
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁█
train/epoch,▁▂▂▃▄▄▄▅▆▆▇▇██
train/global_step,▁▂▂▃▄▄▄▅▆▆▇▇██
train/grad_norm,▁▆█▃▂▇█▅▅▆▆
train/learning_rate,██▇▆▅▄▃▂▂▁▁
train/loss,█▅▅▃▂▂▂▁▁▁▁

0,1
eval/loss,0.87795
eval/runtime,45.3514
eval/samples_per_second,2.205
eval/steps_per_second,0.287
total_flos,1.31675481452544e+16
train/epoch,0.99556
train/global_step,112.0
train/grad_norm,1.69421
train/learning_rate,0.0
train/loss,0.9043


[34m[1mwandb[0m: Agent Starting Run: p2eo5t9h with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 1.2815980026020225e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.01


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 2
\        /    Total batch size = 4 | Total steps = 225
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,1.1634,1.075698
100,0.9235,0.888049
150,0.9312,0.856111
200,0.8524,0.848654


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,█▂▁▁
eval/runtime,▁▅▄█
eval/samples_per_second,█▄▅▁
eval/steps_per_second,▁▁▁▁
train/epoch,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇███
train/grad_norm,▃▁▂▆▅▃▄█▄▅▄▅▂▄▃▂▅▅▇▆▃▅
train/learning_rate,▇███▇▇▇▆▆▆▅▄▄▃▃▃▂▂▁▁▁▁
train/loss,██▅▄▄▃▃▂▂▂▂▁▂▂▂▁▂▁▁▁▁▂

0,1
eval/loss,0.84865
eval/runtime,45.5079
eval/samples_per_second,2.197
eval/steps_per_second,0.286
total_flos,1.323828272160768e+16
train/epoch,1.0
train/global_step,225.0
train/grad_norm,2.65303
train/learning_rate,0.0
train/loss,0.9335


[34m[1mwandb[0m: Agent Starting Run: 1jltfeex with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 0.0002881043108347149
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0.03


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 56
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.8034,0.794815


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▃▄▆▇▇█
train/global_step,▁▃▄▆▇▇█
train/grad_norm,█▃▂▁▁
train/learning_rate,▇█▆▃▁
train/loss,█▂▂▁▁

0,1
eval/loss,0.79481
eval/runtime,45.3076
eval/samples_per_second,2.207
eval/steps_per_second,0.287
total_flos,1.31675481452544e+16
train/epoch,0.99556
train/global_step,56.0
train/grad_norm,0.49082
train/learning_rate,2e-05
train/loss,0.8034


[34m[1mwandb[0m: Agent Starting Run: svzwuigc with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 0.0006454115659541343
[34m[1mwandb[0m: 	per_device_train_batch_size: 4
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.01


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 56
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.7921,0.78558


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▃▄▆▇▇█
train/global_step,▁▃▄▆▇▇█
train/grad_norm,█▁▁▁▁
train/learning_rate,█▇▄▂▁
train/loss,█▂▂▁▁

0,1
eval/loss,0.78558
eval/runtime,45.4339
eval/samples_per_second,2.201
eval/steps_per_second,0.286
total_flos,1.599323206975488e+16
train/epoch,0.99556
train/global_step,56.0
train/grad_norm,0.48386
train/learning_rate,2e-05
train/loss,0.7921


[34m[1mwandb[0m: Agent Starting Run: 0bhyw47e with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 0.0003051778808583243
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0.03


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 56
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.7996,0.791436


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▃▄▆▇▇█
train/global_step,▁▃▄▆▇▇█
train/grad_norm,█▂▁▁▁
train/learning_rate,▇█▆▃▁
train/loss,█▂▂▁▁

0,1
eval/loss,0.79144
eval/runtime,45.4643
eval/samples_per_second,2.2
eval/steps_per_second,0.286
total_flos,1.31675481452544e+16
train/epoch,0.99556
train/global_step,56.0
train/grad_norm,0.51598
train/learning_rate,1e-05
train/loss,0.7996


[34m[1mwandb[0m: Agent Starting Run: ep53m84s with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 0.000447768831736641
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.01


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 112
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.8161,0.800138
100,0.7579,0.779939


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,█▁
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁█
train/epoch,▁▂▂▃▄▄▄▅▆▆▇▇██
train/global_step,▁▂▂▃▄▄▄▅▆▆▇▇██
train/grad_norm,█▂▂▂▂▁▂▁▃▂▁
train/learning_rate,██▇▆▆▄▃▃▂▁▁
train/loss,█▂▂▂▂▂▂▁▂▁▁

0,1
eval/loss,0.77994
eval/runtime,45.472
eval/samples_per_second,2.199
eval/steps_per_second,0.286
total_flos,1.31675481452544e+16
train/epoch,0.99556
train/global_step,112.0
train/grad_norm,0.62889
train/learning_rate,0.0
train/loss,0.7903


[34m[1mwandb[0m: Agent Starting Run: 2d3jfihf with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 3.5605404698529166e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.05


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 56
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.9136,0.888989


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▃▄▆▇▇█
train/global_step,▁▃▄▆▇▇█
train/grad_norm,▃█▁▁▁
train/learning_rate,█▇▄▂▁
train/loss,█▅▂▂▁

0,1
eval/loss,0.88899
eval/runtime,45.2157
eval/samples_per_second,2.212
eval/steps_per_second,0.288
total_flos,1.31675481452544e+16
train/epoch,0.99556
train/global_step,56.0
train/grad_norm,1.00742
train/learning_rate,0.0
train/loss,0.9136


[34m[1mwandb[0m: Agent Starting Run: 5k1jhzlk with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 0.0007025255860692792
[34m[1mwandb[0m: 	per_device_train_batch_size: 4
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.05


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 8
\        /    Total batch size = 32 | Total steps = 28
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
train/epoch,▁▅█
train/global_step,▁▅█
train/grad_norm,█▁
train/learning_rate,█▁
train/loss,█▁

0,1
total_flos,1.599323206975488e+16
train/epoch,0.99556
train/global_step,28.0
train/grad_norm,0.36449
train/learning_rate,0.00015
train/loss,0.8499
train_loss,0.9387
train_runtime,1038.0757
train_samples_per_second,0.867
train_steps_per_second,0.027


[34m[1mwandb[0m: Agent Starting Run: hrpk7fdh with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 7.472602379003287e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 4
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0.05


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 8
\        /    Total batch size = 32 | Total steps = 28
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
train/epoch,▁▅█
train/global_step,▁▅█
train/grad_norm,█▁
train/learning_rate,█▁
train/loss,█▁

0,1
total_flos,1.599323206975488e+16
train/epoch,0.99556
train/global_step,28.0
train/grad_norm,0.69318
train/learning_rate,2e-05
train/loss,1.0529
train_loss,1.16141
train_runtime,1035.5131
train_samples_per_second,0.869
train_steps_per_second,0.027


[34m[1mwandb[0m: Agent Starting Run: qo90ugx8 with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 0.00021370399816615437
[34m[1mwandb[0m: 	per_device_train_batch_size: 4
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.01


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113424466667412, max=1.0…

Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 2
\        /    Total batch size = 8 | Total steps = 112
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.8227,0.801119
100,0.7597,0.779798


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,█▁
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁█
train/epoch,▁▂▂▃▄▄▄▅▆▆▇▇██
train/global_step,▁▂▂▃▄▄▄▅▆▆▇▇██
train/grad_norm,█▂▃▂▂▁ ▁▂▄▁
train/learning_rate,██▇▆▅▄▃▃▂▁▁
train/loss,█▂▂▂▂▂▂▁▂▁▁

0,1
eval/loss,0.7798
eval/runtime,45.2973
eval/samples_per_second,2.208
eval/steps_per_second,0.287
total_flos,1.599323206975488e+16
train/epoch,0.99556
train/global_step,112.0
train/grad_norm,0.6923
train/learning_rate,0.0
train/loss,0.796


[34m[1mwandb[0m: Agent Starting Run: 9shrzg1s with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 7.158639549606323e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 4
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.05


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 8
\        /    Total batch size = 32 | Total steps = 28
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
train/epoch,▁▅█
train/global_step,▁▅█
train/grad_norm,█▁
train/learning_rate,█▁
train/loss,█▁

0,1
total_flos,1.599323206975488e+16
train/epoch,0.99556
train/global_step,28.0
train/grad_norm,0.69952
train/learning_rate,2e-05
train/loss,1.0273
train_loss,1.13139
train_runtime,1048.2731
train_samples_per_second,0.859
train_steps_per_second,0.027


[34m[1mwandb[0m: Agent Starting Run: ftp8nq6p with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 0.0008218874426410474
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0.05


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 2
\        /    Total batch size = 4 | Total steps = 225
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.8617,0.86394
100,0.9422,0.905065
150,0.9203,0.850623
200,0.798,0.816428


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,▅█▄▁
eval/runtime,▁▂▇█
eval/samples_per_second,█▇▃▁
eval/steps_per_second,██▄▁
train/epoch,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇███
train/grad_norm,▇▇▁▂▄▄▅▄▆█▅▄▄▄▂▃▂▁▂▄▁▂
train/learning_rate,▃▄▆▇███▇▇▇▆▅▅▄▄▃▃▂▂▁▁▁
train/loss,█▄▂▁▂▂▂▂▂▃▃▂▂▂▂▁▂▂▁▁▁▂

0,1
eval/loss,0.81643
eval/runtime,45.2269
eval/samples_per_second,2.211
eval/steps_per_second,0.287
total_flos,1.323828272160768e+16
train/epoch,1.0
train/global_step,225.0
train/grad_norm,1.17823
train/learning_rate,0.0
train/loss,0.8861


[34m[1mwandb[0m: Agent Starting Run: l2ezcktf with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 0.00040644263211728497
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.03


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 2
\        /    Total batch size = 4 | Total steps = 225
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.8399,0.81978
100,0.8403,0.805128
150,0.853,0.789063
200,0.7608,0.781823


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,█▅▂▁
eval/runtime,▁▃▃█
eval/samples_per_second,█▆▆▁
eval/steps_per_second,█▆▄▁
train/epoch,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇███
train/grad_norm,█▄▂▂▄▄▅▁▅▃▃▂▁▂▁▂▂▂▂▅▁▁
train/learning_rate,▇███▇▇▇▆▆▅▅▄▄▃▃▂▂▂▁▁▁▁
train/loss,█▄▂▂▂▂▂▂▂▂▂▁▂▂▂▁▂▂▁▁▁▂

0,1
eval/loss,0.78182
eval/runtime,45.485
eval/samples_per_second,2.199
eval/steps_per_second,0.286
total_flos,1.323828272160768e+16
train/epoch,1.0
train/global_step,225.0
train/grad_norm,0.86973
train/learning_rate,0.0
train/loss,0.853


[34m[1mwandb[0m: Agent Starting Run: g3n97dps with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 3.2000869269149885e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.05


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 56
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.9382,0.90824


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▃▄▆▇▇█
train/global_step,▁▃▄▆▇▇█
train/grad_norm,▄█▂▁▅
train/learning_rate,█▇▅▂▁
train/loss,█▅▃▂▁

0,1
eval/loss,0.90824
eval/runtime,45.337
eval/samples_per_second,2.206
eval/steps_per_second,0.287
total_flos,1.31675481452544e+16
train/epoch,0.99556
train/global_step,56.0
train/grad_norm,1.2366
train/learning_rate,0.0
train/loss,0.9382


[34m[1mwandb[0m: Agent Starting Run: rcyg43xk with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 3.653399657315536e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.01


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 2
\        /    Total batch size = 4 | Total steps = 225
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.9212,0.87458
100,0.8565,0.825663
150,0.8842,0.816061
200,0.8039,0.813215


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,█▂▁▁
eval/runtime,▁▂▇█
eval/samples_per_second,██▂▁
eval/steps_per_second,██▁▁
train/epoch,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇███
train/grad_norm,▄▂▃▄█▃▃▁▁▁▁▂▁▃▁▁▂▃▃▆▁▃
train/learning_rate,▄▇███▇▇▇▆▆▅▅▄▄▃▃▂▂▁▁▁▁
train/loss,█▇▄▃▂▂▂▂▂▂▂▁▂▂▂▁▂▁▁▁▁▂

0,1
eval/loss,0.81322
eval/runtime,45.4692
eval/samples_per_second,2.199
eval/steps_per_second,0.286
total_flos,1.323828272160768e+16
train/epoch,1.0
train/global_step,225.0
train/grad_norm,1.61448
train/learning_rate,0.0
train/loss,0.8937


[34m[1mwandb[0m: Agent Starting Run: ihl9okba with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 0.0006025903090293916
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.05


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 56
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.79,0.784909


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▃▄▆▇▇█
train/global_step,▁▃▄▆▇▇█
train/grad_norm,█▅▂▁▁
train/learning_rate,█▇▅▂▁
train/loss,█▂▂▁▁

0,1
eval/loss,0.78491
eval/runtime,45.5678
eval/samples_per_second,2.195
eval/steps_per_second,0.285
total_flos,1.31675481452544e+16
train/epoch,0.99556
train/global_step,56.0
train/grad_norm,0.4635
train/learning_rate,2e-05
train/loss,0.79


[34m[1mwandb[0m: Agent Starting Run: kp3dq5fp with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 7.556288074458126e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 4
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0.01


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 2
\        /    Total batch size = 8 | Total steps = 112
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.851,0.830221
100,0.7951,0.810582


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,█▁
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁▁
train/epoch,▁▂▂▃▄▄▄▅▆▆▇▇██
train/global_step,▁▂▂▃▄▄▄▅▆▆▇▇██
train/grad_norm,▂█▃▁▁▁▅▁▂▄▁
train/learning_rate,▄▇█▇▇▅▄▃▂▁▁
train/loss,█▅▃▂▁▁▁▁▁▁▁

0,1
eval/loss,0.81058
eval/runtime,45.4135
eval/samples_per_second,2.202
eval/steps_per_second,0.286
total_flos,1.599323206975488e+16
train/epoch,0.99556
train/global_step,112.0
train/grad_norm,0.89912
train/learning_rate,0.0
train/loss,0.8258


[34m[1mwandb[0m: Agent Starting Run: unnsh4y3 with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 3.935572083321597e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0.01


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 2
\        /    Total batch size = 4 | Total steps = 225
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,1.018,0.919953
100,0.8582,0.828344
150,0.8849,0.815039
200,0.8018,0.811821


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,█▂▁▁
eval/runtime,▁▂█▁
eval/samples_per_second,█▇▁█
eval/steps_per_second,▁▁▁▁
train/epoch,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇███
train/grad_norm,▅▁█▇█▄▄▂▂▂▂▃▁▄▂▁▃▄▃▇▁▃
train/learning_rate,▃▄▆▇███▇▇▇▆▅▅▄▄▃▃▂▂▁▁▁
train/loss,██▅▄▃▂▂▂▂▂▂▁▂▂▂▁▂▁▁▁▁▂

0,1
eval/loss,0.81182
eval/runtime,45.4008
eval/samples_per_second,2.203
eval/steps_per_second,0.286
total_flos,1.323828272160768e+16
train/epoch,1.0
train/global_step,225.0
train/grad_norm,1.6249
train/learning_rate,0.0
train/loss,0.8911


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: gqh5mjzo with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 1.4602420047129336e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 4
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.05


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 8
\        /    Total batch size = 32 | Total steps = 28
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
train/epoch,▁▅█
train/global_step,▁▅█
train/grad_norm,▁█
train/learning_rate,█▁
train/loss,█▁

0,1
total_flos,1.599323206975488e+16
train/epoch,0.99556
train/global_step,28.0
train/grad_norm,1.14043
train/learning_rate,0.0
train/loss,1.4172
train_loss,1.44857
train_runtime,1047.8095
train_samples_per_second,0.859
train_steps_per_second,0.027


[34m[1mwandb[0m: Agent Starting Run: s4bfyfxj with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 1.3495747103100194e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.01


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 2
\        /    Total batch size = 4 | Total steps = 225
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,1.2019,1.095695
100,0.9207,0.884512
150,0.9238,0.850046
200,0.8451,0.843332


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,█▂▁▁
eval/runtime,▂█▁▂
eval/samples_per_second,▇▁█▇
eval/steps_per_second,█▁██
train/epoch,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇███
train/grad_norm,▃▁▂▅▇▃▄█▄▄▃▅▂▄▃▃▅▅▆▆▃▅
train/learning_rate,▄▇███▇▇▇▆▆▅▅▄▄▃▃▂▂▂▁▁▁
train/loss,██▅▅▄▄▃▂▂▂▂▁▂▂▂▁▂▁▁▁▁▂

0,1
eval/loss,0.84333
eval/runtime,45.4059
eval/samples_per_second,2.202
eval/steps_per_second,0.286
total_flos,1.323828272160768e+16
train/epoch,1.0
train/global_step,225.0
train/grad_norm,2.44097
train/learning_rate,0.0
train/loss,0.9273


[34m[1mwandb[0m: Agent Starting Run: opx7d41v with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 0.0007571815165370608
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0.01


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 112
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.8552,0.838927
100,0.771,0.792796


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,█▁
eval/runtime,▁█
eval/samples_per_second,█▁
eval/steps_per_second,▁▁
train/epoch,▁▂▂▃▄▄▄▅▆▆▇▇██
train/global_step,▁▂▂▃▄▄▄▅▆▆▇▇██
train/grad_norm,█▂▅▂▇█▂▃▂▅▁
train/learning_rate,▄▇█▇▇▅▄▃▂▁▁
train/loss,█▂▂▂▂▂▂▁▂▁▁

0,1
eval/loss,0.7928
eval/runtime,45.5335
eval/samples_per_second,2.196
eval/steps_per_second,0.286
total_flos,1.31675481452544e+16
train/epoch,0.99556
train/global_step,112.0
train/grad_norm,0.67072
train/learning_rate,0.0
train/loss,0.8091


[34m[1mwandb[0m: Agent Starting Run: gkijafjg with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 0.00039058283216112706
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0.01


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 2
\        /    Total batch size = 4 | Total steps = 225
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.8492,0.835757
100,0.8485,0.811499
150,0.8548,0.792736
200,0.7628,0.782636


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,█▅▂▁
eval/runtime,▄█▂▁
eval/samples_per_second,▆▁▇█
eval/steps_per_second,▅▁██
train/epoch,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇███
train/grad_norm,▅▇▂▂▃▂▃▂▂▂▂▂▂▂▁▂▁▁▁█▁▁
train/learning_rate,▃▄▆▇███▇▇▇▆▅▅▄▄▃▃▂▂▁▁▁
train/loss,█▅▂▂▂▂▂▂▂▂▂▁▂▂▂▁▂▂▁▁▁▂

0,1
eval/loss,0.78264
eval/runtime,45.3232
eval/samples_per_second,2.206
eval/steps_per_second,0.287
total_flos,1.323828272160768e+16
train/epoch,1.0
train/global_step,225.0
train/grad_norm,0.92429
train/learning_rate,0.0
train/loss,0.8578


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 4j03ve2f with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 0.00022150679642872972
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.05


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 56
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.8116,0.803201


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▃▄▆▇▇█
train/global_step,▁▃▄▆▇▇█
train/grad_norm,█▂▁▁▁
train/learning_rate,█▇▅▂▁
train/loss,█▂▁▁▁

0,1
eval/loss,0.8032
eval/runtime,45.3247
eval/samples_per_second,2.206
eval/steps_per_second,0.287
total_flos,1.31675481452544e+16
train/epoch,0.99556
train/global_step,56.0
train/grad_norm,0.52961
train/learning_rate,1e-05
train/loss,0.8116


[34m[1mwandb[0m: Agent Starting Run: s5cun4kz with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 0.00030076474660550314
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0.05


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 56
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.8004,0.791839


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▃▄▆▇▇█
train/global_step,▁▃▄▆▇▇█
train/grad_norm,█▂▁▁▁
train/learning_rate,▇█▆▃▁
train/loss,█▂▂▁▁

0,1
eval/loss,0.79184
eval/runtime,45.5066
eval/samples_per_second,2.197
eval/steps_per_second,0.286
total_flos,1.31675481452544e+16
train/epoch,0.99556
train/global_step,56.0
train/grad_norm,0.4801
train/learning_rate,2e-05
train/loss,0.8004


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 279j63mm with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 7.9063489984884e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.01


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 2
\        /    Total batch size = 4 | Total steps = 225
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.855,0.832457
100,0.842,0.812291
150,0.8656,0.799079
200,0.7787,0.79403


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,█▄▂▁
eval/runtime,▆▇█▁
eval/samples_per_second,▃▁▁█
eval/steps_per_second,▁▁▁█
train/epoch,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇███
train/grad_norm,▆▄▄▂▆▂▃▁▁▁▁▂▁▂▁▁▂▃▂█▁▂
train/learning_rate,▇███▇▇▇▆▆▅▅▄▄▃▃▂▂▂▁▁▁▁
train/loss,█▅▂▂▂▂▂▂▂▂▂▁▂▂▂▁▂▂▁▁▁▂

0,1
eval/loss,0.79403
eval/runtime,45.3324
eval/samples_per_second,2.206
eval/steps_per_second,0.287
total_flos,1.323828272160768e+16
train/epoch,1.0
train/global_step,225.0
train/grad_norm,1.12827
train/learning_rate,0.0
train/loss,0.8702


[34m[1mwandb[0m: Agent Starting Run: 87chnd8v with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 0.00026004915966650326
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0.03


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 112
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.8272,0.802579
100,0.7605,0.780248


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,█▁
eval/runtime,▁█
eval/samples_per_second,█▁
eval/steps_per_second,█▁
train/epoch,▁▂▂▃▄▄▄▅▆▆▇▇██
train/global_step,▁▂▂▃▄▄▄▅▆▆▇▇██
train/grad_norm,█▅▃▁▂▁ ▁▂▁▁
train/learning_rate,▄▇█▇▇▅▄▃▂▁▁
train/loss,█▃▂▂▂▂▂▁▁▁▁

0,1
eval/loss,0.78025
eval/runtime,45.4325
eval/samples_per_second,2.201
eval/steps_per_second,0.286
total_flos,1.31675481452544e+16
train/epoch,0.99556
train/global_step,112.0
train/grad_norm,0.68232
train/learning_rate,0.0
train/loss,0.7951


[34m[1mwandb[0m: Agent Starting Run: 7kb103zv with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 8
[34m[1mwandb[0m: 	learning_rate: 0.00013319524239158856
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.03


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 56
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss
50,0.8241,0.813823


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▃▄▆▇▇█
train/global_step,▁▃▄▆▇▇█
train/grad_norm,█▃▁▁▂
train/learning_rate,█▇▅▂▁
train/loss,█▂▁▁▁

0,1
eval/loss,0.81382
eval/runtime,45.619
eval/samples_per_second,2.192
eval/steps_per_second,0.285
total_flos,1.31675481452544e+16
train/epoch,0.99556
train/global_step,56.0
train/grad_norm,0.65577
train/learning_rate,0.0
train/loss,0.8241


[34m[1mwandb[0m: Agent Starting Run: ryydalfn with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 0.0003589965507358379
[34m[1mwandb[0m: 	per_device_train_batch_size: 4
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.03


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 2
\        /    Total batch size = 8 | Total steps = 112
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.8133,0.795885
100,0.7581,0.779339


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,█▁
eval/runtime,▁█
eval/samples_per_second,█▁
eval/steps_per_second,█▁
train/epoch,▁▂▂▃▄▄▄▅▆▆▇▇██
train/global_step,▁▂▂▃▄▄▄▅▆▆▇▇██
train/grad_norm,▄▂▂▁▁▂█▁▂▂▁
train/learning_rate,▇██▇▆▅▄▃▂▁▁
train/loss,█▂▂▂▂▂▂▁▁▁▁

0,1
eval/loss,0.77934
eval/runtime,45.4231
eval/samples_per_second,2.202
eval/steps_per_second,0.286
total_flos,1.599323206975488e+16
train/epoch,0.99556
train/global_step,112.0
train/grad_norm,0.62913
train/learning_rate,0.0
train/loss,0.7941


[34m[1mwandb[0m: Agent Starting Run: slimmv68 with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.9034719380448617e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 4
[34m[1mwandb[0m: 	warmup_ratio: 0.2
[34m[1mwandb[0m: 	weight_decay: 0.03


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 56
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,1.0808,1.042244


Training completed! Model saved to /content/drive/MyDrive/math_verification_sweep/final_model


0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▃▄▆▇▇█
train/global_step,▁▃▄▆▇▇█
train/grad_norm,▁▃█▄▃
train/learning_rate,▇█▆▃▁
train/loss,█▆▄▂▁

0,1
eval/loss,1.04224
eval/runtime,45.5477
eval/samples_per_second,2.195
eval/steps_per_second,0.285
total_flos,1.599323206975488e+16
train/epoch,0.99556
train/global_step,56.0
train/grad_norm,1.25346
train/learning_rate,0.0
train/loss,1.0808


[34m[1mwandb[0m: Agent Starting Run: cn6xf35t with config:
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 0.0004158100217634244
[34m[1mwandb[0m: 	per_device_train_batch_size: 2
[34m[1mwandb[0m: 	warmup_ratio: 0.05
[34m[1mwandb[0m: 	weight_decay: 0.01


Loading model...
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded successfully!
Preparing datasets...
Datasets prepared! Train size: 900, Eval size: 100
Starting sweeping...


Map (num_proc=2):   0%|          | 0/900 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 900 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 112
 "-____-"     Number of trainable parameters = 13,631,488


Step,Training Loss,Validation Loss
50,0.8154,0.796753


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


# Get final result

Before running the main function, hyper parameter in trainer.train() should be changed.

In [None]:
def main():
    try:
        set_seeds()
        print("Starting training pipeline...")

        # Initialize and run trainer
        trainer = MathVerificationTrainer()
        trainer.setup_model()
        trainer.prepare_datasets(max_samples=1000)  # Adjust based on available RAM
        trainer.train()
        trainer.create_submission()

        print("Training pipeline completed successfully!")

    except Exception as e:
        print(f"Fatal error in main: {str(e)}")
        raise
    finally:
        clear_memory()
        print_gpu_utilization()

if __name__ == "__main__":
    main()