# Model

In [3]:
import re
import json
from datasets import load_dataset
from pathlib import Path

## Combine data

In [4]:
with open("gsm8k_train_flawed_plus1_final_answer.jsonl", "r") as f:
    lines = f.readlines()
    print(f"Total lines: {len(lines)}")
    print("First line:", lines[0] if lines else "No data")


Total lines: 7473
First line: {"id": 0, "question": "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?", "flawed_answer": "Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 73", "label": {"verdict": "Flawed", "error_details": {"error_type": "computational_error", "erroneous_line_number": "L3", "explanation": "The final answer is too high by 1. It should be 72, not 73.", "error_in_text": "#### 72", "correction_in_text": "#### 72"}}}



In [5]:
original_dataset = load_dataset("openai/gsm8k", "main")

def load_jsonl(path):
    with open(path, 'r') as f:
        return [json.loads(line) for line in f]

flawed_train_final_answer = load_jsonl("gsm8k_train_flawed_plus1_final_answer.jsonl")
flawed_test = load_jsonl("gsm8k_test_flawed_plus1_final_answer.jsonl")

combined_train_final_answer = []

# Add correct examples from original dataset
for ex in original_dataset["train"]:
    combined_train_final_answer.append({
        "question": ex["question"],
        "solution": ex["answer"],
        "label": "Correct"
    })

# Add flawed examples
for ex in flawed_train_final_answer:
    combined_train_final_answer.append({
        "question": ex["question"],
        "solution": ex["flawed_answer"],
        "label": "Flawed"
    })

print(f"Combined training set size: {len(combined_train_final_answer)}")

combined_test_final_answer = []

# Add correct examples from original test split
for ex in original_dataset["test"]:
    combined_test_final_answer.append({
        "question": ex["question"],
        "solution": ex["answer"],
        "label": "Correct"
    })

# Add flawed examples from test JSONL
for ex in flawed_test:
    combined_test_final_answer.append({
        "question": ex["question"],
        "solution": ex["flawed_answer"],
        "label": "Flawed"
    })

print(f"Combined test set size: {len(combined_test_final_answer)}")


Combined training set size: 14946
Combined test set size: 2638


In [6]:
original_dataset = load_dataset("openai/gsm8k", "main")

def load_jsonl(path):
    with open(path, 'r') as f:
        return [json.loads(line) for line in f]

flawed_train_2nd_last = load_jsonl("gsm8k_train_flawed_plus1_2nd_last.jsonl")
flawed_test = load_jsonl("gsm8k_test_flawed_plus1_2nd_last.jsonl")

combined_train_2nd_last = []

# Add correct examples from original dataset
for ex in original_dataset["train"]:
    combined_train_2nd_last.append({
        "question": ex["question"],
        "solution": ex["answer"],
        "label": "Correct"
    })

# Add flawed examples
for ex in flawed_train_2nd_last:
    combined_train_2nd_last.append({
        "question": ex["question"],
        "solution": ex["flawed_answer"],
        "label": "Flawed"
    })

print(f"Combined training set size: {len(combined_train_2nd_last)}")

combined_test_2nd_last = []

# Add correct examples from original test split
for ex in original_dataset["test"]:
    combined_test_2nd_last.append({
        "question": ex["question"],
        "solution": ex["answer"],
        "label": "Correct"
    })

# Add flawed examples from test JSONL
for ex in flawed_test:
    combined_test_2nd_last.append({
        "question": ex["question"],
        "solution": ex["flawed_answer"],
        "label": "Flawed"
    })

print(f"Combined test set size: {len(combined_test_2nd_last)}")


Combined training set size: 14946
Combined test set size: 2638


In [7]:
# Convert to HUgging Face Dataset
from datasets import Dataset
train_dataset = Dataset.from_list(combined_train_final_answer)
test_dataset = Dataset.from_list(combined_test_final_answer)

In [8]:
import torch
print("Torch version:", torch.__version__)
print("MPS available:", torch.backends.mps.is_available())

Torch version: 2.5.1
MPS available: True


## AutoTokenizer

Converts text into numbers that neural networks can understand.

### Special Tokens
- `[CLS]` (101): Start of sequence
- `[SEP]` (102): Separator between segments  
- `[PAD]` (0): Padding token
- `[UNK]`: Unknown/out-of-vocabulary words

In [18]:
# Preprocessing with tokenizer
from transformers import AutoTokenizer
# Use the same model for both tokenization and training
model_name = "distilbert-base-uncased"  # ← Consistent choice

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Text to token
text = "Hello world"
tokens = tokenizer.tokenize(text)  # ['hello', 'world']
print("Tokens:", tokens)

# Convert tokens to IDs
ids = tokenizer.convert_tokens_to_ids(tokens)  # [7592, 2088]
print("IDs:", ids)

# All-in-one tokenization
result = tokenizer(text, truncation=True, padding="max_length", max_length=512)

# input_ids: numerical representation of the text; special tokens like [CLS] and [SEP] are included
# attention_mask: indicates which tokens are real (1) vs padding (0)
print("Tokenization result:", result['input_ids'][:10], result['attention_mask'][:10])  

Tokens: ['hello', 'world']
IDs: [7592, 2088]
Tokenization result: [101, 7592, 2088, 102, 0, 0, 0, 0, 0, 0] [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]


| Parameter | Purpose |
|-----------|---------|
| `truncation=True` | Cut text if > 512 tokens |
| `padding="max_length"` | Add padding to reach exactly 512 tokens |
| `max_length=512` | Set maximum sequence length |

In [19]:
def tokenize_fn(example):
    # Combine the question and solution into a single input string
    full_input = f"Question:\n{example['question']}\n\nSolution:\n{example['solution']}"

    # Tokenize the combined input with truncation, padding, and a max length of 512 tokens
    return tokenizer(full_input, truncation=True, padding="max_length", max_length=512)

# Apply the tokenization function to the training dataset
train_dataset = train_dataset.map(tokenize_fn)

# Apply the tokenization function to the test dataset
test_dataset = test_dataset.map(tokenize_fn)

Map:   0%|          | 0/14946 [00:00<?, ? examples/s]

Map:   0%|          | 0/2638 [00:00<?, ? examples/s]

In [20]:
print(train_dataset.column_names)

['question', 'solution', 'label', 'input_ids', 'attention_mask']


In [21]:
# The `dir()` function is used to list all the attributes and methods that can be applied to the `train_dataset` object.
dir(train_dataset)

['_TF_DATASET_REFS',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getitems__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_local_temp_path',
 '_check_index_is_initialized',
 '_data',
 '_estimate_nbytes',
 '_fingerprint',
 '_format_columns',
 '_format_kwargs',
 '_format_type',
 '_generate_tables_from_cache_file',
 '_generate_tables_from_shards',
 '_get_cache_file_path',
 '_get_output_signature',
 '_getitem',
 '_indexes',
 '_indices',
 '_info',
 '_map_single',
 '_new_dataset_with_indices',
 '_output_all_columns',
 '_push_parquet_shards_to_hub',
 '_save_to_disk_single',
 '_select_contigu

#

In [22]:
# Label encoding
label_map = {"Correct": 0, "Flawed": 1}
train_dataset = train_dataset.map(lambda e: {"labels": label_map[e["label"]]})
test_dataset = test_dataset.map(lambda e: {"labels": label_map[e["label"]]})

Map:   0%|          | 0/14946 [00:00<?, ? examples/s]

Map:   0%|          | 0/2638 [00:00<?, ? examples/s]

## Model Setup & Training

In [31]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from sklearn.metrics import accuracy_score
import numpy as np
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer

## Fine-tuning

#### Epoch

An epoch refers to one complete pass through the entire training dataset. During an epoch, the model processes all the training samples once, updating its weights based on the computed loss. Training for multiple epochs allows the model to learn and refine its parameters iteratively.

Increasing the number of epochs allows the model to learn more but risks overfitting if too high. A balance between batch size and epochs is crucial for optimal performance.

#### Batch size

Batch size determines the number of samples processed before the model updates its weights. For example, a batch size of 8 means 8 samples (e.g., 8 question-answer pairs) are processed together in one forward and backward pass during training.

A smaller batch size uses less memory but may take longer to converge, while a larger batch size can speed up training but requires more memory. 

#### Optimizer - AdamW

AdamW is an optimizer that implements the Adam algorithm with weight decay regularization. It helps prevent overfitting by penalizing large weights, which is particularly useful in deep learning models. AdamW is widely used because it combines the benefits of Adam (adaptive learning rates) with weight decay for better generalization.

Other optimizer choices include:
- **SGD (Stochastic Gradient Descent)**: A simple optimizer with momentum and learning rate decay options.
- **RMSprop**: Designed for non-stationary objectives, often used in RNNs.
- **Adagrad**: Adapts learning rates based on parameter updates, suitable for sparse data.
- **Adadelta**: An extension of Adagrad that reduces aggressive learning rate decay.
- **Adam**: Similar to AdamW but without weight decay.
- **Nadam**: Adam with Nesterov momentum.

#### tqdm

`tqdm` is a Python library used to display progress bars for loops. It provides a visual representation of the progress of an iterable, such as a training loop or data processing, making it easier to monitor the execution time and completion percentage. It is especially useful in long-running tasks.

For example:
```python
from tqdm import tqdm
for i in tqdm(range(100)):
    # Simulate some work
    pass
```

This will display a progress bar in the console, showing the percentage completed, elapsed time, and estimated time remaining.

In [49]:
train_dataset = Dataset.from_list(combined_train_final_answer)
test_dataset = Dataset.from_list(combined_test_final_answer)

# train_dataset = Dataset.from_list(combined_train_2nd_last)
# test_dataset = Dataset.from_list(combined_test_2nd_last)

In [None]:
# STEP 1: Choose ONE model and stick with it
model_name = "prajjwal1/bert-tiny"  # Faster option
# "distilbert-base-uncased"

# STEP 2: Initialize tokenizer ONCE
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

print(f"Using model: {model_name}")

# STEP 3: Tokenize your datasets with the SAME tokenizer
def tokenize_fn(example):
    full_input = f"Question:\n{example['question']}\n\nSolution:\n{example['solution']}"
    return tokenizer(full_input, truncation=True, padding="max_length", max_length=256)

# Apply to both datasets
train_dataset = train_dataset.map(tokenize_fn)
test_dataset = test_dataset.map(tokenize_fn)

# STEP 4: Encode labels
label_map = {"Correct": 0, "Flawed": 1}
train_dataset = train_dataset.map(lambda e: {"labels": label_map[e["label"]]})
test_dataset = test_dataset.map(lambda e: {"labels": label_map[e["label"]]})

# STEP 5: Set format
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# STEP 6: Continue with training using the SAME model and tokenizer
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using model: prajjwal1/bert-tiny


Map:   0%|          | 0/14946 [00:00<?, ? examples/s]

Map:   0%|          | 0/2638 [00:00<?, ? examples/s]

Map:   0%|          | 0/14946 [00:00<?, ? examples/s]

Map:   0%|          | 0/2638 [00:00<?, ? examples/s]

Using device: mps


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [55]:
# OPTIMIZATION 2: Create smaller datasets for faster experimentation
def create_subset(dataset, size):
    indices = list(range(min(size, len(dataset))))
    return dataset.select(indices)

# 🔥 ENABLE SMALLER SUBSETS FOR FASTER LR TESTING
train_subset = create_subset(train_dataset, 2000)  # Only 2000 samples
test_subset = create_subset(test_dataset, 500)     # Only 500 samples

print(f"LR Testing on {len(train_subset)} training samples")
print(f"LR Testing on {len(test_subset)} test samples")

# Re-tokenize with the new tokenizer
def tokenize_fn(example):
    full_input = f"Question:\n{example['question']}\n\nSolution:\n{example['solution']}"
    return tokenizer(full_input, truncation=True, padding="max_length", max_length=256)  # Reduced from 512

train_subset = train_subset.map(tokenize_fn)
test_subset = test_subset.map(tokenize_fn)

# Re-encode labels
label_map = {"Correct": 0, "Flawed": 1}
train_subset = train_subset.map(lambda e: {"labels": label_map[e["label"]]})
test_subset = test_subset.map(lambda e: {"labels": label_map[e["label"]]})

# Set format
train_subset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_subset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# OPTIMIZATION 3: Batch size for faster training
train_dataloader = DataLoader(train_subset, batch_size=16, shuffle=True)  # Increased for speed
eval_dataloader = DataLoader(test_subset, batch_size=16, shuffle=False)   # Increased for speed

# 🎯 LEARNING RATE EXPERIMENT FUNCTION
def test_learning_rates(learning_rates, model_template, train_dataloader, eval_dataloader, device):
    """Test multiple learning rates and return results"""
    results = {}
    
    for lr in learning_rates:
        print(f"\n🧪 Testing Learning Rate: {lr:.2e}")
        
        # Create fresh model copy
        test_model = type(model_template)(model_template.config)
        test_model.load_state_dict(model_template.state_dict())
        test_model.to(device)
        
        # Create optimizer with current LR
        optimizer = AdamW(test_model.parameters(), lr=lr)
        
        # Train for 1 epoch only (quick test)
        test_model.train()
        total_loss = 0
        batch_count = 0
        
        for batch in tqdm(train_dataloader, desc=f"LR {lr:.2e}", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            outputs = test_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            batch_count += 1
            
            # Early stopping for quick test - only process first 50 batches
            if batch_count >= 50:
                break
        
        avg_loss = total_loss / batch_count
        
        # Quick evaluation
        test_model.eval()
        eval_loss = 0
        eval_batches = 0
        
        with torch.no_grad():
            for batch in eval_dataloader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = test_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                eval_loss += outputs.loss.item()
                eval_batches += 1
                
                # Early stopping for quick eval
                if eval_batches >= 20:
                    break
        
        avg_eval_loss = eval_loss / eval_batches
        
        results[lr] = {
            'train_loss': avg_loss,
            'eval_loss': avg_eval_loss
        }
        
        print(f"   Train Loss: {avg_loss:.4f}, Eval Loss: {avg_eval_loss:.4f}")
        
        # Clean up
        del test_model
        del optimizer
        torch.mps.empty_cache() if device.type == 'mps' else None
    
    return results

# 🎯 RUN LEARNING RATE EXPERIMENTS
learning_rates_to_test = [5e-5, 1e-4, 2e-4, 3e-4, 5e-4, 1e-3]

print("🚀 Starting Learning Rate Experiments...")
lr_results = test_learning_rates(learning_rates_to_test, model, train_dataloader, eval_dataloader, device)

# 📊 ANALYZE RESULTS
print("\n📊 Learning Rate Results Summary:")
print("=" * 50)
for lr, metrics in lr_results.items():
    print(f"LR {lr:.2e}: Train={metrics['train_loss']:.4f}, Eval={metrics['eval_loss']:.4f}")

# Find best LR based on lowest evaluation loss
best_lr = min(lr_results.keys(), key=lambda x: lr_results[x]['eval_loss'])
print(f"\n🏆 Best Learning Rate: {best_lr:.2e}")
print(f"   Best Eval Loss: {lr_results[best_lr]['eval_loss']:.4f}")

# ========================================
# 🎯 NOW TRAIN ON FULL DATASET WITH BEST LR
# ========================================
print(f"\n🎯 Now preparing FULL dataset training with best LR: {best_lr:.2e}")

# Make sure full datasets are properly tokenized
print(f"Full training set: {len(train_dataset)} samples")
print(f"Full test set: {len(test_dataset)} samples")

# Ensure full datasets are tokenized (they should be from earlier, but let's be safe)
if 'input_ids' not in train_dataset.column_names:
    print("Tokenizing full training dataset...")
    train_dataset = train_dataset.map(tokenize_fn)
    train_dataset = train_dataset.map(lambda e: {"labels": label_map[e["label"]]})
    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

if 'input_ids' not in test_dataset.column_names:
    print("Tokenizing full test dataset...")
    test_dataset = test_dataset.map(tokenize_fn)
    test_dataset = test_dataset.map(lambda e: {"labels": label_map[e["label"]]})
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Create full dataset loaders with smaller batch size for memory efficiency
full_train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)  # Smaller batch for full dataset
full_eval_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Reset model to original state for full training
print("Resetting model for full dataset training...")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

# Use best LR found from experiments
optimizer = AdamW(model.parameters(), lr=best_lr)

# OPTIMIZATION 5: Learning rate scheduling (optional)
from torch.optim.lr_scheduler import CosineAnnealingLR
scheduler = CosineAnnealingLR(optimizer, T_max=3)  # Cosine annealing over 3 epochs

# OPTIMIZATION 6: Gradient accumulation for stability
def train_epoch_optimized(model, dataloader, optimizer, scheduler, device, accumulation_steps=2):
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    
    for i, batch in enumerate(tqdm(dataloader, desc="Training")):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss / accumulation_steps  # Scale loss
        
        loss.backward()
        
        # Update weights every accumulation_steps
        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        
        total_loss += loss.item() * accumulation_steps
    
    # Handle remaining gradients
    if len(dataloader) % accumulation_steps != 0:
        optimizer.step()
        optimizer.zero_grad()
    
    # Step scheduler
    if scheduler:
        scheduler.step()
    
    return total_loss / len(dataloader)

# Evaluation function (same as before)
def evaluate(model, dataloader, device):
    model.eval()
    predictions = []
    true_labels = []
    total_loss = 0
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            
            preds = torch.argmax(outputs.logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(true_labels, predictions)
    return total_loss / len(dataloader), accuracy

# OPTIMIZATION 7: Train on FULL dataset with optimized settings
num_epochs = 3
print(f"\n🚀 Starting FULL dataset training with LR {best_lr:.2e} for {num_epochs} epochs...")
print(f"Training samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")

for epoch in range(num_epochs):
    current_lr = optimizer.param_groups[0]['lr']
    print(f"\nEpoch {epoch + 1}/{num_epochs} (LR: {current_lr:.2e})")
    
    # Train on FULL dataset
    train_loss = train_epoch_optimized(model, full_train_dataloader, optimizer, scheduler, device)
    print(f"Training loss: {train_loss:.4f}")
    
    # Evaluate on FULL test set
    eval_loss, eval_accuracy = evaluate(model, full_eval_dataloader, device)
    print(f"Evaluation loss: {eval_loss:.4f}")
    print(f"Evaluation accuracy: {eval_accuracy:.4f}")
    
    # Save checkpoint
    torch.save(model.state_dict(), f"./full_verifier_lr{best_lr:.0e}_epoch_{epoch+1}.pt")

print("\n🚀 Full dataset training completed!")

# Save final model trained on full dataset
model.save_pretrained(f"./full_verifier_model_lr{best_lr:.0e}")
tokenizer.save_pretrained(f"./full_verifier_model_lr{best_lr:.0e}")
print(f"✅ Full model saved to './full_verifier_model_lr{best_lr:.0e}'")

# 📊 FINAL SUMMARY
print("\n" + "="*60)
print("🎉 TRAINING COMPLETE!")
print("="*60)
print(f"✅ Learning Rate Optimization: DONE (Best LR: {best_lr:.2e})")
print(f"✅ Full Dataset Training: DONE ({len(train_dataset)} samples)")
print(f"✅ Model Saved: ./full_verifier_model_lr{best_lr:.0e}")
print("="*60)

LR Testing on 2000 training samples
LR Testing on 500 test samples


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

🚀 Starting Learning Rate Experiments...

🧪 Testing Learning Rate: 5.00e-05


                                                             

   Train Loss: 0.0006, Eval Loss: 0.0005

🧪 Testing Learning Rate: 1.00e-04


                                                             

   Train Loss: 0.0005, Eval Loss: 0.0003

🧪 Testing Learning Rate: 2.00e-04


                                                             

   Train Loss: 0.0003, Eval Loss: 0.0001

🧪 Testing Learning Rate: 3.00e-04


                                                             

   Train Loss: 0.0003, Eval Loss: 0.0001

🧪 Testing Learning Rate: 5.00e-04


                                                             

   Train Loss: 0.0002, Eval Loss: 0.0000

🧪 Testing Learning Rate: 1.00e-03


                                                             

   Train Loss: 0.0001, Eval Loss: 0.0000

📊 Learning Rate Results Summary:
LR 5.00e-05: Train=0.0006, Eval=0.0005
LR 1.00e-04: Train=0.0005, Eval=0.0003
LR 2.00e-04: Train=0.0003, Eval=0.0001
LR 3.00e-04: Train=0.0003, Eval=0.0001
LR 5.00e-04: Train=0.0002, Eval=0.0000
LR 1.00e-03: Train=0.0001, Eval=0.0000

🏆 Best Learning Rate: 1.00e-03
   Best Eval Loss: 0.0000

🎯 Now preparing FULL dataset training with best LR: 1.00e-03
Full training set: 14946 samples
Full test set: 2638 samples
Resetting model for full dataset training...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🚀 Starting FULL dataset training with LR 1.00e-03 for 3 epochs...
Training samples: 14946
Test samples: 2638

Epoch 1/3 (LR: 1.00e-03)


Training: 100%|██████████| 1869/1869 [00:42<00:00, 44.28it/s]


Training loss: 0.7001


Evaluating: 100%|██████████| 330/330 [00:03<00:00, 88.13it/s] 


Evaluation loss: 0.6932
Evaluation accuracy: 0.5000

Epoch 2/3 (LR: 7.50e-04)


Training: 100%|██████████| 1869/1869 [00:47<00:00, 39.31it/s]


Training loss: 0.6964


Evaluating: 100%|██████████| 330/330 [00:04<00:00, 71.44it/s]


Evaluation loss: 0.6938
Evaluation accuracy: 0.5000

Epoch 3/3 (LR: 2.50e-04)


Training: 100%|██████████| 1869/1869 [00:57<00:00, 32.78it/s]


Training loss: 0.6943


Evaluating: 100%|██████████| 330/330 [00:04<00:00, 70.93it/s]


Evaluation loss: 0.6934
Evaluation accuracy: 0.5000

🚀 Full dataset training completed!
✅ Full model saved to './full_verifier_model_lr1e-03'

🎉 TRAINING COMPLETE!
✅ Learning Rate Optimization: DONE (Best LR: 1.00e-03)
✅ Full Dataset Training: DONE (14946 samples)
✅ Model Saved: ./full_verifier_model_lr1e-03


# Backup

In [30]:
# Load the pre-trained DistilBERT model for sequence classification with 2 labels (e.g., Correct and Flawed)
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Determine the device to use for training (MPS for Apple Silicon, if available, otherwise CPU)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the selected device
model.to(device)

# Set the dataset format to PyTorch tensors for compatibility with the DataLoader
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Prepare DataLoader objects for training and evaluation datasets
# Batch size is set to 8, and training data is shuffled
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
eval_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Initialize the AdamW optimizer with a learning rate of 5e-5
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define the training function for one epoch
def train_epoch(model, dataloader, optimizer, device):
    model.train()  # Set the model to training mode
    total_loss = 0  # Initialize total loss for the epoch
    
    # Iterate through batches in the DataLoader
    for batch in tqdm(dataloader, desc="Training", leave=False):
        # Move input data and labels to the selected device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Perform a forward pass and compute the loss
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        # Perform a backward pass and update model parameters
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Accumulate the loss
        total_loss += loss.item()
    
    # Return the average loss for the epoch
    return total_loss / len(dataloader)

# Define the evaluation function
def evaluate(model, dataloader, device):
    model.eval()  # Set the model to evaluation mode
    predictions = []  # List to store predictions
    true_labels = []  # List to store true labels
    total_loss = 0  # Initialize total loss for evaluation
    
    # Disable gradient computation for evaluation
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            # Move input data and labels to the selected device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Perform a forward pass and compute the loss
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            
            # Get predictions by taking the argmax of the logits
            preds = torch.argmax(outputs.logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    # Compute accuracy using sklearn's accuracy_score
    accuracy = accuracy_score(true_labels, predictions)
    return total_loss / len(dataloader), accuracy

# Training loop for multiple epochs
num_epochs = 3  # Number of epochs to train
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    
    # Train the model for one epoch and compute the training loss
    train_loss = train_epoch(model, train_dataloader, optimizer, device)
    print(f"Training loss: {train_loss:.4f}")
    
    # Evaluate the model and compute the evaluation loss and accuracy
    eval_loss, eval_accuracy = evaluate(model, eval_dataloader, device)
    print(f"Evaluation loss: {eval_loss:.4f}")
    print(f"Evaluation accuracy: {eval_accuracy:.4f}")
    
    # Save the model checkpoint for the current epoch
    torch.save(model.state_dict(), f"./verifier_model_epoch_{epoch+1}.pt")

print("\nTraining completed!")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: mps

Epoch 1/3


Training:  18%|█▊        | 339/1869 [13:21<1:00:17,  2.36s/it]


KeyboardInterrupt: 

# Backup

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Check current versions first
import transformers
import accelerate

print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")



In [26]:
# Load a lightweight and stable pre-trained model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)  # Load tokenizer for the model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Load model for binary classification

# Define training arguments for the Trainer
training_args = TrainingArguments(
    output_dir="./verifier_model",  # Directory to save the model and checkpoints
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,  # Batch size for evaluation
    num_train_epochs=3,  # Number of training epochs
    evaluation_strategy="epoch",  # Evaluate the model at the end of each epoch
    save_strategy="epoch",  # Save the model at the end of each epoch
    logging_dir="./logs",  # Directory to save logs
    logging_steps=10,  # Log every 10 steps
    report_to="none",  # Disable reporting to external tools like WandB
    # Fix for compatibility issue
    dataloader_drop_last=False,
    remove_unused_columns=False,
    dataloader_num_workers=0,  # Disable multiprocessing
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    accuracy = accuracy_score(labels, predictions)
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Add to your trainer
trainer = Trainer(
    model=model,  # The model to train
    args=training_args,  # Training arguments
    train_dataset=train_dataset,  # Training dataset
    eval_dataset=test_dataset,  # Evaluation dataset
    processing_class=tokenizer,  # Use this instead of tokenizer
    compute_metrics=compute_metrics  # Function to compute metrics during evaluation
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
# Train the model using the Trainer
trainer.train()

TypeError: DataLoader.__init__() got an unexpected keyword argument 'in_order'