In [15]:
import accelerate
print("Accelerate version:", accelerate.__version__)

Accelerate version: 0.26.0


In [16]:
# Standard library imports
import re
import json
from pathlib import Path

# Third-party imports
import torch
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Datasets and ML frameworks
from datasets import Dataset, load_dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR

# Transformers
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)


In [17]:
with open("gsm8k_train_flawed_plus1_final_answer.jsonl", "r") as f:
    lines = f.readlines()
    print(f"Total lines: {len(lines)}")
    print("First line:", lines[0] if lines else "No data")


Total lines: 7473
First line: {"id": 0, "question": "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?", "flawed_answer": "Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 73", "label": {"verdict": "Flawed", "error_details": {"error_type": "computational_error", "erroneous_line_number": "L3", "explanation": "The final answer is too high by 1. It should be 72, not 73.", "error_in_text": "#### 72", "correction_in_text": "#### 72"}}}



In [18]:
original_dataset = load_dataset("openai/gsm8k", "main")

def load_jsonl(path):
    with open(path, 'r') as f:
        return [json.loads(line) for line in f]

flawed_train_final_answer = load_jsonl("gsm8k_train_flawed_plus1_final_answer.jsonl")
flawed_test = load_jsonl("gsm8k_test_flawed_plus1_final_answer.jsonl")

combined_train_final_answer = []

# 🔥 Add FIRST HALF of correct examples from original dataset
train_size = len(original_dataset["train"])
half_train_size = train_size // 2
print(f"Using first {half_train_size} out of {train_size} training examples")

for i, ex in enumerate(original_dataset["train"]):
    if i >= half_train_size:  # Stop at halfway point
        break
    combined_train_final_answer.append({
        "question": ex["question"],
        "solution": ex["answer"],
        "label": "Correct"
    })

# 🔥 Add FIRST HALF of flawed examples
flawed_size = len(flawed_train_final_answer)
half_flawed_size = flawed_size // 2
print(f"Using first {half_flawed_size} out of {flawed_size} flawed examples")

for i, ex in enumerate(flawed_train_final_answer):
    if i >= half_flawed_size:  # Stop at halfway point
        break
    combined_train_final_answer.append({
        "question": ex["question"],
        "solution": ex["flawed_answer"],
        "label": "Flawed"
    })

print(f"Combined training set size: {len(combined_train_final_answer)}")

combined_test_final_answer = []

# 🔥 Add FIRST HALF of correct test examples
test_size = len(original_dataset["test"])
half_test_size = test_size // 2
print(f"Using first {half_test_size} out of {test_size} test examples")

for i, ex in enumerate(original_dataset["test"]):
    if i >= half_test_size:  # Stop at halfway point
        break
    combined_test_final_answer.append({
        "question": ex["question"],
        "solution": ex["answer"],
        "label": "Correct"
    })

# 🔥 Add FIRST HALF of flawed test examples
flawed_test_size = len(flawed_test)
half_flawed_test_size = flawed_test_size // 2
print(f"Using first {half_flawed_test_size} out of {flawed_test_size} flawed test examples")

for i, ex in enumerate(flawed_test):
    if i >= half_flawed_test_size:  # Stop at halfway point
        break
    combined_test_final_answer.append({
        "question": ex["question"],
        "solution": ex["flawed_answer"],
        "label": "Flawed"
    })

print(f"Combined test set size: {len(combined_test_final_answer)}")

Using first 3736 out of 7473 training examples
Using first 3736 out of 7473 flawed examples
Combined training set size: 7472
Using first 659 out of 1319 test examples
Using first 659 out of 1319 flawed test examples
Combined test set size: 1318


In [19]:
original_dataset = load_dataset("openai/gsm8k", "main")

def load_jsonl(path):
    with open(path, 'r') as f:
        return [json.loads(line) for line in f]

flawed_train_2nd_last = load_jsonl("gsm8k_train_flawed_plus1_2nd_last.jsonl")
flawed_test = load_jsonl("gsm8k_test_flawed_plus1_2nd_last.jsonl")

combined_train_2nd_last = []

# 🔥 Add FIRST HALF of correct examples from original dataset
train_size = len(original_dataset["train"])
half_train_size = train_size // 2
print(f"Using first {half_train_size} out of {train_size} correct training examples")

for i, ex in enumerate(original_dataset["train"]):
    if i >= half_train_size:  # Stop at halfway point
        break
    combined_train_2nd_last.append({
        "question": ex["question"],
        "solution": ex["answer"],
        "label": "Correct"
    })

# 🔥 Add FIRST HALF of flawed examples
flawed_size = len(flawed_train_2nd_last)
half_flawed_size = flawed_size // 2
print(f"Using first {half_flawed_size} out of {flawed_size} flawed training examples")

for i, ex in enumerate(flawed_train_2nd_last):
    if i >= half_flawed_size:  # Stop at halfway point
        break
    combined_train_2nd_last.append({
        "question": ex["question"],
        "solution": ex["flawed_answer"],
        "label": "Flawed"
    })

print(f"Combined training set size: {len(combined_train_2nd_last)}")

combined_test_2nd_last = []

# 🔥 Add FIRST HALF of correct test examples
test_size = len(original_dataset["test"])
half_test_size = test_size // 2
print(f"Using first {half_test_size} out of {test_size} correct test examples")

for i, ex in enumerate(original_dataset["test"]):
    if i >= half_test_size:  # Stop at halfway point
        break
    combined_test_2nd_last.append({
        "question": ex["question"],
        "solution": ex["answer"],
        "label": "Correct"
    })

# 🔥 Add FIRST HALF of flawed test examples
flawed_test_size = len(flawed_test)
half_flawed_test_size = flawed_test_size // 2
print(f"Using first {half_flawed_test_size} out of {flawed_test_size} flawed test examples")

for i, ex in enumerate(flawed_test):
    if i >= half_flawed_test_size:  # Stop at halfway point
        break
    combined_test_2nd_last.append({
        "question": ex["question"],
        "solution": ex["flawed_answer"],
        "label": "Flawed"
    })

print(f"Combined test set size: {len(combined_test_2nd_last)}")

Using first 3736 out of 7473 correct training examples
Using first 3736 out of 7473 flawed training examples
Combined training set size: 7472
Using first 659 out of 1319 correct test examples
Using first 659 out of 1319 flawed test examples
Combined test set size: 1318


In [20]:
import torch
print("Torch version:", torch.__version__)
print("MPS available:", torch.backends.mps.is_available())

Torch version: 2.5.1
MPS available: True


# Data

In [21]:
train_dataset = Dataset.from_list(combined_train_final_answer)
test_dataset = Dataset.from_list(combined_test_final_answer)

# train_dataset = Dataset.from_list(combined_train_2nd_last)
# test_dataset = Dataset.from_list(combined_test_2nd_last)

# AutoTokenizer

Converts text into numbers that neural networks can understand.

### Special Tokens
- `[CLS]` (101): Start of sequence
- `[SEP]` (102): Separator between segments  
- `[PAD]` (0): Padding token
- `[UNK]`: Unknown/out-of-vocabulary words

In [None]:
# Preprocessing with tokenizer
from transformers import AutoTokenizer
# Use the same model for both tokenization and training
model_name = "prajjwal1/bert-tiny"
# "gpt2-tiny"
# "distilbert-base-uncased"  

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Text to token
text = "Hello world"
tokens = tokenizer.tokenize(text)  # ['hello', 'world']
print("Tokens:", tokens)

# Convert tokens to IDs
ids = tokenizer.convert_tokens_to_ids(tokens)  # [7592, 2088]
print("IDs:", ids)

# All-in-one tokenization
result = tokenizer(text, truncation=True, padding="max_length", max_length=512)

# input_ids: numerical representation of the text; special tokens like [CLS] and [SEP] are included
# attention_mask: indicates which tokens are real (1) vs padding (0)
print("Tokenization result:", result['input_ids'][:10], result['attention_mask'][:10])  

Tokens: ['hello', 'world']
IDs: [7592, 2088]
Tokenization result: [101, 7592, 2088, 102, 0, 0, 0, 0, 0, 0] [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]


| Parameter | Purpose |
|-----------|---------|
| `truncation=True` | Cut text if > 512 tokens |
| `padding="max_length"` | Add padding to reach exactly 512 tokens |
| `max_length=512` | Set maximum sequence length |

In [23]:
def tokenize_fn(example):
    # Combine the question and solution into a single input string
    full_input = f"Question:\n{example['question']}\n\nSolution:\n{example['solution']}"

    # Tokenize the combined input with truncation, padding, and a max length of 512 tokens
    return tokenizer(full_input, truncation=True, padding="max_length", max_length=512)

# Apply the tokenization function to the training dataset
train_dataset = train_dataset.map(tokenize_fn)

# Apply the tokenization function to the test dataset
test_dataset = test_dataset.map(tokenize_fn)

Map:   0%|          | 0/7472 [00:00<?, ? examples/s]

Map:   0%|          | 0/1318 [00:00<?, ? examples/s]

In [24]:
print(train_dataset.column_names)

['question', 'solution', 'label', 'input_ids', 'token_type_ids', 'attention_mask']


In [25]:
# Label encoding
label_map = {"Correct": 0, "Flawed": 1}
train_dataset = train_dataset.map(lambda e: {"labels": label_map[e["label"]]})
test_dataset = test_dataset.map(lambda e: {"labels": label_map[e["label"]]})

Map:   0%|          | 0/7472 [00:00<?, ? examples/s]

Map:   0%|          | 0/1318 [00:00<?, ? examples/s]

# Model Setup & Training

## Fine-tuning

#### Epoch

An epoch refers to one complete pass through the entire training dataset. During an epoch, the model processes all the training samples once, updating its weights based on the computed loss. Training for multiple epochs allows the model to learn and refine its parameters iteratively.

Increasing the number of epochs allows the model to learn more but risks overfitting if too high. A balance between batch size and epochs is crucial for optimal performance.

#### Batch size

Batch size determines the number of samples processed before the model updates its weights. For example, a batch size of 8 means 8 samples (e.g., 8 question-answer pairs) are processed together in one forward and backward pass during training.

A smaller batch size uses less memory but may take longer to converge, while a larger batch size can speed up training but requires more memory. 

#### Optimizer - AdamW

AdamW is an optimizer that implements the Adam algorithm with weight decay regularization. It helps prevent overfitting by penalizing large weights, which is particularly useful in deep learning models. AdamW is widely used because it combines the benefits of Adam (adaptive learning rates) with weight decay for better generalization.

Other optimizer choices include:
- **SGD (Stochastic Gradient Descent)**: A simple optimizer with momentum and learning rate decay options.
- **RMSprop**: Designed for non-stationary objectives, often used in RNNs.
- **Adagrad**: Adapts learning rates based on parameter updates, suitable for sparse data.
- **Adadelta**: An extension of Adagrad that reduces aggressive learning rate decay.
- **Adam**: Similar to AdamW but without weight decay.
- **Nadam**: Adam with Nesterov momentum.

#### tqdm

`tqdm` is a Python library used to display progress bars for loops. It provides a visual representation of the progress of an iterable, such as a training loop or data processing, making it easier to monitor the execution time and completion percentage. It is especially useful in long-running tasks.

For example:
```python
from tqdm import tqdm
for i in tqdm(range(100)):
    # Simulate some work
    pass
```

This will display a progress bar in the console, showing the percentage completed, elapsed time, and estimated time remaining.

### Notes

Check if labels were fed to the model

Model: gpt2

Learning rate scheduler

torch.optim.lr_scheduler

https://www.datacamp.com/tutorial/fine-tuning-large-language-models

Increase batch_size 

total_norm_util = clip_grad_norm_(model.parameters(), max_norm=float('inf')) 

weight-decay (decided based on if it is overfitting)

cross validation

1500 - 1999

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    
    # 🔥 LEARNING RATE - Most Important Change
    learning_rate=5e-4,  # Increased from 5e-5 - try 1e-4 to 3e-4 range
    
    # 🔥 MORE TRAINING
    num_train_epochs=5,  # Increased from 3 - more learning time
    
    # 🔥 BATCH SIZE - Better gradient estimates
    per_device_train_batch_size=32,  # Increased from 8
    per_device_eval_batch_size=32,   # Increased from 8
    
    # 🔥 LEARNING RATE SCHEDULING
    lr_scheduler_type="cosine",  # Add learning rate decay
    warmup_steps=500,           # Gradual warmup
    
    # 🔥 REGULARIZATION ADJUSTMENTS
    weight_decay=0.1,           # Increased from 0.01
    
    # 🔥 EVALUATION & EARLY STOPPING
    eval_steps=250,             # Evaluate more frequently
    load_best_model_at_end=True,  # Load best checkpoint
    metric_for_best_model="eval_f1",  # Use F1 score for model selection
    greater_is_better=True,
    
    # 🔥 IMPROVED LOGGING
    logging_steps=50,           # More frequent logging
    save_total_limit=3,         # Keep more checkpoints
    
    # Keep these settings
    report_to="none",
    remove_unused_columns=False,
    dataloader_num_workers=0,
    dataloader_drop_last=False,
    dataloader_pin_memory=False,
    skip_memory_metrics=True,
)

# Complete the compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    accuracy = accuracy_score(labels, predictions)
    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

# Ensure datasets are properly formatted
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,  # Use tokenizer instead of processing_class
    compute_metrics=compute_metrics,
)

# Start training
print("Starting training with Trainer...")
trainer.train()

# Save the model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")
print("Training completed and model saved!")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training with Trainer...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.696,0.693155,0.499241,0.359223,0.498652,0.280728
2,0.6727,0.591831,0.682853,0.60566,0.800499,0.487102
3,0.5394,0.482739,0.742033,0.668616,0.934605,0.520486
4,0.4298,0.403382,0.788316,0.783888,0.800633,0.76783
5,0.3524,0.389379,0.801214,0.793049,0.827018,0.76176


Training completed and model saved!


In [35]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    
    # 🔥 LEARNING RATE - Most Important Change
    learning_rate=5e-4,  # Increased from 5e-5 - try 1e-4 to 3e-4 range
    
    # 🔥 MORE TRAINING
    num_train_epochs=10,  # Increased from 3 - more learning time
    
    # 🔥 BATCH SIZE - Better gradient estimates
    per_device_train_batch_size=32,  # Increased from 8
    per_device_eval_batch_size=32,   # Increased from 8
    
    # 🔥 LEARNING RATE SCHEDULING
    lr_scheduler_type="cosine",  # Add learning rate decay
    warmup_steps=500,           # Gradual warmup
    
    # 🔥 REGULARIZATION ADJUSTMENTS
    weight_decay=0.1,           # Increased from 0.01
    
    # 🔥 EVALUATION & EARLY STOPPING
    eval_steps=250,             # Evaluate more frequently
    load_best_model_at_end=True,  # Load best checkpoint
    metric_for_best_model="eval_f1",  # Use F1 score for model selection
    greater_is_better=True,
    
    # 🔥 IMPROVED LOGGING
    logging_steps=50,           # More frequent logging
    save_total_limit=3,         # Keep more checkpoints
    
    # Keep these settings
    report_to="none",
    remove_unused_columns=False,
    dataloader_num_workers=0,
    dataloader_drop_last=False,
    dataloader_pin_memory=False,
    skip_memory_metrics=True,
)

# Complete the compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    accuracy = accuracy_score(labels, predictions)
    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

# Ensure datasets are properly formatted
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,  # Use tokenizer instead of processing_class
    compute_metrics=compute_metrics,
)

# Start training
print("Starting training with Trainer...")
trainer.train()

# Save the model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")
print("Training completed and model saved!")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training with Trainer...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6965,0.693432,0.5,0.006033,0.5,0.003035
2,0.6655,0.61262,0.682094,0.705965,0.656658,0.763278
3,0.5352,0.512835,0.688923,0.5629,0.946237,0.400607
4,0.494,0.51343,0.739757,0.755175,0.712938,0.802731
5,0.4646,0.438555,0.745827,0.686036,0.897059,0.555387
6,0.4385,0.410214,0.748103,0.779841,0.69258,0.892261
7,0.3655,0.395925,0.774659,0.727273,0.92093,0.60091
8,0.3466,0.418233,0.763278,0.783333,0.722151,0.855842
9,0.3406,0.406329,0.773141,0.746825,0.844828,0.669196
10,0.3236,0.412946,0.76176,0.764264,0.756315,0.772382


Training completed and model saved!


## Manual training

In [30]:
# STEP 1: Choose ONE model and stick with it
model_name = "prajjwal1/bert-tiny"  # Faster option
# "distilbert-base-uncased"

# STEP 2: Initialize tokenizer ONCE
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

print(f"Using model: {model_name}")

# STEP 3: Tokenize your datasets with the SAME tokenizer
def tokenize_fn(example):
    full_input = f"Question:\n{example['question']}\n\nSolution:\n{example['solution']}"
    return tokenizer(full_input, truncation=True, padding="max_length", max_length=256)

train_dataset = Dataset.from_list(combined_train_final_answer)
test_dataset = Dataset.from_list(combined_test_final_answer)

train_dataset = train_dataset.map(tokenize_fn)
test_dataset = test_dataset.map(tokenize_fn)

label_map = {"Correct": 0, "Flawed": 1}
train_dataset = train_dataset.map(lambda e: {"labels": label_map[e["label"]]})
test_dataset = test_dataset.map(lambda e: {"labels": label_map[e["label"]]})

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# STEP 6: Continue with training using the SAME model and tokenizer
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using model: prajjwal1/bert-tiny


Map:   0%|          | 0/7472 [00:00<?, ? examples/s]

Map:   0%|          | 0/1318 [00:00<?, ? examples/s]

Map:   0%|          | 0/7472 [00:00<?, ? examples/s]

Map:   0%|          | 0/1318 [00:00<?, ? examples/s]

Using device: mps


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [32]:
# 🔥 SIMPLE: Just take first 2000 training samples and first 500 test samples
train_subset = train_dataset.select(range(2000))  # First 2000 samples
test_subset = test_dataset.select(range(500))     # First 500 samples

print(f"Using first {len(train_subset)} training samples")
print(f"Using first {len(test_subset)} test samples")

# Create dataloaders
train_dataloader = DataLoader(train_subset, batch_size=32, shuffle=True)
eval_dataloader = DataLoader(test_subset, batch_size=32, shuffle=False)

# 🎯 QUICK LR FINDER (much shorter)
def find_best_lr_quick(model, train_dataloader, eval_dataloader, device):
    learning_rates = [1e-4, 2e-4, 3e-4, 5e-4]
    best_lr = 2e-4
    best_loss = float('inf')
    
    for lr in learning_rates:
        print(f"Testing LR {lr:.2e}...")
        
        # Fresh model copy
        test_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
        test_model.to(device)
        optimizer = AdamW(test_model.parameters(), lr=lr)
        
        # Quick training - just 20 batches
        test_model.train()
        for i, batch in enumerate(train_dataloader):
            if i >= 20: break
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            outputs = test_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            outputs.loss.backward()
            optimizer.step()
        
        # Quick eval - just 10 batches
        test_model.eval()
        eval_loss = 0
        with torch.no_grad():
            for i, batch in enumerate(eval_dataloader):
                if i >= 10: break
                
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = test_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                eval_loss += outputs.loss.item()
        
        avg_eval_loss = eval_loss / 10
        print(f"  Eval Loss: {avg_eval_loss:.4f}")
        
        if avg_eval_loss < best_loss:
            best_loss = avg_eval_loss
            best_lr = lr
        
        del test_model, optimizer
        torch.mps.empty_cache() if device.type == 'mps' else None
    
    print(f"🏆 Best LR: {best_lr:.2e}")
    return best_lr

# Find best LR
best_lr = find_best_lr_quick(model, train_dataloader, eval_dataloader, device)

# 🚀 TRAIN ON FULL DATASET WITH BEST LR
print(f"\n🚀 Training full dataset with LR {best_lr:.2e}...")

# Create full dataset loaders
full_train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
full_eval_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Reset model for full training
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)
optimizer = AdamW(model.parameters(), lr=best_lr)

# Simple training loop
for epoch in range(3):
    print(f"\nEpoch {epoch + 1}/3")
    
    # Training
    model.train()
    total_loss = 0
    for batch in tqdm(full_train_dataloader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        outputs.loss.backward()
        optimizer.step()
        
        total_loss += outputs.loss.item()
    
    # Evaluation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in tqdm(full_eval_dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            predictions = torch.argmax(outputs.logits, dim=-1)
            
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    
    accuracy = correct / total
    avg_loss = total_loss / len(full_train_dataloader)
    
    print(f"  Train Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")
    
    # Save checkpoint
    torch.save(model.state_dict(), f"./model_epoch_{epoch+1}.pt")

# Save final model
model.save_pretrained(f"./verifier_model_lr{best_lr:.0e}")
tokenizer.save_pretrained(f"./verifier_model_lr{best_lr:.0e}")
print(f"✅ Model saved to './verifier_model_lr{best_lr:.0e}'")

Using first 2000 training samples
Using first 500 test samples
Testing LR 1.00e-04...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Eval Loss: 0.2322
Testing LR 2.00e-04...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Eval Loss: 0.1340
Testing LR 3.00e-04...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Eval Loss: 0.0862
Testing LR 5.00e-04...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Eval Loss: 0.0427
🏆 Best LR: 5.00e-04

🚀 Training full dataset with LR 5.00e-04...

Epoch 1/3


Training: 100%|██████████| 467/467 [00:49<00:00,  9.34it/s]
Evaluating: 100%|██████████| 83/83 [00:02<00:00, 35.41it/s]


  Train Loss: 0.6977, Accuracy: 0.5000

Epoch 2/3


Training: 100%|██████████| 467/467 [00:43<00:00, 10.83it/s]
Evaluating: 100%|██████████| 83/83 [00:01<00:00, 42.41it/s]


  Train Loss: 0.6969, Accuracy: 0.5000

Epoch 3/3


Training: 100%|██████████| 467/467 [00:42<00:00, 10.89it/s]
Evaluating: 100%|██████████| 83/83 [00:02<00:00, 41.31it/s]


  Train Loss: 0.6941, Accuracy: 0.5539
✅ Model saved to './verifier_model_lr5e-04'


# Backup

In [30]:
# Load the pre-trained DistilBERT model for sequence classification with 2 labels (e.g., Correct and Flawed)
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Determine the device to use for training (MPS for Apple Silicon, if available, otherwise CPU)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the selected device
model.to(device)

# Set the dataset format to PyTorch tensors for compatibility with the DataLoader
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Prepare DataLoader objects for training and evaluation datasets
# Batch size is set to 8, and training data is shuffled
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
eval_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Initialize the AdamW optimizer with a learning rate of 5e-5
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define the training function for one epoch
def train_epoch(model, dataloader, optimizer, device):
    model.train()  # Set the model to training mode
    total_loss = 0  # Initialize total loss for the epoch
    
    # Iterate through batches in the DataLoader
    for batch in tqdm(dataloader, desc="Training", leave=False):
        # Move input data and labels to the selected device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Perform a forward pass and compute the loss
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        # Perform a backward pass and update model parameters
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Accumulate the loss
        total_loss += loss.item()
    
    # Return the average loss for the epoch
    return total_loss / len(dataloader)

# Define the evaluation function
def evaluate(model, dataloader, device):
    model.eval()  # Set the model to evaluation mode
    predictions = []  # List to store predictions
    true_labels = []  # List to store true labels
    total_loss = 0  # Initialize total loss for evaluation
    
    # Disable gradient computation for evaluation
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            # Move input data and labels to the selected device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Perform a forward pass and compute the loss
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            
            # Get predictions by taking the argmax of the logits
            preds = torch.argmax(outputs.logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    # Compute accuracy using sklearn's accuracy_score
    accuracy = accuracy_score(true_labels, predictions)
    return total_loss / len(dataloader), accuracy

# Training loop for multiple epochs
num_epochs = 3  # Number of epochs to train
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    
    # Train the model for one epoch and compute the training loss
    train_loss = train_epoch(model, train_dataloader, optimizer, device)
    print(f"Training loss: {train_loss:.4f}")
    
    # Evaluate the model and compute the evaluation loss and accuracy
    eval_loss, eval_accuracy = evaluate(model, eval_dataloader, device)
    print(f"Evaluation loss: {eval_loss:.4f}")
    print(f"Evaluation accuracy: {eval_accuracy:.4f}")
    
    # Save the model checkpoint for the current epoch
    torch.save(model.state_dict(), f"./verifier_model_epoch_{epoch+1}.pt")

print("\nTraining completed!")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: mps

Epoch 1/3


Training:  18%|█▊        | 339/1869 [13:21<1:00:17,  2.36s/it]


KeyboardInterrupt: 

# Backup

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Check current versions first
import transformers
import accelerate

print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")



In [26]:
# Load a lightweight and stable pre-trained model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)  # Load tokenizer for the model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Load model for binary classification

# Define training arguments for the Trainer
training_args = TrainingArguments(
    output_dir="./verifier_model",  # Directory to save the model and checkpoints
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,  # Batch size for evaluation
    num_train_epochs=3,  # Number of training epochs
    evaluation_strategy="epoch",  # Evaluate the model at the end of each epoch
    save_strategy="epoch",  # Save the model at the end of each epoch
    logging_dir="./logs",  # Directory to save logs
    logging_steps=10,  # Log every 10 steps
    report_to="none",  # Disable reporting to external tools like WandB
    # Fix for compatibility issue
    dataloader_drop_last=False,
    remove_unused_columns=False,
    dataloader_num_workers=0,  # Disable multiprocessing
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    accuracy = accuracy_score(labels, predictions)
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Add to your trainer
trainer = Trainer(
    model=model,  # The model to train
    args=training_args,  # Training arguments
    train_dataset=train_dataset,  # Training dataset
    eval_dataset=test_dataset,  # Evaluation dataset
    processing_class=tokenizer,  # Use this instead of tokenizer
    compute_metrics=compute_metrics  # Function to compute metrics during evaluation
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
# Train the model using the Trainer
trainer.train()

TypeError: DataLoader.__init__() got an unexpected keyword argument 'in_order'