In [None]:
# Cell 1: Setup and Installations

# 1.1 Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
print("Google Drive mounted successfully.")

# 1.2 Install required libraries
# Note: TRL is included for consistency with your original script, but is not
# strictly required for this sequence classification task.
!pip install -Uq transformers
!pip install -Uq peft
!pip install -Uq trl
!pip install -Uq accelerate
!pip install -Uq datasets
!pip install -Uq bitsandbytes

# Install Flash Attention 2
!pip install flash-attn==2.7.4.post1 \
  --extra-index-url https://download.pytorch.org/whl/cu124 \
  --no-build-isolation

# 1.3 Unzip the dataset
# Assumes the dataset ZIP file is located in your Google Drive's root directory.
# Adjust the path if it is stored elsewhere.
!unzip -q -o /content/drive/My\ Drive/level-1-binary.zip -d /content/
print("Dataset unzipped to '/content/level-1-binary'.")

In [None]:
# Cell 2: Project Configuration

class Config:
    # Model ID from Hugging Face Hub
    MODEL_ID = "microsoft/Phi-4-mini-instruct"

    # Local path to the unzipped dataset
    DATASET_PATH = "/content/level-1-binary"

    # Directory for saving the final model adapter
    OUTPUT_DIR = "/content/level1-classifier-output"

    # Number of labels for the classification task
    NUM_LABELS = 2

In [None]:
# Cell 3: Data Loading and Preprocessing

from datasets import load_from_disk
from transformers import AutoTokenizer

# 3.1 Load the tokenizer needed for preprocessing
# This will be the same tokenizer used for the model later.
tokenizer = AutoTokenizer.from_pretrained(
    Config.MODEL_ID,
    trust_remote_code=True
)
# Set a padding token if one is not already defined
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 3.2 Load the raw dataset from disk
raw_dataset = load_from_disk(Config.DATASET_PATH)

# 3.3 Define the preprocessing function
def preprocess_function(examples):
    """
    Formats the input text and tokenizes it for sequence classification.
    The label is passed through untouched.
    """
    # Create a single input string per example
    # Note: We do not include the label (0 or 1) in the input text itself.
    system_prompt = "Analyze the following mathematical problem and solution to determine if the solution is correct or flawed."
    input_texts = [
        f"{system_prompt}\n\n### Problem:\n{q}\n\n### Solution:\n{s}"
        for q, s in zip(examples["question"], examples["solution"])
    ]
    
    # Tokenize the texts
    # The tokenizer will return 'input_ids' and 'attention_mask'.
    return tokenizer(
        input_texts,
        truncation=True,
        max_length=512,  # A reasonable max length for these problems
        padding=False    # Padding will be handled by the data collator
    )

# 3.4 Apply the preprocessing function to the dataset
# We use batched=True for efficiency and remove original text columns.
tokenized_dataset = raw_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["question", "solution"]
)

# 3.5 Verify the new dataset structure
print("--- Tokenized dataset ---")
print(tokenized_dataset)
print("\nExample record:")
print(tokenized_dataset["train"][0])

In [None]:
# Cell 3.5: Merge Datasets for Training

from datasets import concatenate_datasets, DatasetDict

# 3.5.1 Combine the 'train' and 'validation' splits
# This creates a larger training set for the model.
full_train_dataset = concatenate_datasets(
    [tokenized_dataset["train"], tokenized_dataset["validation"]]
)

# 3.5.2 Create a new DatasetDict with the merged training set and the original test set
final_dataset = DatasetDict({
    "train": full_train_dataset,
    "test": tokenized_dataset["test"]
})

print("--- Merged dataset for training ---")
print(final_dataset)

In [None]:
# Cell 4: Model and Tokenizer Initialization

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig

# 4.1 Define 4-bit Quantization Configuration
# This enables memory-efficient training by quantizing the model weights.
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

# 4.2 Load the Tokenizer
# This is the same tokenizer instance from the previous cell, re-established for clarity.
tokenizer = AutoTokenizer.from_pretrained(
    Config.MODEL_ID,
    trust_remote_code=True
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 4.3 Load the Model for Sequence Classification
# This is the key change from the original script. We use AutoModelForSequenceClassification
# to get a base model with a classification head suitable for our task.
model = AutoModelForSequenceClassification.from_pretrained(
    Config.MODEL_ID,
    num_labels=Config.NUM_LABELS,
    quantization_config=quantization_config,
    device_map="auto", # Automatically maps model layers to available hardware (GPU/CPU)
    trust_remote_code=True,
)

# 4.4 Configure model's pad token ID
# It's important that the model's configuration knows the pad token ID
# to correctly handle padding during forward passes.
model.config.pad_token_id = tokenizer.pad_token_id

# 4.5 Verify model and configuration
print("--- Model and Tokenizer Loaded ---")
print(f"Model class: {type(model)}")
print(f"Number of labels: {model.config.num_labels}")
print(f"Model pad token ID set to: {model.config.pad_token_id}")

In [None]:
# Cell 5: LoRA and Model Preparation

from peft import prepare_model_for_kbit_training, LoraConfig

# 5.1 Prepare the quantized model for k-bit training.
# This performs necessary operations to make the quantized model compatible with PEFT.
model = prepare_model_for_kbit_training(model)

# 5.2 Define the LoRA configuration.
# PEFT (Parameter-Efficient Fine-Tuning) will insert adapter layers into the model.
# Only these adapters and the classification head will be trained.
lora_config = LoraConfig(
    r=16,                     # The dimension of the LoRA update matrices.
    lora_alpha=32,            # The scaling factor for the LoRA updates.
    lora_dropout=0.05,        # Dropout probability for LoRA layers.
    bias="none",              # Do not train bias terms.
    target_modules="all-linear", # Apply LoRA to all linear layers.
    task_type="SEQ_CLS",      # Specify the task type for correct PEFT setup.
)

print("--- LoRA Configured ---")

In [None]:
# Cell 6: Metrics Definition

import numpy as np
import evaluate

# 6.1 Load the accuracy metric from the 'evaluate' library
accuracy_metric = evaluate.load("accuracy")

# 6.2 Define the function to compute metrics
def compute_metrics(p):
    """
    Computes accuracy based on model predictions and true labels.
    'p' is an EvalPrediction object containing predictions and label_ids.
    """
    # The first element of p.predictions is the logits matrix
    logits = p.predictions[0]
    
    # Get predictions by finding the index of the max logit (the predicted class)
    predictions = np.argmax(logits, axis=1)
    
    # The 'label_ids' field contains the true labels
    labels = p.label_ids
    
    # Compute accuracy using the loaded metric
    return accuracy_metric.compute(predictions=predictions, references=labels)

print("--- Metrics function defined ---")

In [None]:
# Cell 7: Trainer Setup

from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

# 7.1 Define Training Arguments
# These arguments control the training process.
training_args = TrainingArguments(
    output_dir=Config.OUTPUT_DIR,
    
    # --- Batching and Training ---
    num_train_epochs=3,
    per_device_train_batch_size=4, # Reduced for Colab stability
    gradient_accumulation_steps=8, # Effective batch size = 4 * 8 = 32
    
    # --- Optimizer and Scheduling ---
    optim="paged_adamw_8bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    
    # --- Logging and Saving ---
    logging_strategy="steps",
    logging_steps=25,
    save_strategy="epoch", # Save a checkpoint at the end of each epoch
    save_total_limit=1,    # Only keep the last checkpoint
    
    # --- Evaluation ---
    # No evaluation is performed during training.
    evaluation_strategy="no",
    
    # --- Precision ---
    bf16=True, # Use bfloat16 for performance if on Ampere GPU or newer
)

# 7.2 Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset["train"],
    # eval_dataset is omitted since evaluation_strategy is "no"
    tokenizer=tokenizer,
    peft_config=lora_config,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer), # Handles padding batches
    compute_metrics=compute_metrics, # Will be used for final evaluation
)

print("--- Trainer Initialized ---")

In [None]:
# Cell 8: Execute Training

print("Starting model training...")
trainer.train()
print("Training complete.")

In [None]:
# Cell 9: Final Evaluation and Saving

# 9.1 Evaluate the model on the test set
print("\n--- Evaluating on the test set ---")
test_results = trainer.evaluate(eval_dataset=final_dataset["test"])

# 9.2 Print the evaluation results
print("Test set performance:")
print(test_results)

# 9.3 Save the final trained LoRA adapter
print(f"\nSaving final model adapter to {Config.OUTPUT_DIR}...")
trainer.save_model(Config.OUTPUT_DIR)
print("Model saved successfully.")