In [None]:
# ==============================================================================
# CELL 1: SETUP & INSTRUCTIONS
# ==============================================================================
# ---
# ## Project Setup Instructions
#
# 1.  **Dependencies:** Ensure you have installed all required libraries by running `pip install -r requirements.txt` from the project's root directory.
# 2.  **Data:** This notebook requires the "golden dataset" generated by Phase 1. Make sure you have run the `1-Phase1_...` notebook and that a `.parquet` file exists in the `output/high_purity_golden_datasets/` directory.
# 3.  **Hardware:** A GPU (like a T4 or better in Colab) is **required** to run the teacher model (Cell 3) in a reasonable amount of time.
# ---

print("--- [1] Importing necessary libraries ---")
import os
import json
import pandas as pd
from tqdm.auto import tqdm
import torch
import glob
from collections import Counter
import numpy as np
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline,
    BitsAndBytesConfig
)

print("\n✅ All libraries imported successfully.")

--- [1] Installing necessary libraries ---

✅ All libraries installed and imported successfully.


In [None]:

# ==============================================================================
# CELL 2: CONFIGURATION & PORTABLE PATHS
# ==============================================================================
print("--- [2] Setting up configuration and relative paths ---")

# --- Project Paths (Relative to the notebooks/ directory) ---
ROOT_DIR = '..'
OUTPUT_DIR = os.path.join(ROOT_DIR, 'output')

GOLDEN_DATA_DIR = os.path.join(OUTPUT_DIR, 'high_purity_golden_datasets/')
LABELED_DATA_PATH = os.path.join(OUTPUT_DIR, 'labeled_datasets/teacher_labeled_prompts.csv')
STUDENT_MODEL_OUTPUT_PATH = os.path.join(OUTPUT_DIR, 'models/roberta-prompt-classifier/')

# --- LLM Teacher Model ---
LLM_MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"

# --- Student Classifier Model ---
STUDENT_MODEL_NAME = 'roberta-base'

# --- Data Generation Parameters ---
TARGET_SAMPLES_PER_CLASS = 150
CONFIDENCE_THRESHOLD = 0.80
LLM_BATCH_SIZE = 8

# --- Student Training Parameters ---
TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 16
NUM_EPOCHS = 3

# --- Create necessary directories ---
os.makedirs(os.path.dirname(LABELED_DATA_PATH), exist_ok=True)
os.makedirs(STUDENT_MODEL_OUTPUT_PATH, exist_ok=True)

print(f"Reading golden data from: {os.path.abspath(GOLDEN_DATA_DIR)}")
print(f"Labeled data will be saved to: {os.path.abspath(LABELED_DATA_PATH)}")
print(f"Student model will be saved to: {os.path.abspath(STUDENT_MODEL_OUTPUT_PATH)}")
print("\n✅ Configuration complete.")



--- [2] Setting up configuration and paths ---
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Project Path: /content/drive/MyDrive/JudgeModel/
Labeled Data will be saved to: /content/drive/MyDrive/JudgeModel/labeled_datasets/roberta_teacher_labeled.csv
Student Model will be saved to: /content/drive/MyDrive/JudgeModel/roberta-10-class-classifier
Student model set to: roberta-base

✅ Configuration complete.


In [None]:

# ==============================================================================
# CELL 3: LOAD THE "TEACHER" LLM (ZEPHYR-7B-BETA)
# ==============================================================================
print(f"\n--- [3] Initializing Teacher LLM ({LLM_MODEL_NAME}) ---")
print("This step is memory-intensive and requires a GPU.")

if 'text_generator' in locals() and text_generator is not None:
    print("✅ LLM pipeline already exists. Skipping loading.")
else:
    try:
        device = "cuda" if torch.cuda.is_available() else "cpu"
        if device == "cpu":
            print("WARNING: No GPU detected. Loading the LLM on CPU will be extremely slow.")

        # 4-bit quantization for memory efficiency on GPU
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
        )

        llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
        if llm_tokenizer.pad_token is None:
            llm_tokenizer.pad_token = llm_tokenizer.eos_token
        llm_tokenizer.padding_side = "left"

        llm_model = AutoModelForCausalLM.from_pretrained(
            LLM_MODEL_NAME,
            quantization_config=bnb_config,
            device_map="auto",
            torch_dtype=torch.bfloat16,
        )

        text_generator = pipeline(
            "text-generation",
            model=llm_model,
            tokenizer=llm_tokenizer,
            max_new_tokens=150,
            return_full_text=False,
        )
        print(f"\n✅ LLM Teacher Model '{LLM_MODEL_NAME}' loaded successfully.")

    except Exception as e:
        print(f"CRITICAL ERROR: Failed to load LLM model. {e}")
        print("Ensure you have a T4 GPU (or better) enabled and sufficient RAM.")
        text_generator = None # Ensure it's None on failure


--- [3] Initializing Teacher LLM (HuggingFaceH4/zephyr-7b-beta) ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]



model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Device set to use cpu



✅ LLM Model 'HuggingFaceH4/zephyr-7b-beta' loaded successfully.


In [None]:

# ==============================================================================
# CELL 4: GENERATE HIGH-QUALITY LABELS VIA BATCH INFERENCE
# ==============================================================================
if text_generator:
    print("\n--- [4] Starting Data Labeling Process ---")

    PROMPT_CATEGORIES = {
        "Code & Programming": "Requests for generating, debugging, optimizing, or explaining code.",
        "Roleplay & Persona": "Instructions for the AI to adopt a specific character or persona.",
        "Factual Information": "Questions seeking direct, factual answers or definitions.",
        "Creative Content": "Requests for generating stories, poems, jokes, or other creative text.",
        "Math & Logic Puzzles": "Problems requiring mathematical calculation or logical deduction.",
        "Personal Advice & Guidance": "Seeking general advice for well-being, habits, or relationships.",
        "Data/Content Transformation": "Prompts to reformat, summarize, translate, or extract information.",
        "Technical Inquiry": "Questions asking for detailed technical or scientific explanations.",
        "Business & Professional": "Requests for drafting professional documents, marketing, or business plans.",
        "General Interaction": "Open-ended greetings, informal chat, or questions about the AI itself."
    }
    categories_text = "\n".join([f"- **{name}**: {desc}" for name, desc in PROMPT_CATEGORIES.items()])

    MESSAGES_TEMPLATE = [
        {"role": "system", "content": f"""You are an expert text classification assistant. First, provide a brief `Reasoning` for your choice. Then, on a new line, provide ONLY a valid JSON object with your classification.

**Categories:**
{categories_text}

**Output Format:**
Reasoning: [Your one-sentence reasoning for the classification.]
{{"prompt_type": "The Best-Fit Category", "confidence": A float from 0.0 to 1.0}}
"""},
        {"role": "user", "content": "Please classify the following prompt."}
    ]

    def find_latest_golden_file(directory):
        list_of_files = glob.glob(os.path.join(directory, '*.parquet'))
        return max(list_of_files, key=os.path.getctime) if list_of_files else None

    def generate_balanced_llm_labels():
        labeled_data, processed_prompts, category_counts = [], set(), Counter()

        if os.path.exists(LABELED_DATA_PATH):
            print(f"Loading existing data from {LABELED_DATA_PATH} to continue.")
            df_existing = pd.read_csv(LABELED_DATA_PATH)
            labeled_data.extend(df_existing.to_dict('records'))
            processed_prompts.update(df_existing['prompt'].tolist())
            category_counts.update(df_existing['prompt_type'].tolist())
            print(f"Loaded {len(df_existing)} samples. Current counts:\n{category_counts}")

        latest_golden_file = find_latest_golden_file(GOLDEN_DATA_DIR)
        if not latest_golden_file:
            print(f"ERROR: No golden data file found in {GOLDEN_DATA_DIR}. Cannot generate new labels."); return pd.DataFrame(labeled_data)

        print(f"Loading high-purity prompts from: {latest_golden_file}")
        df_source = pd.read_parquet(latest_golden_file)
        df_to_label = df_source[~df_source['prompt'].isin(processed_prompts)].copy().dropna(subset=['prompt'])
        df_to_label = df_to_label.sample(frac=1, random_state=42).reset_index(drop=True)
        print(f"Found {len(df_to_label)} new unique prompts to classify.")

        total_target = len(PROMPT_CATEGORIES) * TARGET_SAMPLES_PER_CLASS
        with tqdm(total=total_target, initial=len(labeled_data), desc="Generating Labels") as pbar:
            for i in range(0, len(df_to_label), LLM_BATCH_SIZE):
                if len(labeled_data) >= total_target or all(c >= TARGET_SAMPLES_PER_CLASS for c in category_counts.values()):
                    print("All categories have reached their target sample count. Stopping."); break
                
                batch_df = df_to_label.iloc[i:i+LLM_BATCH_SIZE]
                prompts_batch = batch_df['prompt'].tolist()
                
                formatted_prompts = [llm_tokenizer.apply_chat_template(MESSAGES_TEMPLATE + [{"role": "user", "content": p}], tokenize=False, add_generation_prompt=True) for p in prompts_batch]
                
                try:
                    response_outputs = text_generator(formatted_prompts)
                except Exception as e: print(f"Error during batch generation: {e}"); continue

                for idx, result_list in enumerate(response_outputs):
                    if not result_list: continue
                    generated_text = result_list[0]['generated_text']
                    json_match = re.search(r'\{.*\}', generated_text, re.DOTALL)
                    if json_match:
                        try:
                            result_json = json.loads(json_match.group(0))
                            prompt_type = result_json.get("prompt_type")
                            confidence = float(result_json.get("confidence", 0.0))
                            
                            if prompt_type in PROMPT_CATEGORIES and confidence >= CONFIDENCE_THRESHOLD and category_counts[prompt_type] < TARGET_SAMPLES_PER_CLASS:
                                new_record = batch_df.iloc[idx].to_dict()
                                new_record['prompt_type'] = prompt_type
                                new_record['confidence'] = confidence
                                labeled_data.append(new_record)
                                processed_prompts.add(new_record['prompt'])
                                category_counts[prompt_type] += 1
                                pbar.update(1)
                        except (json.JSONDecodeError, TypeError, ValueError): continue

        df_labeled = pd.DataFrame(labeled_data)
        df_labeled.to_csv(LABELED_DATA_PATH, index=False)
        print("\n--- Balanced Labeling Complete ---")
        print(f"Total high-confidence samples: {len(df_labeled)}")
        print("Final counts per category:"); print(df_labeled['prompt_type'].value_counts())
        return df_labeled

    df_labeled = generate_balanced_llm_labels()
    print("\n✅ Data labeling finished.")
else:
    print("\n⚠️ Skipping data labeling because the Teacher LLM failed to load.")
    # Try to load existing data if skipping generation
    if os.path.exists(LABELED_DATA_PATH):
        df_labeled = pd.read_csv(LABELED_DATA_PATH)
        print(f"Loaded {len(df_labeled)} existing labeled samples.")
    else:
        df_labeled = pd.DataFrame()


--- [4] Starting Data Labeling Process ---
Loading existing data from /content/drive/MyDrive/JudgeModel/labeled_datasets/roberta_teacher_labeled.csv to continue.
Loaded 300 samples. Current counts:
Counter({'Technical Inquiry': 104, 'Factual Information': 80, 'Data/Content Transformation': 37, 'Creative Content': 24, 'Personal Advice & Guidance': 16, 'Code & Programming': 15, 'Math & Logic Puzzles': 11, 'Roleplay & Persona': 6, 'Business & Professional': 5, 'General Interaction': 2})
Loading high-purity prompts from: /content/drive/MyDrive/JudgeModel/high_purity_golden_datasets/high_purity_golden_data_v_2025-07-05T14-27-23.parquet
Found 39666 new unique prompts to classify.


 20%|##        | 300/1500 [00:00<?, ?it/s]

In [None]:
#Run this code if you want to load the labeled data from the csv file

# # --- Define Categories and CoT Prompt ---
# PROMPT_CATEGORIES = {
#     "Code & Programming": "Requests for generating, debugging, optimizing, or explaining code.",
#     "Roleplay & Persona": "Instructions for the AI to adopt a specific character or persona.",
#     "Factual Information": "Questions seeking direct, factual answers or definitions.",
#     "Creative Content": "Requests for generating stories, poems, jokes, or other creative text.",
#     "Math & Logic Puzzles": "Problems requiring mathematical calculation or logical deduction.",
#     "Personal Advice & Guidance": "Seeking general advice for well-being, habits, or relationships.",
#     "Data/Content Transformation": "Prompts to reformat, summarize, translate, or extract information.",
#     "Technical Inquiry": "Questions asking for detailed technical or scientific explanations.",
#     "Business & Professional": "Requests for drafting professional documents, marketing, or business plans.",
#     "General Interaction": "Open-ended greetings, informal chat, or questions about the AI itself."
# }


# df_labeled = pd.read_csv(LABELED_DATA_PATH)
# df_labeled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           300 non-null    int64  
 1   prompt       300 non-null    object 
 2   response_a   300 non-null    object 
 3   response_b   300 non-null    object 
 4   winner       300 non-null    object 
 5   prompt_type  300 non-null    object 
 6   confidence   300 non-null    float64
dtypes: float64(1), int64(1), object(5)
memory usage: 16.5+ KB


In [None]:

# ==============================================================================
# CELL 5: TRAIN THE "STUDENT" CLASSIFIER (ROBERTA)
# ==============================================================================
print("\n--- [5] Starting Student Model Training ---")

if 'df_labeled' in locals() and not df_labeled.empty:
    print("Preparing data for training...")
    labels = sorted(df_labeled['prompt_type'].unique().tolist())
    label2id = {label: i for i, label in enumerate(labels)}
    id2label = {i: label for i, label in enumerate(labels)}
    df_labeled['label'] = df_labeled['prompt_type'].map(label2id)

    train_df, test_df = train_test_split(df_labeled, test_size=0.2, random_state=42, stratify=df_labeled['label'])
    dataset_dict = DatasetDict({ 'train': Dataset.from_pandas(train_df), 'test': Dataset.from_pandas(test_df) })
    print("Dataset successfully split and prepared:\n", dataset_dict)

    print(f"\nTokenizing data for {STUDENT_MODEL_NAME}...")
    student_tokenizer = AutoTokenizer.from_pretrained(STUDENT_MODEL_NAME)
    def tokenize_function(examples): return student_tokenizer(examples['prompt'], padding='max_length', truncation=True, max_length=256)
    tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

    print("\nSetting up model and trainer...")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    student_model = AutoModelForSequenceClassification.from_pretrained(
        STUDENT_MODEL_NAME, num_labels=len(labels), id2label=id2label, label2id=label2id
    ).to(device)

    def compute_metrics(p):
        preds = np.argmax(p.predictions, axis=1)
        return {"accuracy": accuracy_score(p.label_ids, preds), "f1_weighted": f1_score(p.label_ids, preds, average='weighted')}

    training_args = TrainingArguments(
        output_dir=STUDENT_MODEL_OUTPUT_PATH, eval_strategy="epoch", save_strategy="epoch",
        num_train_epochs=NUM_EPOCHS, per_device_train_batch_size=TRAIN_BATCH_SIZE, per_device_eval_batch_size=EVAL_BATCH_SIZE,
        fp16=True if device == "cuda" else False, learning_rate=2e-5, weight_decay=0.01, warmup_ratio=0.1,
        logging_dir=os.path.join(OUTPUT_DIR, 'logs'), logging_steps=20,
        load_best_model_at_end=True, metric_for_best_model="f1_weighted", report_to="none"
    )
    trainer = Trainer(model=student_model, args=training_args, train_dataset=tokenized_datasets['train'], eval_dataset=tokenized_datasets['test'], tokenizer=student_tokenizer, compute_metrics=compute_metrics)

    print("\nStarting training...")
    trainer.train()
    print("\n--- Training complete ---")

    print("\nEvaluating the best model on the test set...")
    eval_results = trainer.evaluate()
    print("\nFinal Evaluation Results:", json.dumps(eval_results, indent=2))
    trainer.save_model(STUDENT_MODEL_OUTPUT_PATH)
    print(f"\n✅ Model saved to {STUDENT_MODEL_OUTPUT_PATH}")
else:
    print("⚠️ Could not proceed to model training because the labeled dataset was not created or was empty.")


--- [5] Starting Student Model Training ---
Preparing data for training...
Dataset successfully split and prepared:
DatasetDict({
    train: Dataset({
        features: ['id', 'prompt', 'response_a', 'response_b', 'winner', 'prompt_type', 'confidence', 'label', '__index_level_0__'],
        num_rows: 240
    })
    test: Dataset({
        features: ['id', 'prompt', 'response_a', 'response_b', 'winner', 'prompt_type', 'confidence', 'label', '__index_level_0__'],
        num_rows: 60
    })
})

Tokenizing data for roberta-base...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]


Setting up model and trainer...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



Starting training...


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted
1,No log,1.954018,0.283333,0.146519
2,2.138800,1.727131,0.366667,0.215123
3,1.721300,1.710989,0.35,0.181481



--- Training complete ---

Evaluating the best model on the test set...



Final Evaluation Results: {
  "eval_loss": 1.7271308898925781,
  "eval_accuracy": 0.36666666666666664,
  "eval_f1_weighted": 0.21512254901960787,
  "eval_runtime": 53.6002,
  "eval_samples_per_second": 1.119,
  "eval_steps_per_second": 0.075,
  "epoch": 3.0
}

✅ Model saved to /content/drive/MyDrive/JudgeModel/roberta-10-class-classifier


In [None]:

# ==============================================================================
# CELL 6: INFERENCE DEMONSTRATION WITH THE FINAL STUDENT MODEL
# ==============================================================================
print("\n--- [6] Inference Demonstration ---")
if os.path.exists(STUDENT_MODEL_OUTPUT_PATH) and os.path.exists(os.path.join(STUDENT_MODEL_OUTPUT_PATH, 'config.json')):
    classifier_pipe = pipeline("text-classification", model=STUDENT_MODEL_OUTPUT_PATH, device=0 if torch.cuda.is_available() else -1)
    test_prompts = [
        "Write a python function to sort a dictionary by its values.", "Act as a cynical pirate captain who is skeptical of a treasure map.",
        "What is the average lifespan of a giant tortoise?", "Create a haiku about a rainy day in a busy city.",
        "Can you create a marketing slogan for a new brand of eco-friendly coffee?",
        "Please translate the following sentence into French: 'The quick brown fox jumps over the lazy dog.'",
        "How does a CPU's cache hierarchy work?", "what's up doc", "if a train leaves chicago at 3pm going 60mph..."
    ]
    print("\nClassifying test prompts with the fine-tuned student model:")
    for p in test_prompts:
        result = classifier_pipe(p)
        print(f"Prompt: '{p[:70]}...'")
        print(f"  -> Predicted Category: {result[0]['label']} (Score: {result[0]['score']:.4f})\n")
    print("\n✅ Inference demonstration complete.")
else:
    print(f"⚠️ Model not found at {STUDENT_MODEL_OUTPUT_PATH}. Please ensure training was successful.")

--- [6] Inference Demonstration ---


Device set to use cpu



Classifying test prompts with the fine-tuned RoBERTa model:
Prompt: 'Write a python function to sort a dictionary by its values....'
  -> Predicted Category: Technical Inquiry (Score: 0.4162)

Prompt: 'Act as a cynical pirate captain who is skeptical of a treasure map....'
  -> Predicted Category: Technical Inquiry (Score: 0.3600)

Prompt: 'What is the average lifespan of a giant tortoise?...'
  -> Predicted Category: Technical Inquiry (Score: 0.3575)

Prompt: 'Create a haiku about a rainy day in a busy city....'
  -> Predicted Category: Factual Information (Score: 0.3363)

Prompt: 'Can you create a marketing slogan for a new brand of eco-friendly coff...'
  -> Predicted Category: Technical Inquiry (Score: 0.4222)

Prompt: 'Please translate the following sentence into French: 'The quick brown ...'
  -> Predicted Category: Technical Inquiry (Score: 0.3503)

Prompt: 'How does a CPU's cache hierarchy work?...'
  -> Predicted Category: Technical Inquiry (Score: 0.4218)

Prompt: 'what's up