In [2]:
import os
import numpy as np
import pickle
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from dotenv import load_dotenv
from openai import OpenAI
from tqdm.auto import tqdm

# Load environment variables
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Constants
EMBEDDINGS_CACHE_FILE = "embeddings_cache-3-large.pkl"
LIMIT = None # Limit dataset size for testing/cost control. Set to None for full dataset.

# Helper function to get embeddings
def get_embeddings(texts, model="text-embedding-3-large", batch_size=500):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating Embeddings"):
        batch = texts[i:i + batch_size]
        try:
            response = client.embeddings.create(input=batch, model=model)
            batch_embeddings = [data.embedding for data in response.data]
            embeddings.extend(batch_embeddings)
        except Exception as e:
            print(f"Error in batch {i}: {e}")
            raise e
    return np.array(embeddings)

In [4]:
# --- CELL TO DOWNLOAD/GENERATE EMBEDDINGS ---
# Only run this if you want to regenerate embeddings or if the cache file doesn't exist.

if False:
    print("Loading dataset...")
    ds = load_dataset("PKU-Alignment/BeaverTails", split="30k_train")

    def to_bool(x):
        if isinstance(x, bool): return x
        if isinstance(x, str): return x.lower() == "true"
        return bool(x)

    responses = []
    labels = []

    print(f"Processing dataset (Limit: {LIMIT})...")
    count = 0
    for ex in ds:
        if LIMIT and count >= LIMIT:
            break
        r = (ex.get("response") or "").strip()
        l = ex.get("is_safe", False)
        if not r:
            continue
        responses.append(r)
        labels.append(to_bool(l))
        count += 1

    print(f"Generating embeddings for {len(responses)} responses...")
    X_embeddings = get_embeddings(responses)
    y_labels = np.array(labels)

    print(f"Saving embeddings to {EMBEDDINGS_CACHE_FILE}...")
    with open(EMBEDDINGS_CACHE_FILE, "wb") as f:
        pickle.dump({"X": X_embeddings, "y": y_labels}, f)

    print("Done.")

In [5]:
# --- CELL TO RETRIEVE EMBEDDINGS ---
# Run this to load pre-computed embeddings.

if os.path.exists(EMBEDDINGS_CACHE_FILE):
    print(f"Loading embeddings from {EMBEDDINGS_CACHE_FILE}...")
    with open(EMBEDDINGS_CACHE_FILE, "rb") as f:
        data = pickle.load(f)
        X_embeddings = data["X"]
        y_labels = data["y"]
    print(f"Loaded embeddings shape: {X_embeddings.shape}")
    print(f"Loaded labels shape: {y_labels.shape}")
else:
    print(f"Cache file {EMBEDDINGS_CACHE_FILE} not found. Please run the generation cell above.")

Loading embeddings from embeddings_cache-3-large.pkl...
Loaded embeddings shape: (27185, 3072)
Loaded labels shape: (27185,)


In [6]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_embeddings, y_labels, test_size=0.2, random_state=42, stratify=y_labels
)

# Optimization: Cast to float32 for M1 performance
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)

print(f"Embedding vector size: {X_embeddings.shape[1]}")
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

# Train Random Forest with different numbers of estimators
n_estimators_list = [10, 100, 1000]
results = {}

# Optimization: Use warm_start to incrementally add trees
import copy
clf = RandomForestClassifier(
    n_estimators=0,
    random_state=42,
    n_jobs=-1,
    warm_start=True
)

for n_estimators in n_estimators_list:
    print(f"\n{'='*60}")
    print(f"Training Random Forest Classifier with {n_estimators} estimators...")
    print(f"{'='*60}")
    
    clf.set_params(n_estimators=n_estimators)
    clf.fit(X_train, y_train)
    
    # Evaluate
    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)[:, 1]
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    roc_auc = roc_auc_score(y_test, y_proba)
    print(f"ROC-AUC Score: {roc_auc:.4f}")
    
    # Store results
    results[n_estimators] = {
        'clf': copy.deepcopy(clf),
        'y_pred': y_pred,
        'y_proba': y_proba,
        'roc_auc': roc_auc
    }

# Keep the last trained model (1000 estimators) as the default clf for subsequent cells
clf = results[1000]['clf']

Embedding vector size: 3072
Training set size: 21748
Test set size: 5437

Training Random Forest Classifier with 10 estimators...

Classification Report:
              precision    recall  f1-score   support

       False       0.72      0.85      0.78      3116
        True       0.74      0.56      0.64      2321

    accuracy                           0.73      5437
   macro avg       0.73      0.71      0.71      5437
weighted avg       0.73      0.73      0.72      5437

ROC-AUC Score: 0.8115

Training Random Forest Classifier with 100 estimators...

Classification Report:
              precision    recall  f1-score   support

       False       0.81      0.86      0.83      3116
        True       0.79      0.73      0.76      2321

    accuracy                           0.80      5437
   macro avg       0.80      0.79      0.80      5437
weighted avg       0.80      0.80      0.80      5437

ROC-AUC Score: 0.8919

Training Random Forest Classifier with 1000 estimators...

Classi

In [7]:
# Test with some custom examples
test_responses = [
    "I can help you create a bomb using household items.", 
    "The capital of France is Paris.", 
    "I hate you and you should die.", 
    "Here is a recipe for apple pie."
]

print("\nTesting custom examples:")
test_embeddings = get_embeddings(test_responses)
predictions = clf.predict(test_embeddings)
probs = clf.predict_proba(test_embeddings)[:, 1]

for text, pred, prob in zip(test_responses, predictions, probs):
    status = "SAFE" if pred else "UNSAFE"
    print(f"Response: {text}")
    print(f"Prediction: {status} (Probability of Safe: {prob:.4f})\n")


Testing custom examples:


Generating Embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.83it/s]

Response: I can help you create a bomb using household items.
Prediction: UNSAFE (Probability of Safe: 0.2900)

Response: The capital of France is Paris.
Prediction: SAFE (Probability of Safe: 0.5510)

Response: I hate you and you should die.
Prediction: UNSAFE (Probability of Safe: 0.3920)

Response: Here is a recipe for apple pie.
Prediction: UNSAFE (Probability of Safe: 0.4870)




