In [1]:
import os
import numpy as np
import pickle
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.decomposition import PCA
from dotenv import load_dotenv
from openai import OpenAI
from tqdm.auto import tqdm

# Load environment variables
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Constants
EMBEDDINGS_CACHE_FILE = "embeddings_cache-3-large.pkl"
LIMIT = None # Limit dataset size for testing/cost control. Set to None for full dataset.

# Helper function to get embeddings
def get_embeddings(texts, model="text-embedding-3-large", batch_size=500):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating Embeddings"):
        batch = texts[i:i + batch_size]
        try:
            response = client.embeddings.create(input=batch, model=model)
            batch_embeddings = [data.embedding for data in response.data]
            embeddings.extend(batch_embeddings)
        except Exception as e:
            print(f"Error in batch {i}: {e}")
            raise e
    return np.array(embeddings)

In [3]:
# --- CELL TO DOWNLOAD/GENERATE EMBEDDINGS ---
# Only run this if you want to regenerate embeddings or if the cache file doesn't exist.

if False:
    print("Loading dataset...")
    ds = load_dataset("PKU-Alignment/BeaverTails", split="30k_train")

    def to_bool(x):
        if isinstance(x, bool): return x
        if isinstance(x, str): return x.lower() == "true"
        return bool(x)

    responses = []
    labels = []

    print(f"Processing dataset (Limit: {LIMIT})...")
    count = 0
    for ex in ds:
        if LIMIT and count >= LIMIT:
            break
        r = (ex.get("response") or "").strip()
        l = ex.get("is_safe", False)
        if not r:
            continue
        responses.append(r)
        labels.append(to_bool(l))
        count += 1

    print(f"Generating embeddings for {len(responses)} responses...")
    X_embeddings = get_embeddings(responses)
    y_labels = np.array(labels)

    print(f"Saving embeddings to {EMBEDDINGS_CACHE_FILE}...")
    with open(EMBEDDINGS_CACHE_FILE, "wb") as f:
        pickle.dump({"X": X_embeddings, "y": y_labels}, f)

    print("Done.")

In [4]:
# --- CELL TO RETRIEVE EMBEDDINGS ---
# Run this to load pre-computed embeddings.

if os.path.exists(EMBEDDINGS_CACHE_FILE):
    print(f"Loading embeddings from {EMBEDDINGS_CACHE_FILE}...")
    with open(EMBEDDINGS_CACHE_FILE, "rb") as f:
        data = pickle.load(f)
        X_embeddings = data["X"]
        y_labels = data["y"]
    print(f"Loaded embeddings shape: {X_embeddings.shape}")
    print(f"Loaded labels shape: {y_labels.shape}")
else:
    print(f"Cache file {EMBEDDINGS_CACHE_FILE} not found. Please run the generation cell above.")

Loading embeddings from embeddings_cache-3-large.pkl...
Loaded embeddings shape: (27185, 3072)
Loaded labels shape: (27185,)


In [10]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_embeddings, y_labels, test_size=0.2, random_state=42, stratify=y_labels
)

# Optimization: Cast to float32 for M1 performance
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)

print(f"Embedding vector size: {X_embeddings.shape[1]}")
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

# Dimensionality reduction with PCA
pca = PCA(n_components=100, random_state=42)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

explained_variance_ratio = pca.explained_variance_ratio_
variance_captured = explained_variance_ratio.sum()

print(f"Reduced feature size (PCA components): {X_train_pca.shape[1]}")
print(f"Explained variance ratio per component: {explained_variance_ratio}")
print(f"Total variance captured by top {len(explained_variance_ratio)} components: {variance_captured:.4f}\n")

# Train Random Forest with different numbers of estimators
n_estimators_list = [10, 100, 1000]
results = {}

# Optimization: Use warm_start to incrementally add trees
import copy
clf = RandomForestClassifier(
    n_estimators=0,
    random_state=42,
    n_jobs=-1,
    warm_start=True
)

for n_estimators in n_estimators_list:
    print(f"\n{'='*60}")
    print(f"Training Random Forest Classifier with {n_estimators} estimators...")
    print(f"{'='*60}")
    
    clf.set_params(n_estimators=n_estimators)
    clf.fit(X_train_pca, y_train)
    
    # Evaluate
    y_pred = clf.predict(X_test_pca)
    y_proba = clf.predict_proba(X_test_pca)[:, 1]
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    roc_auc = roc_auc_score(y_test, y_proba)
    print(f"ROC-AUC Score: {roc_auc:.4f}")
    
    # Store results
    results[n_estimators] = {
        'clf': copy.deepcopy(clf),
        'y_pred': y_pred,
        'y_proba': y_proba,
        'roc_auc': roc_auc
    }

# Keep the last trained model (1000 estimators) as the default clf for subsequent cells
clf = results[1000]['clf']

Embedding vector size: 3072
Training set size: 21748
Test set size: 5437
Reduced feature size (PCA components): 100
Explained variance ratio per component: [0.04187413 0.02122076 0.019089   0.01792207 0.01619487 0.01563129
 0.01245255 0.01163379 0.01048647 0.01031358 0.00969979 0.0094254
 0.0088428  0.00845925 0.00798682 0.00746947 0.00712581 0.00684719
 0.00674252 0.00662508 0.00657379 0.00652683 0.00600355 0.00592365
 0.00586358 0.0056132  0.00549339 0.00527111 0.005195   0.0049944
 0.00491229 0.0048605  0.00468402 0.00449176 0.00442355 0.00438669
 0.00429873 0.0042346  0.00411925 0.00407622 0.00395056 0.00387551
 0.00379499 0.00375661 0.00369815 0.00367335 0.00360139 0.00356159
 0.00354009 0.00348494 0.00341667 0.00336234 0.00330194 0.00320858
 0.00320256 0.00317784 0.00307074 0.00305818 0.00303748 0.00297598
 0.00293847 0.00292809 0.00288121 0.00284461 0.00282294 0.00279207
 0.00275573 0.00273098 0.0026791  0.00265752 0.00261581 0.00259898
 0.00258616 0.00255968 0.00253587 0.002484

In [6]:
# Test with some custom examples
test_responses = [
    "I can help you create a bomb using household items.", 
    "The capital of France is Paris.", 
    "I hate you and you should die.", 
    "Here is a recipe for apple pie."
]

print("\nTesting custom examples:")
test_embeddings = get_embeddings(test_responses)
test_embeddings = test_embeddings.astype(np.float32)
test_embeddings_pca = pca.transform(test_embeddings)
predictions = clf.predict(test_embeddings_pca)
probs = clf.predict_proba(test_embeddings_pca)[:, 1]

for text, pred, prob in zip(test_responses, predictions, probs):
    status = "SAFE" if pred else "UNSAFE"
    print(f"Response: {text}")
    print(f"Prediction: {status} (Probability of Safe: {prob:.4f})\n")


Testing custom examples:


Generating Embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.02s/it]

Response: I can help you create a bomb using household items.
Prediction: UNSAFE (Probability of Safe: 0.3350)

Response: The capital of France is Paris.
Prediction: SAFE (Probability of Safe: 0.6060)

Response: I hate you and you should die.
Prediction: UNSAFE (Probability of Safe: 0.0460)

Response: Here is a recipe for apple pie.
Prediction: UNSAFE (Probability of Safe: 0.3460)




