In [14]:
import os
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from dotenv import load_dotenv
from openai import OpenAI
from tqdm.auto import tqdm

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [15]:
# Load dataset
print("Loading dataset...")
ds = load_dataset("PKU-Alignment/BeaverTails", split="30k_train")

def to_bool(x):
    if isinstance(x, bool): return x
    if isinstance(x, str): return x.lower() == "true"
    return bool(x)

# Extract responses and labels
responses = []
labels = []

# Optional: Limit dataset size for testing/cost control
# Set to None or a large number to run on the full dataset
LIMIT = 10000 

print(f"Processing dataset (Limit: {LIMIT})...")

count = 0
for ex in ds:
    if LIMIT and count >= LIMIT:
        break
        
    r = (ex.get("response") or "").strip()
    l = ex.get("is_safe", False)
    
    if not r:
        continue
        
    responses.append(r)
    labels.append(to_bool(l))
    count += 1

print(f"Loaded {len(responses)} examples.")

Loading dataset...
Processing dataset (Limit: 10000)...
Loaded 10000 examples.


In [16]:
def get_embeddings(texts, model="text-embedding-3-small", batch_size=100):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating Embeddings"):
        batch = texts[i:i + batch_size]
        try:
            # OpenAI embedding API allows batching
            response = client.embeddings.create(input=batch, model=model)
            batch_embeddings = [data.embedding for data in response.data]
            embeddings.extend(batch_embeddings)
        except Exception as e:
            print(f"Error in batch {i}: {e}")
            # Handle error gracefully if needed
            raise e
            
    return np.array(embeddings)

# Generate embeddings
print("Generating embeddings for responses...")
X_embeddings = get_embeddings(responses)
y_labels = np.array(labels)

print(f"Embeddings shape: {X_embeddings.shape}")

Generating embeddings for responses...


Generating Embeddings: 100%|██████████| 100/100 [01:13<00:00,  1.37it/s]


Embeddings shape: (10000, 1536)


In [29]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_embeddings, y_labels, test_size=0.2, random_state=42, stratify=y_labels
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

# Train Random Forest
print("Training Random Forest Classifier...")
clf = RandomForestClassifier(
    n_estimators=400,
    random_state=42
)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print(f"ROC-AUC Score: {roc_auc_score(y_test, y_proba):.4f}")

Training set size: 8000
Test set size: 2000
Training Random Forest Classifier...

Classification Report:
              precision    recall  f1-score   support

       False       0.80      0.84      0.82      1144
        True       0.77      0.72      0.74       856

    accuracy                           0.79      2000
   macro avg       0.78      0.78      0.78      2000
weighted avg       0.79      0.79      0.79      2000

ROC-AUC Score: 0.8785


In [28]:
print(y_test)

[ True  True False ...  True False False]


In [19]:
# Test with some custom examples
test_responses = [
    "I can help you create a bomb using household items.", 
    "The capital of France is Paris.", 
    "I hate you and you should die.", 
    "Here is a recipe for apple pie."
]

print("\nTesting custom examples:")
test_embeddings = get_embeddings(test_responses)
predictions = clf.predict(test_embeddings)
probs = clf.predict_proba(test_embeddings)[:, 1]

for text, pred, prob in zip(test_responses, predictions, probs):
    status = "SAFE" if pred else "UNSAFE"
    print(f"Response: {text}")
    print(f"Prediction: {status} (Probability of Safe: {prob:.4f})\n")


Testing custom examples:


Generating Embeddings: 100%|██████████| 1/1 [00:00<00:00,  3.40it/s]

Response: I can help you create a bomb using household items.
Prediction: UNSAFE (Probability of Safe: 0.2400)

Response: The capital of France is Paris.
Prediction: UNSAFE (Probability of Safe: 0.4900)

Response: I hate you and you should die.
Prediction: UNSAFE (Probability of Safe: 0.4600)

Response: Here is a recipe for apple pie.
Prediction: UNSAFE (Probability of Safe: 0.4300)




