In [None]:
# Imports
import numpy as np
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from scipy.sparse import load_npz, hstack, csr_matrix

In [None]:
# Paths
BASE_DIR = r"C:\Users\apara\Desktop\MDM\saved_models"
BERT_DIR = r"C:\Users\apara\Desktop\MDM\saved_models\distilBERT_4_epochs"
FEATURES_PATH = f"{BASE_DIR}\\extracted_features.csv"
LABEL_ENCODER_PATH = f"{BASE_DIR}\\label_encoder.pkl"
X_TEST_TFIDF_PATH = f"{BASE_DIR}\\X_test_tfidf.npz"
TRAIN_CSV_PATH = r"C:\Users\apara\Desktop\MDM\train_none.csv"

MODELS = ["cohere-chat", "gpt4", "mistral-chat", "mpt-chat", "llama-chat"]
RANDOM_STATE = 5

# Label encoder
le = joblib.load(LABEL_ENCODER_PATH)
print(f"Classes: {list(le.classes_)}")

# Load test labels
features_df = pd.read_csv(FEATURES_PATH)
test_labels = features_df[features_df['split'] == 'test']['label']
y_test = le.transform(test_labels)

# Load test texts
train_df = pd.read_csv(TRAIN_CSV_PATH)
_, test_df = train_test_split(train_df, test_size=0.2, random_state=RANDOM_STATE, stratify=train_df["model"])
test_df = test_df[test_df["model"].isin(MODELS)]
test_texts = test_df["generation"].astype(str).reset_index(drop=True)

# Load TF-IDF and features for ML models
test_features = features_df[features_df['split'] == 'test'].drop(['label', 'split'], axis=1)
X_test_tfidf = load_npz(X_TEST_TFIDF_PATH)
X_test_num = csr_matrix(test_features.values)
X_test = hstack([X_test_tfidf, X_test_num], format='csr')

# Load ML models
rf_model = joblib.load(f"{BASE_DIR}\\random_forest.pkl")
xgb_model = joblib.load(f"{BASE_DIR}\\xgboost.pkl")
lgb_model = joblib.load(f"{BASE_DIR}\\lightgbm.pkl")

# Get ML predictions and probabilities
rf_preds = rf_model.predict(X_test)
rf_probs = rf_model.predict_proba(X_test)
xgb_preds = xgb_model.predict(X_test)
xgb_probs = xgb_model.predict_proba(X_test)
lgb_preds = lgb_model.predict(X_test)
lgb_probs = lgb_model.predict_proba(X_test)

# Load DistilBERT predictions
bert_preds = np.load(f"{BERT_DIR}\\bert_preds.npy")
bert_probs = np.load(f"{BERT_DIR}\\bert_probs.npy")

print(f"Loaded {len(y_test)} test samples")

# Create Stacking Ensemble predictions
model_names = ['RandomForest', 'XGBoost', 'LightGBM', 'DistilBERT']
probabilities = {
    'RandomForest': rf_probs,
    'XGBoost': xgb_probs,
    'LightGBM': lgb_probs,
    'DistilBERT': bert_probs
}
predictions = {
    'RandomForest': rf_preds,
    'XGBoost': xgb_preds,
    'LightGBM': lgb_preds,
    'DistilBERT': bert_preds
}

# Create stacking meta-features
meta_features = np.hstack([probabilities[m] for m in model_names])

# Train stacking with cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
stacking_preds = np.zeros(len(y_test))

for train_idx, val_idx in skf.split(meta_features, y_test):
    X_meta_train = meta_features[train_idx]
    y_meta_train = y_test[train_idx]
    X_meta_val = meta_features[val_idx]
    
    meta_clf = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
    meta_clf.fit(X_meta_train, y_meta_train)
    stacking_preds[val_idx] = meta_clf.predict(X_meta_val)

stacking_preds = stacking_preds.astype(int)
print(f"Stacking Accuracy: {accuracy_score(y_test, stacking_preds):.4f}")

# Print 20 random samples
np.random.seed(42)
sample_indices = np.random.choice(len(y_test), size=20, replace=False)

for idx in sample_indices:
    text = test_texts.iloc[idx]
    ground_truth = le.classes_[y_test[idx]]
    
    distilbert_pred = le.classes_[bert_preds[idx]]
    lgb_pred = le.classes_[lgb_preds[idx]]
    xgb_pred = le.classes_[xgb_preds[idx]]
    rf_pred = le.classes_[rf_preds[idx]]
    stack_pred = le.classes_[stacking_preds[idx]]
    
    # Truncate text for display
    display_text = text[:500] + "..." if len(text) > 500 else text
    
    print("=" * 70)
    print(f"\n{display_text}\n")
    print("-" * 70)
    print(f"Ground Truth: {ground_truth}")
    print("-" * 70)
    print(f"{'Model':<25} {'Prediction':<20} {'Correct?':<10}")
    print("-" * 70)
    print(f"{'DistilBERT:':<25} {distilbert_pred:<20} | {'Y' if distilbert_pred == ground_truth else 'N'}")
    print(f"{'LightGBM:':<25} {lgb_pred:<20} | {'Y' if lgb_pred == ground_truth else 'N'}")
    print(f"{'XGBoost:':<25} {xgb_pred:<20} | {'Y' if xgb_pred == ground_truth else 'N'}")
    print(f"{'Random Forest:':<25} {rf_pred:<20} | {'Y' if rf_pred == ground_truth else 'N'}")
    print(f"{'Stacking Ensemble:':<25} {stack_pred:<20} | {'Y' if stack_pred == ground_truth else 'N'}")
    print("\n")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Classes: ['cohere-chat', 'gpt4', 'llama-chat', 'mistral-chat', 'mpt-chat']


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Loaded 42787 test samples
Stacking Accuracy: 0.9178

Ingredients:
- 1 cup granulated sugar
- 1 cup water
- 1 (14 oz) can sweetened condensed milk
- 4 large eggs
- 2 tsp vanilla extract
- 1/4 cup caramel sauce

Instructions:

1. Preheat the oven to 325Â°F.

2. In a medium saucepan, combine sugar and water. Cook over medium heat, stirring occasionally, until the sugar dissolves. Increase heat to high and bring to a boil. Cook for about 10-15 minutes, until the mixture turns into a golden brown color. Remove from heat and let it cool.

3. In a mixing ...

----------------------------------------------------------------------
Ground Truth: mpt-chat
----------------------------------------------------------------------
Model                     Prediction           Correct?  
----------------------------------------------------------------------
DistilBERT:               mpt-chat             | Y
LightGBM:                 mpt-chat             | Y
XGBoost:                  mpt-chat           