# Ensemble Model Comparison
Combines predictions from:
- RandomForest
- XGBoost
- LightGBM
- DistilBERT

Ensemble methods:
- Hard Voting (Majority)
- Soft Voting (Average Probabilities)
- Weighted Voting
- Stacking

In [None]:
# Imports
import os
import numpy as np
import pandas as pd
import joblib
import torch
from scipy.sparse import load_npz, hstack, csr_matrix
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report,
    confusion_matrix, cohen_kappa_score
)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.stats import pearsonr, spearmanr
from transformers import AutoTokenizer, AutoModelForSequenceClassification

print("Imports complete!")

Imports complete!


In [None]:
# Configuration
BASE_DIR = r"C:\Users\apara\Desktop\MDM\saved_models"
BERT_DIR = r"C:\Users\apara\Desktop\MDM\saved_models\BERT_4_epochs"

# Model paths
MODEL_PATHS = {
    'RandomForest': os.path.join(BASE_DIR, 'random_forest.pkl'),
    'XGBoost': os.path.join(BASE_DIR, 'xgboost.pkl'),
    'LightGBM': os.path.join(BASE_DIR, 'lightgbm.pkl'),
}

# Other files
LABEL_ENCODER_PATH = os.path.join(BASE_DIR, 'label_encoder.pkl')
TFIDF_PATH = os.path.join(BASE_DIR, 'tfidf_vectorizer.pkl')
X_TEST_TFIDF_PATH = os.path.join(BASE_DIR, 'X_test_tfidf.npz')
FEATURES_PATH = os.path.join(BASE_DIR, 'extracted_features.csv')

BERT_MAX_LENGTH = 256
BERT_BATCH_SIZE = 32

print("Configuration set!")
print(f"Base directory: {BASE_DIR}")
print(f"BERT directory: {BERT_DIR}")

Configuration set!
Base directory: C:\Users\apara\Desktop\MDM\saved_models
BERT directory: C:\Users\apara\Desktop\MDM\saved_models\BERT_4_epochs


In [None]:
# Load label encoder and features
print("Loading label encoder...")
le = joblib.load(LABEL_ENCODER_PATH)
print(f"Classes: {list(le.classes_)}")

# Load extracted features
print("\nLoading extracted features...")
features_df = pd.read_csv(FEATURES_PATH)

# Split into train/test based on 'split' column
test_features = features_df[features_df['split'] == 'test'].drop(['label', 'split'], axis=1)
test_labels = features_df[features_df['split'] == 'test']['label']

# Encode labels
y_test = le.transform(test_labels)
print(f"Test samples: {len(y_test)}")

# Load TF-IDF
print("\nLoading TF-IDF features...")
X_test_tfidf = load_npz(X_TEST_TFIDF_PATH)

# Combine features
X_test_num = csr_matrix(test_features.values)
X_test = hstack([X_test_tfidf, X_test_num], format='csr')
print(f"Combined test features shape: {X_test.shape}")

# Get test texts for BERT
train_df = pd.read_csv(r"C:\Users\apara\Desktop\MDM\train_none.csv")
from sklearn.model_selection import train_test_split
MODELS = ["cohere-chat", "gpt4", "mistral-chat", "mpt-chat", "llama-chat"]
_, test_df = train_test_split(train_df, test_size=0.2, random_state=5, stratify=train_df["model"])
test_df = test_df[test_df["model"].isin(MODELS)]
test_texts = test_df["generation"].astype(str)
print(f"Test texts loaded: {len(test_texts)}")

Loading label encoder...
Classes: ['cohere-chat', 'gpt4', 'llama-chat', 'mistral-chat', 'mpt-chat']

Loading extracted features...


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Test samples: 42787

Loading TF-IDF features...
Combined test features shape: (42787, 5041)
Test texts loaded: 42787


In [None]:
# Load all ML models and get predictions
print("Loading ML models and getting predictions...\n")

models = {}
predictions = {}
probabilities = {}

for name, path in MODEL_PATHS.items():
    print(f"Loading {name}...")
    models[name] = joblib.load(path)
    predictions[name] = models[name].predict(X_test)
    probabilities[name] = models[name].predict_proba(X_test)
    
    acc = accuracy_score(y_test, predictions[name])
    f1 = f1_score(y_test, predictions[name], average='macro')
    print(f"  Accuracy: {acc:.4f}, F1: {f1:.4f}")

print("\n‚úÖ All ML models loaded!")

Loading ML models and getting predictions...

Loading RandomForest...


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


  Accuracy: 0.7786, F1: 0.7768
Loading XGBoost...
  Accuracy: 0.8278, F1: 0.8300
Loading LightGBM...


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


  Accuracy: 0.8618, F1: 0.8656

‚úÖ All ML models loaded!


In [None]:
# Load DistilBERT and get predictions
print("Loading DistilBERT model...")

tokenizer = AutoTokenizer.from_pretrained(BERT_DIR)
bert_model = AutoModelForSequenceClassification.from_pretrained(BERT_DIR)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
bert_model.to(device)
bert_model.eval()

bert_preds = []
bert_probs = []

print("Getting BERT predictions...")
for i in range(0, len(test_texts), BERT_BATCH_SIZE):
    batch_texts = test_texts.iloc[i:i+BERT_BATCH_SIZE].tolist()
    
    encodings = tokenizer(
        batch_texts,
        truncation=True,
        padding=True,
        max_length=BERT_MAX_LENGTH,
        return_tensors='pt'
    )
    encodings = {k: v.to(device) for k, v in encodings.items()}
    
    with torch.no_grad():
        outputs = bert_model(**encodings)
        probs = torch.softmax(outputs.logits, dim=1)
        preds = torch.argmax(outputs.logits, dim=1)
    
    bert_preds.extend(preds.cpu().numpy())
    bert_probs.extend(probs.cpu().numpy())
    
    if i % 5000 == 0:
        print(f"  Processed {i}/{len(test_texts)}")

predictions['DistilBERT'] = np.array(bert_preds)
probabilities['DistilBERT'] = np.array(bert_probs)

acc = accuracy_score(y_test, predictions['DistilBERT'])
f1 = f1_score(y_test, predictions['DistilBERT'], average='macro')
print(f"\nDistilBERT - Accuracy: {acc:.4f}, F1: {f1:.4f}")
print("\n‚úÖ DistilBERT loaded!")

Loading DistilBERT model...
Using device: cpu
Getting BERT predictions...
  Processed 0/42787
  Processed 20000/42787
  Processed 40000/42787

DistilBERT - Accuracy: 0.8978, F1: 0.9017

‚úÖ DistilBERT loaded!


In [None]:
# Save to CSV
import pandas as pd
pd.DataFrame({'bert_preds': bert_preds}).to_csv('bert_preds.csv', index=False)
pd.DataFrame({'bert_probs': bert_probs}).to_csv('bert_probs.csv', index=False)

# Save to numpy files (faster to load)
import numpy as np
np.save('bert_preds.npy', bert_preds)
np.save('bert_probs.npy', bert_probs)

In [None]:
# Individual Model Performance Summary
print("="*60)
print("INDIVIDUAL MODEL PERFORMANCE")
print("="*60)

results = []
for name, preds in predictions.items():
    acc = accuracy_score(y_test, preds)
    f1_macro = f1_score(y_test, preds, average='macro')
    f1_weighted = f1_score(y_test, preds, average='weighted')
    kappa = cohen_kappa_score(y_test, preds)
    
    results.append({
        'Model': name,
        'Accuracy': acc,
        'F1 (Macro)': f1_macro,
        'F1 (Weighted)': f1_weighted,
        'Cohen Kappa': kappa
    })

results_df = pd.DataFrame(results).sort_values('Accuracy', ascending=False)
print(results_df.to_string(index=False))

INDIVIDUAL MODEL PERFORMANCE
       Model  Accuracy  F1 (Macro)  F1 (Weighted)  Cohen Kappa
  DistilBERT  0.897773    0.901723       0.898093     0.868727
    LightGBM  0.861780    0.865595       0.861500     0.822834
     XGBoost  0.827751    0.829966       0.827406     0.778989
RandomForest  0.778648    0.776762       0.777052     0.714240


In [None]:
# Correlation Analysis
print("\n" + "="*60)
print("PREDICTION CORRELATION ANALYSIS")
print("="*60)

model_names = list(predictions.keys())
n_models = len(model_names)

# Pearson correlation
corr_matrix = np.zeros((n_models, n_models))
for i, m1 in enumerate(model_names):
    for j, m2 in enumerate(model_names):
        if i == j:
            corr_matrix[i, j] = 1.0
        else:
            corr_matrix[i, j], _ = pearsonr(predictions[m1], predictions[m2])

corr_df = pd.DataFrame(corr_matrix, index=model_names, columns=model_names)
print("\nPearson Correlation Matrix:")
print(corr_df.round(3))

# Agreement analysis
print("\n\nPairwise Agreement (% same predictions):")
for i, m1 in enumerate(model_names):
    for j, m2 in enumerate(model_names):
        if i < j:
            agreement = np.mean(predictions[m1] == predictions[m2]) * 100
            print(f"  {m1} vs {m2}: {agreement:.1f}%")


PREDICTION CORRELATION ANALYSIS

Pearson Correlation Matrix:
              RandomForest  XGBoost  LightGBM  DistilBERT
RandomForest         1.000    0.824     0.796       0.719
XGBoost              0.824    1.000     0.890       0.759
LightGBM             0.796    0.890     1.000       0.800
DistilBERT           0.719    0.759     0.800       1.000


Pairwise Agreement (% same predictions):
  RandomForest vs XGBoost: 86.5%
  RandomForest vs LightGBM: 84.1%
  RandomForest vs DistilBERT: 77.4%
  XGBoost vs LightGBM: 91.9%
  XGBoost vs DistilBERT: 81.2%
  LightGBM vs DistilBERT: 84.1%


In [None]:
# Ensemble Methods
print("\n" + "="*60)
print("ENSEMBLE METHODS")
print("="*60)

ensemble_results = []

# 1. Hard Voting (Majority Vote)
print("\n1. HARD VOTING (Majority Vote)")
all_preds = np.array([predictions[m] for m in model_names])
hard_vote_preds = []
for i in range(all_preds.shape[1]):
    votes = all_preds[:, i]
    hard_vote_preds.append(np.bincount(votes.astype(int), minlength=len(le.classes_)).argmax())
hard_vote_preds = np.array(hard_vote_preds)

acc = accuracy_score(y_test, hard_vote_preds)
f1 = f1_score(y_test, hard_vote_preds, average='macro')
print(f"   Accuracy: {acc:.4f}, F1: {f1:.4f}")
ensemble_results.append({'Method': 'Hard Voting', 'Accuracy': acc, 'F1 (Macro)': f1})

# 2. Soft Voting (Average Probabilities)
print("\n2. SOFT VOTING (Average Probabilities)")
avg_probs = np.mean([probabilities[m] for m in model_names], axis=0)
soft_vote_preds = np.argmax(avg_probs, axis=1)

acc = accuracy_score(y_test, soft_vote_preds)
f1 = f1_score(y_test, soft_vote_preds, average='macro')
print(f"   Accuracy: {acc:.4f}, F1: {f1:.4f}")
ensemble_results.append({'Method': 'Soft Voting', 'Accuracy': acc, 'F1 (Macro)': f1})

# 3. Weighted Voting (by individual accuracy)
print("\n3. WEIGHTED VOTING (by accuracy)")
weights = np.array([accuracy_score(y_test, predictions[m]) for m in model_names])
weights = weights / weights.sum()
print(f"   Weights: {dict(zip(model_names, weights.round(3)))}")

weighted_probs = np.zeros_like(probabilities[model_names[0]])
for i, m in enumerate(model_names):
    weighted_probs += weights[i] * probabilities[m]
weighted_vote_preds = np.argmax(weighted_probs, axis=1)

acc = accuracy_score(y_test, weighted_vote_preds)
f1 = f1_score(y_test, weighted_vote_preds, average='macro')
print(f"   Accuracy: {acc:.4f}, F1: {f1:.4f}")
ensemble_results.append({'Method': 'Weighted Voting', 'Accuracy': acc, 'F1 (Macro)': f1})

# 4. Weighted Voting (BERT gets higher weight)
print("\n4. WEIGHTED VOTING (BERT boosted)")
bert_boost_weights = {'RandomForest': 1, 'XGBoost': 1, 'LightGBM': 1, 'DistilBERT': 2}
total = sum(bert_boost_weights.values())
bert_boost_weights = {k: v/total for k, v in bert_boost_weights.items()}
print(f"   Weights: {bert_boost_weights}")

boosted_probs = np.zeros_like(probabilities[model_names[0]])
for m in model_names:
    boosted_probs += bert_boost_weights[m] * probabilities[m]
boosted_preds = np.argmax(boosted_probs, axis=1)

acc = accuracy_score(y_test, boosted_preds)
f1 = f1_score(y_test, boosted_preds, average='macro')
print(f"   Accuracy: {acc:.4f}, F1: {f1:.4f}")
ensemble_results.append({'Method': 'BERT Boosted', 'Accuracy': acc, 'F1 (Macro)': f1})


ENSEMBLE METHODS

1. HARD VOTING (Majority Vote)
   Accuracy: 0.8606, F1: 0.8654

2. SOFT VOTING (Average Probabilities)
   Accuracy: 0.9008, F1: 0.9047

3. WEIGHTED VOTING (by accuracy)
   Weights: {'RandomForest': np.float64(0.231), 'XGBoost': np.float64(0.246), 'LightGBM': np.float64(0.256), 'DistilBERT': np.float64(0.267)}
   Accuracy: 0.9028, F1: 0.9067

4. WEIGHTED VOTING (BERT boosted)
   Weights: {'RandomForest': 0.2, 'XGBoost': 0.2, 'LightGBM': 0.2, 'DistilBERT': 0.4}
   Accuracy: 0.9117, F1: 0.9155


In [None]:
# Stacking Ensemble
print("\n5. STACKING (Meta-learner on probabilities)")

# Create meta-features from all model probabilities
meta_features = np.hstack([probabilities[m] for m in model_names])
print(f"   Meta-features shape: {meta_features.shape}")

# Train a simple logistic regression as meta-learner
# Using cross-validation to avoid overfitting
from sklearn.model_selection import StratifiedKFold

# Split meta-features for proper stacking
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)
stacking_preds = np.zeros(len(y_test))

for train_idx, val_idx in skf.split(meta_features, y_test):
    X_meta_train = meta_features[train_idx]
    y_meta_train = y_test[train_idx]
    X_meta_val = meta_features[val_idx]
    
    meta_clf = LogisticRegression(max_iter=1000, random_state=5)
    meta_clf.fit(X_meta_train, y_meta_train)
    stacking_preds[val_idx] = meta_clf.predict(X_meta_val)

acc = accuracy_score(y_test, stacking_preds)
f1 = f1_score(y_test, stacking_preds, average='macro')
print(f"   Accuracy: {acc:.4f}, F1: {f1:.4f}")
ensemble_results.append({'Method': 'Stacking (LR)', 'Accuracy': acc, 'F1 (Macro)': f1})

# Stacking with predictions as features (not probabilities)
print("\n6. STACKING (Predictions as features)")
pred_features = np.column_stack([predictions[m] for m in model_names])
print(f"   Prediction features shape: {pred_features.shape}")

stacking_preds2 = np.zeros(len(y_test))
for train_idx, val_idx in skf.split(pred_features, y_test):
    X_pred_train = pred_features[train_idx]
    y_pred_train = y_test[train_idx]
    X_pred_val = pred_features[val_idx]
    
    meta_clf2 = LogisticRegression(max_iter=1000, random_state=5)
    meta_clf2.fit(X_pred_train, y_pred_train)
    stacking_preds2[val_idx] = meta_clf2.predict(X_pred_val)

acc = accuracy_score(y_test, stacking_preds2)
f1 = f1_score(y_test, stacking_preds2, average='macro')
print(f"   Accuracy: {acc:.4f}, F1: {f1:.4f}")
ensemble_results.append({'Method': 'Stacking (Preds)', 'Accuracy': acc, 'F1 (Macro)': f1})


5. STACKING (Meta-learner on probabilities)
   Meta-features shape: (42787, 20)
   Accuracy: 0.9178, F1: 0.9218

6. STACKING (Predictions as features)
   Prediction features shape: (42787, 4)
   Accuracy: 0.8592, F1: 0.8550


In [None]:
# Best Models Only Ensemble
print("\n7. TOP-3 MODELS SOFT VOTING")
# Get top 3 models by accuracy
top3 = results_df.head(3)['Model'].tolist()
print(f"   Top 3 models: {top3}")

top3_probs = np.mean([probabilities[m] for m in top3], axis=0)
top3_preds = np.argmax(top3_probs, axis=1)

acc = accuracy_score(y_test, top3_preds)
f1 = f1_score(y_test, top3_preds, average='macro')
print(f"   Accuracy: {acc:.4f}, F1: {f1:.4f}")
ensemble_results.append({'Method': 'Top-3 Soft Voting', 'Accuracy': acc, 'F1 (Macro)': f1})

# ML only (no BERT)
print("\n8. ML MODELS ONLY (No BERT)")
ml_models = ['RandomForest', 'XGBoost', 'LightGBM']
ml_probs = np.mean([probabilities[m] for m in ml_models], axis=0)
ml_preds = np.argmax(ml_probs, axis=1)

acc = accuracy_score(y_test, ml_preds)
f1 = f1_score(y_test, ml_preds, average='macro')
print(f"   Accuracy: {acc:.4f}, F1: {f1:.4f}")
ensemble_results.append({'Method': 'ML Only Ensemble', 'Accuracy': acc, 'F1 (Macro)': f1})


7. TOP-3 MODELS SOFT VOTING
   Top 3 models: ['DistilBERT', 'LightGBM', 'XGBoost']
   Accuracy: 0.9069, F1: 0.9107

8. ML MODELS ONLY (No BERT)
   Accuracy: 0.8482, F1: 0.8516


In [None]:
# Final Comparison
print("\n" + "="*60)
print("FINAL COMPARISON - ALL METHODS")
print("="*60)

# Combine individual and ensemble results
all_results = []

# Add individual models
for _, row in results_df.iterrows():
    all_results.append({
        'Method': row['Model'],
        'Type': 'Individual',
        'Accuracy': row['Accuracy'],
        'F1 (Macro)': row['F1 (Macro)']
    })

# Add ensembles
for res in ensemble_results:
    all_results.append({
        'Method': res['Method'],
        'Type': 'Ensemble',
        'Accuracy': res['Accuracy'],
        'F1 (Macro)': res['F1 (Macro)']
    })

final_df = pd.DataFrame(all_results).sort_values('Accuracy', ascending=False)
print(final_df.to_string(index=False))

# Highlight best
best = final_df.iloc[0]
print(f"\n{'='*60}")
print(f"üèÜ BEST METHOD: {best['Method']}")
print(f"   Type: {best['Type']}")
print(f"   Accuracy: {best['Accuracy']:.4f} ({best['Accuracy']*100:.2f}%)")
print(f"   F1 (Macro): {best['F1 (Macro)']:.4f}")
print(f"{'='*60}")

if best['Accuracy'] >= 0.90:
    print("\n‚úÖ TARGET 90% ACCURACY ACHIEVED!")
else:
    print(f"\n‚ùå Gap to 90%: {(0.90 - best['Accuracy'])*100:.2f}%")


FINAL COMPARISON - ALL METHODS
           Method       Type  Accuracy  F1 (Macro)
    Stacking (LR)   Ensemble  0.917802    0.921785
     BERT Boosted   Ensemble  0.911702    0.915538
Top-3 Soft Voting   Ensemble  0.906911    0.910677
  Weighted Voting   Ensemble  0.902844    0.906706
      Soft Voting   Ensemble  0.900788    0.904698
       DistilBERT Individual  0.897773    0.901723
         LightGBM Individual  0.861780    0.865595
      Hard Voting   Ensemble  0.860612    0.865370
 Stacking (Preds)   Ensemble  0.859233    0.854983
 ML Only Ensemble   Ensemble  0.848202    0.851589
          XGBoost Individual  0.827751    0.829966
     RandomForest Individual  0.778648    0.776762

üèÜ BEST METHOD: Stacking (LR)
   Type: Ensemble
   Accuracy: 0.9178 (91.78%)
   F1 (Macro): 0.9218

‚úÖ TARGET 90% ACCURACY ACHIEVED!


In [None]:
# Classification Report for Best Method
print("\n" + "="*60)
print(f"CLASSIFICATION REPORT - {best['Method']}")
print("="*60)

# Get predictions for best method
if best['Method'] == 'Soft Voting':
    best_preds = soft_vote_preds
elif best['Method'] == 'Hard Voting':
    best_preds = hard_vote_preds
elif best['Method'] == 'Weighted Voting':
    best_preds = weighted_vote_preds
elif best['Method'] == 'BERT Boosted':
    best_preds = boosted_preds
elif best['Method'] == 'Top-3 Soft Voting':
    best_preds = top3_preds
elif best['Method'] in predictions:
    best_preds = predictions[best['Method']]
else:
    best_preds = soft_vote_preds  # Default

print(classification_report(y_test, best_preds, target_names=le.classes_))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, best_preds)
cm_df = pd.DataFrame(cm, index=le.classes_, columns=le.classes_)
print(cm_df)


CLASSIFICATION REPORT - Stacking (LR)
              precision    recall  f1-score   support

 cohere-chat       0.92      0.83      0.88      5348
        gpt4       0.98      0.95      0.96      5348
  llama-chat       0.92      0.96      0.94     10697
mistral-chat       0.84      0.88      0.86     10697
    mpt-chat       0.90      0.87      0.89     10697

    accuracy                           0.90     42787
   macro avg       0.91      0.90      0.90     42787
weighted avg       0.90      0.90      0.90     42787


Confusion Matrix:
              cohere-chat  gpt4  llama-chat  mistral-chat  mpt-chat
cohere-chat          4455    41         100           441       311
gpt4                   18  5102         151            52        25
llama-chat             20    25       10270           315        67
mistral-chat          162    35         468          9397       635
mpt-chat              162    25         194           998      9318


In [None]:
# Save Results
output_dir = os.path.join(BASE_DIR, 'ensemble_results')
os.makedirs(output_dir, exist_ok=True)

# Save comparison table
final_df.to_csv(os.path.join(output_dir, 'model_comparison.csv'), index=False)

# Save correlation matrix
corr_df.to_csv(os.path.join(output_dir, 'correlation_matrix.csv'))

# Save best predictions
joblib.dump({
    'predictions': best_preds,
    'method': best['Method'],
    'accuracy': best['Accuracy'],
    'f1': best['F1 (Macro)']
}, os.path.join(output_dir, 'best_ensemble_predictions.pkl'))

print(f"\n‚úÖ Results saved to: {output_dir}")
print("   - model_comparison.csv")
print("   - correlation_matrix.csv")
print("   - best_ensemble_predictions.pkl")


‚úÖ Results saved to: C:\Users\apara\Desktop\MDM\saved_models\ensemble_results
   - model_comparison.csv
   - correlation_matrix.csv
   - best_ensemble_predictions.pkl


# **WITHOUT RANDOM FOREST**

In [None]:
# COMPARISON - With vs Without RandomForest
print("="*60)
print("FULL ENSEMBLE COMPARISON: WITH vs WITHOUT RANDOM FOREST")
print("="*60)

# Models without RF
models_no_rf = ['XGBoost', 'LightGBM', 'DistilBERT']

no_rf_results = []

# 1. Hard Voting (no RF)
print("\n1. HARD VOTING (no RF)")
no_rf_all_preds = np.array([predictions[m] for m in models_no_rf])
no_rf_hard_preds = []
for i in range(no_rf_all_preds.shape[1]):
    votes = no_rf_all_preds[:, i]
    no_rf_hard_preds.append(np.bincount(votes.astype(int), minlength=len(le.classes_)).argmax())
no_rf_hard_preds = np.array(no_rf_hard_preds)

acc = accuracy_score(y_test, no_rf_hard_preds)
f1 = f1_score(y_test, no_rf_hard_preds, average='macro')
print(f"   Accuracy: {acc:.4f}, F1: {f1:.4f}")
no_rf_results.append({'Method': 'Hard Voting', 'Accuracy': acc, 'F1': f1})

# 2. Soft Voting (no RF)
print("\n2. SOFT VOTING (no RF)")
no_rf_probs = np.mean([probabilities[m] for m in models_no_rf], axis=0)
no_rf_soft_preds = np.argmax(no_rf_probs, axis=1)

acc = accuracy_score(y_test, no_rf_soft_preds)
f1 = f1_score(y_test, no_rf_soft_preds, average='macro')
print(f"   Accuracy: {acc:.4f}, F1: {f1:.4f}")
no_rf_results.append({'Method': 'Soft Voting', 'Accuracy': acc, 'F1': f1})

# 3. Weighted Voting (no RF)
print("\n3. WEIGHTED VOTING by accuracy (no RF)")
no_rf_weights = np.array([accuracy_score(y_test, predictions[m]) for m in models_no_rf])
no_rf_weights = no_rf_weights / no_rf_weights.sum()
print(f"   Weights: {dict(zip(models_no_rf, no_rf_weights.round(3)))}")

no_rf_weighted_probs = np.zeros_like(probabilities[models_no_rf[0]])
for i, m in enumerate(models_no_rf):
    no_rf_weighted_probs += no_rf_weights[i] * probabilities[m]
no_rf_weighted_preds = np.argmax(no_rf_weighted_probs, axis=1)

acc = accuracy_score(y_test, no_rf_weighted_preds)
f1 = f1_score(y_test, no_rf_weighted_preds, average='macro')
print(f"   Accuracy: {acc:.4f}, F1: {f1:.4f}")
no_rf_results.append({'Method': 'Weighted Voting', 'Accuracy': acc, 'F1': f1})

# 4. BERT Boosted (no RF)
print("\n4. BERT BOOSTED (no RF)")
no_rf_bert_weights = {'XGBoost': 1, 'LightGBM': 1, 'DistilBERT': 2}
total = sum(no_rf_bert_weights.values())
no_rf_bert_weights = {k: v/total for k, v in no_rf_bert_weights.items()}
print(f"   Weights: {no_rf_bert_weights}")

no_rf_boosted_probs = np.zeros_like(probabilities[models_no_rf[0]])
for m in models_no_rf:
    no_rf_boosted_probs += no_rf_bert_weights[m] * probabilities[m]
no_rf_boosted_preds = np.argmax(no_rf_boosted_probs, axis=1)

acc = accuracy_score(y_test, no_rf_boosted_preds)
f1 = f1_score(y_test, no_rf_boosted_preds, average='macro')
print(f"   Accuracy: {acc:.4f}, F1: {f1:.4f}")
no_rf_results.append({'Method': 'BERT Boosted', 'Accuracy': acc, 'F1': f1})

# 5. Stacking on probabilities (no RF)
print("\n5. STACKING on probabilities (no RF)")
no_rf_meta_features = np.hstack([probabilities[m] for m in models_no_rf])
print(f"   Meta-features shape: {no_rf_meta_features.shape}")

no_rf_stacking_preds = np.zeros(len(y_test))
for train_idx, val_idx in skf.split(no_rf_meta_features, y_test):
    X_meta_train = no_rf_meta_features[train_idx]
    y_meta_train = y_test[train_idx]
    X_meta_val = no_rf_meta_features[val_idx]
    
    meta_clf = LogisticRegression(max_iter=1000, random_state=5)
    meta_clf.fit(X_meta_train, y_meta_train)
    no_rf_stacking_preds[val_idx] = meta_clf.predict(X_meta_val)

acc = accuracy_score(y_test, no_rf_stacking_preds)
f1 = f1_score(y_test, no_rf_stacking_preds, average='macro')
print(f"   Accuracy: {acc:.4f}, F1: {f1:.4f}")
no_rf_results.append({'Method': 'Stacking (LR)', 'Accuracy': acc, 'F1': f1})

# 6. Stacking on predictions (no RF)
print("\n6. STACKING on predictions (no RF)")
no_rf_pred_features = np.column_stack([predictions[m] for m in models_no_rf])
print(f"   Prediction features shape: {no_rf_pred_features.shape}")

no_rf_stacking_preds2 = np.zeros(len(y_test))
for train_idx, val_idx in skf.split(no_rf_pred_features, y_test):
    X_pred_train = no_rf_pred_features[train_idx]
    y_pred_train = y_test[train_idx]
    X_pred_val = no_rf_pred_features[val_idx]
    
    meta_clf2 = LogisticRegression(max_iter=1000, random_state=5)
    meta_clf2.fit(X_pred_train, y_pred_train)
    no_rf_stacking_preds2[val_idx] = meta_clf2.predict(X_pred_val)

acc = accuracy_score(y_test, no_rf_stacking_preds2)
f1 = f1_score(y_test, no_rf_stacking_preds2, average='macro')
print(f"   Accuracy: {acc:.4f}, F1: {f1:.4f}")
no_rf_results.append({'Method': 'Stacking (Preds)', 'Accuracy': acc, 'F1': f1})

# 7. Top-2 Soft Voting (no RF) - just XGB + LightGBM + BERT is already top 3
print("\n7. ML ONLY (no RF) - XGBoost + LightGBM")
ml_no_rf = ['XGBoost', 'LightGBM']
ml_no_rf_probs = np.mean([probabilities[m] for m in ml_no_rf], axis=0)
ml_no_rf_preds = np.argmax(ml_no_rf_probs, axis=1)

acc = accuracy_score(y_test, ml_no_rf_preds)
f1 = f1_score(y_test, ml_no_rf_preds, average='macro')
print(f"   Accuracy: {acc:.4f}, F1: {f1:.4f}")
no_rf_results.append({'Method': 'ML Only (no RF)', 'Accuracy': acc, 'F1': f1})

# HEAD TO HEAD COMPARISON
print("\n" + "="*60)
print("HEAD TO HEAD: WITH RF vs WITHOUT RF")
print("="*60)

# Original results with RF
with_rf_results = {
    'Hard Voting': 0.8606,
    'Soft Voting': 0.9008,
    'Weighted Voting': 0.9028,
    'BERT Boosted': 0.9117,
    'Stacking (LR)': 0.9178,
    'Stacking (Preds)': 0.8592,
    'ML Only': 0.8482
}

print(f"\n{'Method':<20} {'With RF':<12} {'Without RF':<12} {'Diff':<10} {'Winner':<10}")
print("-"*70)

for res in no_rf_results:
    method = res['Method']
    no_rf_acc = res['Accuracy']
    
    # Match method names
    if method == 'ML Only (no RF)':
        with_rf_acc = with_rf_results.get('ML Only', 0)
    else:
        with_rf_acc = with_rf_results.get(method, 0)
    
    diff = no_rf_acc - with_rf_acc
    winner = "NO RF ‚úì" if diff > 0 else "WITH RF ‚úì" if diff < 0 else "TIE"
    
    print(f"{method:<20} {with_rf_acc:<12.4f} {no_rf_acc:<12.4f} {diff:+.4f}     {winner}")

# Summary
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
no_rf_wins = sum(1 for res in no_rf_results if res['Accuracy'] > with_rf_results.get(res['Method'].replace(' (no RF)', ''), with_rf_results.get('ML Only', 0)))
print(f"Without RF wins: {no_rf_wins}/{len(no_rf_results)} comparisons")

best_with_rf = max(with_rf_results.values())
best_no_rf = max(res['Accuracy'] for res in no_rf_results)
print(f"\nBest WITH RF:    {best_with_rf:.4f} (Stacking LR)")
print(f"Best WITHOUT RF: {best_no_rf:.4f}")
print(f"\nRECOMMENDATION: {'DROP RandomForest' if best_no_rf > best_with_rf else 'KEEP RandomForest'}")

FULL ENSEMBLE COMPARISON: WITH vs WITHOUT RANDOM FOREST

1. HARD VOTING (no RF)
   Accuracy: 0.8704, F1: 0.8736

2. SOFT VOTING (no RF)
   Accuracy: 0.9069, F1: 0.9107

3. WEIGHTED VOTING by accuracy (no RF)
   Weights: {'XGBoost': np.float64(0.32), 'LightGBM': np.float64(0.333), 'DistilBERT': np.float64(0.347)}
   Accuracy: 0.9080, F1: 0.9117

4. BERT BOOSTED (no RF)
   Weights: {'XGBoost': 0.25, 'LightGBM': 0.25, 'DistilBERT': 0.5}
   Accuracy: 0.9100, F1: 0.9140

5. STACKING on probabilities (no RF)
   Meta-features shape: (42787, 15)
   Accuracy: 0.9174, F1: 0.9214

6. STACKING on predictions (no RF)
   Prediction features shape: (42787, 3)
   Accuracy: 0.8709, F1: 0.8734

7. ML ONLY (no RF) - XGBoost + LightGBM
   Accuracy: 0.8520, F1: 0.8555

HEAD TO HEAD: WITH RF vs WITHOUT RF

Method               With RF      Without RF   Diff       Winner    
----------------------------------------------------------------------
Hard Voting          0.8606       0.8704       +0.0098     NO RF