# Ensemble Model - Complete Implementation

Complete ensemble model implementation combining MLP, PyTorch NN, and Gradient Boosting with probability calibration using StackingClassifier and CalibratedClassifierCV.


In [None]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path().absolute().parent))

import numpy as np
import json
import pickle
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score, brier_score_loss, log_loss

# Note: This notebook assumes MLP and PyTorch models are already trained and saved
# The model class definitions are in train_mlp.ipynb and train_pytorch.ipynb
# For a fully self-contained version, copy those class definitions here or import from the .py files


## Load Features


In [None]:
features_dir = Path("data/features")

X_train = np.load(features_dir / "train_features_v2.X.npy")
y_train = np.load(features_dir / "train_features_v2.y.npy")
X_val = np.load(features_dir / "val_features_v2.X.npy")
y_val = np.load(features_dir / "val_features_v2.y.npy")
X_test = np.load(features_dir / "test_features_v2.X.npy")
y_test = np.load(features_dir / "test_features_v2.y.npy")

with open(features_dir / "feature_names_v2.json", "r") as f:
    feature_names = json.load(f)

print(f"Training: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")


## Load Base Models


In [None]:
mlp_model = PatentNoveltyClassifier.load('models')
pytorch_model = PyTorchPatentClassifier.load('models/pytorch_nn')

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


## Get Base Model Predictions


In [None]:
mlp_proba_train = mlp_model.predict_proba(X_train)[:, 1]
pytorch_proba_train = pytorch_model.predict_proba(X_train)[:, 1]

mlp_proba_val = mlp_model.predict_proba(X_val)[:, 1]
pytorch_proba_val = pytorch_model.predict_proba(X_val)[:, 1]

mlp_proba_test = mlp_model.predict_proba(X_test)[:, 1]
pytorch_proba_test = pytorch_model.predict_proba(X_test)[:, 1]

X_meta_train = np.column_stack([mlp_proba_train, pytorch_proba_train, X_train_scaled])
X_meta_val = np.column_stack([mlp_proba_val, pytorch_proba_val, X_val_scaled])
X_meta_test = np.column_stack([mlp_proba_test, pytorch_proba_test, X_test_scaled])


## Train Stacking Ensemble


In [None]:
base_models = [
    ('gb', GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42))
]

meta_model = LogisticRegression(random_state=42, max_iter=1000)

stacking = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1
)

stacking.fit(X_meta_train, y_train)


## Calibrate Probabilities


In [None]:
calibrated_ensemble = CalibratedClassifierCV(
    stacking,
    method='sigmoid',
    cv=5
)

calibrated_ensemble.fit(X_meta_val, y_val)


## Evaluate Ensemble


In [None]:
y_pred_test = calibrated_ensemble.predict(X_meta_test)
y_proba_test = calibrated_ensemble.predict_proba(X_meta_test)[:, 1]

metrics = {
    'accuracy': accuracy_score(y_test, y_pred_test),
    'precision': precision_score(y_test, y_pred_test),
    'recall': recall_score(y_test, y_pred_test),
    'f1': f1_score(y_test, y_pred_test),
    'roc_auc': roc_auc_score(y_test, y_proba_test),
    'brier_score': brier_score_loss(y_test, y_proba_test),
    'log_loss': log_loss(y_test, y_proba_test)
}

print("Test Metrics:")
for key, value in metrics.items():
    print(f"  {key}: {value:.4f}")


## Save Ensemble Model


In [None]:
ensemble_dir = Path("models/ensemble")
ensemble_dir.mkdir(parents=True, exist_ok=True)

with open(ensemble_dir / "ensemble_model.pkl", "wb") as f:
    pickle.dump(calibrated_ensemble, f)

with open(ensemble_dir / "scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

results_dir = Path("results/ensemble")
results_dir.mkdir(parents=True, exist_ok=True)

with open(results_dir / "ensemble_metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

print(f"Ensemble model saved to: {ensemble_dir}")
print(f"Metrics saved to: {results_dir}")
