In [33]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, log_loss

# 1. Generate synthetic dataset
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Define Base Learners with Specific Preprocessing
# Model A: Logistic Regression (Requires Scaling)
clf1 = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(random_state=42))
])

# Model B: Random Forest (Does NOT require scaling)
# We use it directly as it handles raw data well
clf2 = RandomForestClassifier(n_estimators=50, random_state=42)

# Model C: SVM (Requires Scaling + Explicit Probability)
# Note: SVC does not output probabilities by default. We must set probability=True
clf3 = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(kernel='rbf', probability=True, random_state=42))
])

# 3. Initialize the Voting Classifier (Soft Voting)
# We assign arbitrary initial weights. We will tune these later.
eclf = VotingClassifier(
    estimators=[('lr', clf1), ('rf', clf2), ('svc', clf3)],
    voting='soft', 
    weights=[1, 1, 1] 
)

# 4. Fit the Ensemble
eclf.fit(X_train, y_train)

# 5. Evaluation
y_pred = eclf.predict(X_test)
y_prob = eclf.predict_proba(X_test)

print(f"Ensemble Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Ensemble Log Loss: {log_loss(y_test, y_prob):.4f}")

Ensemble Accuracy: 0.8700
Ensemble Log Loss: 0.3362
