In [20]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import joblib
import os
from tensorflow.keras.models import load_model

# Générer un dataset non linéairement séparable
X, y = make_moons(n_samples=300, noise=0.2, random_state=42)

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")

# Standardiser les données
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Charger les modèles depuis le dossier "models"
models = {}
models_folder = os.path.join(os.getcwd(), "models")
for file in os.listdir(models_folder):
    model_name = file.split(".")[0]
    file_extension = file.split(".")[-1]
    if file_extension == "pkl":
        models[model_name] = joblib.load(os.path.join(models_folder, file))
        print(f"Imported sklearn model: {model_name}")
    elif file_extension == "keras":
        models[model_name] = load_model(os.path.join(models_folder, file))
        print(f"Imported keras model: {model_name}")


# Entraînement et prédictions binaires (1=correct, 0=faux)
oracle_outputs = {}
for name, model in models.items():
    if name == "cnn_model":
        y_pred = (model.predict(X_test_scaled) > 0.5).astype(int).flatten()
    else:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    oracle_outputs[name] = (y_pred == y_test).astype(int)  # 1 si correct, 0 sinon

# Matrice binaire (n_classifiers x n_samples)
print("Oracle outputs:", oracle_outputs)
oracle_matrix = np.array(list(oracle_outputs.values()))
print(oracle_matrix.shape)
n_classifiers, n_samples = oracle_matrix.shape

# Diversité pairwise (comparer chaque paire de classifieurs)
def pairwise_metrics(matrix):
    n = matrix.shape[0]
    Qs, corrs, disagreements, double_faults = [], [], [], []
    for i in range(n):
        for j in range(i + 1, n):
            a = matrix[i]
            b = matrix[j]
            N11 = np.sum((a == 1) & (b == 1))
            N00 = np.sum((a == 0) & (b == 0))
            N10 = np.sum((a == 1) & (b == 0))
            N01 = np.sum((a == 0) & (b == 1))
            N = len(a)

            # Q-statistic
            Q = (N11 * N00 - N10 * N01) / (N11 * N00 + N10 * N01 + 1e-10)
            # Correlation
            rho = np.corrcoef(a, b)[0, 1]
            # Disagreement
            dis = (N01 + N10) / N
            # Double fault
            df = N00 / N

            Qs.append(Q)
            corrs.append(rho)
            disagreements.append(dis)
            double_faults.append(df)

    return {
        "Q_statistic": np.mean(Qs),
        "correlation": np.mean(corrs),
        "disagreement": np.mean(disagreements),
        "double_fault": np.mean(double_faults)
    }

# Diversité non pairwise
def non_pairwise_metrics(matrix):
    m, n = matrix.shape
    mean_correct = np.mean(matrix, axis=0)

    # Entropy
    entropy = -np.mean(mean_correct * np.log2(mean_correct + 1e-10) + (1 - mean_correct) * np.log2(1 - mean_correct + 1e-10))
    
    # Kohavi-Wolpert variance
    KW = np.mean(mean_correct * (1 - mean_correct))

    # Interrater agreement (kappa)
    p_bar = np.mean(matrix)
    p_j = np.mean(matrix, axis=0)
    P_e = np.mean(p_j ** 2 + (1 - p_j) ** 2)
    kappa = (p_bar - P_e) / (1 - P_e + 1e-10)

    # Difficulty θ
    theta = np.var(mean_correct)

    # Generalized diversity
    incorrects = (matrix == 0).astype(int)
    shared_errors = np.dot(incorrects, incorrects.T)
    np.fill_diagonal(shared_errors, 0)
    GD = np.sum(shared_errors) / (m * (m - 1))

    # Coincident Failure Diversity (CFD)
    total_failures = np.sum(incorrects, axis=0)
    max_failures = np.max(total_failures)
    CFD = 1 - (max_failures / m)

    return {
        "entropy": entropy,
        "KW_variance": KW,
        "kappa": kappa,
        "theta": theta,
        "generalized_diversity": GD / n,
        "CFD": CFD
    }

# Calcul des 10 métriques
metrics_pairwise = pairwise_metrics(oracle_matrix)
metrics_non_pairwise = non_pairwise_metrics(oracle_matrix)

# Fusion
all_metrics = {**metrics_pairwise, **metrics_non_pairwise}

# Affichage
df_metrics = pd.DataFrame([all_metrics])
print(" Mesures de diversité pour l'ensemble des 4 modèles :")
display(df_metrics)


X_train shape: (240, 2), y_train shape: (240,)
Imported keras model: cnn_model
Imported sklearn model: decision_tree
Imported sklearn model: kernel_svc
Imported sklearn model: linear_svc
Imported sklearn model: random_forest
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170ms/step
Oracle outputs: {'cnn_model': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1]), 'decision_tree': array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1]), 'kernel_svc': array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1]), 'linear_svc': array([1, 1, 1, 1,

Unnamed: 0,Q_statistic,correlation,disagreement,double_fault,entropy,KW_variance,kappa,theta,generalized_diversity,CFD
0,0.684158,0.399624,0.093333,0.04,0.157256,0.037333,-0.160714,0.041822,0.04,0.2


In [None]:
from itertools import combinations

# List of model names
model_names = list(models.keys())

# Generate all possible ensemble combinations
ensemble_combinations = []
for r in range(1, len(model_names) + 1):
    ensemble_combinations.extend(combinations(model_names, r))

# Display the combinations
print(f"Total combinations: {len(ensemble_combinations)}")
print(ensemble_combinations)