In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import joblib
import os
from tensorflow.keras.models import load_model

### Importing pre-trained models and generating oracle matrix 
The oracle matrix contains the outputs of the models for the test set of our dataset

In [15]:
# Générer un dataset non linéairement séparable
Xg, yg = make_moons(n_samples=300, noise=0.2, random_state=42)

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(Xg, yg, test_size=0.2, random_state=42)
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")

# Standardiser les données
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Charger les modèles depuis le dossier "models"
models = {}
models_folder = os.path.join(os.getcwd(), "models")
for file in os.listdir(models_folder):
    model_name = file.split(".")[0]
    file_extension = file.split(".")[-1]
    if file_extension == "pkl":
        models[model_name] = joblib.load(os.path.join(models_folder, file))
        print(f"Imported sklearn model: {model_name}")
    elif file_extension == "keras":
        models[model_name] = load_model(os.path.join(models_folder, file))
        print(f"Imported keras model: {model_name}")


# Entraînement et prédictions binaires (1=correct, 0=faux)
oracle_outputs = {}
for name, model in models.items():
    if name == "cnn_model":
        y_pred = (model.predict(X_test_scaled) > 0.5).astype(int).flatten()
    else:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    oracle_outputs[name] = (y_pred == y_test).astype(int)  # 1 si correct, 0 sinon

# Matrice binaire (n_classifiers x n_samples)
print("Oracle outputs:", oracle_outputs)
oracle_matrix = np.array(list(oracle_outputs.values()))
print(oracle_matrix.shape)
n_classifiers, n_samples = oracle_matrix.shape

X_train shape: (240, 2), y_train shape: (240,)
Imported keras model: cnn_model
Imported sklearn model: decision_tree
Imported sklearn model: kernel_svc
Imported sklearn model: linear_svc
Imported sklearn model: random_forest
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
Oracle outputs: {'cnn_model': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1]), 'decision_tree': array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1]), 'kernel_svc': array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1]), 'linear_svc': array([1, 1, 1, 1, 

### Defining the diversity metrics and testing them

In [16]:
# Diversité pairwise (comparer chaque paire de classifieurs)
def pairwise_metrics(matrix):
    n = matrix.shape[0]
    Qs, corrs, disagreements, double_faults = [], [], [], []
    for i in range(n):
        for j in range(i + 1, n):
            a = matrix[i]
            b = matrix[j]
            N11 = np.sum((a == 1) & (b == 1))
            N00 = np.sum((a == 0) & (b == 0))
            N10 = np.sum((a == 1) & (b == 0))
            N01 = np.sum((a == 0) & (b == 1))
            N = len(a)

            # Q-statistic
            Q = (N11 * N00 - N10 * N01) / (N11 * N00 + N10 * N01 + 1e-10)
            # Correlation
            rho = np.corrcoef(a, b)[0, 1]
            # Disagreement
            dis = (N01 + N10) / N
            # Double fault
            df = N00 / N

            Qs.append(Q)
            corrs.append(rho)
            disagreements.append(dis)
            double_faults.append(df)

    return {
        "Q_statistic": np.mean(Qs),
        "correlation": np.mean(corrs),
        "disagreement": np.mean(disagreements),
        "double_fault": np.mean(double_faults)
    }

# Diversité non pairwise
def non_pairwise_metrics(matrix):
    m, n = matrix.shape
    mean_correct = np.mean(matrix, axis=0)

    # Entropy
    entropy = -np.mean(mean_correct * np.log2(mean_correct + 1e-10) + (1 - mean_correct) * np.log2(1 - mean_correct + 1e-10))
    
    # Kohavi-Wolpert variance
    KW = np.mean(mean_correct * (1 - mean_correct))

    # Interrater agreement (kappa)
    p_bar = np.mean(matrix)
    p_j = np.mean(matrix, axis=0)
    P_e = np.mean(p_j ** 2 + (1 - p_j) ** 2)
    kappa = (p_bar - P_e) / (1 - P_e + 1e-10)

    # Difficulty θ
    theta = np.var(mean_correct)

    # Generalized diversity
    incorrects = (matrix == 0).astype(int)
    shared_errors = np.dot(incorrects, incorrects.T)
    np.fill_diagonal(shared_errors, 0)
    GD = np.sum(shared_errors) / (m * (m - 1))

    # Coincident Failure Diversity (CFD)
    total_failures = np.sum(incorrects, axis=0)
    max_failures = np.max(total_failures)
    CFD = 1 - (max_failures / m)

    return {
        "entropy": entropy,
        "KW_variance": KW,
        "kappa": kappa,
        "theta": theta,
        "generalized_diversity": GD / n,
        "CFD": CFD
    }

# Calcul des 10 métriques
metrics_pairwise = pairwise_metrics(oracle_matrix)
metrics_non_pairwise = non_pairwise_metrics(oracle_matrix)

# Fusion
all_metrics = {**metrics_pairwise, **metrics_non_pairwise}

# Affichage
df_metrics = pd.DataFrame([all_metrics])
print(" Mesures de diversité pour l'ensemble des 4 modèles :")
display(df_metrics)

 Mesures de diversité pour l'ensemble des 4 modèles :


Unnamed: 0,Q_statistic,correlation,disagreement,double_fault,entropy,KW_variance,kappa,theta,generalized_diversity,CFD
0,0.681627,0.370333,0.093333,0.036667,0.157256,0.037333,-0.116071,0.039056,0.036667,0.2


In [17]:
import metrics as me

In [30]:
def get_oracle_output(model, X, y):
    """
    Generate oracle output (1=correct, 0=incorrect) for a given model.

    Parameters:
    - model: The model to evaluate.
    - X: Scaled test data.
    - y: Test labels.

    Returns:
    - numpy.ndarray: Array of oracle outputs (1=correct, 0=incorrect).
    """
    if hasattr(model, 'predict_proba'):
        y_pred = (model.predict(X) > 0.5).astype(int).flatten()
    else:
        y_pred = model.predict(X)
    return (y_pred == y).astype(int)

def calculate_q_statistic(M1, M2, X, y):
    """ 
    Calculate the Q-statistic between two models.
    Parameters:
    - M1: First model.
    - M2: Second model.
    - X: Feature data.
    - y: Labels.
    Returns:
    - float: Q-statistic value.
    """
    # Get oracle outputs
    oracle_M1 = get_oracle_output(M1, X, y)
    oracle_M2 = get_oracle_output(M2, X, y)

    # Calculate Q-statistic
    Q = me.q_statistic(oracle_M1, oracle_M2)
    return Q

def calculate_q_statistic_for_models(models, X, y, k= -1):
    """
    Calculate Q-statistic for an array of models.
    Parameters:
    - models: List of models.
    - X: Feature data.
    - y: Labels.
    - k: Parameter for the number of best pairs to return
    Returns:
    - dict: Dictionary of k best pairs of models by the value of Q-statistics.
        The Q-statistics are sorted
    """
    n = len(models)
    model_list = list(models.values())  
    model_names = list(models.keys())
    results = {}
    for i in range(n):
        for j in range(i + 1, n):
            Q_statistic = calculate_q_statistic(model_list[i], model_list[j], X, y)
            results[f"{model_names[i]} vs {model_names[j]}"] = Q_statistic
    results = dict(list(sorted(results.items(), key=lambda item: item[1]))[:])
    return results

In [31]:
diversity_values = calculate_q_statistic_for_models(models, X_test_scaled, y_test)

print("Diversity Values:")
for key, value in diversity_values.items():
    print(f"{key}: {value}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Diversity Values:
kernel_svc vs random_forest: -0.9999999999875
cnn_model vs decision_tree: 0.0
cnn_model vs kernel_svc: 0.0
cnn_model vs linear_svc: 0.0
cnn_model vs random_forest: 0.0
decision_tree vs random_forest: 0.7931034482744946
linear_svc vs random_forest: 0.8275862068951249
decision_tree vs linear_svc: 0.8518518518513261
kernel_svc vs linear_svc: 0.8571428571420918
decision_tree vs kernel_svc: 0.9499999999994063


In [1]:
qstat= calculate_q_statistic(models["cnn_model"], models["random_forest"], X_test_scaled, y_test)
print(f"qstat: {qstat}")
oracle= get_oracle_output(models["cnn_model"], X_test_scaled, y_test)
print(f"oracle: {oracle}")

NameError: name 'calculate_q_statistic' is not defined

In [None]:
def calculate_pairwise_metrics(M1, M2, X, y):
    """
    Calculate pairwise metrics between two models.
    Parameters:
    - M1: First model.
    - M2: Second model.
    - X: Feature data.
    - y: Labels.
    Returns:
    - dict: Dictionary of pairwise metrics.
    """
    # Get oracle outputs
    oracle_M1 = get_oracle_output(M1, X, y)
    oracle_M2 = get_oracle_output(M2, X, y)

    # Calculate metrics
    metrics = {
        "Q_statistic": me.q_statistic(oracle_M1, oracle_M2),
        "correlation": me.correlation(oracle_M1, oracle_M2),
        "disagreement": me.disagreement(oracle_M1, oracle_M2),
        "double_fault": me.double_fault(oracle_M1, oracle_M2)
    }
    return metrics