In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import joblib
import os
from tensorflow.keras.models import load_model

In [2]:
import metrics as me
import utils as ut
import seaborn as sns
import matplotlib.pyplot as plt


### Importing pre-trained models and generating oracle matrix 
The oracle matrix contains the outputs of the models for the test set of our dataset

In [3]:
# Générer un dataset non linéairement séparable
Xg, yg = make_moons(n_samples=300, noise=0.2, random_state=42)

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(Xg, yg, test_size=0.2, random_state=42)
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")

# Standardiser les données
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Charger les modèles depuis le dossier "models"
models_folder = "models"
models = ut.load_models(os.path.join(models_folder))
# Entraînement et prédictions binaires (1=correct, 0=faux)
oracle_outputs = {}
for name, model in models.items():
    if name == "cnn_model":
        y_pred = (model.predict(X_test_scaled) > 0.5).astype(int).flatten()
    else:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    oracle_outputs[name] = (y_pred == y_test).astype(int)  # 1 si correct, 0 sinon

# Matrice binaire (n_classifiers x n_samples)
print("Oracle outputs:", oracle_outputs)
oracle_matrix = np.array(list(oracle_outputs.values()))
print(oracle_matrix.shape)
n_classifiers, n_samples = oracle_matrix.shape

X_train shape: (240, 2), y_train shape: (240,)
Imported sklearn model: decision_tree
Imported sklearn model: kernel_svc
Imported sklearn model: linear_svc
Imported sklearn model: random_forest
Oracle outputs: {'decision_tree': array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1]), 'kernel_svc': array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1]), 'linear_svc': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1]), 'random_forest': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 

In [4]:
correlation_matrix= ut.calculate_diversity_metrics_correlation(models, X_test_scaled, y_test)
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title("Matrice de corrélation des métriques de diversité")
plt.tight_layout()
plt.show()

NameError: name 'ut' is not defined

In [5]:
corr_matrix= ut.calculate_diversity_metrics_correlation(models, X_test_scaled, y_test)

NameError: name 'ut' is not defined

In [8]:
def get_least_correlated_metrics(correlation_matrix, k):
    """
    Get the least correlated metric pairs from a correlation matrix.
    Parameters:
    correlation_matrix (pd.DataFrame): A DataFrame containing the correlation matrix.
    k (int): The number of least correlated metric pairs to return.
    Returns:
    pd.DataFrame: A DataFrame containing the least correlated metric pairs and their correlation values.
    """
    # Flatten the correlation matrix and get the absolute values
    corr_values = correlation_matrix.abs().unstack()
    
    # Remove self-correlations (diagonal elements)
    corr_values = corr_values[corr_values.index.get_level_values(0) != corr_values.index.get_level_values(1)]
    
    # Sort by correlation values in ascending order
    least_correlated = corr_values.sort_values().head(k)
    least_correlated = least_correlated.reset_index()
    least_correlated.columns = ['Metric_1', 'Metric_2', 'Correlation']
    # Return the metric pairs and their correlation values
    return least_correlated

In [9]:
corr_matrix

Unnamed: 0,entropy,KW_variance,kappa,theta,generalized_diversity,CFD,Q_statistic,correlation,disagreement,double_fault
entropy,1.0,1.0,-0.29554,0.633241,0.57735,,0.366807,0.307435,1.0,0.57735
KW_variance,1.0,1.0,-0.29554,0.633241,0.57735,,0.366807,0.307435,1.0,0.57735
kappa,-0.29554,-0.29554,1.0,-0.92653,-0.950654,,-0.99099,-0.998727,-0.29554,-0.950654
theta,0.633241,0.633241,-0.92653,1.0,0.997533,,0.947297,0.930183,0.633241,0.997533
generalized_diversity,0.57735,0.57735,-0.950654,0.997533,1.0,,0.966098,0.953429,0.57735,1.0
CFD,,,,,,,,,,
Q_statistic,0.366807,0.366807,-0.99099,0.947297,0.966098,,1.0,0.985391,0.366807,0.966098
correlation,0.307435,0.307435,-0.998727,0.930183,0.953429,,0.985391,1.0,0.307435,0.953429
disagreement,1.0,1.0,-0.29554,0.633241,0.57735,,0.366807,0.307435,1.0,0.57735
double_fault,0.57735,0.57735,-0.950654,0.997533,1.0,,0.966098,0.953429,0.57735,1.0


In [10]:
get_least_correlated_metrics(corr_matrix, 4)

Unnamed: 0,Metric_1,Metric_2,Correlation
0,kappa,disagreement,0.29554
1,disagreement,kappa,0.29554
2,kappa,entropy,0.29554
3,entropy,kappa,0.29554


### Testing the diversity metrics

In [4]:
# Calcul des 10 métriques pour l'ensemble des classifieurs
# 1. Pairwise metrics
metrics_pairwise = me.pairwise_metrics(oracle_matrix)
# 2. Non-pairwise metrics
metrics_non_pairwise = me.non_pairwise_metrics(oracle_matrix)

# Fusion
all_metrics = {**metrics_pairwise, **metrics_non_pairwise}

# Affichage
df_metrics = pd.DataFrame([all_metrics])
print(" Mesures de diversité pour l'ensemble des 4 modèles :")
display(df_metrics)

 Mesures de diversité pour l'ensemble des 4 modèles :


Unnamed: 0,Q_statistic,correlation,disagreement,double_fault,entropy,KW_variance,kappa,theta,generalized_diversity,CFD
0,0.882421,0.392217,0.094444,0.036111,0.147794,0.035417,-0.176471,0.040972,0.036111,0.0


### Finding the best k pairs of classifiers in diversity

For Q statistic only

In [5]:
diversity_values = me.calculate_q_statistic_for_models(models, X_test_scaled, y_test, 2)

print("Diversity Values:")
for key, value in diversity_values.items():
    print(f"{key}: {value}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Diversity Values:
cnn_model vs decision_tree: 0.0
cnn_model vs kernel_svc: 0.0


For all pairwise metrics: top k pairs for each metric

In [11]:
print("Pairwise Metrics:")
pairwise_metrics = me.calculate_pairwise_metrics_for_models(models, X_test_scaled, y_test)
for metric, values in pairwise_metrics.items():
    print(f"{metric}:")
    for key, value in values.items():
        print(f"  {key}: {value[metric]}")

Pairwise Metrics:
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
Q_statistic:
  cnn_model vs decision_tree: 0.0
  cnn_model vs kernel_svc: 0.0
  cnn_model vs linear_svc: 0.0
  cnn_model vs random_forest: 0.0
  kernel_svc vs random_forest: 0.7999999999986667
  decision_tree vs linear_svc: 0.8518518518513261
  kernel_svc vs linear_svc: 0.8571428571420918
  decision_tree vs random_forest: 0.9082568807331117
  linear_svc vs random_forest: 0.9272727272718843
correlation:
  cnn_model vs decision_tree: 0.0
  cnn_model vs kernel_svc: 0.0
  cnn_model vs linear_svc: 0.0
  cnn_model vs random_forest: 0.0
  kernel_svc vs random_forest: 0.24525573579398632
  kernel_svc vs linear_svc: 0.3563483225498992
  decision_tree vs random_forest: 0.3930521575839001
  decis