In [12]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import joblib
import os
from tensorflow.keras.models import load_model

In [13]:
import metrics as me
import utils as ut

### Importing pre-trained models and generating oracle matrix 
The oracle matrix contains the outputs of the models for the test set of our dataset

In [19]:
# Générer un dataset non linéairement séparable
Xg, yg = make_moons(n_samples=300, noise=0.2, random_state=42)

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(Xg, yg, test_size=0.2, random_state=42)
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")

# Standardiser les données
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Charger les modèles depuis le dossier "models"
models_folder = "models"
models = ut.load_models(os.path.join(models_folder))
# Entraînement et prédictions binaires (1=correct, 0=faux)
oracle_outputs = {}
for name, model in models.items():
    if name == "cnn_model":
        y_pred = (model.predict(X_test_scaled) > 0.5).astype(int).flatten()
    else:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    oracle_outputs[name] = (y_pred == y_test).astype(int)  # 1 si correct, 0 sinon

# Matrice binaire (n_classifiers x n_samples)
print("Oracle outputs:", oracle_outputs)
oracle_matrix = np.array(list(oracle_outputs.values()))
print(oracle_matrix.shape)
n_classifiers, n_samples = oracle_matrix.shape

X_train shape: (240, 2), y_train shape: (240,)


NameError: name 'os' is not defined

### Testing the diversity metrics

In [20]:
# Calcul des 10 métriques pour l'ensemble des classifieurs
# 1. Pairwise metrics
metrics_pairwise = me.pairwise_metrics(oracle_matrix)
# 2. Non-pairwise metrics
metrics_non_pairwise = me.non_pairwise_metrics(oracle_matrix)

# Fusion
all_metrics = {**metrics_pairwise, **metrics_non_pairwise}

# Affichage
df_metrics = pd.DataFrame([all_metrics])
print(" Mesures de diversité pour l'ensemble des 4 modèles :")
display(df_metrics)

NameError: name 'oracle_matrix' is not defined

### Finding the best k pairs of classifiers in diversity

For Q statistic only

In [5]:
diversity_values = me.calculate_q_statistic_for_models(models, X_test_scaled, y_test, 2)

print("Diversity Values:")
for key, value in diversity_values.items():
    print(f"{key}: {value}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Diversity Values:
cnn_model vs decision_tree: 0.0
cnn_model vs kernel_svc: 0.0


For all pairwise metrics: top k pairs for each metric

In [11]:
print("Pairwise Metrics:")
pairwise_metrics = me.calculate_pairwise_metrics_for_models(models, X_test_scaled, y_test)
for metric, values in pairwise_metrics.items():
    print(f"{metric}:")
    for key, value in values.items():
        print(f"  {key}: {value[metric]}")

Pairwise Metrics:
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
Q_statistic:
  cnn_model vs decision_tree: 0.0
  cnn_model vs kernel_svc: 0.0
  cnn_model vs linear_svc: 0.0
  cnn_model vs random_forest: 0.0
  kernel_svc vs random_forest: 0.7999999999986667
  decision_tree vs linear_svc: 0.8518518518513261
  kernel_svc vs linear_svc: 0.8571428571420918
  decision_tree vs random_forest: 0.9082568807331117
  linear_svc vs random_forest: 0.9272727272718843
correlation:
  cnn_model vs decision_tree: 0.0
  cnn_model vs kernel_svc: 0.0
  cnn_model vs linear_svc: 0.0
  cnn_model vs random_forest: 0.0
  kernel_svc vs random_forest: 0.24525573579398632
  kernel_svc vs linear_svc: 0.3563483225498992
  decision_tree vs random_forest: 0.3930521575839001
  decis