In [1]:
from utils.utils import *
import json
import numpy as np
import pandas as pd
from xcobras_kmeans_exp_merge import XCOBRAS_kmeans
from model_explainer import ClusteringExplainer
from XQuerier import XQuerier
from DQuerier import DQuerier
from RandomQuerier import RandomQuerier
from cobras_ts.cobras_kmeans import COBRAS_kmeans
from cobras_ts.querier.labelquerier import LabelQuerier

# -- others
import warnings
warnings.filterwarnings("ignore")


PATH = "../../../datasets/deric_benchmark/real-world/"

names = [
    'wine.arff',
    'wisc.arff',
    'glass.arff'
]

budget = 180
runs = 5

Ecriture des résultats:
* un fichier (moyenne empirique de $n$ executions)
  > ``NumberCluster (average) | ARI (Average) | AMI (Average) | V-Measure (Average) | runtimes (average; if any)``
* Un fichier pour chaque des exes
  > ``Num Experiment | Number of cluster (per exp)``

### Ground Truth 

In [3]:
for dataset_name in names:
    print(f"Dataset: {dataset_name} ... ")
    data = read_arff_dataset(PATH + dataset_name)
    X, y = data.drop(["class"], axis=1).values, data["class"].values
    # NumClusters, ARIs, AMIs, VMSs, JACs, exec_time
    NumClusters = np.array([0]*budget, dtype=float)
    ARIs = np.array([0]*budget, dtype=float)
    AMIs = np.array([0]*budget, dtype=float)
    VMIs = np.array([0]*budget, dtype=float)
    exec_time = np.array([0]*budget, dtype=float)

    data_df_2 = {}

    print(f"Excecution of {runs} runs for *LabelQuerier() := Ground Truth*")
    for num_run in range(runs):
        print(f"   run n{num_run}")
        clusterer = COBRAS_kmeans(X, LabelQuerier(y), budget)
        clustering, intermediate_clusterings, runtimes, ml, cl = clusterer.cluster()
        
        NumClusters_tmp, ARIs_tmp, AMIs_tmp, VMSs_tmp, exec_time_tmp = cluster_analysis(intermediate_clusterings, y, runtimes=runtimes)

        NumClusters += NumClusters_tmp
        ARIs += ARIs_tmp
        AMIs += AMIs_tmp
        VMIs += VMSs_tmp
        exec_time += exec_time_tmp
        
        data_df_2[num_run] = {
            'nbr cluster': list(NumClusters_tmp),
            'ari': list(ARIs_tmp),
            'ami': list(AMIs_tmp),
            'v-score': list(VMSs_tmp),
            'exec_time': list(exec_time_tmp),
            'ml' : ml,
            'cl' : cl
        }

    
    NumClusters /= runs
    ARIs /= runs
    AMIs /= runs
    VMIs /= runs
    exec_time /= runs

    # simple tests
    assert ARIs.shape[0] == budget
    assert NumClusters.shape[0] == budget

    data_df = {
        'NumClusters (average)' : NumClusters,
        'ARI' : ARIs,
        'AMI' : AMIs,
        'VMI' : VMIs,
        'runtimes' : exec_time
    }

    df = pd.DataFrame(data=data_df)
    df.to_csv("./results/ground_truth/" + dataset_name[:-5] + "_budget_"+str(budget)+".csv", index=False)

    with open("./results/ground_truth/" + dataset_name[:-5] + "_budget_"+str(budget)+"_per run.json", "w") as write_file:
        json.dump(data_df_2, write_file, indent=4)
    
print("... Saved in: **./results/ground_truth/**")

Dataset: wine.arff ... 
Excecution of 10 runs for *LabelQuerier() := Ground Truth*
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
Dataset: wisc.arff ... 
Excecution of 10 runs for *LabelQuerier() := Ground Truth*
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
Dataset: glass.arff ... 
Excecution of 10 runs for *LabelQuerier() := Ground Truth*
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
... Saved in: **./results/ground_truth/**


### Random Querier

In [4]:
for dataset_name in names:
    print(f"Dataset: {dataset_name} ... ")
    data = read_arff_dataset(PATH + dataset_name)
    X, y = data.drop(["class"], axis=1).values, data["class"].values
    # NumClusters, ARIs, AMIs, VMSs, JACs, exec_time
    NumClusters = np.array([0]*budget, dtype=float)
    ARIs = np.array([0]*budget, dtype=float)
    AMIs = np.array([0]*budget, dtype=float)
    VMIs = np.array([0]*budget, dtype=float)
    exec_time = np.array([0]*budget, dtype=float)

    data_df_2 = {}

    print(f"Excecution of {runs} runs for *RandomQuerier() := P(answer=True)=P(answer=False)=0.5*")
    for num_run in range(runs):
        print(f"   run n{num_run}")
        clusterer = COBRAS_kmeans(X, RandomQuerier(), budget)
        clustering, intermediate_clusterings, runtimes, ml, cl = clusterer.cluster()
        
        NumClusters_tmp, ARIs_tmp, AMIs_tmp, VMSs_tmp, exec_time_tmp = cluster_analysis(intermediate_clusterings, y, runtimes=runtimes)

        NumClusters += NumClusters_tmp
        ARIs += ARIs_tmp
        AMIs += AMIs_tmp
        VMIs += VMSs_tmp
        exec_time += exec_time_tmp

        data_df_2[num_run] = {
            'nbr cluster': list(NumClusters_tmp),
            'ari': list(ARIs_tmp),
            'ami': list(AMIs_tmp),
            'v-score': list(VMSs_tmp),
            'exec_time': list(exec_time_tmp),
            'ml' : ml,
            'cl' : cl
        }

    
    NumClusters /= runs
    ARIs /= runs
    AMIs /= runs
    VMIs /= runs
    exec_time /= runs

    # simple tests
    assert ARIs.shape[0] == budget
    assert NumClusters.shape[0] == budget

    data_df = {
        'NumClusters (average)' : NumClusters,
        'ARI' : ARIs,
        'AMI' : AMIs,
        'VMI' : VMIs,
        'runtimes' : exec_time
    }

    df = pd.DataFrame(data=data_df)
    

    df.to_csv("./results/random/" + dataset_name[:-5] + "_budget_"+str(budget)+".csv", index=False)
    
    with open("./results/random/" + dataset_name[:-5] + "_budget_"+str(budget)+"_per run.json", "w") as write_file:
        json.dump(data_df_2, write_file, indent=4)
print("... Saved in: **./results/random/**")

Dataset: wine.arff ... 
Excecution of 10 runs for *RandomQuerier() := P(answer=True)=P(answer=False)=0.5*
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
Dataset: wisc.arff ... 
Excecution of 10 runs for *RandomQuerier() := P(answer=True)=P(answer=False)=0.5*
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
Dataset: glass.arff ... 
Excecution of 10 runs for *RandomQuerier() := P(answer=True)=P(answer=False)=0.5*
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
... Saved in: **./results/random/**


### LIME

#### 1. Top-n

In [3]:
for dataset_name in names:
    print(f"Dataset: {dataset_name} ... ")
    for top_n in [3, 5, 7, 9, 10]:
        print(f"top_n: {top_n}")
        
        # read file TODO gérer les CSV ou les arff
        data = read_arff_dataset(PATH + dataset_name)
        X, y = data.drop(["class"], axis=1), data["class"]
        feature_names = X.columns

        # build querier
        xai_querier = XQuerier(
            y.values,
            xai_method= 'lime',
            strat     = 'commun_fraction',
            top_n     = top_n, 
            threshold = 0.5
        )

        # build explainer
        model_explainer = ClusteringExplainer(
            model       = 'rbf_svm', # rbf_svm
            xai_model   = 'lime', # lime/shap
            test_size   = 0.4, 
            verbose     = False
        )
        
        # NumClusters, ARIs, AMIs, VMSs, JACs, exec_time
        NumClusters = np.array([0]*budget, dtype=float)
        ARIs = np.array([0]*budget, dtype=float)
        AMIs = np.array([0]*budget, dtype=float)
        VMIs = np.array([0]*budget, dtype=float)
        exec_time = np.array([0]*budget, dtype=float)

        data_df_2 = {}
        
        for num_run in range(runs):
            print(f"   run n{num_run}")
            # instanciate and train the model
            xcobras_kmeans = XCOBRAS_kmeans(budget = budget, model_explainer=model_explainer)
            clustering, intermediate_clusterings, runtimes, ml, cl = xcobras_kmeans.fit(X.values, feature_names=feature_names, y=xai_querier)
            
            NumClusters_tmp, ARIs_tmp, AMIs_tmp, VMSs_tmp, exec_time_tmp = cluster_analysis(intermediate_clusterings, y, runtimes=runtimes)

            NumClusters += NumClusters_tmp
            ARIs += ARIs_tmp
            AMIs += AMIs_tmp
            VMIs += VMSs_tmp
            exec_time += exec_time_tmp

            data_df_2[num_run] = {
                'nbr cluster': list(NumClusters_tmp),
                'ari': list(ARIs_tmp),
                'ami': list(AMIs_tmp),
                'v-score': list(VMSs_tmp),
                'exec_time': list(exec_time_tmp),
                'ml' : ml,
                'cl' : cl
            }

        
        NumClusters /= runs
        ARIs /= runs
        AMIs /= runs
        VMIs /= runs
        exec_time /= runs

        # simple tests
        assert ARIs.shape[0] == budget
        assert NumClusters.shape[0] == budget

        data_df = {
            'NumClusters (average)' : NumClusters,
            'ARI' : ARIs,
            'AMI' : AMIs,
            'VMI' : VMIs,
            'runtimes' : exec_time
        }

        df = pd.DataFrame(data=data_df)
        
        df.to_csv("./results/lime/commun_fraction/" + dataset_name[:-5] + "_budget_"+str(budget)+"_"+ str(top_n) +".csv", index=False)
        
        with open("./results/lime/commun_fraction/" + dataset_name[:-5] + "_budget_"+str(budget)+"_"+ str(top_n) +"_num clusters per run.json", "w") as write_file:
            json.dump(data_df_2, write_file, indent=4)


        

Dataset: wine.arff ... 
top_n: 3
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
top_n: 5
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
top_n: 7
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
top_n: 9
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
top_n: 10
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
Dataset: wisc.arff ... 
top_n: 3
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
top_n: 5
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
top_n: 7
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
top_n: 9
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6


#### 2. Cosine Similarity

In [2]:
for dataset_name in names:
    print(f"Dataset: {dataset_name} ... ")
    for threshold in [0.5, 0.6, 0.7, 0.8, 0.9]:
        print(f"Threshold: {threshold}")
        
        # read file TODO gérer les CSV ou les arff
        data = read_arff_dataset(PATH + dataset_name)
        X, y = data.drop(["class"], axis=1), data["class"]
        feature_names = X.columns

        # build querier
        xai_querier = XQuerier(
            y.values,
            xai_method= 'lime',
            strat     = 'cosine_similarity',
            # top_n     = 3, 
            threshold = threshold
        )

        # build explainer
        model_explainer = ClusteringExplainer(
            model       = 'rbf_svm', # rbf_svm
            xai_model   = 'lime', # lime/shap
            test_size   = 0.4, 
            verbose     = False
        )
        
        # NumClusters, ARIs, AMIs, VMSs, JACs, exec_time
        NumClusters = np.array([0]*budget, dtype=float)
        ARIs = np.array([0]*budget, dtype=float)
        AMIs = np.array([0]*budget, dtype=float)
        VMIs = np.array([0]*budget, dtype=float)
        exec_time = np.array([0]*budget, dtype=float)

        data_df_2 = {}
        
        for num_run in range(runs):
            print(f"   run n{num_run}")
            # instanciate and train the model
            xcobras_kmeans = XCOBRAS_kmeans(xquerier=True, budget = budget, model_explainer=model_explainer)
            clustering, intermediate_clusterings, runtimes, ml, cl = xcobras_kmeans.fit(X.values, feature_names=feature_names, y=xai_querier)
            
            NumClusters_tmp, ARIs_tmp, AMIs_tmp, VMSs_tmp, exec_time_tmp = cluster_analysis(intermediate_clusterings, y, runtimes=runtimes)
            
            NumClusters += NumClusters_tmp
            ARIs += ARIs_tmp
            AMIs += AMIs_tmp
            VMIs += VMSs_tmp
            exec_time += exec_time_tmp

            data_df_2[num_run] = {
                'nbr cluster': list(NumClusters_tmp),
                'ari': list(ARIs_tmp),
                'ami': list(AMIs_tmp),
                'v-score': list(VMSs_tmp),
                'exec_time': list(exec_time_tmp),
                'ml' : ml,
                'cl' : cl
            }
            print(f"Helloooo - nbr GT: {xcobras_kmeans.nbr_data_used} and nbr EXP: {xcobras_kmeans.nbr_expplications_used}")
        
        NumClusters /= runs
        ARIs /= runs
        AMIs /= runs
        VMIs /= runs
        exec_time /= runs

        # simple tests
        assert ARIs.shape[0] == budget
        assert NumClusters.shape[0] == budget

        data_df = {
            'NumClusters (average)' : NumClusters,
            'ARI' : ARIs,
            'AMI' : AMIs,
            'VMI' : VMIs,
            'runtimes' : exec_time
        }

        df = pd.DataFrame(data=data_df)
        
        df.to_csv("./results/lime/cosine_similarity/" + dataset_name[:-5] + "_budget_"+str(budget)+"_"+ str(int(threshold*100)) +".csv", index=False)
        
        with open("./results/lime/cosine_similarity/" + dataset_name[:-5] + "_budget_"+str(budget)+"_"+ str(int(threshold*100)) +"_per run.json", "w") as write_file:
            json.dump(data_df_2, write_file, indent=4)


        

Dataset: wine.arff ... 
Threshold: 0.5
   run n0
Split:	Vérité terrain!!!!!!
Salut 
Split:	Vérité terrain!!!!!!
Salut 
Split:	Explications!!!!!!
Merge:	Explications!!!!!!
Split:	Vérité terrain!!!!!!
Salut 
Split:	Vérité terrain!!!!!!
Salut 
Merge:	Vérité terrain!!!!!!
Salut 
Merge:	Vérité terrain!!!!!!
Salut 
Split:	Explications!!!!!!
Merge:	Explications!!!!!!
Split:	Explications!!!!!!
Merge:	Explications!!!!!!
Merge:	Explications!!!!!!
Split:	Explications!!!!!!
Merge:	Explications!!!!!!
Merge:	Explications!!!!!!
Split:	Vérité terrain!!!!!!
Salut 
Split:	Vérité terrain!!!!!!
Salut 
Split:	Vérité terrain!!!!!!
Salut 
Split:	Vérité terrain!!!!!!
Salut 
Merge:	Vérité terrain!!!!!!
Salut 
Merge:	Vérité terrain!!!!!!
Salut 
Merge:	Vérité terrain!!!!!!
Salut 
Merge:	Vérité terrain!!!!!!
Salut 
Merge:	Vérité terrain!!!!!!
Salut 
Merge:	Vérité terrain!!!!!!
Salut 
Merge:	Vérité terrain!!!!!!
Salut 
Merge:	Vérité terrain!!!!!!
Salut 
Merge:	Vérité terrain!!!!!!
Salut 
Merge:	Vérité terrain!!!!!

ValueError: 
All the 84 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
84 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\user\anaconda3\envs\XClustering_env39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\user\anaconda3\envs\XClustering_env39\lib\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\user\anaconda3\envs\XClustering_env39\lib\site-packages\sklearn\svm\_base.py", line 201, in fit
    y = self._validate_targets(y)
  File "c:\Users\user\anaconda3\envs\XClustering_env39\lib\site-packages\sklearn\svm\_base.py", line 749, in _validate_targets
    raise ValueError(
ValueError: The number of classes has to be greater than one; got 1 class


#### 3. NDCDG (Normalized Discounted Cumulative Gain)

In [None]:
for dataset_name in names:
    print(f"Dataset: {dataset_name} ... ")
    for threshold in [0.5, 0.6, 0.7, 0.8, 0.9]:
        print(f"Threshold: {threshold}")
        
        # read file TODO gérer les CSV ou les arff
        data = read_arff_dataset(PATH + dataset_name)
        X, y = data.drop(["class"], axis=1), data["class"]
        feature_names = X.columns

        # build querier
        xai_querier = XQuerier(
            y.values,
            xai_method= 'lime',
            strat     = 'ndcg',
            # top_n     = 3, 
            threshold = threshold
        )

        # build explainer
        model_explainer = ClusteringExplainer(
            model       = 'rbf_svm', # rbf_svm
            xai_model   = 'lime', # lime/shap
            test_size   = 0.4, 
            verbose     = False
        )
        
        # NumClusters, ARIs, AMIs, VMSs, JACs, exec_time
        NumClusters = np.array([0]*budget, dtype=float)
        ARIs = np.array([0]*budget, dtype=float)
        AMIs = np.array([0]*budget, dtype=float)
        VMIs = np.array([0]*budget, dtype=float)
        exec_time = np.array([0]*budget, dtype=float)

        data_df_2 = {}
        
        for num_run in range(runs):
            print(f"   run n{num_run}")
            # instanciate and train the model
            xcobras_kmeans = XCOBRAS_kmeans(xquerier=True, budget = budget, model_explainer=model_explainer)
            clustering, intermediate_clusterings, runtimes, ml, cl = xcobras_kmeans.fit(X.values, feature_names=feature_names, y=xai_querier)
            
            NumClusters_tmp, ARIs_tmp, AMIs_tmp, VMSs_tmp, exec_time_tmp = cluster_analysis(intermediate_clusterings, y, runtimes=runtimes)

            NumClusters += NumClusters_tmp
            ARIs += ARIs_tmp
            AMIs += AMIs_tmp
            VMIs += VMSs_tmp
            exec_time += exec_time_tmp

            data_df_2[num_run] = {
                'nbr cluster': list(NumClusters_tmp),
                'ari': list(ARIs_tmp),
                'ami': list(AMIs_tmp),
                'v-score': list(VMSs_tmp),
                'exec_time': list(exec_time_tmp),
                'ml' : ml,
                'cl' : cl
            }

        
        NumClusters /= runs
        ARIs /= runs
        AMIs /= runs
        VMIs /= runs
        exec_time /= runs

        # simple tests
        assert ARIs.shape[0] == budget
        assert NumClusters.shape[0] == budget

        data_df = {
            'NumClusters (average)' : NumClusters,
            'ARI' : ARIs,
            'AMI' : AMIs,
            'VMI' : VMIs,
            'runtimes' : exec_time
        }

        df = pd.DataFrame(data=data_df)
        
        df.to_csv("./results/lime/ndcg/" + dataset_name[:-5] + "_budget_"+str(budget)+"_"+ str(int(threshold*100)) +".csv", index=False)
        
        with open("./results/lime/ndcg/" + dataset_name[:-5] + "_budget_"+str(budget)+"_"+ str(int(threshold*100)) +"_per run.json", "w") as write_file:
            json.dump(data_df_2, write_file, indent=4)


        

Dataset: wine.arff ... 
Threshold: 0.5
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
Threshold: 0.6
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
Threshold: 0.7
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
Threshold: 0.8
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
Threshold: 0.9
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
Dataset: wisc.arff ... 
Threshold: 0.5
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
Threshold: 0.6
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
Threshold: 0.7
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
Threshold: 0.8
   run n0
   run 

### SHAP

#### 1. Top-n

In [2]:
for dataset_name in names:
    print(f"Dataset: {dataset_name} ... ")
    for top_n in [3, 5, 7, 9, 10]:
        print(f"top_n: {top_n}")
        
        # read file TODO gérer les CSV ou les arff
        data = read_arff_dataset(PATH + dataset_name)
        X, y = data.drop(["class"], axis=1), data["class"]
        feature_names = X.columns

        # build querier
        xai_querier = XQuerier(
            y.values,
            xai_method= 'shap',
            strat     = 'commun_fraction',
            top_n     = top_n, 
            threshold = 0.5
        )

        # build explainer
        model_explainer = ClusteringExplainer(
            model       = 'rbf_svm', # rbf_svm
            xai_model   = 'shap', # lime/shap
            test_size   = 0.4, 
            verbose     = False
        )
        
        # NumClusters, ARIs, AMIs, VMSs, JACs, exec_time
        NumClusters = np.array([0]*budget, dtype=float)
        ARIs = np.array([0]*budget, dtype=float)
        AMIs = np.array([0]*budget, dtype=float)
        VMIs = np.array([0]*budget, dtype=float)
        exec_time = np.array([0]*budget, dtype=float)

        data_df_2 = {}
        
        for num_run in range(runs):
            print(f"   run n{num_run}")
            # instanciate and train the model
            xcobras_kmeans = XCOBRAS_kmeans(budget = budget, model_explainer=model_explainer)
            clustering, intermediate_clusterings, runtimes, ml, cl = xcobras_kmeans.fit(X.values, feature_names=feature_names, y=xai_querier)
            
            NumClusters_tmp, ARIs_tmp, AMIs_tmp, VMSs_tmp, exec_time_tmp = cluster_analysis(intermediate_clusterings, y, runtimes=runtimes)

            NumClusters += NumClusters_tmp
            ARIs += ARIs_tmp
            AMIs += AMIs_tmp
            VMIs += VMSs_tmp
            exec_time += exec_time_tmp

            data_df_2[num_run] = {
                'nbr cluster': list(NumClusters_tmp),
                'ari': list(ARIs_tmp),
                'ami': list(AMIs_tmp),
                'v-score': list(VMSs_tmp),
                'exec_time': list(exec_time_tmp),
                'ml' : ml,
                'cl' : cl
            }

        
        NumClusters /= runs
        ARIs /= runs
        AMIs /= runs
        VMIs /= runs
        exec_time /= runs

        # simple tests
        assert ARIs.shape[0] == budget
        assert NumClusters.shape[0] == budget

        data_df = {
            'NumClusters (average)' : NumClusters,
            'ARI' : ARIs,
            'AMI' : AMIs,
            'VMI' : VMIs,
            'runtimes' : exec_time
        }

        df = pd.DataFrame(data=data_df)
        
        df.to_csv("./results/shap/commun_fraction/" + dataset_name[:-5] + "_budget_"+str(budget)+"_"+ str(top_n) +".csv", index=False)
        
        with open("./results/shap/commun_fraction/" + dataset_name[:-5] + "_budget_"+str(budget)+"_"+ str(top_n) +"_num clusters per run.json", "w") as write_file:
            json.dump(data_df_2, write_file, indent=4)


        

Dataset: wine.arff ... 
top_n: 3
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
top_n: 5
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
top_n: 7
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
top_n: 9
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
top_n: 10
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
Dataset: wisc.arff ... 
top_n: 3
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
top_n: 5
   run n0
   run n1
   run n2
   run n3


Exact explainer: 3it [26:13, 1573.87s/it]             


   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
top_n: 7
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
top_n: 9
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
top_n: 10
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
Dataset: glass.arff ... 
top_n: 3
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
top_n: 5
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
top_n: 7
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
top_n: 9
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
top_n: 10
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9


#### 2. Cosine Similarity

In [2]:
for dataset_name in names:
    print(f"Dataset: {dataset_name} ... ")
    for threshold in [0.8, 0.9]:
        print(f"Threshold: {threshold}")
        
        # read file TODO gérer les CSV ou les arff
        data = read_arff_dataset(PATH + dataset_name)
        X, y = data.drop(["class"], axis=1), data["class"]
        feature_names = X.columns

        # build querier
        xai_querier = XQuerier(
            y.values,
            xai_method= 'shap',
            strat     = 'cosine_similarity',
            # top_n     = 3, 
            threshold = threshold
        )

        # build explainer
        model_explainer = ClusteringExplainer(
            model       = 'rbf_svm', # rbf_svm
            xai_model   = 'shap', # lime/shap
            test_size   = 0.4, 
            verbose     = False
        )
        
        # NumClusters, ARIs, AMIs, VMSs, JACs, exec_time
        NumClusters = np.array([0]*budget, dtype=float)
        ARIs = np.array([0]*budget, dtype=float)
        AMIs = np.array([0]*budget, dtype=float)
        VMIs = np.array([0]*budget, dtype=float)
        exec_time = np.array([0]*budget, dtype=float)

        data_df_2 = {}
        
        for num_run in range(runs):
            print(f"   run n{num_run}")
            # instanciate and train the model
            xcobras_kmeans = XCOBRAS_kmeans(xquerier = True, budget = budget, model_explainer=model_explainer)
            clustering, intermediate_clusterings, runtimes, ml, cl = xcobras_kmeans.fit(X.values, feature_names=feature_names, y=xai_querier)
            
            NumClusters_tmp, ARIs_tmp, AMIs_tmp, VMSs_tmp, exec_time_tmp = cluster_analysis(intermediate_clusterings, y, runtimes=runtimes)

            NumClusters += NumClusters_tmp
            ARIs += ARIs_tmp
            AMIs += AMIs_tmp
            VMIs += VMSs_tmp
            exec_time += exec_time_tmp

            data_df_2[num_run] = {
                'nbr cluster': list(NumClusters_tmp),
                'ari': list(ARIs_tmp),
                'ami': list(AMIs_tmp),
                'v-score': list(VMSs_tmp),
                'exec_time': list(exec_time_tmp),
                'ml' : ml,
                'cl' : cl
            }

        
        NumClusters /= runs
        ARIs /= runs
        AMIs /= runs
        VMIs /= runs
        exec_time /= runs

        # simple tests
        assert ARIs.shape[0] == budget
        assert NumClusters.shape[0] == budget

        data_df = {
            'NumClusters (average)' : NumClusters,
            'ARI' : ARIs,
            'AMI' : AMIs,
            'VMI' : VMIs,
            'runtimes' : exec_time
        }

        df = pd.DataFrame(data=data_df)
        
        df.to_csv("./results/shap/cosine_similarity/" + dataset_name[:-5] + "_budget_"+str(budget)+"_"+ str(int(threshold*100)) +".csv", index=False)
        
        with open("./results/shap/cosine_similarity/" + dataset_name[:-5] + "_budget_"+str(budget)+"_"+ str(int(threshold*100)) +"_num clusters per run.json", "w") as write_file:
            json.dump(data_df_2, write_file, indent=4)


        

Dataset: wine.arff ... 
Threshold: 0.8
   run n0
Salut 
Salut 
Salut 
Salut 
Salut 
Salut 
Salut 
Salut 
Merge:	Explications!!!!!! | set(merging_y_hat)={0, 1} | size Class0:6|size Class1:1


ValueError: 
All the 84 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
84 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\user\anaconda3\envs\XClustering_env39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\user\anaconda3\envs\XClustering_env39\lib\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\user\anaconda3\envs\XClustering_env39\lib\site-packages\sklearn\svm\_base.py", line 201, in fit
    y = self._validate_targets(y)
  File "c:\Users\user\anaconda3\envs\XClustering_env39\lib\site-packages\sklearn\svm\_base.py", line 749, in _validate_targets
    raise ValueError(
ValueError: The number of classes has to be greater than one; got 1 class


#### 3. NDGC (Normalized Discounted Cumulative Gain)

In [2]:
for dataset_name in names:
    print(f"Dataset: {dataset_name} ... ")
    for threshold in [0.5, 0.6, 0.7, 0.8, 0.9]:
        print(f"Threshold: {threshold}")
        
        # read file TODO gérer les CSV ou les arff
        data = read_arff_dataset(PATH + dataset_name)
        X, y = data.drop(["class"], axis=1), data["class"]
        feature_names = X.columns

        # build querier
        xai_querier = XQuerier(
            y.values,
            xai_method= 'shap',
            strat     = 'ndcg',
            # top_n     = 3, 
            threshold = threshold
        )

        # build explainer
        model_explainer = ClusteringExplainer(
            model       = 'rbf_svm', # rbf_svm
            xai_model   = 'shap', # lime/shap
            test_size   = 0.4, 
            verbose     = False
        )
        
        # NumClusters, ARIs, AMIs, VMSs, JACs, exec_time
        NumClusters = np.array([0]*budget, dtype=float)
        ARIs = np.array([0]*budget, dtype=float)
        AMIs = np.array([0]*budget, dtype=float)
        VMIs = np.array([0]*budget, dtype=float)
        exec_time = np.array([0]*budget, dtype=float)

        data_df_2 = {}
        
        for num_run in range(runs):
            print(f"   run n{num_run}")
            # instanciate and train the model
            xcobras_kmeans = XCOBRAS_kmeans(budget = budget, model_explainer=model_explainer)
            clustering, intermediate_clusterings, runtimes, ml, cl = xcobras_kmeans.fit(X.values, feature_names=feature_names, y=xai_querier)
            
            NumClusters_tmp, ARIs_tmp, AMIs_tmp, VMSs_tmp, exec_time_tmp = cluster_analysis(intermediate_clusterings, y, runtimes=runtimes)

            NumClusters += NumClusters_tmp
            ARIs += ARIs_tmp
            AMIs += AMIs_tmp
            VMIs += VMSs_tmp
            exec_time += exec_time_tmp

            data_df_2[num_run] = {
                'nbr cluster': list(NumClusters_tmp),
                'ari': list(ARIs_tmp),
                'ami': list(AMIs_tmp),
                'v-score': list(VMSs_tmp),
                'exec_time': list(exec_time_tmp),
                'ml' : ml,
                'cl' : cl
            }

        
        NumClusters /= runs
        ARIs /= runs
        AMIs /= runs
        VMIs /= runs
        exec_time /= runs

        # simple tests
        assert ARIs.shape[0] == budget
        assert NumClusters.shape[0] == budget

        data_df = {
            'NumClusters (average)' : NumClusters,
            'ARI' : ARIs,
            'AMI' : AMIs,
            'VMI' : VMIs,
            'runtimes' : exec_time
        }

        df = pd.DataFrame(data=data_df)
        
        df.to_csv("./results/shap/ndcg/" + dataset_name[:-5] + "_budget_"+str(budget)+"_"+ str(int(threshold*100)) +".csv", index=False)
        
        with open("./results/shap/ndcg/" + dataset_name[:-5] + "_budget_"+str(budget)+"_"+ str(int(threshold*100)) +"_per run.json", "w") as write_file:
            json.dump(data_df_2, write_file, indent=4)


        

Dataset: wine.arff ... 
Threshold: 0.5
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
Threshold: 0.6
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
Threshold: 0.7
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
Threshold: 0.8
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
Threshold: 0.9
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
Dataset: wisc.arff ... 
Threshold: 0.5
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
Threshold: 0.6
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
Threshold: 0.7
   run n0
   run n1
   run n2
   run n3
   run n4
   run n5
   run n6
   run n7
   run n8
   run n9
Threshold: 0.8
   run n0
   run 

### DQuerier

In [1]:
from utils.utils import *
import json
import numpy as np
import pandas as pd
from xcobras_kmeans import XCOBRAS_kmeans
from model_explainer import ClusteringExplainer
from XQuerier import XQuerier
from DQuerier import DQuerier
from RandomQuerier import RandomQuerier
from cobras_ts.cobras_kmeans import COBRAS_kmeans
from cobras_ts.querier.labelquerier import LabelQuerier

# -- others
import warnings
warnings.filterwarnings("ignore")


PATH = "../../../datasets/deric_benchmark/real-world/"

names = [
    'wine.arff',
    'wisc.arff',
    'glass.arff'
]

budget = 180
runs = 10

#### 1. Cosine similarity

In [2]:
for dataset_name in names:
    print(f"Dataset: {dataset_name} ... ")
    for threshold in [0.5, 0.6, 0.7, 0.8, 0.9]:
        print(f"Threshold: {threshold}")
        
        # read file TODO gérer les CSV ou les arff
        data = read_arff_dataset(PATH + dataset_name)
        X, y = data.drop(["class"], axis=1), data["class"]
        feature_names = X.columns

        # build querier
        xai_querier = DQuerier(
            y.values,
            strat     = 'cosine_similarity',
            # top_n     = 3, 
            threshold = threshold
        )

        # # build explainer
        # model_explainer = ClusteringExplainer(
        #     model       = 'rbf_svm', # rbf_svm
        #     xai_model   = 'shap', # lime/shap
        #     test_size   = 0.4, 
        #     verbose     = False
        # )
        
        # NumClusters, ARIs, AMIs, VMSs, JACs, exec_time
        NumClusters = np.array([0]*budget, dtype=float)
        ARIs = np.array([0]*budget, dtype=float)
        AMIs = np.array([0]*budget, dtype=float)
        VMIs = np.array([0]*budget, dtype=float)
        exec_time = np.array([0]*budget, dtype=float)

        data_df_2 = {}
        
        for num_run in range(runs):
            print(f"   run n{num_run}")
            # instanciate and train the model
            xcobras_kmeans = XCOBRAS_kmeans(xquerier=False,  budget = budget)
            clustering, intermediate_clusterings, runtimes, ml, cl = xcobras_kmeans.fit(X.values, feature_names=feature_names, y=xai_querier)
            
            NumClusters_tmp, ARIs_tmp, AMIs_tmp, VMSs_tmp, exec_time_tmp = cluster_analysis(intermediate_clusterings, y, runtimes=runtimes)

            NumClusters += NumClusters_tmp
            ARIs += ARIs_tmp
            AMIs += AMIs_tmp
            VMIs += VMSs_tmp
            exec_time += exec_time_tmp

            data_df_2[num_run] = {
                'nbr cluster': list(NumClusters_tmp),
                'ari': list(ARIs_tmp),
                'ami': list(AMIs_tmp),
                'v-score': list(VMSs_tmp),
                'exec_time': list(exec_time_tmp),
                'ml' : ml,
                'cl' : cl
            }

        
        NumClusters /= runs
        ARIs /= runs
        AMIs /= runs
        VMIs /= runs
        exec_time /= runs

        # simple tests
        assert ARIs.shape[0] == budget
        assert NumClusters.shape[0] == budget

        data_df = {
            'NumClusters (average)' : NumClusters,
            'ARI' : ARIs,
            'AMI' : AMIs,
            'VMI' : VMIs,
            'runtimes' : exec_time
        }

        df = pd.DataFrame(data=data_df)
        
        df.to_csv("./results/DQuerier/cosine_similarity/" + dataset_name[:-5] + "_budget_"+str(budget)+"_"+ str(int(threshold*100)) +".csv", index=False)
        
        with open("./results/DQuerier/cosine_similarity/" + dataset_name[:-5] + "_budget_"+str(budget)+"_"+ str(int(threshold*100)) +"_num clusters per run.json", "w") as write_file:
            json.dump(data_df_2, write_file, indent=4)


        

Dataset: wine.arff ... 
Threshold: 0.5
   run n0
pt1:50
explanations[0]:[1.305e+01 1.730e+00 2.040e+00 1.240e+01 9.200e+01 2.720e+00 3.270e+00
 1.700e-01 2.910e+00 7.200e+00 1.120e+00 2.910e+00 1.150e+03]
data1--------- [1.305e+01 1.730e+00 2.040e+00 1.240e+01 9.200e+01 2.720e+00 3.270e+00
 1.700e-01 2.910e+00 7.200e+00 1.120e+00 2.910e+00 1.150e+03]
please [[0.99507465]]
Voici le answer: True
Voici le answer: True
pt1:91
explanations[0]:[ 12.     1.51   2.42  22.    86.     1.45   1.25   0.5    1.63   3.6
   1.05   2.65 450.  ]
data1--------- [ 12.     1.51   2.42  22.    86.     1.45   1.25   0.5    1.63   3.6
   1.05   2.65 450.  ]
please [[0.99824073]]
Voici le answer: True
Voici le answer: True
data1--------- None


ValueError: Expected 2D array, got 1D array instead:
array=[nan].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.