## I. Import + Load Data

In [1]:
# ------- IMPORTS
# -- model(s)
from cobras_ts.querier.labelquerier import LabelQuerier
from xcobras_kmeans import XCOBRAS_kmeans
from model_explainer import PartTwo
import shap

# -- metrics
from sklearn.metrics import adjusted_rand_score

# -- plot(s)
from utils.plots import plot_2D, plot_boundary
import matplotlib.pyplot as plt

# -- dataset(s) 
from sklearn import datasets
from scipy.io import arff
import pandas as pd
import numpy as np

# -- others
import warnings
warnings.filterwarnings("ignore")


# ------- CREATE TOY DATASET
n_samples = 500
X_blob0, y_blob0 = datasets.make_blobs(n_samples=n_samples, centers=3, n_features=2, random_state=0)
X_blob1, y_blob1 = datasets.make_blobs(n_samples=n_samples, centers=3, n_features=2, random_state=1)
X_moons1, y_moons1 = datasets.make_moons(n_samples=n_samples, noise=0.1, random_state=1)
# on verra comment les print

# ------- READ REAL DATASET
PATH = "../../../datasets/deric benchmark/"
ARTIFICIAL = PATH+"artificial/"
REALWORLD = PATH+"real-world/"

datasets_path = [
    ARTIFICIAL+"target",
    REALWORLD+"wine",
    REALWORLD+"iris"
]

data = {}
for dataset_path in datasets_path:
    temp_data = arff.loadarff(open(dataset_path+".arff", 'r'))
    dataset_name = dataset_path.split("/")[-1]
    data[dataset_name] = pd.DataFrame(temp_data[0])
    data[dataset_name]["class"] = data[dataset_name]["class"].str.decode('utf-8') 

In [2]:
X = data["wine"].drop(["class"], axis=1)
y = data["wine"]["class"]

feature_names = list(X.columns)

budget = 160

In [3]:
xcobras_kmeans = XCOBRAS_kmeans(budget)
print("Fitting...")
xcobras_kmeans.fit(X.values, LabelQuerier(y.values))
print("Predicting...")
y_hat = xcobras_kmeans.predict(X.values)
print(f"ARI: {adjusted_rand_score(y_hat, y):.2f}")

Fitting...


In [11]:
part_two = PartTwo(verbose=True)
part_two.fit(X.values, y_hat)

---------Some scores:---------
------------------------------
f1-score (macro): 0.8975108711950818
         (micro): 0.9027777777777778
accuracy_score:   0.9027777777777778
------------------------------


### Shap

**General idea:**  
1. Get all the instances of a certain cluster
2. Explain the whole cluster
3. Show explanation of 2 samples

In [None]:
import shap

# 1  .................
label = np.random.choice(list(set(y_hat)))
print(f"Chosen label: {label}")
X_label = X[list(y_hat)==label]
print(f"Number of instances: {X_label.shape[0]}")

In [None]:
# 2  .................
#  ---- COBRAS
explainer_XCOBRAS_kmeans = shap.Explainer(
    xcobras_kmeans.predict,
    X_label.values,
    feature_names=list(X.columns)
)
print("XCOBRAS explainer .... ")
XCOBRAS_shap_values = explainer_XCOBRAS_kmeans(X_label.values)


#  ---- PartTwo (RBF SVM)
explainer_part_two = shap.Explainer(
    part_two.predict,
    X_label.values,
    feature_names=list(X.columns)
)
print("PartTwo explainer .... ")
part_two_shap_values = explainer_part_two(X_label)

In [None]:
# 3  .................
shap.summary_plot(XCOBRAS_shap_values,
                  X_label.values,
                  cmap = "plasma")

In [None]:
shap.summary_plot(part_two_shap_values,
                  X_label.values,
                  cmap = "plasma")