### import librairies

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn_extra.cluster import KMedoids
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import time
import seaborn as sns

### Load original_data

In [2]:
data = pd.read_csv("Animal_Original.csv")
# Assuming every other feature starting from the first one
selected_features = data.columns[::2][:100]

X = data[selected_features]
y = data.iloc[:, -1]
# Split data into trainig 75% and test 25%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [5]:
print(" Data size", data.shape)
print("X_train size",X_train.shape)
print("X_test size", X_test.shape)

 Data size (1499, 201)
X_train size (1124, 100)
X_test size (375, 100)


### Clustering  using k-Means and k-Medoids via Elbow 

In [3]:
"""Split your training original data by class"""
X_train_class_0 = X_train[y_train == 0]  # Class 0
y_train_class_0 = y_train[y_train == 0]  # Class 0
X_train_class_1 = X_train[y_train == 1]  # Class 1
y_train_class_1 = y_train[y_train == 1]  # Class 1

#### 1) Cluster's number selection:

#### a) For k-Medoids

In [4]:
k_values_Elbow = range(4, 80)
threshold = 0.9
#class 1
n = X_train_class_1.shape[0]
inertia_values_kmedoids_p = []
for k in k_values_Elbow:
    kmedoids = KMedoids(n_clusters=k,init='k-medoids++')
    kmedoids.fit(X_train_class_1)
    inertia_values_kmedoids_p.append(kmedoids.inertia_)
    # Choosing k using the elbow method for K-Medoids
inertia_cumsum_kmedoids_p = np.cumsum(inertia_values_kmedoids_p)
total_inertia_kmedoids_p = inertia_cumsum_kmedoids_p[-1]
chosen_k_medoids_p = next(k for k, inertia in zip(k_values_Elbow, inertia_cumsum_kmedoids_p / total_inertia_kmedoids_p) if inertia >= threshold)
print("Chosen k for class 1 with K-Medoids using Elbow method for n={} is k_opt = {}".format(n, chosen_k_medoids_p))
#class 0
n_0 = X_train_class_0.shape[0]
inertia_values_kmedoids_n = []
for k in k_values_Elbow:
    # K-Medoids
    kmedoids = KMedoids(n_clusters=k,init='k-medoids++')
    kmedoids.fit(X_train_class_0)
    inertia_values_kmedoids_n.append(kmedoids.inertia_)
    # Choosing k using the elbow method for K-Medoids
inertia_cumsum_kmedoids_n = np.cumsum(inertia_values_kmedoids_n)
total_inertia_kmedoids_n = inertia_cumsum_kmedoids_n[-1]
chosen_k_medoids_n = next(k for k, inertia in zip(k_values_Elbow, inertia_cumsum_kmedoids_n / total_inertia_kmedoids_n) if inertia >= threshold)
print("Chosen k for class 0 with K-Medoids using Elbow method for n={} is k_opt = {}".format(n_0, chosen_k_medoids_n))

Chosen k for class 1 with K-Medoids using Elbow method for n=754 is k_opt = 70
Chosen k for class 0 with K-Medoids using Elbow method for n=370 is k_opt = 64


#### b) For k-Means

In [6]:
#class 1
inertia_values_kmeans_p = []
for k in k_values_Elbow:
    # K-Means
    kmeans = KMeans(n_clusters=k, init='k-means++', n_init=10)
    kmeans.fit(X_train_class_1)
    inertia_values_kmeans_p.append(kmeans.inertia_)
    # Choosing k using the elbow method for K-Means
inertia_cumsum_kmeans_p = np.cumsum(inertia_values_kmeans_p)
total_inertia_kmeans_p = inertia_cumsum_kmeans_p[-1]
chosen_k_means_p = next(k for k, inertia in zip(k_values_Elbow, inertia_cumsum_kmeans_p/ total_inertia_kmeans_p) if inertia >= threshold)
print("Chosen k for class 1 with K-Means using Elbow method for n={} is k_opt = {}".format(n, chosen_k_means_p))
#N class 0
inertia_values_kmeans_n = []
for k in k_values_Elbow:
    # K-Means
    kmeans = KMeans(n_clusters=k, init='k-means++', n_init=10)
    kmeans.fit(X_train_class_0)
    inertia_values_kmeans_n.append(kmeans.inertia_)
    # Choosing k using the elbow method for K-Means
inertia_cumsum_kmeans_n = np.cumsum(inertia_values_kmeans_n)
total_inertia_kmeans_n = inertia_cumsum_kmeans_n[-1]
chosen_k_means_n = next(k for k, inertia in zip(k_values_Elbow, inertia_cumsum_kmeans_n/ total_inertia_kmeans_n) if inertia >= threshold)
print("Chosen k for class 0 with K-Means using Elbow method for n={} is k_opt = {}".format(n_0, chosen_k_means_n))

Chosen k for class 1 with K-Means using Elbow method for n=754 is k_opt = 66
Chosen k for class 0 with K-Means using Elbow method for n=370 is k_opt = 58


#### 2) Reduced training-data (samples):

#### a) Using k-Medoids

In [7]:
#Class 1
kmedoidsp=KMedoids(chosen_k_medoids_p,init='k-medoids++',max_iter=500).fit(X_train_class_1)
medoid_indices_p=kmedoidsp.medoid_indices_
x_reduced_medoid_p=np.array(X_train_class_1)[medoid_indices_p]
y_reduced_medoid_p=np.array(y_train_class_1)[medoid_indices_p]
#Class 0
kmedoidsn=KMedoids(n_clusters=chosen_k_medoids_n,init='k-medoids++',max_iter=500).fit(X_train_class_0)
medoid_indices_n=kmedoidsn.medoid_indices_
x_reduced_medoid_n=np.array(X_train_class_0)[medoid_indices_n]
y_reduced_medoid_n=np.array(y_train_class_0)[medoid_indices_n]
#Concatenate
X_red_kmedoid = np.concatenate([x_reduced_medoid_p, x_reduced_medoid_n])
y_red_kmedoid = np.concatenate([y_reduced_medoid_p, y_reduced_medoid_n])

"""Shapes!"""
X_red_kmedoid_size=X_red_kmedoid.shape
print("X_red_kmedoid_size",X_red_kmedoid_size)

X_red_kmedoid_size (134, 100)


#### b) Using k-Means

In [8]:
#Class 1
kmeansp=KMeans(n_clusters=chosen_k_means_p,n_init=10,init='k-means++').fit(X_train_class_1)
centroids1=kmeansp.cluster_centers_
knn=NearestNeighbors(n_neighbors=1).fit(X_train_class_1)
distances, indices = knn.kneighbors(centroids1)
x_reduced_1 = np.array(X_train_class_1)[np.squeeze(indices)]
y_reduced_1 = np.array(y_train_class_1)[np.squeeze(indices)]
#Class 0
kmeansn=KMeans(n_clusters=chosen_k_means_n,n_init=10,init='k-means++').fit(X_train_class_0)
centroids_0=kmeansn.cluster_centers_
knn_0=NearestNeighbors(n_neighbors=1).fit(X_train_class_0)
distances_0, indices_0 = knn_0.kneighbors(centroids_0)
x_reduced_0 = np.array(X_train_class_0)[np.squeeze(indices_0)]
y_reduced_0 = np.array(y_train_class_0)[np.squeeze(indices_0)]
#concatenate
x_reduced_kmeans=np.concatenate([x_reduced_1,x_reduced_0])
y_reduced_kmeans=np.concatenate([y_reduced_1,y_reduced_0])
"""Shapes!"""
X_red_kmeans_size=x_reduced_kmeans.shape
print("X_red_kmeans_size=",X_red_kmeans_size)



X_red_kmeans_size= (124, 100)




### Learn and predict GPC / RGPC_kMedoids / RGPC_kMeans

#### 1) The Gaussian Process Classifier with RBF kernel for the original case

In [9]:
kernel =RBF(length_scale=1.0)
# Record the starting time
start_time_GPC = time.time()
# Learn the model
model_GPC = GaussianProcessClassifier(kernel=kernel).fit(X_train, y_train)
# Prediction 
y_pred_GPC=model_GPC.predict(X_test)
# Evaluation
accuracy_GPC = accuracy_score(y_test,y_pred_GPC)
auc_GPC = roc_auc_score(y_test, y_pred_GPC)
f1_score_GPC = f1_score(y_test, y_pred_GPC)
recall_GPC = recall_score(y_test, y_pred_GPC)
f1_score_GPC = f1_score(y_test, y_pred_GPC)
# Record the ending time
end_time_GPC = time.time()
execution_time_GPC=end_time_GPC-start_time_GPC
print("GPC execution time=",execution_time_GPC)

GPC execution time= 23.413249969482422


#### 2) The k-Medoids reduced Gaussian Classifier

In [10]:
# Record the starting time
start_time_RGPC_med = time.time()
# Learn the model
reduced_model_medoid = GaussianProcessClassifier(kernel).fit(X_red_kmedoid, y_red_kmedoid)
# Prediction
y_pred_medoid=reduced_model_medoid.predict(X_test) 
# Evaluation
accuracy_RGPC_kMedoids = accuracy_score(y_test,y_pred_medoid)
auc_RGPC_kMedoids = roc_auc_score(y_test, y_pred_medoid)
recall_RGPC_kMedoids = recall_score(y_test, y_pred_medoid)
f1_score_RGPC_kMedoids = f1_score(y_test, y_pred_medoid)
# Record the ending time
end_time_RGPC_med = time.time()
execution_time_RGPC_med=end_time_RGPC_med-start_time_RGPC_med
print("k-Medoids RGPC execution time=",execution_time_RGPC_med)

k-Medoids RGPC execution time= 0.354783296585083




#### 3) The k_Means reduced Gaussian Classifier

In [13]:
# Record the starting time
start_time_RGPC_mean = time.time()
# Learn the model
reduced_model_mean = GaussianProcessClassifier(kernel, random_state=42).fit(x_reduced_kmeans, y_reduced_kmeans)
# Prediction
y_pred_mean=reduced_model_mean.predict(X_test)
# Evaluation
accuracy_RGPC_kMeans = accuracy_score(y_test,y_pred_mean)
auc_RGPC_kMeans = roc_auc_score(y_test, y_pred_mean)
recall_RGPC_kMeans = recall_score(y_test, y_pred_mean)
f1_score_RGPC_kMeans = f1_score(y_test, y_pred_mean)
# Record the ending time
end_time_RGPC_mean = time.time()
execution_time_RGPC_mean=end_time_RGPC_mean-start_time_RGPC_mean
print("k-Means RGPC execution time=",execution_time_RGPC_mean)

k-Means RGPC execution time= 0.2518439292907715




### Comparison

In [20]:

# Evaluation metrics for GPC
evaluation_GPC = [accuracy_GPC, auc_GPC, f1_score_GPC]
execution_time_GPC = execution_time_GPC

# Evaluation metrics for RGPC_kMeans
evaluation_RGPC_kMeans = [accuracy_RGPC_kMeans, auc_RGPC_kMeans, f1_score_RGPC_kMeans]
execution_time_RGPC_mean = execution_time_RGPC_mean

# Evaluation metrics for RGPC_kMedoids
evaluation_RGPC_kMedoids = [accuracy_RGPC_kMedoids, auc_RGPC_kMedoids, f1_score_RGPC_kMedoids]
execution_time_RGPC_med = execution_time_RGPC_med

# Define the metrics
metrics = ['Accuracy', 'AUC', 'F1-score']

# Create the DataFrame
df = pd.DataFrame({'Metric': metrics,
                   'GPC': evaluation_GPC,
                   'RGPC_kMeans': evaluation_RGPC_kMeans,
                   'RGPC_kMedoids': evaluation_RGPC_kMedoids,
                   'Execution Time ': [execution_time_GPC, execution_time_RGPC_mean, execution_time_RGPC_med]})

# Set the Metric column as index
df.set_index('Metric', inplace=True)

# Display the DataFrame
print(df)


               GPC  RGPC_kMeans  RGPC_kMedoids  Execution Time (s)
Metric                                                            
Accuracy  0.928000     0.888000       0.882667           23.413250
AUC       0.905785     0.862674       0.847970            0.251844
F1-score  0.946535     0.916667       0.914397            0.354783
