In [9]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from sklearn.metrics import cohen_kappa_score
from itertools import product
from sklearn.cluster import AgglomerativeClustering
import skfuzzy as fuzz
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
import scipy.cluster.hierarchy as sch
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_score
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from yellowbrick.cluster import KElbowVisualizer

# Hard and Soft Clustering

In [10]:
items_similarity = pd.read_csv("./data/items_similarity.csv")
items_similarity = items_similarity.set_index('item_id')
# print(items_similarity.shape)
# items_similarity

## PCA

In [11]:
X_std = StandardScaler().fit_transform(items_similarity)
pca = decomposition.PCA(n_components=392)
features = pca.fit_transform(items_similarity)

## K-means Clustering

In [12]:
#Applying kmeans to the dataset / Creating the kmeans classifier
kmeans = KMeans(n_clusters = 2, init = 'k-means++', max_iter = 10000, n_init = 50, random_state = 0)
y_kmeans_1_ = kmeans.fit_predict(features)

## Fuzzy Clustering

In [13]:
cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(features.T, 2, 2, error=0.005, maxiter=1000, init=None)
cluster_membership = np.argmax(u, axis=0)

# Hierarchical Clustering

In [14]:
hc = AgglomerativeClustering(n_clusters=2,affinity="euclidean",linkage="average")
y_hc = hc.fit_predict(features)

# Clustering Validation

In [22]:
from jqmcvi import base
from sklearn.metrics import davies_bouldin_score, calinski_harabasz_score

## Indice de Dunn

In [20]:
# We store the clusters
from jqmcvi import base
# Kmeans Clusters
kmeans_clus0 = features[y_kmeans_1_ == 0,:]
kmeans_clus1 = features[y_kmeans_1_ == 1,:]
kmeans_clusters = [kmeans_clus0, kmeans_clus1]
# Hierarchical Clusters
agglo_clus0 = features[y_hc == 0,:]
agglo_clus1 = features[y_hc == 1,:]
agglo_clusters = [agglo_clus0, agglo_clus1]
# Fuzzy Clusters
fuzzy_clus0 = features[cluster_membership == 0,:]
fuzzy_clus1 = features[cluster_membership == 1,:]
fuzzy_clusters = [fuzzy_clus0, fuzzy_clus1]
  
print(f"Kemans ==> {base.dunn(kmeans_clusters)} ")
print(f"Agglo ==> {base.dunn(agglo_clusters)}")
print(f"Fuzzy ==> {base.dunn(fuzzy_clusters)}")

Kemans ==> 0.4275280514357695 
Agglo ==> 0.5704708340372251
Fuzzy ==> 0.4275280514357695


## Indice Davies-Bouldin

In [21]:
print(f"Kemans ==> {davies_bouldin_score(features, y_kmeans_1_)}")
print(f"Agglo ==> {davies_bouldin_score(features, y_hc)}")
print(f"Fuzzy ==> {davies_bouldin_score(features, cluster_membership)}")

Kemans ==> 0.5862468799936242
Agglo ==> 0.5766252914501527
Fuzzy ==> 0.5862468799936242


# Indice Calinski Harabasz

In [24]:
print(f"Kemans ==> {calinski_harabasz_score(features, y_kmeans_1_)}")
print(f"Agglo ==> {calinski_harabasz_score(features, y_hc)}")
print(f"Fuzzy ==> {calinski_harabasz_score(features, cluster_membership)}")

Kemans ==> 2413.693346711567
Agglo ==> 2368.281414740965
Fuzzy ==> 2413.693346711567


## silhouette_score

In [18]:
# Comparaison of models
print(f'hierarchical clustering ==> Silhouette Score(n=2): {silhouette_score(features, y_hc)}')
print(f'kmeans clustering ==> Silhouette Score(n=2): {silhouette_score(features, y_kmeans_1_)}')
print(f'fuzzy clustering ==> Silhouette Score(n=2): {silhouette_score(features,cluster_membership)}')
# ==> Enfin, la partition avec le SI le plus élevé est considérée comme optimale

hierarchical clustering ==> Silhouette Score(n=2): 0.620315008119564
kmeans clustering ==> Silhouette Score(n=2): 0.6209031100633661
fuzzy clustering ==> Silhouette Score(n=2): 0.6209031100633661
