In [16]:
import pandas as pd
import numpy as np
from collections import Counter
import math

### 0- Chargement du dataset

In [17]:
def charger_dataset(file_name):
    """Charge un fichier CSV et retourne le df."""
    df = pd.read_csv('Dataset-Exos.txt', delimiter=',', header=None, names=['sepal_length','sepal_width','petal_length', 'petal_width', 'classe'])
    #turn the 4 first columns into float
    df[['sepal_length','sepal_width','petal_length', 'petal_width']] = df[['sepal_length','sepal_width','petal_length', 'petal_width']].astype(float)
    df = df.dropna( )

    return df
    
    
df = charger_dataset('Dataset-Exos.txt')

In [18]:
Q1 = df['sepal_width'].quantile(0.25)
Q3 = df['sepal_width'].quantile(0.75)

IQR = Q3 - Q1 

outliers_threshold = 1.5 * IQR 
max_outliers = Q3 + outliers_threshold
min_outliers = Q1 - outliers_threshold

In [19]:
column2 = df['sepal_width'][(df['sepal_width']< max_outliers) & (df['sepal_width'] > min_outliers)]
df['sepal_width'] = column2
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,classe
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [20]:
classes = df['classe']
# remove column class
df_copy = df.drop(columns=['classe'])
df_copy = df_copy.apply(pd.to_numeric, errors='coerce')

df_copy

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


#### Supression des valeurs null de notre dataset

In [21]:
print(df_copy.isnull().sum())

sepal_length    0
sepal_width     4
petal_length    0
petal_width     0
dtype: int64


In [22]:
df_copy = df_copy.dropna()
print(df_copy.isnull().sum())

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
dtype: int64


### 1- les fonctions permettant de calculer le distance entre deux instances du dataset.

In [23]:
def manhattan_distance(instance_a, instance_b):
        return sum(abs(a - b) for a, b in zip(instance_a, instance_b))

def euclidean_distance(instance_a, instance_b):
        return np.sqrt(sum((a - b)**2 for a, b in zip(instance_a, instance_b)))

def minkowski_distance(instance_a, instance_b, p):
        return sum(abs(a - b)**p for a, b in zip(instance_a, instance_b))**(1/p)

def cosine_distance(instance_a, instance_b):
        return 1 - (sum(a * b for a, b in zip(instance_a, instance_b)) / (np.sqrt(sum(a**2 for a in instance_a)) * np.sqrt(sum(b**2 for b in instance_b))))

def hamming_distance(instance_a, instance_b):
        return sum(a != b for a, b in zip(instance_a, instance_b))

### 2- fonction permettant de calculer le centroïde d’un ensemble d’instances.

In [24]:
def get_centroid(dataset, k):
    centroids = []  # List to store the centroids
    for _ in range(k):  # Loop k times to select k centroids
        centroids.append(dataset.sample(replace=False).iloc[0])  # Randomly select a single instance and add it to centroids
    return pd.DataFrame(centroids).reset_index(drop=True)  # Convert centroids to a DataFrame and reset index

# Get 3 centroids from the 'df_copy' dataset
get_centroid(df_copy, 3)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.8,2.6,4.0,1.2
1,4.9,2.4,3.3,1.0
2,5.1,3.7,1.5,0.4


### 3- fonction permettant de trouver le cluster dont une instance donnée est la plus proche.

In [25]:
def find_closest_cluster(instance, centroids, distance_function):

    centroids_df = pd.DataFrame(centroids)  # Convert centroids list to a DataFrame
    # Calculate distances between the instance and each centroid using the provided distance function
    distances = centroids_df.apply(lambda row: distance_function(row, instance), axis=1)
    return distances.idxmin()  # Return the index of the centroid with the minimum distance

# Find the closest cluster
centroids = get_centroid(df_copy, 3)  # Get 3 centroids
find_closest_cluster(df_copy.iloc[0], centroids, cosine_distance)  # Find the closest cluster for the first instance in 'df_copy'


0

### 5- Implémentation de l’algorithme k-means et déduction des clusters formés.

In [26]:
# Désactiver les messages d'avertissement pour les affectations en chaîne
pd.options.mode.chained_assignment = None  # par défaut='warn'

def kmeans(dataset, k, distance_function, max_iterations=100):
    # Choisir k instances aléatoirement comme centroïdes
    centroids = get_centroid(dataset, k)

    # Initialiser la colonne 'cluster' avec des valeurs aléatoires
    dataset['cluster'] = np.random.randint(0, k+5, size=len(dataset))

    for iteration in range(max_iterations):
        # Affecter chaque instance au groupe le plus proche de son centroïde
        clusters = dataset.apply(lambda row: find_closest_cluster(row[:-1], centroids, distance_function), axis=1)
        dataset['cluster'] = clusters

        # Mettre à jour les centroïdes
        # calculant la moyenne de chaque columns qui a le meme cluster 
        # new centroide position == (donc la moyenne des postions des clusters proche a le centroides "i")
        new_centroids = dataset.groupby('cluster').mean().reset_index(drop=True)
        print(new_centroids)
        # Vérifier la convergence
        if centroids.equals(new_centroids):
            print('Après {} itérations, la convergence atteinte .'.format(iteration))
            break
        centroids = new_centroids

    return dataset

# Exemple d'utilisation
k = 3
result_dataset = kmeans(df_copy,k , cosine_distance)
#print(result_dataset)

   sepal_length  sepal_width  petal_length  petal_width
0      5.425000     2.475000      3.666667     1.133333
1      4.976596     3.365957      1.463830     0.244681
2      6.391954     2.936782      5.093103     1.758621
   sepal_length  sepal_width  petal_length  petal_width
0      5.550000     2.646154      3.957692     1.226923
1      4.976596     3.365957      1.463830     0.244681
2      6.532877     2.964384      5.263014     1.845205
   sepal_length  sepal_width  petal_length  petal_width
0      5.669697     2.660606      4.063636     1.257576
1      4.976596     3.365957      1.463830     0.244681
2      6.577273     2.990909      5.348485     1.895455
   sepal_length  sepal_width  petal_length  petal_width
0      5.727027     2.705405      4.116216     1.278378
1      4.976596     3.365957      1.463830     0.244681
2      6.601613     2.985484      5.400000     1.924194
   sepal_length  sepal_width  petal_length  petal_width
0      5.790476     2.721429      4.183333     1

In [27]:
result_dataset

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,cluster
0,5.1,3.5,1.4,0.2,1
1,4.9,3.0,1.4,0.2,1
2,4.7,3.2,1.3,0.2,1
3,4.6,3.1,1.5,0.2,1
4,5.0,3.6,1.4,0.2,1
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,0
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


# Evaluation des résultats en utilisant le coefficient de silhouette. 

In [28]:
def calculate_a_b(cluster, instance, dataset, distance_function):
    a_values = []
    b_values = []

    # Parcourir l'ensemble de données
    for i in range(len(dataset)):
        if dataset.iloc[i]['cluster'] == cluster:
            # Calculer la distance entre l'instance et les autres instances dans le même cluster
            a_values.append(distance_function(instance, dataset.iloc[i, :-1]))
        else:
            # Calculer la distance entre l'instance et les instances des autres clusters
            b_values.append(distance_function(instance, dataset.iloc[i, :-1]))

    # Calculer les valeurs a et b
    a = np.mean(a_values)  # Moyenne des distances à l'intérieur du cluster
    b = np.min(b_values) if len(b_values) > 0 else np.inf  # Distance minimale aux instances d'autres clusters, infini si pas d'autres clusters

    return a, b

In [29]:
def silhouette_score(dataset, distance_function):
    scores = []

    # Parcourir toutes les instances dans l'ensemble de données
    for i in range(len(dataset)):
        instance = dataset.iloc[i, :-1]  # Récupérer l'instance
        cluster = dataset.iloc[i]['cluster']  # Récupérer le cluster de l'instance

        # Calculer les valeurs a et b pour chaque instance
        a, b = calculate_a_b(cluster, instance, dataset, distance_function)

        # Calculer le coefficient de silhouette pour l'instance actuelle
        silhouette_i = (b - a) / max(a, b)
        scores.append(silhouette_i)  # Ajouter le coefficient de silhouette à la liste

    # Calculer la moyenne des coefficients de silhouette pour obtenir le coefficient de silhouette moyen
    silhouette_avg = np.mean(scores)
    return silhouette_avg

In [30]:
# Calcul du coefficient de silhouette moyen pour les clusters obtenus précédemment (result_dataset)
silhouette_avg = silhouette_score(result_dataset, cosine_distance)
print(f'Coefficient de silhouette moyen pour k = {k}: {silhouette_avg}')

Coefficient de silhouette moyen pour k = 3: 0.2201063623575435
