Libraries

In [119]:
import pandas as pd
import numpy as np 
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

Data set reading

In [120]:
df = pd.read_csv('Dry_Bean_Dataset.csv')
print(df.head())
print(df.columns)

    Area  Perimeter  MajorAxisLength  MinorAxisLength  AspectRation  \
0  28395    610.291       208.178117       173.888747      1.197191   
1  28734    638.018       200.524796       182.734419      1.097356   
2  29380    624.110       212.826130       175.931143      1.209713   
3  30008    645.884       210.557999       182.516516      1.153638   
4  30140    620.134       201.847882       190.279279      1.060798   

   Eccentricity  ConvexArea  EquivDiameter    Extent  Solidity  roundness  \
0      0.549812       28715     190.141097  0.763923  0.988856   0.958027   
1      0.411785       29172     191.272750  0.783968  0.984986   0.887034   
2      0.562727       29690     193.410904  0.778113  0.989559   0.947849   
3      0.498616       30724     195.467062  0.782681  0.976696   0.903936   
4      0.333680       30417     195.896503  0.773098  0.990893   0.984877   

   Compactness  ShapeFactor1  ShapeFactor2  ShapeFactor3  ShapeFactor4  Class  
0     0.913358      0.007332  

NULL

In [121]:
# Print the number of missing values in each column
missing_values = df.isnull().sum()
print("\nNumber of missing values in each column:")
print(missing_values)
#NO missing value in this data set


Number of missing values in each column:
Area               0
Perimeter          0
MajorAxisLength    0
MinorAxisLength    0
AspectRation       0
Eccentricity       0
ConvexArea         0
EquivDiameter      0
Extent             0
Solidity           0
roundness          0
Compactness        0
ShapeFactor1       0
ShapeFactor2       0
ShapeFactor3       0
ShapeFactor4       0
Class              0
dtype: int64


Noise

In [122]:
y=df['Class']
df = df.drop(columns=['Class'])
# Initialize the LOF model
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1)

# Fit the model to your data
outlier_scores = lof.fit_predict(df)

# Identify outliers
outliers = df[outlier_scores == -1]

# Handle outliers as needed -- Creating new data set
cleaned_df = df[outlier_scores != -1]
cleaned_y = y[outlier_scores != -1]  # Remove corresponding rows from y
cleaned_df.to_csv('cleaned.csv', index=False)
df = cleaned_df
y = cleaned_y  # Update y with the cleaned labels
print(len(y))

12250


Find the number of categories in Target

In [123]:
#remove target beacuse we have clustering
num_unique_classes = len(y.unique())
print("Number of unique classes:", num_unique_classes)

Number of unique classes: 7


Standardization
--> Without scaling the results were worse so I used standardization

In [124]:
scaler = StandardScaler()
# Fitting the scaler 
scaled_data = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_data,columns=df.columns)
df=scaled_df

Distance

In [125]:
def euclidean_distance(X, Y):
    return np.sqrt(np.sum(np.power(X - Y, 2), axis=1))

clustering

In [126]:
def cluster_data(solution, data):
    global num_clusters, feature_vector_length
    cluster_centers = []
    all_clusters_dists = []
    clusters = []
    clusters_sum_dist = []

    for clust_idx in range(num_clusters):
        cluster_centers.append(solution[feature_vector_length*clust_idx:feature_vector_length*(clust_idx+1)])
        cluster_center_dists = euclidean_distance(data, cluster_centers[clust_idx])
        all_clusters_dists.append(np.array(cluster_center_dists))

    cluster_centers = np.array(cluster_centers)
    all_clusters_dists = np.array(all_clusters_dists)

    cluster_indices = np.argmin(all_clusters_dists, axis=0)
    for clust_idx in range(num_clusters):
        clusters.append(np.where(cluster_indices == clust_idx)[0])
        if len(clusters[clust_idx]) == 0:
            clusters_sum_dist.append(0)
        else:
            clusters_sum_dist.append(np.sum(all_clusters_dists[clust_idx, clusters[clust_idx]]))

    clusters_sum_dist = np.array(clusters_sum_dist)

    return cluster_centers, all_clusters_dists, cluster_indices, clusters, clusters_sum_dist

Fitness function

In [127]:
def fitness_func(chromosome, data):
    _, _, _, _, clusters_sum_dist = cluster_data(chromosome, data)
    fitness_value = 1.0 / (np.sum(clusters_sum_dist) + 0.00000001)
    return fitness_value

Crossover function

In [128]:
def crossover(parent1, parent2):
    # Ensure the length of parents is same
    assert len(parent1) == len(parent2), "Parents should have the same length"

    # Perform crossover operation
    crossover_point = np.random.randint(0, len(parent1))  # Randomly select crossover point
    offspring1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
    offspring2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])

    return offspring1, offspring2

Mutation function

In [129]:
def mutation(chromosome, mutation_rate):
    for gene_index in range(len(chromosome)):
        if np.random.rand() < mutation_rate:
            chromosome[gene_index] += np.random.uniform(-1, 1)  
    return chromosome

Selection function

In [130]:
def selection(population, fitness_values, num_parents):
    selected_parents = []
    total_fitness = np.sum(fitness_values)
    
    for _ in range(num_parents):
        # Generate a random number between 0 and total fitness
        rand_val = np.random.uniform(0, total_fitness)
        
        # Select the chromosome based on the cumulative fitness
        cumulative_fitness = 0
        for i, fitness_val in enumerate(fitness_values):
            cumulative_fitness += fitness_val
            if cumulative_fitness >= rand_val:
                selected_parents.append(population[i])
                break
    
    return selected_parents

MAIN-PROCESS

In [131]:
num_generations = 100
sol_per_pop = 10
mutation_rate = 0.1
num_parents_mating = 5
best_overall_fitness = -np.inf
best_overall_solution = None
best_num_clusters = None
# Loop through different values of num_clusters
for num_clusters in range(1, num_unique_classes*2):
    feature_vector_length = df.shape[1]
    num_genes = num_clusters * feature_vector_length

    # Initial population
    population = np.random.uniform(0, 1, (sol_per_pop, num_genes))

    for generation in range(num_generations):
        # Calculate fitness for each chromosome
        fitness_values = np.array([fitness_func(chromosome, df.values) for chromosome in population])
       
        # Select parents
        parents = selection(population, fitness_values, num_parents_mating)

        # Crossover
        offspring = [crossover(parents[i % len(parents)], parents[(i + 1) % len(parents)]) for i in range(sol_per_pop)]
        offspring = np.array(offspring).reshape((sol_per_pop * 2, -1))  # Flatten the list of offspring

        # Mutation
        for i in range(len(offspring)):
            offspring[i] = mutation(offspring[i], mutation_rate)

        # Replace the old population with the offspring
        population = offspring
        
    #for comparing with true labels    
    if num_clusters==num_unique_classes:
        equal_solution=population[best_solution_idx] 
    # Get the best solution for this num_clusters
    best_solution_idx = np.argmax(fitness_values)
    best_solution = population[best_solution_idx]
    best_fitness = fitness_values[best_solution_idx]

    print(f"Best solution for num_clusters={num_clusters}: {best_solution}")
    print(f"Best fitness for num_clusters={num_clusters}: {best_fitness}")
    print("---------------------------------------------------------------------------------")

    # Update the overall best solution if the current best fitness is better
    if best_fitness > best_overall_fitness:
        best_overall_fitness = best_fitness
        best_overall_solution = best_solution
        best_num_clusters = num_clusters
print("---------------------------------------------------------------------------------")
print("Best overall solution:", best_overall_solution)
print("Best overall fitness:", best_overall_fitness)
print("Best number of clusters:", best_num_clusters)

Best solution for num_clusters=1: [ 0.86505853  0.11887961  4.04746569  0.28293052  2.59611902  0.11571484
  1.15594305 -0.62842285 -1.70444138 -0.82463472 -0.45096291 -1.99006719
  0.56960481  2.71925377  1.59143003  0.09134793]
Best fitness for num_clusters=1: 1.1754135622962704e-05
---------------------------------------------------------------------------------
Best solution for num_clusters=2: [-0.04345251  4.52069351  1.39278607 -0.9627234   0.98332179  0.25193168
  0.63169903  3.17338566  1.35728692  1.72328044 -0.18082805 -1.07640044
 -3.716643   -0.53541639  0.01397144 -0.16983661  0.49213954 -1.58746574
  1.39700004  0.03310118 -1.85513884 -0.45680022  0.09281794 -0.77995344
  0.52608734  0.05137846 -1.13373684  2.57692362 -1.39463566 -0.65668344
  3.80346022  1.32204457]
Best fitness for num_clusters=2: 1.3422894710277979e-05
---------------------------------------------------------------------------------
Best solution for num_clusters=3: [-4.48648482e-01 -7.72442917e-01 -1

--------------------------------------

The correctness percentage of the algorithm ----> did not reach the full result

----------------------------------------------------

Assign Clusteres

In [132]:
def assign_cluster_labels(data, cluster_centers):
    cluster_labels = []
    for point in data:
        distances = [np.linalg.norm(point - center) for center in cluster_centers]
        closest_cluster = np.argmin(distances)
        cluster_labels.append(closest_cluster)
    return cluster_labels

Assign cluster labels to data points

In [133]:
# Assign cluster labels to data points
cluster_labels = assign_cluster_labels(df.values, equal_solution)
print(equal_solution)
print(len(y))
print(len(cluster_labels))

# Ensure both y and cluster_labels have the same length
if len(y) == len(cluster_labels):
    # Convert numerical cluster labels to string labels
    cluster_labels_str = [str(label) for label in cluster_labels]
    # Calculate F1 score
    f1 = f1_score(y, cluster_labels_str, average='weighted')
    

    # Calculate accuracy
    accuracy = accuracy_score(y, cluster_labels)

    print("F1 Score of Genetic Algorithm:", f1)
    print("Accuracy of Genetic Algorithm:", accuracy)
else:
    print("Error: Number of samples in true labels and cluster labels is not the same.")


[ 6.39009236e-01 -1.80333451e-02 -2.27909729e-02 -1.68875190e-01
  1.12266537e+00 -3.88933411e-02 -4.68568060e-01  1.07917610e+00
 -7.97610067e-01 -1.51876976e+00  2.22224934e+00  3.15641846e+00
  2.87281005e+00 -6.12639044e-01  2.07854063e-02  3.15322507e-01
 -1.85583614e+00  1.75303681e+00 -7.82437568e-01  1.14534047e+00
  1.60851732e-03  7.90338185e-01 -7.01105305e-01  1.59668592e+00
 -1.29053608e+00  1.07895424e+00  1.61050687e+00  2.77852444e+00
 -7.17819352e-01 -2.62385819e-01 -1.84637417e+00  4.08434560e-01
 -1.10320472e+00  4.45857721e+00 -6.20225500e-02 -2.29789528e-01
 -1.10332490e+00  3.31509771e-01 -1.33310043e+00 -1.07187913e+00
  1.08085722e+00  2.19843541e-01 -4.31716272e-01  4.19045923e-02
  2.42384595e+00  1.58935118e+00  1.57132278e+00  6.63350838e-01
  1.36577751e+00  3.72043738e+00 -1.48284328e+00  1.18018683e+00
 -2.70457990e+00  1.29682404e+00  1.87349929e-01  9.80913837e-01
 -3.50014985e+00  1.52589660e+00  9.74165073e-01  1.19300010e+00
  3.47908106e+00  4.35106

--------------------------------------

K-Means

In [134]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Define the range of numbers of clusters to iterate over
min_clusters = 2
max_clusters = 10

best_score = -1
best_num_clusters = -1

for num_clusters in range(min_clusters, max_clusters + 1):
    # Create an instance of KMeans with the desired number of clusters
    kmeans = KMeans(n_clusters=num_clusters, n_init=10, random_state=42)  # Explicitly set n_init to suppress the warning

    # Fit the KMeans model to your data
    kmeans.fit(df)

    # Get the cluster labels for each data point
    cluster_labels = kmeans.labels_

    # Evaluate the clustering using silhouette score
    silhouette_avg = silhouette_score(df, cluster_labels)
    print("Number of clusters:", num_clusters, "| Silhouette Score:", silhouette_avg)

    # Update the best score and number of clusters if necessary
    if silhouette_avg > best_score:
        best_score = silhouette_avg
        best_num_clusters = num_clusters

print("Best number of clusters:", best_num_clusters)


Number of clusters: 2 | Silhouette Score: 0.4089523099010351
Number of clusters: 3 | Silhouette Score: 0.41365656768294096
Number of clusters: 4 | Silhouette Score: 0.3501223015238623
Number of clusters: 5 | Silhouette Score: 0.3602044271591858
Number of clusters: 6 | Silhouette Score: 0.304678124197644
Number of clusters: 7 | Silhouette Score: 0.3044389931235186
Number of clusters: 8 | Silhouette Score: 0.3061474105583051
Number of clusters: 9 | Silhouette Score: 0.3080722891916036
Number of clusters: 10 | Silhouette Score: 0.27200207712803853
Best number of clusters: 3
