In [2]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler,OneHotEncoder

In [3]:
df = pd.read_excel("Dry_Bean_Dataset.xlsx")
df.info

<bound method DataFrame.info of         Area  Perimeter  MajorAxisLength  MinorAxisLength  AspectRation  \
0      28395    610.291       208.178117       173.888747      1.197191   
1      28734    638.018       200.524796       182.734419      1.097356   
2      29380    624.110       212.826130       175.931143      1.209713   
3      30008    645.884       210.557999       182.516516      1.153638   
4      30140    620.134       201.847882       190.279279      1.060798   
...      ...        ...              ...              ...           ...   
13606  42097    759.696       288.721612       185.944705      1.552728   
13607  42101    757.499       281.576392       190.713136      1.476439   
13608  42139    759.321       281.539928       191.187979      1.472582   
13609  42147    763.779       283.382636       190.275731      1.489326   
13610  42159    772.237       295.142741       182.204716      1.619841   

       Eccentricity  ConvexArea  EquivDiameter    Extent  Solidity 

In [4]:
df.columns[df.isna().any()].tolist()

[]

In [14]:
x = df.drop(columns="Class")
y = df["Class"]

In [15]:
def select_mating_pool(pop, fitness, num_parents):
    parents = np.empty((num_parents, pop.shape[1]))
    for parent_num in range(num_parents):
        max_fitness_idx = np.where(fitness == np.max(fitness))
        max_fitness_idx = max_fitness_idx[0][0]
        parents[parent_num, :] = pop[max_fitness_idx, :]
        fitness[max_fitness_idx] = -99999999999
    return parents


In [16]:
def crossover(parents, offspring_size):
    offspring = np.empty(offspring_size)
    crossover_point = np.uint8(offspring_size[1]/2)

    for k in range(offspring_size[0]):
        parent1_idx = k%parents.shape[0]
        parent2_idx = (k+1)%parents.shape[0]
        offspring[k, 0:crossover_point] = parents[parent1_idx, 0:crossover_point]
        offspring[k, crossover_point:] = parents[parent2_idx, crossover_point:]
    return offspring

In [17]:
def mutation(offspring_crossover, num_mutations=1):
    mutations_counter = np.uint8(offspring_crossover.shape[1] / num_mutations)
    for idx in range(offspring_crossover.shape[0]):
        gene_idx = mutations_counter - 1
        for mutation_num in range(num_mutations):
            random_value = np.random.uniform(-1.0, 1.0, 1)
            offspring_crossover[idx, gene_idx] = offspring_crossover[idx, gene_idx] + random_value
            gene_idx = gene_idx + mutations_counter
    return offspring_crossover

In [18]:
def euclidean_distance(X, Y):
    return np.sqrt(np.sum(np.power(X - Y, 2), axis=1))

In [19]:
def cluster_data(solution):
    global num_cluster, data
    feature_vector_length = data.shape[1]
    cluster_centers = []
    all_clusters_dists = []
    clusters = []
    clusters_sum_dist = []

    for clust_idx in range(num_clusters):
        cluster_centers.append(solution[feature_vector_length*clust_idx:feature_vector_length*(clust_idx+1)])
        cluster_center_dists = euclidean_distance(data, cluster_centers[clust_idx])
        all_clusters_dists.append(np.array(cluster_center_dists))

    cluster_centers = np.array(cluster_centers)
    all_clusters_dists = np.array(all_clusters_dists)

    cluster_indices = np.argmin(all_clusters_dists, axis=0)
    for clust_idx in range(num_clusters):
        clusters.append(np.where(cluster_indices == clust_idx)[0])
        if len(clusters[clust_idx]) == 0:
            clusters_sum_dist.append(0)
        else:
            clusters_sum_dist.append(np.sum(all_clusters_dists[clust_idx, clusters[clust_idx]]))

    clusters_sum_dist = np.array(clusters_sum_dist)

    return cluster_centers, all_clusters_dists, cluster_indices, clusters, clusters_sum_dist

In [20]:
def cal_pop_fitness(solutions,method=False):
    fitness = []
    if not method:
        for s in solutions:
            _, _, _, _, clusters_sum_dist = cluster_data(s)

            f = 1.0 / (np.sum(clusters_sum_dist) + 1e-9)
            fitness.append(f)

    return np.array(fitness)

In [22]:
x = x.to_numpy()

In [24]:
number_of_generations = 100
sol_per_pop = 16
num_clusters = 5
num_parents_mating = 4
l = []
for i in range(sol_per_pop):
    r = np.random.randint(0,13605)
    random_solution = data[r:r+5,:]
    l.append(random_solution.flatten())
new_population = np.array(l)


num_weights = num_clusters*data.shape[1]
pop_size = (sol_per_pop,num_weights)

for num_gen in range(number_of_generations):
    print(f"generation {num_gen}")
    fitness = cal_pop_fitness(new_population)
    print("fitness:",fitness)

    parents = select_mating_pool(new_population,fitness,num_parents_mating)
    offspring_crossover = crossover(parents,
                                    offspring_size=(pop_size[0]-parents.shape[0], num_weights))

    offspring_mutation = mutation(offspring_crossover)
    new_population[0:parents.shape[0], :] = parents
    new_population[parents.shape[0]:, :] = offspring_mutation


    

generation 0
fitness: [2.91029604e-09 1.93612593e-09 2.51904732e-09 2.77335959e-09
 2.83515461e-09 2.37699507e-09 2.10582726e-09 2.94756826e-09
 4.14189804e-10 2.71361959e-09 2.69954831e-09 3.34286578e-09
 2.08152634e-09 1.63883730e-09 2.71710994e-09 1.43228449e-09]
generation 1


  offspring_crossover[idx, gene_idx] = offspring_crossover[idx, gene_idx] + random_value


fitness: [3.34286578e-09 2.94756826e-09 2.91029604e-09 2.83515461e-09
 3.20549909e-09 3.06628931e-09 3.76329272e-09 3.88596358e-09
 3.20549909e-09 3.06628931e-09 3.76329271e-09 3.88596359e-09
 3.20549909e-09 3.06628931e-09 3.76329271e-09 3.88596359e-09]
generation 2
fitness: [3.88596359e-09 3.88596359e-09 3.88596358e-09 3.76329272e-09
 3.88596359e-09 3.88596358e-09 2.83515461e-09 2.98415286e-09
 3.88596358e-09 3.88596359e-09 2.83515461e-09 2.98415286e-09
 3.88596358e-09 3.88596355e-09 2.83515461e-09 2.98415286e-09]
generation 3
fitness: [3.88596359e-09 3.88596359e-09 3.88596359e-09 3.88596359e-09
 3.88596358e-09 3.88596358e-09 3.88596359e-09 3.88596357e-09
 3.88596359e-09 3.88596357e-09 3.88596358e-09 3.88596358e-09
 3.88596358e-09 3.88596358e-09 3.88596358e-09 3.88596358e-09]
generation 4
fitness: [3.88596359e-09 3.88596359e-09 3.88596359e-09 3.88596359e-09
 3.88596357e-09 3.88596358e-09 3.88596359e-09 3.88596358e-09
 3.88596358e-09 3.88596358e-09 3.88596359e-09 3.88596359e-09
 3.8859

In [28]:
fitness = cal_pop_fitness(new_population)
best_match_idx = np.argmax(fitness)
print("Best :", fitness[best_match_idx])

Best : 3.885963591197453e-09


In [25]:
K_Means = KMeans(n_clusters=5)
K_Means.fit(x)

  super()._check_params_vs_input(X, default_n_init=10)


In [29]:
centers = K_Means.cluster_centers_.flatten().reshape(1,-1)

In [30]:
print(f"fitness for kmeans: {cal_pop_fitness(centers)[0]}")

fitness for kmeans: 1.1425137274401707e-08
