# Analayzing data

In [232]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

In [233]:
df = pd.read_csv('iris.csv')
df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [234]:
df.describe()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [235]:
data = df.iloc[:, :-1].values
data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [236]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2, random_state=22)

pca_data = pca.fit_transform(data)

In [237]:
df2 = pd.DataFrame(data=pca_data, columns=['PC1', 'PC2'])
df2['variaty'] = df['variety']
df2

Unnamed: 0,PC1,PC2,variaty
0,-2.684126,0.319397,Setosa
1,-2.714142,-0.177001,Setosa
2,-2.888991,-0.144949,Setosa
3,-2.745343,-0.318299,Setosa
4,-2.728717,0.326755,Setosa
...,...,...,...
145,1.944110,0.187532,Virginica
146,1.527167,-0.375317,Virginica
147,1.764346,0.078859,Virginica
148,1.900942,0.116628,Virginica


# Visualizing data

In [238]:
fig = px.scatter(x=pca_data[:, 0], y=pca_data[:, 1], color=df['variety'])
fig.update_traces(marker=dict(size=9))

fig.show()

In [239]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(data)
tsne.kl_divergence_

0.12473531812429428

In [240]:
fig = px.scatter(x=X_tsne[:, 0], y=X_tsne[:, 1], color=df['variety'])
fig.update_layout(
    title="t-SNE visualization of dataset",
    xaxis_title="First t-SNE",
    yaxis_title="Second t-SNE",
)
fig.update_traces(marker=dict(size=8))

fig.show()

# Clustring with Genetic algorithm

In [241]:
def euclidean_distance(X, Y):
    return np.sqrt(np.sum(np.power(X - Y, 2), axis=1))

In [242]:
d = euclidean_distance(data, data[0])
d

array([0.        , 0.53851648, 0.50990195, 0.64807407, 0.14142136,
       0.6164414 , 0.51961524, 0.17320508, 0.92195445, 0.46904158,
       0.37416574, 0.37416574, 0.59160798, 0.99498744, 0.88317609,
       1.1045361 , 0.54772256, 0.1       , 0.74161985, 0.33166248,
       0.43588989, 0.3       , 0.64807407, 0.46904158, 0.59160798,
       0.54772256, 0.31622777, 0.14142136, 0.14142136, 0.53851648,
       0.53851648, 0.38729833, 0.6244998 , 0.80622577, 0.45825757,
       0.37416574, 0.41231056, 0.24494897, 0.8660254 , 0.14142136,
       0.17320508, 1.34907376, 0.76811457, 0.45825757, 0.6164414 ,
       0.59160798, 0.36055513, 0.58309519, 0.3       , 0.2236068 ,
       4.00374824, 3.61662826, 4.16413256, 3.09354166, 3.79209705,
       3.41613817, 3.78549865, 2.34520788, 3.74966665, 2.88790582,
       2.70370117, 3.22800248, 3.14642654, 3.7       , 2.58069758,
       3.62767143, 3.43511281, 3.00998339, 3.76828874, 2.88270706,
       3.85356977, 3.0757113 , 4.04722127, 3.65786823, 3.41613

In [243]:
def euclidean_distance(X, Y):
    return np.sqrt(np.sum(np.power(X - Y, 2), axis=1))

def cluster_data(solution, solution_idx):
    global num_cluster, data
    feature_vector_length = data.shape[1]
    cluster_centers = []
    all_clusters_dists = []
    clusters = []
    clusters_sum_dist = []
    
    for clust_idx in range(num_clusters):
        cluster_centers.append(solution[feature_vector_length*clust_idx:feature_vector_length*(clust_idx+1)])
        cluster_center_dists = euclidean_distance(data, cluster_centers[clust_idx])
        all_clusters_dists.append(np.array(cluster_center_dists))

    cluster_centers = np.array(cluster_centers)
    all_clusters_dists = np.array(all_clusters_dists)

    cluster_indices = np.argmin(all_clusters_dists, axis=0)
    for clust_idx in range(num_clusters):
        clusters.append(np.where(cluster_indices == clust_idx)[0])
        if len(clusters[clust_idx]) == 0:
            clusters_sum_dist.append(0)
        else:
            clusters_sum_dist.append(np.sum(all_clusters_dists[clust_idx, clusters[clust_idx]]))

    clusters_sum_dist = np.array(clusters_sum_dist)

    return cluster_centers, all_clusters_dists, cluster_indices, clusters, clusters_sum_dist

def fitness_func(instance, solution, solution_idx):
    _, _, _, _, clusters_sum_dist = cluster_data(solution, solution_idx)

    fitness = 1.0 / (np.sum(clusters_sum_dist) + 0.00000001)

    return fitness

In [244]:
class GeneticAlgorithm:
    
    def __init__(self, number_of_generations, population_size, number_of_genes, mutation_percent, fitness_func, genes_range):
        self.number_of_generations = number_of_generations
        self.population_size = population_size
        self.number_of_genes = number_of_genes
        self.mutation_percent = mutation_percent
        self.fitness_func = fitness_func
        self.genes_range = genes_range
        self.best_solution = None

        self.choromosomes = np.zeros((self.population_size + self.number_of_generations, self.number_of_genes))
        self.fitness = np.zeros((self.population_size + self.number_of_generations + 1, 1))

    def population(self):
        for i in range(self.population_size):
            new_choromosome = []
            for j in range(self.number_of_genes):
                new_choromosome.append(np.random.uniform(self.genes_range[j][0], self.genes_range[j][1]))
            self.choromosomes[i] = new_choromosome
            self.fitness[i] = self.fitness_func("", self.choromosomes[i], i)

    def selection(self):

        # get index of fittest and second fittest choromosomes

        fittest_index = np.argmax(self.fitness)
        second_fittest_index = np.argmax(np.delete(self.fitness, fittest_index, axis=0))

        # Store the fittest and second fittest chromosomes
        parent_1 = self.choromosomes[fittest_index]
        parent_2 = self.choromosomes[second_fittest_index]
        
        # get random choromosomes and store it in parent_3

        parent_3 = self.choromosomes[np.random.randint(0, self.population_size)]

        return parent_1, parent_2, parent_3

    def crossover(self, parent_1, parent_2, parent_3):
        # create new choromosome by crossover between parent_1, parent_2 and parent_3
        # randomly select genes from parent_1, parent_2 and parent_3
        new_choromosome = []
        for i in range(self.number_of_genes):
            chance = np.random.randint(0, 100)
            if chance <= 50: 
                new_choromosome.append(parent_1[i])
            elif chance <= 88:
                new_choromosome.append(parent_2[i])
            else:
                new_choromosome.append(parent_3[i])

        return new_choromosome


    def mutation(self, new_choromosome):
        # mutate new choromosome by randomly changing genes
        chance = np.random.randint(0, 100)
        if chance <= 50:
            for i in range(self.number_of_genes):
                chance = np.random.randint(0, 100)

                if chance <= 2*self.mutation_percent:
                    new_choromosome[i] = np.random.uniform(self.genes_range[i][0], self.genes_range[i][1])

            return new_choromosome
        
        if chance <= 25:
            idxs = [0, 4, 8]
            swap_idxs = np.random.choice(idxs, 2, replace=False)
            new_choromosome[swap_idxs[0]], new_choromosome[swap_idxs[1]] = new_choromosome[swap_idxs[1]], new_choromosome[swap_idxs[0]]

        elif chance <= 50:
            idxs = [1, 5, 9]
            swap_idxs = np.random.choice(idxs, 2, replace=False)
            new_choromosome[swap_idxs[0]], new_choromosome[swap_idxs[1]] = new_choromosome[swap_idxs[1]], new_choromosome[swap_idxs[0]]
        
        elif chance <= 75:
            idxs = [2, 6, 10]
            swap_idxs = np.random.choice(idxs, 2, replace=False)
            new_choromosome[swap_idxs[0]], new_choromosome[swap_idxs[1]] = new_choromosome[swap_idxs[1]], new_choromosome[swap_idxs[0]]
        else:
            idxs = [3, 7, 11]
            swap_idxs = np.random.choice(idxs, 2, replace=False)
            new_choromosome[swap_idxs[0]], new_choromosome[swap_idxs[1]] = new_choromosome[swap_idxs[1]], new_choromosome[swap_idxs[0]]

        return new_choromosome

    def best_solution(self):
        return self.best_solution

    def run(self):
        self.population()
        for i in range(self.population_size, self.population_size + self.number_of_generations):
            parent_1, parent_2, parent_3 = self.selection()
            new_choromosome = self.crossover(parent_1, parent_2, parent_3)
            if np.random.randint(0, 100) <= self.mutation_percent:
                new_choromosome = self.mutation(new_choromosome)
            new_choromosome_fitness = self.fitness_func("", new_choromosome, i)
            self.choromosomes[i]=new_choromosome
            self.fitness[i]=new_choromosome_fitness
        self.best_solution = self.choromosomes[np.argmax(self.fitness)]        

In [245]:
num_clusters = 3

sepal_length_range = [df['sepal.length'].min(), df['sepal.length'].max()]
sepal_width_range = [df['sepal.width'].min(), df['sepal.width'].max()]
petal_length_range = [df['petal.length'].min(), df['petal.length'].max()]
petal_width_range = [df['petal.width'].min(), df['petal.width'].max()]


test = GeneticAlgorithm(
    number_of_generations=1000,
    population_size=30,
    number_of_genes=12,
    mutation_percent=30,
    fitness_func=fitness_func,
    genes_range=np.array([sepal_length_range, sepal_width_range, petal_length_range, petal_width_range] * num_clusters )
)

In [246]:
test.run()

In [247]:
cluster_centers, all_clusters_dists, cluster_indices, clusters, clusters_sum_dist = cluster_data(test.best_solution, 0)

In [248]:
cluster_centers

array([[5.35699128, 3.4580663 , 1.12632932, 0.15573072],
       [6.00444893, 2.93729475, 4.63989111, 1.1571657 ],
       [6.13787546, 2.93606695, 4.99876857, 2.09165913]])

In [249]:
cluster_centers_pca = pca.transform(cluster_centers)

In [250]:
df3 = df2[['PC1', 'PC2', 'variaty']]
df3['cluster'] = cluster_indices
df3

Unnamed: 0,PC1,PC2,variaty,cluster
0,-2.684126,0.319397,Setosa,0
1,-2.714142,-0.177001,Setosa,0
2,-2.888991,-0.144949,Setosa,0
3,-2.745343,-0.318299,Setosa,0
4,-2.728717,0.326755,Setosa,0
...,...,...,...,...
145,1.944110,0.187532,Virginica,2
146,1.527167,-0.375317,Virginica,2
147,1.764346,0.078859,Virginica,2
148,1.900942,0.116628,Virginica,2


In [251]:
dic = {}

genetic_clusters = np.unique(cluster_indices)

for cluster_idx in genetic_clusters:
    # Extrect the varieties of the current cluster
    cluster_varieties = df3[df3['cluster'] == cluster_idx]['variaty'].values

    # Count the number of each variety in the cluster
    cluster_varieties_count = np.unique(cluster_varieties, return_counts=True)

    # Get the variety with the highest count
    cluster_variety = cluster_varieties_count[0][np.argmax(cluster_varieties_count[1])]
    dic[cluster_idx] = cluster_variety

dic

{0: 'Setosa', 1: 'Versicolor', 2: 'Virginica'}

In [252]:
df3['labels'] = df3['cluster'].map(dic)
df3

Unnamed: 0,PC1,PC2,variaty,cluster,labels
0,-2.684126,0.319397,Setosa,0,Setosa
1,-2.714142,-0.177001,Setosa,0,Setosa
2,-2.888991,-0.144949,Setosa,0,Setosa
3,-2.745343,-0.318299,Setosa,0,Setosa
4,-2.728717,0.326755,Setosa,0,Setosa
...,...,...,...,...,...
145,1.944110,0.187532,Virginica,2,Virginica
146,1.527167,-0.375317,Virginica,2,Virginica
147,1.764346,0.078859,Virginica,2,Virginica
148,1.900942,0.116628,Virginica,2,Virginica


In [253]:
fig = px.scatter(df3, x='PC1', y='PC2', color=cluster_indices, title='PCA visualization of genetic clustring', hover_data=['variaty'])
fig.update_traces(marker=dict(size=12))

# Centers of Clusters
fig.add_scatter(x=cluster_centers_pca[:, 0], y=cluster_centers_pca[:, 1], mode='markers', marker=dict( symbol='cross',size=10, color='green'))


fig.show()

In [254]:
def report_clustring(dataframe):
    # calculate number of correct predictions
    correct = 0
    for i in range(len(dataframe)):
        if dataframe['labels'][i] == dataframe['variaty'][i]:
            correct += 1


    print('Total accuracy: ', correct / len(dataframe))
    print('------------------------------------')
    print('Count of correct predictions: ', correct)
    print('Count of wrong predictions: ', len(dataframe) - correct)

    setosa_correct = 0
    setosa_wrong = 0
    versicolor_correct = 0
    versicolor_wrong = 0
    virginica_correct = 0
    virginica_wrong = 0

    for i in range(len(dataframe)):
        if dataframe['variaty'][i] == 'Setosa':
            if dataframe['labels'][i] == 'Setosa':
                setosa_correct += 1
            else:
                setosa_wrong += 1
        elif dataframe['variaty'][i] == 'Versicolor':
            if dataframe['labels'][i] == 'Versicolor':
                versicolor_correct += 1
            else:
                versicolor_wrong += 1
        elif dataframe['variaty'][i] == 'Virginica':
            if dataframe['labels'][i] == 'Virginica':
                virginica_correct += 1
            else:
                virginica_wrong += 1

                
    print('------------------------------------')
    print('Setosa correct: ', setosa_correct)
    print('Setosa wrong: ', setosa_wrong)
    print('------------------------------------')
    print('Versicolor correct: ', versicolor_correct)
    print('Versicolor wrong: ', versicolor_wrong)
    print('------------------------------------')
    print('Virginica correct: ', virginica_correct)
    print('Virginica wrong: ', virginica_wrong)

In [255]:
report_clustring(df3)

Total accuracy:  0.96
------------------------------------
Count of correct predictions:  144
Count of wrong predictions:  6
------------------------------------
Setosa correct:  50
Setosa wrong:  0
------------------------------------
Versicolor correct:  46
Versicolor wrong:  4
------------------------------------
Virginica correct:  48
Virginica wrong:  2


# Clustring with K-Means

In [256]:
from sklearn.cluster import KMeans

KMeans = KMeans(n_clusters=3, random_state=42)
KMeans.fit(data)





In [257]:
KMeans.cluster_centers_

array([[5.9016129 , 2.7483871 , 4.39354839, 1.43387097],
       [5.006     , 3.428     , 1.462     , 0.246     ],
       [6.85      , 3.07368421, 5.74210526, 2.07105263]])

In [258]:
cluster_centers

array([[5.35699128, 3.4580663 , 1.12632932, 0.15573072],
       [6.00444893, 2.93729475, 4.63989111, 1.1571657 ],
       [6.13787546, 2.93606695, 4.99876857, 2.09165913]])

In [259]:
df4 = df2[['PC1', 'PC2', 'variaty']]
df4['cluster'] = KMeans.labels_
df4

Unnamed: 0,PC1,PC2,variaty,cluster
0,-2.684126,0.319397,Setosa,1
1,-2.714142,-0.177001,Setosa,1
2,-2.888991,-0.144949,Setosa,1
3,-2.745343,-0.318299,Setosa,1
4,-2.728717,0.326755,Setosa,1
...,...,...,...,...
145,1.944110,0.187532,Virginica,2
146,1.527167,-0.375317,Virginica,0
147,1.764346,0.078859,Virginica,2
148,1.900942,0.116628,Virginica,2


In [260]:
# find the most frequent variety in each kmeans clusters for mapping

kmeans_clusters = np.unique(KMeans.labels_)
dic2 = {}

for i in kmeans_clusters:
    # Extract the variety information for all points in the cluster
    cluster_varieties = df['variety'].iloc[np.where(KMeans.labels_ == i)]

    # Find the unique varieties and their counts
    unique_varieties, counts = np.unique(cluster_varieties, return_counts=True)

    # Find the index of the maximum count
    max_count_index = np.argmax(counts)

    # Map the cluster index to the most frequent variety
    dic2[i] = unique_varieties[max_count_index]

dic2

{0: 'Versicolor', 1: 'Setosa', 2: 'Virginica'}

In [261]:
df4['labels'] = df4['cluster'].map(dic2)
df4

Unnamed: 0,PC1,PC2,variaty,cluster,labels
0,-2.684126,0.319397,Setosa,1,Setosa
1,-2.714142,-0.177001,Setosa,1,Setosa
2,-2.888991,-0.144949,Setosa,1,Setosa
3,-2.745343,-0.318299,Setosa,1,Setosa
4,-2.728717,0.326755,Setosa,1,Setosa
...,...,...,...,...,...
145,1.944110,0.187532,Virginica,2,Virginica
146,1.527167,-0.375317,Virginica,0,Versicolor
147,1.764346,0.078859,Virginica,2,Virginica
148,1.900942,0.116628,Virginica,2,Virginica


In [262]:
kmeans_center_pca = pca.transform(KMeans.cluster_centers_)

In [263]:
fig = px.scatter(df4, x='PC1', y='PC2', color=KMeans.labels_, title='PCA visualization of kmeans clustring', hover_data=['variaty'])
fig.update_traces(marker=dict(size=12))

# Centers of Clusters
fig.add_scatter(x=kmeans_center_pca[:, 0], y=kmeans_center_pca[:, 1], mode='markers', marker=dict( symbol='cross',size=10, color='green'))


fig.show()

In [264]:
report_clustring(df4)

Total accuracy:  0.8933333333333333
------------------------------------
Count of correct predictions:  134
Count of wrong predictions:  16
------------------------------------
Setosa correct:  50
Setosa wrong:  0
------------------------------------
Versicolor correct:  48
Versicolor wrong:  2
------------------------------------
Virginica correct:  36
Virginica wrong:  14
