# Computação Natural - Tarefa Computacional 1

## Alexandre Rosseto Lemos
### PPGI (Mestrado)

# Inicialização

## Bibliotecas

In [1]:
import pandas as pd
import numpy as np
from scipy.optimize import differential_evolution
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from numpy import argsort
import random
from numpy.random import randn
from numpy.random import rand
import warnings
warnings.filterwarnings("ignore")

# Datasets
from sklearn.datasets import load_breast_cancer, load_wine, load_iris

pd.options.display.max_columns = 50

## Funções e classes

### data_to_df

In [2]:
def data_to_df(data):
    '''
    Info:
        Essa função lê os dados e transforma eles em um dataframe.
    ----------
    Input:
        data: Dados obtidos através da bibliteca sklearn.datasets.
    ----------
    Output:
        df: Dataframe com os dados
    '''
    # Obtendo as features e target
    feat = data.data
    target = data.target
    target = target.reshape(len(target), 1)
    
    # Concatenando as informacoes
    info = np.hstack((feat, target))
    
    # Obtendo os nomes das features e adiciona o nome da coluna target
    feat_name = list(data.feature_names)
    feat_name.append('target')
    
    # Criando o dataframe
    df = pd.DataFrame(data = info, columns = feat_name)
    
    display(df.head())
    print('Shape:',df.shape)
    
    return df

### data_prep

In [3]:
def data_prep(df):
    '''
    Info:
        Funcao que prepara os dados para serem utilizados pelos modelos
    ----------
    Input:
        df: DataFrame utilizado
    ----------
    Output:
        X_train_norm: Dados de treinamento normalizados
        y_train: Labels de treinamento
        X_test_norm: Dados de teste normalizaods
        y_test: Labels de teste
    '''
    # Separando as features dos labels
    X = df.drop(columns = 'target')
    y = df['target']
    
    columns_data = X.columns

    # Separando os dados em treino e teste
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify = y, random_state = 0)

    # Normalizando utilizando os dados de treino
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_norm = scaler.transform(X_train)
    X_test_norm = scaler.transform(X_test)
    
    df_x_train_norm = pd.DataFrame(data = X_train_norm, columns = columns_data)
    df_x_test_norm = pd.DataFrame(data = X_test_norm, columns = columns_data)
    
    return df_x_train_norm, y_train, df_x_test_norm, y_test

### calcula_sse

In [4]:
def calcula_sse(dados, centroides):
    '''
    Info:
        Funcao que calcula o SSE do conjunto de dados passado.
    ----------
    Input:
        dados: Conjunto de dados analisado
        centroides: Array com os centroides de cada cluster do conjunto
    ----------
    sse: Soma do erro quadratico dos dados
    '''
    # SSE total dos dados
    sse = 0
    
    # Erro de cada cluster
    err_cluster = 0
    for i in range(centroides.shape[0]):
        
        # Obtendo os dados do cluster
        dados_cluster = dados[dados['cluster_calculado'] == i]
        
        # Remove a coluna do cluster
        dados_cluster.drop(columns = ['cluster_calculado'], inplace = True)
        
        # Obtendo o erro de cada amostra ao centroide do seu cluster
        for ind in dados_cluster.index:
            
            sample = dados_cluster.iloc[i]
            
            # Calcula a distancia da amostra ao centroide
            dist = (np.linalg.norm(sample - centroides[i]) ** 2)
            
            err_cluster = err_cluster + dist
            
        sse = sse + err_cluster
        
    return sse

### KMeans_DE

In [5]:
class KMeans_DE:
    
    def __init__(self, cross_rate, fator_mutacao, num_cluster, X_train, X_test, y_train, y_test, max_iter, verbose = False):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.populacao = []
        self.mut_populacao = []
        self.teste_populacao = []
        self.max_iter = max_iter
        self.num_k = num_cluster
        self.current_fitness = []
        self.fator_mutacao = fator_mutacao
        self.cross_rate = cross_rate
        self.melhor_individuo_geracao = None
        self.melhor_fitness_geracao = None
        self.melhor_individuo_total = None
        self.melhor_fitness_total = 1000000
        self.hist = []
        self.verbose = verbose
        
    def inicializa_populacao(self):
        '''
        Info:
            Funcao que inicializa uma quantidade de vetores onde cada vetor representa uma combinacao de centroides
        ----------
        Input:
            None
        ----------
        Output:
            None
        '''
        dim_data = self.X_train.shape[1]
        
        # Obtem os limites maximos e minimos do conjunto de dados
        max_lim = self.X_train.max().values
        min_lim = self.X_train.min().values
        
        # Gerando a populacoa
        for p in range(10 * dim_data):
        
            clusters = []
            for c in range(self.num_k):
                # Gera um valor aleatório dentro do limite para cada dimensao
                vet = []
                for i in range(dim_data):
                    rand_value = random.uniform(min_lim[i], max_lim[i])

                    # Salva o valor do individuo
                    vet.append(rand_value)
                    
                clusters.append(vet)

            # Salva o vetor na populacao
            np_cluster = np.array(clusters).ravel()
            self.populacao.append(np_cluster)

    def gera_vet_mutante(self):
        '''
        Info:
            Funcao que calcular os vetores mutantes para cada elemento da populacao
        ----------
        Input:
            None
        ----------
        Output:
            None
        '''
        # Cria um vetor mutante para cada vetor alvo
        for vetor in self.populacao:
            
            mut_vet = []
            for x_r1 in vetor:
                
                # Obtendo dois elementos do vetor para calcular o vetor mutante
                vet_aux = vetor.copy()
                vet_aux = list(vet_aux)
                vet_aux.remove(x_r1)
                x_r2 = vet_aux[random.randint(0, len(vet_aux) - 1)]
                vet_aux.remove(x_r2)
                x_r3 = vet_aux[random.randint(0, len(vet_aux) - 1)]
                
                # Calcula o valor para o vetor mutante
                v_i = (x_r1) + (self.fator_mutacao*(x_r2 - x_r1))
                
                mut_vet.append(v_i)
                
            np_vet_mut = np.array(mut_vet)
            self.mut_populacao.append(np_vet_mut)
            self.current_fitness.append(1000000)
            
    def crossover(self):
        '''
        Info:
            Funcao que realiza a geracao dos vetores de teste atraves do crossover
        ----------
        Input:
            None
        ----------
        Output:
            None
        '''
        # Gerando os vetores de teste
        for ind in range(len(self.populacao)):
            t_vet = []
            
            # Seleciona um indice do vetor aleatoriamente
            rand_ind = random.randint(0, len(self.populacao[ind]) - 1)
            for j in range(len(self.populacao[ind])):
                
                if((rand() <= self.cross_rate) | (j == rand_ind)):
                    t_vet.append(self.mut_populacao[ind][j])
                else:
                    t_vet.append(self.populacao[ind][j])
                    
            # Salva o vetor de teste gerado
            self.teste_populacao.append(t_vet)
            
    def fitness(self, vetor):
        '''
        Info:
            Funcao que calcula o SSE de um vetor
        ----------
        Input:
            vetor: Vetor a ser avaliado
        ----------
        Output:
            sse: Calculo da soma do erro quadratico
        '''
        # Alterando a forma do vetor
        centr = np.reshape(vetor, (self.num_k, self.X_train.shape[1]))
        
        # Inicializa o modelo (como max_iter = 1, os pesos nao serao alterados quando executado o fit)
        kmeans = KMeans(n_clusters = self.num_k, random_state = 1234, init = centr, max_iter = 1)
        kmeans.fit(self.X_train, self.y_train)
        
        # Atribuindo os clusters
        y_pred_ = kmeans.predict(self.X_train)
        y_pred = pd.DataFrame(data = y_pred_, columns = ['cluster_calculado'])
        X_train_pred = pd.concat([self.X_train, y_pred], axis = 1)

        # SSE total dos dados
        sse = 0

        # Erro de cada cluster
        err_cluster = 0
        for i in range(centr.shape[0]):

            # Obtendo os dados do cluster
            dados_cluster = X_train_pred[X_train_pred['cluster_calculado'] == i]

            # Remove a coluna do cluster
            dados_cluster.drop(columns = ['cluster_calculado'], inplace = True)

            # Obtendo o erro de cada amostra ao centroide do seu cluster
            for ind in dados_cluster.index:

                sample = dados_cluster.loc[ind]

                # Calcula a distancia da amostra ao centroide
                dist = (np.linalg.norm(sample - centr[i]) ** 2)

                err_cluster = err_cluster + dist

            sse = sse + err_cluster
            
        del X_train_pred

        return sse

    def selec_vet(self):
        '''
        Info:
            Funcao que determina qual vetor segue para a proxima geracao
        ----------
        Input:
            None
        ----------
        Output:
            None
        '''
        # Testando se os vetores de teste sao melhores que os originais
        for i in range(len(self.populacao)):
            
            # Selecionando os vetores a serem testados
            vet_orig = self.populacao[i]
            vet_teste = self.teste_populacao[i]
            
            sse_orig = self.fitness(vet_orig)
            sse_teste = self.fitness(vet_teste)
            
            # Avalia qual vetor obteve o menor SSE e substitui ele na populacao
            if sse_orig < sse_teste:
                self.populacao[i] = vet_orig
                
                # Salva o SSE do vetor
                self.current_fitness[i] = sse_orig
            else:
                self.populacao[i] = vet_teste
        
                # Salva o SSE do vetor
                self.current_fitness[i] = sse_teste
         
    def run(self):
        '''
        Info:
            Funcao de execucao do modelo KMEANS-DE
        ----------
        Input:
            None
        ----------
        Output:
            None
        '''
        # Flag utilizada para determinar quando o algoritmo convergiu (quando o melhor fitness nao muda em 10 iteracoes)
        flag_converge = 0
        
        if self.verbose:        
            print('Inicializando a populacao')
        self.inicializa_populacao()
                
        for i in range(self.max_iter):
            if self.verbose:
                print(f'###-----Execucao {i+1}-----###')
                print('Gerando os vetores mutantes')
            self.gera_vet_mutante()
                
            if self.verbose:
                print('Realizando o crossover')
            self.crossover()
            
            if self.verbose:
                print('Realizando a seleção dos vetores')
            self.selec_vet()
            
            if self.verbose:
                print('Obtendo o melhor vetor da populacao')
            
            # Encontrando qual a menor fitness da populacao
            melhor_sse = min(self.current_fitness)
            self.melhor_fitness_geracao = melhor_sse
            
            # Encontrando qual vetor resultou na menor fitness da populacao
            indice_melhor_vetor = self.current_fitness.index(melhor_sse)
            self.melhor_individuo_geracao = self.populacao[indice_melhor_vetor]
            
            # Armazenando a melhor fitness de cada geracao
            self.hist.append(self.melhor_fitness_geracao)
            
            # Verificando se a melhor fitness da geracao supera a melhor fitness geral
            if self.melhor_fitness_geracao < self.melhor_fitness_total:
                self.melhor_fitness_total = self.melhor_fitness_geracao
                self.melhor_individuo_total = np.reshape(self.melhor_individuo_geracao, (self.num_k, self.X_train.shape[1]))
            
            # Verifica convergencia a partir da segunda execucao
            if i > 0:
                if self.melhor_fitness_geracao == self.hist[-2]:
                    flag_converge = flag_converge + 1
                    
                if flag_converge == 10:
                    self.n_iter_converg = i - 9
                    
            self.mut_populacao = []
            self.teste_populacao = []
                
        if self.verbose:
            print('Fim do Algoritmo')

### KMeans_ES

In [6]:
class KMeans_ES:
    
    def __init__(self, num_cluster, lambda_value, mu, step_size, X_train, X_test, y_train, y_test, max_iter, verbose = False):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.lambda_value = lambda_value
        self.mu = mu
        self.num_filhos = int(lambda_value/mu)
        self.populacao = []
        self.pais = []
        self.filhos = []
        self.step_size = step_size
        self.max_iter = max_iter
        self.num_k = num_cluster
        self.current_fitness = []
        self.melhor_individuo_geracao = None
        self.melhor_fitness_geracao = None
        self.melhor_individuo_total = None
        self.melhor_fitness_total = 1000000
        self.hist = []
        self.verbose = verbose
        
    def inicializa_populacao(self):
        '''
        Info:
            Funcao que inicializa uma quantidade de individuos onde cada individuo representa uma combinacao de centroides.
            Realiza-se o calculo do SSE para cada individuo
        ----------
        Input:
            None
        ----------
        Output:
            None
        '''
        dim_data = self.X_train.shape[1]

        # Obtem os limites maximos e minimos do conjunto de dados
        self.max_lim = self.X_train.max().values
        self.min_lim = self.X_train.min().values

        # Gerando a populacoa
        for p in range(self.lambda_value):

            clusters = []
            for c in range(self.num_k):
                # Gera um valor aleatório dentro do limite para cada dimensao
                vet = []
                for i in range(dim_data):
                    rand_value = random.uniform(self.min_lim[i], self.max_lim[i])

                    # Salva o valor do individuo
                    vet.append(rand_value)

                clusters.append(vet)

            # Salva o individuo na populacao
            np_cluster = np.array(clusters).ravel()
            self.populacao.append(np_cluster)
            
            # Calcula a fitness do individuo e salva na lista de fitness
            self.current_fitness.append(self.fitness(np_cluster))
            
    def fitness(self, individuo):
        '''
        Info:
            Funcao que calcula o SSE de um individuo
        ----------
        Input:
            individuo: Individuo a ser avaliado
        ----------
        Output:
            sse: Calculo da soma do erro quadratico
        '''
        # Alterando a forma do individuo
        centr = np.reshape(individuo, (self.num_k, self.X_train.shape[1]))
        
        # Inicializa o modelo (como max_iter = 1, os pesos nao serao alterados quando executado o fit)
        kmeans = KMeans(n_clusters = self.num_k, random_state = 1234, init = centr, max_iter = 1)
        kmeans.fit(self.X_train, self.y_train)
        
        # Atribuindo os clusters
        y_pred_ = kmeans.predict(self.X_train)
        y_pred = pd.DataFrame(data = y_pred_, columns = ['cluster_calculado'])
        X_train_pred = pd.concat([self.X_train, y_pred], axis = 1)

        # SSE total dos dados
        sse = 0

        # Erro de cada cluster
        err_cluster = 0
        for i in range(centr.shape[0]):

            # Obtendo os dados do cluster
            dados_cluster = X_train_pred[X_train_pred['cluster_calculado'] == i]

            # Remove a coluna do cluster
            dados_cluster.drop(columns = ['cluster_calculado'], inplace = True)

            # Obtendo o erro de cada amostra ao centroide do seu cluster
            for ind in dados_cluster.index:

                sample = dados_cluster.loc[ind]

                # Calcula a distancia da amostra ao centroide
                dist = (np.linalg.norm(sample - centr[i]) ** 2)

                err_cluster = err_cluster + dist

            sse = sse + err_cluster
            
        del X_train_pred

        return sse
    
    def selec_mu_pais(self):
        '''
        Info:
            Funcao que seleciona os mu melhores pais
        ----------
        Input:
            None
        ----------
        Output:
            None
        '''
        # Salvando os dados em dataframes
        sse_df = pd.DataFrame(data = self.current_fitness, columns = ['SSE'])
        pop_df = pd.DataFrame(data = self.populacao)
        
        # Ordenando pelo SSE e obtendo os indices dos 30 individuos que possuem os menores SSE
        sse_ord_ind = list(sse_df.sort_values(by = 'SSE').head(self.mu).index)
        
        # Obtendo os 30 melhores pais
        for ind_p in sse_ord_ind:
            self.pais.append(pop_df.iloc[ind_p].values)
            
        # Obtendo a melhor solucao da geracao
        self.melhor_fitness_geracao = sse_df.iloc[sse_ord_ind[0]].values[0]
        self.melhor_individuo_geracao = pop_df.iloc[sse_ord_ind[0]].values
        
        # Verifica se a melhor solucao da geracao supera a melhor solucao geral
        if self.melhor_fitness_geracao < self.melhor_fitness_total:
            self.melhor_fitness_total = self.melhor_fitness_geracao
            self.melhor_individuo_total = np.reshape(self.melhor_individuo_geracao, (self.num_k, self.X_train.shape[1]))
        
    def gera_filhos(self):
        '''
        Info:
            Funcao que gera lambda/mu filhos por pai
        ----------
        Input:
            None
        ----------
        Output:
            None
        '''
        # Cada pai vai gerar lambda/mu filhos
        for pai in self.pais:
            
            for n in range(self.num_filhos):
                
                # Calcula o filho e salva na lista
                filho = (pai + (randn(self.X_train.shape[1]*self.num_k) * self.step_size))
                self.filhos.append(filho)
                
    def run(self):
        '''
         Info:
            Funcao de execucao do modelo KMEANS-ES
        ----------
        Input:
            None
        ----------
        Output:
            None
        '''
        # Flag utilizada para determinar quando o algoritmo convergiu (quando o melhor fitness nao muda em 10 iteracoes)
        flag_converge = 0
        
        if self.verbose:        
            print('Inicializando a populacao')
        self.inicializa_populacao()
                
        for i in range(self.max_iter):
            if self.verbose:
                print(f'###-----Execucao {i+1}-----###')
                print('Selecionandos os melhores pais')
            self.selec_mu_pais()
                
            if self.verbose:
                print('Gerando os filhos')
            self.gera_filhos()
            
            # Substituindo a populacao pelos filhos
            self.populacao = self.filhos
            
            # Reiniciando as listas de pais e filhos
            self.pais = []
            self.filhos = []
                        
            # Armazenando a melhor fitness de cada geracao
            self.hist.append(self.melhor_fitness_geracao)
                        
            # Verifica convergencia a partir da segunda execucao
            if i > 0:
                if self.melhor_fitness_geracao == self.hist[-2]:
                    flag_converge = flag_converge + 1
                    
                if flag_converge == 10:
                    self.n_iter_converg = i - 9
                
        if self.verbose:
            print('Fim do Algoritmo')

## Carregando os dados

In [7]:
# Obtendo os dados de breast cancer
data_wdbc = load_breast_cancer()
df_wdbd = data_to_df(data_wdbc)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


Shape: (569, 31)


In [8]:
# Obtendo os dados de iris
data_iris = load_iris()
df_iris = data_to_df(data_iris)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


Shape: (150, 5)


In [9]:
# Obtendo os dados de wine
data_wine = load_wine()
df_wine = data_to_df(data_wine)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0.0


Shape: (178, 14)


In [10]:
# Para o dataset wine
X_train_wine, y_train_wine, X_test_wine, y_test_wine = data_prep(df_wine)

# Para o dataset wdbd
X_train_wdbd, y_train_wdbd, X_test_wdbd, y_test_wdbd = data_prep(df_wdbd)

# Para o dataset iris
X_train_iris, y_train_iris, X_test_iris, y_test_iris = data_prep(df_iris)

#  K-means

Os valores de K para cada dataset será igual à quantidade de classes nas colunas target:

- Wine: k = 3
- Wdbd: k = 2
- Iris: k = 3

In [11]:
# Definindo o numero de clusters a serem formados
k_wdbd = 2
k_wine = 3
k_iris = 3

## Para o dataset wine

In [12]:
# Rodando o algoritmo 5 vezes
kmeans_wine_sse = []
acc_wine = []
for loop in range(5):
    print(f"Executando loop: {loop+1}")
    
    # Inicializa o modelo
    kmeans_wine = KMeans(n_clusters = k_wine, max_iter = 50, n_init = 1, init = 'random')

    # Treina o modelo
    kmeans_wine.fit(X_train_wine, y_train_wine)

    # Obtem a predicao do modelo
    y_pred_wine = kmeans_wine.predict(X_train_wine)
    y_pred = pd.DataFrame(data = y_pred_wine, columns = ['cluster_calculado'])
    
    # Calculando a acuracia
    y_pred_test_wine = kmeans_wine.predict(X_test_wine)
    acc_wine.append(accuracy_score(y_test_wine, y_pred_test_wine))
    
    X_train_wine_pred = pd.concat([X_train_wine, y_pred], axis = 1)

    # Obtendo os clusters gerados
    clusters_wine = kmeans_wine.cluster_centers_
    
    # Calculando o SSE do modelo
    kmeans_wine_sse.append(calcula_sse(X_train_wine_pred, clusters_wine))

Executando loop: 1
Executando loop: 2
Executando loop: 3
Executando loop: 4
Executando loop: 5


## Para o dataset breast cancer

In [13]:
# Rodando o algoritmo 5 vezes
kmeans_wdbd_sse = []
acc_wdbd = []
for loop in range(5):
    print(f"Executando loop: {loop+1}")
    
    # Inicializa o modelo
    kmeans_wdbd = KMeans(n_clusters = k_wdbd, max_iter = 50, n_init = 1, init = 'random')

    # Treina o modelo
    kmeans_wdbd.fit(X_train_wdbd, y_train_wdbd)

    # Obtem a predicao do modelo
    y_pred_wdbd = kmeans_wdbd.predict(X_train_wdbd)
    y_pred = pd.DataFrame(data = y_pred_wdbd, columns = ['cluster_calculado'])
    
    # Calculando a acuracia
    y_pred_test_wdbd = kmeans_wdbd.predict(X_test_wdbd)
    acc_wdbd.append(accuracy_score(y_test_wdbd, y_pred_test_wdbd))
    
    X_train_wdbd_pred = pd.concat([X_train_wdbd, y_pred], axis = 1)

    # Obtendo os clusters gerados
    clusters_wdbd = kmeans_wdbd.cluster_centers_
    
    # Calculando o SSE do modelo
    kmeans_wdbd_sse.append(calcula_sse(X_train_wdbd_pred, clusters_wdbd))

Executando loop: 1
Executando loop: 2
Executando loop: 3
Executando loop: 4
Executando loop: 5


## Para o dataset iris

In [14]:
# Rodando o algoritmo 5 vezes
kmeans_iris_sse = []
acc_iris = []
for loop in range(5):
    print(f"Executando loop: {loop+1}")
    
    # Inicializa o modelo
    kmeans_iris = KMeans(n_clusters = k_iris, max_iter = 50, n_init = 1, init = 'random')

    # Treina o modelo
    kmeans_iris.fit(X_train_iris, y_train_iris)

    # Obtem a predicao do modelo
    y_pred_iris = kmeans_iris.predict(X_train_iris)
    y_pred = pd.DataFrame(data = y_pred_iris, columns = ['cluster_calculado'])
    
    # Calculando a acuracia
    y_pred_test_iris = kmeans_iris.predict(X_test_iris)
    acc_iris.append(accuracy_score(y_test_iris, y_pred_test_iris))
    
    X_train_iris_pred = pd.concat([X_train_iris, y_pred], axis = 1)

    # Obtendo os clusters gerados
    clusters_iris = kmeans_iris.cluster_centers_
    
    # Calculando o SSE do modelo
    kmeans_iris_sse.append(calcula_sse(X_train_iris_pred, clusters_iris))

Executando loop: 1
Executando loop: 2
Executando loop: 3
Executando loop: 4
Executando loop: 5


## Resultados

In [15]:
# Resultados de cada loop
df_kmeans = pd.DataFrame(data = {'Wine': kmeans_wine_sse, 'Breast cancer': kmeans_wdbd_sse, 'Iris': kmeans_iris_sse})

df_kmeans.index = df_kmeans.index.values + 1
df_kmeans.index.name = 'Loop'

## Variáveis estatísticas (média, mediana e desvio padrao)

In [16]:
# Obtendo a media das acuracias
acc_wine_avg = np.mean(acc_wine)
acc_wdbd_avg = np.mean(acc_wdbd)
acc_iris_avg = np.mean(acc_iris)

df_stat_kmeans = pd.DataFrame(data = {'Média': df_kmeans.mean().values, 
                                      'Mediana': df_kmeans.median().values, 
                                      'Desvio padrão': df_kmeans.std().values,
                                      'Acurácia': [acc_wine_avg, acc_wdbd_avg, acc_iris_avg]},
                             index = ['Wine', 'Breast cancer', 'Iris'])
df_stat_kmeans

Unnamed: 0,Média,Mediana,Desvio padrão,Acurácia
Wine,1082.890659,1058.829207,184.338629,0.214815
Breast cancer,11923.77582,15618.409926,5059.086104,0.576608
Iris,324.539152,259.162563,207.117206,0.293333


# K-Means com DE

In [19]:
# Definindo os parâmetros do modelo
fator_mutacao = random.uniform(0.5, 1)
taxa_de_crossover = 0.7
n_iter = 50

In [25]:
fator_mutacao

0.8872069019139298

## Para o dataset wine

In [20]:
# Rodando o algoritmo 5 vezes
kmeans_de_wine_sse = []
acc_de_wine = []
for loop in range(5):
    print(f"Executando loop {loop + 1}")
    
    # Instanciando o modelo
    kmeans_de_wine = KMeans_DE(taxa_de_crossover, fator_mutacao, k_wine, X_train_wine, X_test_wine, y_train_wine, y_test_wine, 
                               n_iter, verbose = False)
    
    # Executando o algoritmo
    kmeans_de_wine.run()
    
    # Salvando o resultado
    kmeans_de_wine_sse.append(kmeans_de_wine.melhor_fitness_total)
    
    # Calculando a acurácia do modelo
    # Inicializando o modelo
    kmeans_wine = KMeans(n_clusters = k_wine, random_state = 1234, 
                         init = kmeans_de_wine.melhor_individuo_total, max_iter = 1).fit(X_train_wine)

    # Fazendo a predicao
    y_pred_wine = kmeans_wine.predict(X_test_wine)

    # Calculando a acurácia
    acc_de_wine.append(accuracy_score(y_test_wine, y_pred_wine))

Executando loop 1
Executando loop 2
Executando loop 3
Executando loop 4
Executando loop 5


## Para o dataset breast cancer

In [21]:
# Rodando o algoritmo 5 vezes
kmeans_de_wdbd_sse = []
acc_de_wdbd = []
for loop in range(5):
    print(f"Executando loop {loop + 1}")
    
    # Instanciando o modelo
    kmeans_de_wdbd = KMeans_DE(taxa_de_crossover, fator_mutacao, k_wdbd, X_train_wdbd, X_test_wdbd, y_train_wdbd, y_test_wdbd, 
                               n_iter, verbose = False)
    
    # Executando o algoritmo
    kmeans_de_wdbd.run()
    
    # Salvando o resultado
    kmeans_de_wdbd_sse.append(kmeans_de_wdbd.melhor_fitness_total)
    
    # Calculando a acurácia do modelo
    # Inicializando o modelo
    kmeans_wdbd = KMeans(n_clusters = k_wdbd, random_state = 1234, 
                         init = kmeans_de_wdbd.melhor_individuo_total, max_iter = 1).fit(X_train_wdbd)

    # Fazendo a predicao
    y_pred_wdbd = kmeans_wdbd.predict(X_test_wdbd)

    # Calculando a acurácia
    acc_de_wdbd.append(accuracy_score(y_test_wdbd, y_pred_wdbd))

Executando loop 1
Executando loop 2
Executando loop 3
Executando loop 4
Executando loop 5


## Para o dataset iris

In [22]:
# Rodando o algoritmo 5 vezes
kmeans_de_iris_sse = []
acc_de_iris = []
for loop in range(5):
    print(f"Executando loop {loop + 1}")
    
    # Instanciando o modelo
    kmeans_de_iris = KMeans_DE(taxa_de_crossover, fator_mutacao, k_iris, X_train_iris, X_test_iris, y_train_iris, y_test_iris, 
                               n_iter, verbose = False)
    
    # Executando o algoritmo
    kmeans_de_iris.run()
    
    # Salvando o resultado
    kmeans_de_iris_sse.append(kmeans_de_iris.melhor_fitness_total)
    
    # Calculando a acurácia do modelo
    # Inicializando o modelo
    kmeans_iris = KMeans(n_clusters = k_iris, random_state = 1234, 
                         init = kmeans_de_iris.melhor_individuo_total, max_iter = 1).fit(X_train_iris)

    # Fazendo a predicao
    y_pred_iris = kmeans_iris.predict(X_test_iris)

    # Calculando a acurácia
    acc_de_iris.append(accuracy_score(y_test_iris, y_pred_iris))

Executando loop 1
Executando loop 2
Executando loop 3
Executando loop 4
Executando loop 5


## Resultados

In [23]:
# Resultados de cada loop
df_kmeans_de = pd.DataFrame(data = {'Wine': kmeans_de_wine_sse, 'Breast cancer': kmeans_de_wdbd_sse, 'Iris': kmeans_de_iris_sse},
                        )

df_kmeans_de.index = df_kmeans_de.index.values + 1
df_kmeans_de.index.name = 'Loop'

In [26]:
# Obtendo a media das acuracias
acc_wine_avg = np.mean(acc_de_wine)
acc_wdbd_avg = np.mean(acc_de_wdbd)
acc_iris_avg = np.mean(acc_de_iris)

# Variáveis estatísticas (média, mediana e desvio padrao)
df_stat_kmeans_de = pd.DataFrame(data = {'Média': df_kmeans_de.mean().values, 
                                         'Mediana': df_kmeans_de.median().values, 
                                         'Desvio padrão': df_kmeans_de.std().values,
                                         'Acurácia': [acc_wine_avg, acc_wdbd_avg, acc_iris_avg]},
                             index = ['Wine', 'Breast cancer', 'Iris'])
df_stat_kmeans_de

Unnamed: 0,Média,Mediana,Desvio padrão,Acurácia
Wine,2212.921964,2208.667176,16.740471,0.322222
Breast cancer,15467.366922,15653.90688,355.241041,0.352047
Iris,317.215116,309.835318,15.001832,0.306667


# K-Means com ES

**Será desenvolvido o (mu, lambda)-ES**

In [27]:
# Parametros do modelo
lambda_value = 210
mu = 30
max_iter = 50
step_size = 0.15

## Para o dataset wine

In [28]:
# Rodando o algoritmo 5 vezes
kmeans_es_wine_sse = []
acc_es_wine = []
for loop in range(5):
    print(f"Executando loop {loop + 1}")
    
    # Instanciando o modelo
    kmeans_de_wine = KMeans_ES(k_wine, lambda_value, mu, step_size,
                               X_train_wine, X_test_wine, y_train_wine, y_test_wine, max_iter)
    
    # Executando o algoritmo
    kmeans_de_wine.run()
    
    # Salvando o resultado
    kmeans_de_wine_sse.append(kmeans_de_wine.melhor_fitness_total)
    
    # Calculando a acurácia do modelo
    # Inicializando o modelo
    kmeans_wine = KMeans(n_clusters = k_wine, random_state = 1234, 
                         init = kmeans_de_wine.melhor_individuo_total, max_iter = 1).fit(X_train_wine)

    # Fazendo a predicao
    y_pred_wine = kmeans_wine.predict(X_test_wine)

    # Calculando a acurácia
    acc_es_wine.append(accuracy_score(y_test_wine, y_pred_wine))

Executando loop 1
Executando loop 2
Executando loop 3
Executando loop 4
Executando loop 5


## Para o dataset breast cancer

In [29]:
# Rodando o algoritmo 5 vezes
kmeans_es_wdbd_sse = []
acc_es_wdbd = []
for loop in range(5):
    print(f"Executando loop {loop + 1}")
    
    # Instanciando o modelo
    kmeans_de_wdbd = KMeans_ES(k_wdbd, lambda_value, mu, step_size,
                               X_train_wdbd, X_test_wdbd, y_train_wdbd, y_test_wdbd, max_iter)
    
    # Executando o algoritmo
    kmeans_de_wdbd.run()
    
    # Salvando o resultado
    kmeans_de_wdbd_sse.append(kmeans_de_wdbd.melhor_fitness_total)
    
    # Calculando a acurácia do modelo
    # Inicializando o modelo
    kmeans_wdbd = KMeans(n_clusters = k_wdbd, random_state = 1234, 
                         init = kmeans_de_wdbd.melhor_individuo_total, max_iter = 1).fit(X_train_wdbd)

    # Fazendo a predicao
    y_pred_wdbd = kmeans_wdbd.predict(X_test_wdbd)

    # Calculando a acurácia
    acc_es_wdbd.append(accuracy_score(y_test_wdbd, y_pred_wdbd))

Executando loop 1
Executando loop 2
Executando loop 3
Executando loop 4
Executando loop 5


## Para o dataset iris

In [30]:
# Rodando o algoritmo 5 vezes
kmeans_es_iris_sse = []
acc_es_iris = []
for loop in range(5):
    print(f"Executando loop {loop + 1}")
    
    # Instanciando o modelo
    kmeans_de_iris = KMeans_ES(k_iris, lambda_value, mu, step_size,
                               X_train_iris, X_test_iris, y_train_iris, y_test_iris, max_iter)
    
    # Executando o algoritmo
    kmeans_de_iris.run()
    
    # Salvando o resultado
    kmeans_de_iris_sse.append(kmeans_de_iris.melhor_fitness_total)
    
    # Calculando a acurácia do modelo
    # Inicializando o modelo
    kmeans_iris = KMeans(n_clusters = k_iris, random_state = 1234, 
                         init = kmeans_de_iris.melhor_individuo_total, max_iter = 1).fit(X_train_iris)

    # Fazendo a predicao
    y_pred_iris = kmeans_iris.predict(X_test_iris)

    # Calculando a acurácia
    acc_es_iris.append(accuracy_score(y_test_iris, y_pred_iris))

Executando loop 1
Executando loop 2
Executando loop 3
Executando loop 4
Executando loop 5


## Resultados

In [31]:
# Resultados de cada loop
df_kmeans_es_mu_lam = pd.DataFrame(data = {'Wine': kmeans_de_wine_sse, 
                                           'Breast cancer': kmeans_de_wdbd_sse, 
                                           'Iris': kmeans_de_iris_sse},
                                  )
df_kmeans_es_mu_lam.index = df_kmeans_es_mu_lam.index.values + 1
df_kmeans_es_mu_lam.index.name = 'Loop'
df_kmeans_es_mu_lam

Unnamed: 0_level_0,Wine,Breast cancer,Iris
Loop,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2225.237517,15356.200951,301.614889
2,2234.008472,15769.836275,330.337632
3,2204.742653,14894.544709,309.835318
4,2191.954001,15662.345795,308.33073
5,2208.667176,15653.90688,335.957012
6,4728.387699,56468.472556,472.751728
7,4362.680156,66856.394764,533.951904
8,3483.815982,68783.159384,502.53524
9,4030.623034,65768.649097,493.583138
10,3981.353679,51032.488168,422.813154


In [32]:
# Obtendo a media das acuracias
acc_wine_avg = np.mean(acc_es_wine)
acc_wdbd_avg = np.mean(acc_es_wdbd)
acc_iris_avg = np.mean(acc_es_iris)

# Variáveis estatísticas (média, mediana e desvio padrao)
df_stat_kmeans_es_mu_lam = pd.DataFrame(data = {'Média': df_kmeans_es_mu_lam.mean().values, 
                                                'Mediana': df_kmeans_es_mu_lam.median().values, 
                                                'Desvio padrão': df_kmeans_es_mu_lam.std().values,
                                                'Acurácia': [acc_wine_avg, acc_wdbd_avg, acc_iris_avg]},
                                        index = ['Wine', 'Breast cancer', 'Iris'])
df_stat_kmeans_es_mu_lam

Unnamed: 0,Média,Mediana,Desvio padrão,Acurácia
Wine,3165.147037,2858.912227,1050.364103,0.281481
Breast cancer,38624.599858,33401.162222,24938.867081,0.644444
Iris,401.171075,379.385083,93.202787,0.315556
