In [2]:
import pandas as pd
import numpy as np
import math

In [3]:
#lendo o data set
data = pd.read_csv('./breast-cancer.data', header=None)

In [4]:
#Conferindo o tamanho do data set
print("Numero de instancias: ", len(data.index)) #inicia em 0

Numero de instancias:  286


In [5]:
#Dando nomes as colunas
data.columns = ['class', 'age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'deg-malig', 'breast', 'breast-quad', 'irradiant']
display(data.head())

Unnamed: 0,class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiant
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [6]:
#separando as colunas
col_class = data.loc[:, 'class'] #Todas as linhas pela a coluna indicada
col_age = data.loc[:, 'age']
col_menopause = data.loc[:,'menopause']
col_tumor_size = data.loc[:,'tumor-size']
col_inv_nodes = data.loc[:,'inv-nodes']
col_node_caps = data.loc[:,'node-caps']
col_deg_malig = data.loc[:,'deg-malig']
col_breast = data.loc[:,'breast']
col_breast_quad = data.loc[:,'breast-quad']
col_irradiant = data.loc[:,'irradiant']


print(col_class.head(5)) #Teste de coluna
print(col_age.head(5)) #Teste de coluna
print(col_irradiant.head(5)) #Teste de coluna

0    no-recurrence-events
1    no-recurrence-events
2    no-recurrence-events
3    no-recurrence-events
4    no-recurrence-events
Name: class, dtype: object
0    30-39
1    40-49
2    40-49
3    60-69
4    40-49
Name: age, dtype: object
0    no
1    no
2    no
3    no
4    no
Name: irradiant, dtype: object


In [7]:
#Metodo de normalização
#Recebe coluna do pandas, os arquimentos antigos para serem subistidutos,
#Argumentos novos que vão subistituir
def normalize(col, old_args =[], new_args =[]):

    #Verfica o tamanho dos arrays, se forem diferentes lancam erro
    if len(old_args)  == len(new_args): 
               
        #redistribuo os valores com o metodo replace do pandas
        return col.replace([*old_args], [*new_args])
    else:
        print("Os numeros de argumentos nos arrays devem ser iguais")
        return

In [8]:
#Normalização dos dados - Binarios
#no-recurrence-events = 0
#recurrence-events = 1
new_col_class = normalize(col_class, ['no-recurrence-events', 'recurrence-events'], [0, 1])

#node-caps
#yes = 0
#no = 1
new_col_node_caps = normalize(col_node_caps, ['yes', 'no'], [0, 1]) 

#Breat
#left = 0
#right = 1
new_col_breast = normalize(col_breast, ['left', 'right'], [0, 1]) 

#Iradiant
#yes = 0
#no = 1
new_col_irradiant = normalize(col_irradiant, ['yes', 'no'], [0, 1])

print(new_col_class.head(3)) #Teste normalizacao
print(new_col_node_caps.head(3)) #Teste normalizacao
print(new_col_breast.head(3)) #Teste normalizacao
print(new_col_irradiant.head(3)) #Teste normalizacao


0    0
1    0
2    0
Name: class, dtype: int64
0    1
1    1
2    1
Name: node-caps, dtype: object
0    0
1    1
2    0
Name: breast, dtype: int64
0    1
1    1
2    1
Name: irradiant, dtype: int64


In [9]:
#Normalização dos dados - Ternario...
#Menopause
#lt40 = -1
#ge40 = 0
#premeno = 1
new_col_menopause = normalize(col_menopause, ['lt40', 'ge40', 'premeno'], [-1, 0, 1])

#deg-malig
#1 = -1
#2 = 0
#3 = 1
new_col_deg_malig = normalize(col_deg_malig, [1, 2, 3], [-1, 0, 1])

#Breast-quad
#left_up  = -1
#left_low = -0,5
#right_up  = 0
#right_low  = 0,5
#central  = 1
new_col_breast_quad = normalize(col_breast_quad, ['left_up', 'left_low', 'right_up', 'right_low', 'central'], [-1, -0.5, 0, 0.5, 1])

print(new_col_menopause.head(5)) #Teste normalizacao
print(new_col_deg_malig.head(5)) #Teste normalizacao
print(new_col_breast_quad.head(5)) #Teste normalizacao

0    1
1    1
2    1
3    0
4    1
Name: menopause, dtype: int64
0    1
1    0
2    0
3    0
4    0
Name: deg-malig, dtype: int64
0   -0.5
1      0
2   -0.5
3     -1
4    0.5
Name: breast-quad, dtype: object


In [10]:
#Sobrescrita do metodo de normalizacao para trabalhar com range
#Trabalha de froma semlhande ao de cima só que subistitui para um range de 0 a 1
def normalize(col, args = [], is_range=True, is_round=False):
    if is_range:
        numero_elementos = len(args)

        def scale_convertion(this_arg):
            if is_round:
                return round(1 / (numero_elementos - 1) * this_arg[0], 2)
            else:
                # maxEscala / (numero_elementos - 1) * argumento -> Menos 1 para acontagem ir até o 1
                return 1 / (numero_elementos - 1) * this_arg[0]

        #Remapea em loop com a função de cima
        new_args = map(scale_convertion, enumerate(args))

        return col.replace([*args], [*new_args]) 

In [11]:

#Age
#10-19 = 0
#a
#90-99 = 1
new_col_age = normalize(col_age, ['10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90-99'], is_range=True, is_round=True)

print(new_col_age.head(10)) #Teste normalizacao

0    0.25
1    0.38
2    0.38
3    0.62
4    0.38
5    0.62
6    0.50
7    0.62
8    0.38
9    0.38
Name: age, dtype: float64


In [12]:
#Verificando os tipos de dados da coluna
data.drop_duplicates(subset='tumor-size').sort_values(by='tumor-size')


Unnamed: 0,class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiant
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no
12,no-recurrence-events,60-69,lt40,10-14,0-2,no,1,left,right_up,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
6,no-recurrence-events,50-59,premeno,25-29,0-2,no,2,left,left_low,no
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
24,no-recurrence-events,50-59,premeno,35-39,0-2,no,2,right,left_up,no
20,no-recurrence-events,50-59,ge40,40-44,0-2,no,2,left,left_low,no
143,no-recurrence-events,40-49,premeno,45-49,0-2,no,2,left,left_low,yes
42,no-recurrence-events,60-69,ge40,5-9,0-2,no,1,left,central,no


In [26]:
#Verificando os tipos de dados da coluna
data.drop_duplicates(subset="inv-nodes").sort_values(by='inv-nodes')

Unnamed: 0,class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiant
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
190,no-recurrence-events,40-49,premeno,15-19,12-14,no,3,right,right_low,yes
150,no-recurrence-events,50-59,ge40,25-29,15-17,yes,3,right,left_up,no
267,recurrence-events,60-69,ge40,20-24,24-26,yes,3,left,left_low,yes
131,no-recurrence-events,40-49,premeno,40-44,3-5,yes,3,right,left_up,yes
126,no-recurrence-events,30-39,premeno,30-34,6-8,yes,2,right,right_up,no
129,no-recurrence-events,40-49,premeno,35-39,9-11,yes,2,right,left_up,yes


In [14]:
#Tumor size
#Entradas ['0-4', '5-9', '6-8', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59']
#0-4 = 0
#55-59 = 1
new_col_tumor_size = normalize(col_tumor_size, ['0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49','50-54', '55-59'], is_range=True, is_round=True)

#Inv nodes
#Entradas ['0-2', '3-5', '6-8', '9-11', '12-14', '15-17', '24-26', '36-39']
#0-2 = 0
#36-39 = 1
new_col_inv_nodes = normalize(col_inv_nodes, ['0-2', '3-5', '6-8', '9-11', '12-14', '15-17', '24-26', '36-39'], is_range=True, is_round=True)

print(new_col_tumor_size)
print(new_col_inv_nodes.head(269))

0      0.55
1      0.36
2      0.36
3      0.27
4      0.00
       ... 
281    0.55
282    0.36
283    0.36
284    0.55
285    0.55
Name: tumor-size, Length: 286, dtype: float64
0      0.00
1      0.00
2      0.00
3      0.00
4      0.00
       ... 
264    0.00
265    0.43
266    0.14
267    0.86
268    0.00
Name: inv-nodes, Length: 269, dtype: float64


In [15]:
#output = class
new_data = {
    'age': new_col_age,
    'menopause': new_col_menopause,
    'tumor-size': new_col_tumor_size,
    'inv-nodes': new_col_inv_nodes,
    'node-caps': new_col_node_caps,
    'deg-malig': new_col_deg_malig,
    'breast': new_col_breast,
    'breast-quad': new_col_breast_quad,
    'irradiant': new_col_irradiant,
    'class': new_col_class
}

normalize_data = pd.DataFrame(new_data)
display(normalize_data)

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiant,class
0,0.25,1,0.55,0.00,1,1,0,-0.5,1,0
1,0.38,1,0.36,0.00,1,0,1,0,1,0
2,0.38,1,0.36,0.00,1,0,0,-0.5,1,0
3,0.62,0,0.27,0.00,1,0,1,-1,1,0
4,0.38,1,0.00,0.00,1,0,1,0.5,1,0
...,...,...,...,...,...,...,...,...,...,...
281,0.25,1,0.55,0.00,1,0,0,-1,1,1
282,0.25,1,0.36,0.00,1,1,0,-1,0,1
283,0.62,0,0.36,0.00,1,-1,1,-1,1,1
284,0.38,0,0.55,0.14,1,1,0,-0.5,1,1


In [29]:
normalize_data.drop_duplicates(subset="node-caps")

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiant,class
0,0.25,1,0.55,0.0,1,1,0,-0.5,1,0
126,0.25,1,0.55,0.29,0,0,1,0.0,1,0
145,0.38,1,0.45,0.0,?,0,0,0.5,0,0


In [30]:
normalize_data.drop_duplicates(subset="breast-quad")

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiant,class
0,0.25,1,0.55,0.0,1,1,0,-0.5,1,0
1,0.38,1,0.36,0.0,1,0,1,0,1,0
3,0.62,0,0.27,0.0,1,0,1,-1,1,0
4,0.38,1,0.0,0.0,1,0,1,0.5,1,0
10,0.38,1,0.0,0.0,1,1,0,1,1,0
206,0.5,0,0.55,0.0,1,1,0,?,1,1


In [23]:
normalize_data.to_excel('normalize_data.xls', encoding='utf-8')

In [24]:
newcol = normalize_data[normalize_data['node-caps'] != '?']
normalize_data_drop = newcol[newcol['breast-quad'] != '?']

display(normalize_data_drop)

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiant,class
0,0.25,1,0.55,0.00,1,1,0,-0.5,1,0
1,0.38,1,0.36,0.00,1,0,1,0,1,0
2,0.38,1,0.36,0.00,1,0,0,-0.5,1,0
3,0.62,0,0.27,0.00,1,0,1,-1,1,0
4,0.38,1,0.00,0.00,1,0,1,0.5,1,0
...,...,...,...,...,...,...,...,...,...,...
281,0.25,1,0.55,0.00,1,0,0,-1,1,1
282,0.25,1,0.36,0.00,1,1,0,-1,0,1
283,0.62,0,0.36,0.00,1,-1,1,-1,1,1
284,0.38,0,0.55,0.14,1,1,0,-0.5,1,1


In [25]:
def infos(data, classes =['', ''], info=True):
    class0, class1 = 0, 0
    for d in data: 
        #classe 0 e calsse 1
        if d[-1] == 0:
            class0 += 1
        else:
            class1 += 1

    if info:
        print("Total de amostras:", len(data))
        print("Total " + classes[0] + ":", class0)
        print("Total " + classes[1] + ":", class1)
    
    return [len(data), class0, class1]
    

In [26]:
infos(normalize_data_drop.to_numpy(), ['no-recurrence-events', 'recurrence-events'])#Classe 0 e classe 1

Total de amostras: 277
Total no-recurrence-events: 196
Total recurrence-events: 81


[277, 196, 81]

In [27]:
#Metade da amostra irá para treinamento
training_percent = 0.7

#recebendo a quantidade dos dados possiveis
_, class0_num, class1_num = infos(normalize_data_drop.to_numpy(), info=False)

#Iniciando listas
training, test = [], []

#Maximo de dados que podem ser usados por classe para a massa
max_class0 = int(training_percent * class0_num)
max_class1 = int(training_percent * class1_num)

#Total de cada classe
total_class0, total_class1 = 0, 0

#Percorre os dados normalizados sem os dados ivalidos
for data in normalize_data_drop.to_numpy():
    #Enquando a soma do total for menor q o maximo possivel
    if(total_class0 + total_class1) < (max_class0 + max_class1):
        #Se for igual a 0 adiciona ao teste e vai limitando
        if data[-1] == 0 and total_class0 <= max_class0:
                training.append(data)
                total_class0 += 1
        #Se for igual a 1 adiciona ao teste e vai limitando
        elif data[-1] == 1 and total_class1 <= max_class1:
                training.append(data)
                total_class1 += 1
        #Adciona sem limitar se não for nenhum dos casos
        else:
            test.append(data)
    else:
        #O que sobrar é adicionado ao teste
        test.append(data)

In [28]:
#Distancia euclidiana
def ecludian(p1, p2):
    #Quantidade de dimenções
    dim = len(p1)
    #A soma das potencias das distancias
    total_sum = 0
    #Pecorrendo as dimenções
    for i in range(dim):
        #A distancia do dado na posicao menos o outro dado na mesma posição em potencia
        total_sum += (p1[i] - p2[i]) ** 2
    #Raiz das somas das potencias
    return math.sqrt(total_sum)


In [29]:
#Calculando o knn
def knn(training_data, new_data, k):
    #Index das distancias
    distances = {}
    #Quantidade de dados que vao treinar o algoritimo
    len_trainig = len(training_data)

    #Percorendo os dados para treino e calculando a distancia euclidiana
    for i in range(len_trainig):
        #Distancia no indice i
        d = ecludian(training_data[i], new_data)
        distances[i] = d
    
    k_neighborhoods = sorted(distances, key=distances.get)[:k]

    num_class0, num_class1 = 0, 0

    #Pegando as classe dos vizinhos
    for i in k_neighborhoods:
        if training_data[i][-1] == 0:
            num_class0 += 1
        else:
            num_class1 += 1
    
    if num_class0 > num_class1:
        return 0
    else:
        return 1

In [46]:
print(knn(training, test[83], k=9))

1


In [58]:
corrects = 0
k = 7

for data in test:
    num_class = (knn(training, data, k))
    if data[-1] == num_class:
        corrects += 1

print("Total de treinamento ", len(training))
print("Total de testes ", len(test))
print("Total de acertos ", corrects)
print("Porcentagem de acertos ", 100*corrects/len(test))

Total de treinamento  193
Total de testes  84
Total de acertos  82
Porcentagem de acertos  97.61904761904762
