In [1]:
import math
import pandas as pd
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [56]:
def prepareDataIris(porcentage_testing):
    data_iris = pd.read_table('../data/iris.csv', sep=',') # three classifications ('Iris-setosa, Iris-versicolor and Iris-virginica')
    data_iris = data_iris.drop(['Id'], axis=1)
    data_iris = data_iris.rename(columns={'SepalLengthCm': 0, 'SepalWidthCm': 1, 'PetalLengthCm': 2, 'PetalWidthCm': 3, 'Species': 4})
    num_testing = (data_iris.shape[0] / 3) * porcentage_testing
    testing_iris = data_iris.head(1)
    data_iris = data_iris.drop([0])
    
    total_testing = num_testing * 3
    
    for index, row in data_iris.iterrows():
        if testing_iris.shape[0] < total_testing:
            if len(data_iris) > index:
                if row[4] == 'Iris-setosa' and (testing_iris.loc[testing_iris[4] == 'Iris-setosa']).shape[0] < num_testing \
                    or row[4] == 'Iris-versicolor' and (testing_iris.loc[testing_iris[4] == 'Iris-versicolor']).shape[0] < num_testing \
                    or row[4] == 'Iris-virginica' and (testing_iris.loc[testing_iris[4] == 'Iris-virginica']).shape[0] < num_testing:
                    testing_iris = testing_iris.append(data_iris.iloc[index]) 
                    data_iris = data_iris.drop([index])

    training_iris = data_iris 
    training_iris = training_iris.reset_index()
    training_iris = training_iris.drop(['index'], axis=1)
    testing_iris = testing_iris.reset_index()
    testing_iris = testing_iris.drop(['index'], axis=1)
    
    return training_iris, testing_iris

In [57]:
def prepareDataSpamBase(porcentage_testing):
    data_spambase = pd.read_table('../data/spambase.data', sep=',', header=None) # two classifications (1 or 0)
    num_testing = (data_spambase.shape[0] / 2) * porcentage_testing
    testing_spambase = data_spambase.head(1)
    data_spambase = data_spambase.drop([0])
    
    total_testing = num_testing * 2
    
    for index, row in data_spambase.iterrows():
        if testing_spambase.shape[0] < total_testing:
            if len(data_spambase) > index:
                if row[57] == 0 and (testing_spambase.loc[testing_spambase[57] == 0]).shape[0] < num_testing \
                or row[57] == 1 and (testing_spambase.loc[testing_spambase[57] == 1]).shape[0] < num_testing:
                    testing_spambase = testing_spambase.append(data_spambase.iloc[index]) 
                    data_spambase = data_spambase.drop([index])
    
    training_spambase = data_spambase
    training_spambase = training_spambase.reset_index()
    training_spambase = training_spambase.drop(['index'], axis=1)
    testing_spambase = testing_spambase.reset_index()
    testing_spambase = testing_spambase.drop(['index'], axis=1)
    
    return training_spambase, testing_spambase

In [58]:
def euclidianDistance(testing, training):
    all_distances = []
    num_columns = testing.shape[1] - 1
    num_rows_testing = testing.shape[0]
    
    for i in range(num_rows_testing):
        all_distances.append([])
    
    for index_train, row_train in training.iterrows():
        for test in range(num_rows_testing):
            distance = 0 
            for i in range(num_columns): # ultima coluna é a classificação
                distance += (float(testing.loc[test][i]) - float(row_train[i])) ** 2
                
            all_distances[test].append([math.sqrt(distance), row_train[num_columns]])

    for i in range(num_rows_testing):
        identification = [[i, testing.loc[i][4]]]
        all_distances[i].sort()
        all_distances[i] = identification + all_distances[i]
        
        
    return all_distances

In [59]:
def classificationKNN(distance_list, k):
    classification = []
    for i in range(1, k+1):
        classification.append(distance_list[i][1])
    
    result = Counter(classification).most_common()[0][0]
    
    return [distance_list[0][1], result]

In [60]:
def confusionMatrix(distances, testing, k):
    num_testing = testing.shape[0]
    num_columns = testing.shape[1]
    column_values = testing[num_columns - 1].drop_duplicates().tolist() 
    
    confusion_matrix = pd.DataFrame(0, index=column_values, columns=column_values)
    input_data = []
    predicted_data = []
    
    for i in range(num_testing):
        result = classificationKNN(distances[i], k)
        input_data.append(result[0])
        predicted_data.append(result[1])
        confusion_matrix.loc[result[0], result[1]] = confusion_matrix.loc[result[0], result[1]] + 1
            
    
    return confusion_matrix, input_data, predicted_data

In [61]:
def analyseMachineLearning(input_data, predicted_data):
    print("\nAcurácia: ", accuracy_score(input_data, predicted_data))
    print("\nRecall: ", recall_score(input_data, predicted_data, average=None))
    print("\nPrecisão: ", precision_score(input_data, predicted_data, average=None))
    print("\nF-score: ", f1_score(input_data, predicted_data, average=None))

In [62]:
def executeKNN(training, testing):
    k_list = [1, 3, 5, 7]
    distances = euclidianDistance(testing, training)
    
    for k in k_list:
        print('\nK = ', k)
        confusion_matrix, input_data, predicted_data = confusionMatrix(distances, testing, k)
        print('\nMatriz de Confusão: ')
        print(confusion_matrix)
        analyseMachineLearning(input_data, predicted_data)
        print('\n---------------------------------------------------------------\n')

In [63]:
porcentage_testing = 0.5
training, testing = prepareDataIris(porcentage_testing)
print('\nIRIS DATA\n')
print('\nTesting with ', porcentage_testing * 100, '% of the data.')
executeKNN(training, testing)


IRIS DATA


Testing with  50.0 % of the data.

K =  1

Matriz de Confusão: 
                 Iris-setosa  Iris-versicolor  Iris-virginica
Iris-setosa               25                0               0
Iris-versicolor            0               11               2
Iris-virginica             0                0              25

Acurácia:  0.9682539682539683

Recall:  [1.         0.84615385 1.        ]

Precisão:  [1.         1.         0.92592593]

F-score:  [1.         0.91666667 0.96153846]

---------------------------------------------------------------


K =  3

Matriz de Confusão: 
                 Iris-setosa  Iris-versicolor  Iris-virginica
Iris-setosa               25                0               0
Iris-versicolor            0               10               3
Iris-virginica             0                0              25

Acurácia:  0.9523809523809523

Recall:  [1.         0.76923077 1.        ]

Precisão:  [1.         1.         0.89285714]

F-score:  [1.         0.86956522 0.943

In [None]:
porcentage_testing = 0.5 
training, testing = prepareDataSpamBase(porcentage_testing)
print('\nSPAM BASE DATA\n')
print('\nTesting with ', porcentage_testing * 100, '% of the data.')
executeKNN(training, testing)