In [291]:
import math
import pandas as pd
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [234]:
def prepareDataIris(porcentage_testing):
    data_iris = pd.read_table('../data/iris.csv', sep=',') # three classifications ('Iris-setosa, Iris-versicolor and Iris-virginica')
    data_iris = data_iris.drop(['Id'], axis=1)
    data_iris = data_iris.rename(columns={'SepalLengthCm': 0, 'SepalWidthCm': 1, 'PetalLengthCm': 2, 'PetalWidthCm': 3, 'Species': 4})
    total_testing = data_iris.shape[0] * porcentage_testing
    testing_iris = data_iris.head(1)
    data_iris = data_iris.drop([0])
    
    num_testing = total_testing / 3
    
    for index, row in data_iris.iterrows():
        if testing_iris.shape[0] < total_testing:
            if row[4] == 'Iris-setosa' and (testing_iris.loc[testing_iris[4] == 'Iris-setosa']).shape[0] < num_testing:
                testing_iris = testing_iris.append(data_iris.iloc[index]) 
                data_iris = data_iris.drop([index])
            elif row[4] == 'Iris-versicolor' and (testing_iris.loc[testing_iris[4] == 'Iris-versicolor']).shape[0] < num_testing:
                testing_iris = testing_iris.append(data_iris.iloc[index]) 
                data_iris = data_iris.drop([index])
            elif row[4] == 'Iris-virginica' and (testing_iris.loc[testing_iris[4] == 'Iris-virginica']).shape[0] < num_testing:
                testing_iris = testing_iris.append(data_iris.iloc[index]) 
                data_iris = data_iris.drop([index])
    
    training_iris = data_iris 
    training_iris = training_iris.reset_index()
    training_iris = training_iris.drop(['index'], axis=1)
    testing_iris = testing_iris.reset_index()
    testing_iris = testing_iris.drop(['index'], axis=1)
    
    return training_iris, testing_iris

In [237]:
def prepareDataSpamBase(porcentage_testing):
    data_spambase = pd.read_table('../data/spambase.data', sep=',', header=None) # two classifications (1 or 0)
    total_testing = data_spambase.shape[0] * porcentage_testing
    testing_spambase = data_spambase.head(1)
    data_spambase = data_spambase.drop([0])
    
    num_testing = total_testing / 2
    
    for index, row in data_spambase.iterrows():
        if testing_spambase.shape[0] < total_testing:
            if row[57] == 0 and (testing_spambase.loc[testing_spambase[57] == 0]).shape[0] < num_testing:
                testing_spambase = testing_spambase.append(data_spambase.iloc[index]) 
                data_spambase = data_spambase.drop([index])
            elif row[57] == 1 and (testing_spambase.loc[testing_spambase[57] == 1]).shape[0] < num_testing:
                testing_spambase = testing_spambase.append(data_spambase.iloc[index]) 
                data_spambase = data_spambase.drop([index])
    
    training_spambase = data_spambase
    training_spambase = training_spambase.reset_index()
    training_spambase = training_spambase.drop(['index'], axis=1)
    testing_spambase = testing_spambase.reset_index()
    testing_spambase = testing_spambase.drop(['index'], axis=1)
    
    return training_spambase, testing_spambase

In [269]:
def euclidianDistance(testing, training):
    all_distances = []
    num_columns = testing.shape[1] - 1
    num_rows_testing = testing.shape[0]
    
    for i in range(num_rows_testing):
        all_distances.append([])
    
    for index_train, row_train in training.iterrows():
        for test in range(num_rows_testing):
            distance = 0 
            for i in range(num_columns): # ultima coluna é a classificação
                distance += (float(testing.loc[test][i]) - float(row_train[i])) ** 2
                
            all_distances[test].append([math.sqrt(distance), row_train[num_columns]])

    for i in range(num_rows_testing):
        identification = [[i, testing.loc[i][4]]]
        all_distances[i].sort()
        all_distances[i] = identification + all_distances[i]
        
        
    return all_distances

In [256]:
def classificationKNN(distance_list, k):
    classification = []
    for i in range(1, k+1):
        classification.append(distance_list[i][1])
    
    result = Counter(classification).most_common()[0][0]
    
    return [distance_list[0][1], result]

In [295]:
def KNN(training, testing, k):
    distances = euclidianDistance(testing, training)
    num_testing = testing.shape[0]
    num_columns = testing.shape[1]
    column_values = testing[num_columns - 1].drop_duplicates().tolist() 
    
    confusion_matrix = pd.DataFrame(0, index=column_values, columns=column_values)
    input_data = []
    predicted_data = []
    
    for i in range(num_testing):
        result = classificationKNN(distances[i], k)
        input_data.append(result[0])
        predicted_data.append(result[1])
        confusion_matrix.loc[result[0], result[1]] = confusion_matrix.loc[result[0], result[1]] + 1
            
    
    return confusion_matrix, input_data, predicted_data

In [287]:
def analyseMachineLearning(input_data, predicted_data):
    print("\nAcurácia: ", accuracy_score(input_data, predicted_data))
    print("\nRecall: ", recall_score(input_data, predicted_data, average=None))
    print("\nPrecisão: ", precision_score(input_data, predicted_data, average=None))
    print("\nF-score: ", f1_score(input_data, predicted_data, average=None))

In [299]:
def executeKNN(training, testing):
    k_list = [1, 3, 5, 7]

    for k in k_list:
        print('K = ', k)
        confusion_matrix, input_data, predicted_data = KNN(training, testing, k)
        print(confusion_matrix)
        analyseMachineLearning(input_data, predicted_data)
        print('\n---------------------------------------------------------------\n')

In [302]:
training, testing = prepareDataIris(0.1)
print('\nIRIS DATA\n')
executeKNN(training, testing)


IRIS DATABASE

K =  1
                 Iris-setosa  Iris-versicolor  Iris-virginica
Iris-setosa                5                0               0
Iris-versicolor            0                5               0
Iris-virginica             0                0               5

Acurácia:  1.0

Recall:  [1. 1. 1.]

Precisão:  [1. 1. 1.]

F-score:  [1. 1. 1.]

---------------------------------------------------------------

K =  3
                 Iris-setosa  Iris-versicolor  Iris-virginica
Iris-setosa                5                0               0
Iris-versicolor            0                5               0
Iris-virginica             0                0               5

Acurácia:  1.0

Recall:  [1. 1. 1.]

Precisão:  [1. 1. 1.]

F-score:  [1. 1. 1.]

---------------------------------------------------------------

K =  5
                 Iris-setosa  Iris-versicolor  Iris-virginica
Iris-setosa                5                0               0
Iris-versicolor            0                5  

In [283]:
training, testing = prepareDataSpamBase(0.1)
print('\nSPAMBASE DATA\n')
executeKNN(training, testing)

KeyboardInterrupt: 

In [None]:
confusion_matrix