#Utilização

##1) Dê play para baixar os dados de entrada do GitHub.

In [None]:
!git clone https://github.com/azhow/Machine_Learning.git

Cloning into 'Machine_Learning'...
remote: Enumerating objects: 5, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 5 (delta 0), reused 2 (delta 0), pack-reused 0[K
Unpacking objects: 100% (5/5), done.


##2) Definição dos parâmetros

Defina aqui os parâmetros para execução dos experimentos:

* INPUT_PATH: é o caminho que contém o CSV com os dados de entrada.

* OUTPUT_PATH: é o caminho para onde serão exportados os dados de saída.

* NORMALIZE: contém as variações de valor para a normalização de valores usando o método min-max.

* DISTANCE_FUNCTIONS: possui as diferentes funções de medida de distância.

* HOLDOUT_PROB: contém as porcentagens para os diferentes splits de dados de **treino**.

* KS: os diferentes valores do hiperparâmetro K para a execução do KNN

Dê play após a definição dos novos parâmetros para efetivar as mudanças.

In [None]:
def euclidean_distance(v1, v2):
    """
    Calculates the euclidean distance of a N-dimensional point
    """
    assert len(v1) == len(v2)
    distance = 0
    for i in range(len(v1)):
        distance += (v1[i] - v2[i])**2.0

    return math.sqrt(distance)


def manhattan_distance(v1, v2):
    """
    Calculates the manhattan distance of a N-dimensional point
    """
    assert len(v1) == len(v2)
    distance = 0
    for i in range(len(v1)):
        distance += abs(v1[i] - v2[i])

    return distance


# Define experiment set
INPUT_PATH = "./Machine_Learning/KNN/breast_cancer_data.csv"
OUTPUT_PATH = "./Machine_Learning/KNN/results.csv"
NORMALIZE = [False, True]
DISTANCE_FUNCTIONS = [euclidean_distance, manhattan_distance]
HOLDOUT_PROB = [0.8]
KS = [1, 3, 5, 7, 53, 101, 285]

##3) Execução

Aperte o play para executar o algoritmo KNN com os diversos parâmetros definidos.

In [None]:
from collections import Counter, OrderedDict
import math
import csv
import random

class DataElement():
    """
    Represents a data element (either training or testing) from the input data
    """
    def __init__(self, csv_row):
        """
        Constructs from CSV line
        """
        self.label = csv_row[-1]
        self.attributes = [float(i) for i in csv_row[:-1]]


    def __str__(self):
        return "Label: %s, Attributes: %s" % (self.label, self.attributes)


    def __repr__(self):
        return str(self)


    def normalize(self, attribute_ranges):
        """
        Normalize element using the attribute range
        """
        assert len(attribute_ranges) == len(self.attributes)
        # For each attribute update its value with normalized value
        for attribute_idx in range(len(self.attributes)):
            self.attributes[attribute_idx] = attribute_ranges[attribute_idx].calculate_normalized_value(
                self.attributes[attribute_idx])


class AttributeRange():
    """
    Represents the attribute range
    """
    def __init__(self, attribute_idx, attribute):
        self.id = attribute_idx
        self.max = max(attribute)
        self.min = min(attribute)


    def __str__(self):
        s = "ID: {0} Min: {1} Max: {2}".format(self.id, self.min, self.max)
        return s


    def __repr__(self):
        return str(self)


    def calculate_normalized_value(self, value):
        return (value - self.min) / (self.max - self.min) 


class TrainingSet():
    """
    Represents the whole input data
    """
    def __init__(self, training_elements):
        """
        Iniitialize from CSV data
        """
        self.training_elements = training_elements
        

def read_input_data(input_csv, holdout_prob_training, normalize):
    data_elements = []
    # Initialize elements
    with open(input_csv) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            if line_count != 0:
                data_elements.append(DataElement(row[1:]))
            line_count += 1
    
    if normalize:
        data_elements = normalize_attribute_ranges(data_elements)

    random.shuffle(data_elements)

    # Calculate label distribution
    labels_quantity_dict = calculate_distribution(data_elements)

    training_data = {}

    # Determines the needed number of samples for each class in the training set
    for label in labels_quantity_dict:
        labels_quantity_dict[label] = round(labels_quantity_dict[label] * (len(data_elements) * holdout_prob_training))
        # Initialize 
        training_data[label] = []

    test_data = []

    # For each class in the dataset, add the needed number of samples to the training dataset
    for k in labels_quantity_dict:
        for el in data_elements:
            if (el.label == k) and (len(training_data[k]) < labels_quantity_dict[k]):
                training_data[k].append(el)
            elif len(training_data[k]) >= labels_quantity_dict[k]:
                break

    # Add elements to test data
    for el in data_elements:
        if el not in training_data[el.label]:
            test_data.append(el)

    final_training_data = []
    for i in training_data.values():
        final_training_data += i

    return [final_training_data, test_data]


def calculate_distribution(data_elements):
    """
    Calculates the distribution of the classes present in the data
    """
    data_elements.sort(key=lambda x: x.label)

    # Calculate label distribution
    strata = OrderedDict()
    for el in data_elements:
        if el.label not in strata:
            strata[el.label] = 1
        else:
            strata[el.label] += 1

    for label in strata:
        strata[label] = strata[label] / len(data_elements)

    return strata


def most_frequent(l):
    """
    Gets the most frequent element in a list
    """
    occurence_count = Counter(l)
    return occurence_count.most_common(1)[0][0]


def k_nearest_neighbors(k, training_set, test_element, distance_function):
    """
    Runs KNN on the test_element using the input training_set and distance_function with k
    """
    distance_classification_list = []
    # For each element calculate its distance to the test element and store its label
    for element in training_set.training_elements:
        distance_classification_list.append([distance_function(element.attributes, test_element.attributes), element.label])

    distance_classification_list.sort(key=lambda x: x[0])
    # Sorts, gets the first k elements and creates a list with the labels from these k-first elements
    k_nearest_labels = [i[1] for i in distance_classification_list[:k]]

    return most_frequent(k_nearest_labels)


def calculate_accuracy(test_data, test_results):
    """
    Measures the accuracy of the test data results
    """
    assert len(test_data) == len(test_results)
    hits = 0    
    for idx in range(len(test_data)):
        if test_data[idx].label == test_results[idx]:
            hits += 1

    return hits/len(test_data)


def normalize_attribute_ranges(data_elements):
    """
    Normalize all training element's attributes
    """
    attribute_ranges = []

    # Iterate on every attribute of the training set
    for attribute_idx in range(len(data_elements[0].attributes)):
        attribute_ranges.append(AttributeRange(attribute_idx,
            [element.attributes[attribute_idx] for element in data_elements]))

    for element in data_elements:
        element.normalize(attribute_ranges)

    return data_elements


def run_experiment(k, holdout_prob, training_data, test_data, distance_f, normalize):
    """
    Run an experiment and returns the relevant information from the experiment
    """
    training_set = TrainingSet(training_data)

    test_results = []
    for element in test_data:
        test_results.append(k_nearest_neighbors(k, training_set, element, distance_f))
    
    training_strata = calculate_distribution(training_data)
    test_strata = calculate_distribution(test_data)
    input_strata = calculate_distribution(training_data + test_data)

    header = ["k", "holdout", "distance_f", "normalization", "training_data_size", "test_data_size", "accuracy"]

    for key in input_strata:
        header.append("input_strata_"+key)
    for key in training_strata:
        header.append("training_strata_"+key)
    for key in test_strata:
        header.append("test_strata_"+key)

    data = [k, holdout_prob, distance_f.__name__, normalize, len(training_data), len(test_data), 
            calculate_accuracy(test_data, test_results)]

    for key in input_strata:
        data.append(input_strata[key])
    for key in training_strata:
        data.append(training_strata[key])
    for key in test_strata:
        data.append(test_strata[key])

    return { "header": header, "data": data }


def export_results_to_csv(output_path, header, experiment_results):
    """
    Exports results from experiments to CSV output_path file
    """
    with open(output_path, 'w') as f:
        writer = csv.writer(f)
        # write the header
        writer.writerow(header)

        for result in experiment_results:
            assert len(header) == len(result)
            # write the data
            writer.writerow(result)

    f.close()


def run_all_experiments():
    """
    Runs all experiments
    """
    print("Running experiments...")
    
    experiment_results = []
    header = []
    
    # Run all experiments (excluding first one that already ran)
    for normalize in NORMALIZE:
        for distance_f in DISTANCE_FUNCTIONS:
            for holdout_prob in HOLDOUT_PROB:
                training_data, test_data = read_input_data(INPUT_PATH, holdout_prob, normalize)
                for k in KS:
                    result = run_experiment(k, holdout_prob, training_data, test_data, distance_f, normalize)
                    experiment_results.append(result["data"])
                    header = result["header"]
    
    print("Exporting experiments...")
    # Export results
    export_results_to_csv(OUTPUT_PATH, header, experiment_results)

    print("Done!")


if __name__ == "__main__":
    run_all_experiments()

Running experiments...
Exporting experiments...
Done!


##4) Saída

A saída deverá estar presente no caminho de saída configurado pelo usuário.

Na saída existem as seguintes colunas:
* k - Valor de K utilizado.
* holdout - Valor de holdout utilizado.
* distance_f - Função de distância utilizada.
* normalization - Indica se os atributos da entrada foram normalizados.
* training_data_size - Quantidade de exemplos utilizados para o treinamento do KNN.
* test_data_size - Quantidade de entradas utilizadas para o teste.
* accuracy - Performance da predição do algoritmo KNN com os dados parâmetros.
* input/training/test_strata_X - Indica a porcentagem da strata X presente na entrada, nos dados de treino e nos dados de teste.