In [1]:
from random import seed
from random import randrange
from csv import reader
from math import sqrt

In [2]:
# chargement de fichier CSV
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row[::-1])
    return dataset

In [3]:
# Convertir colonne string en float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

In [4]:
# Convertir colonne string a int
def str_column_to_int(dataset,testData, column):
  class_values = [row[column] for row in dataset]
  unique = set(class_values)
  lookup = dict()
  for i, value in enumerate(unique):
    lookup[value] = i
  for row in dataset:
    row[column] = lookup[row[column]]
  for row in testData:
    row[column] = lookup[row[column]]
  return lookup

In [5]:
#Calcule du pourcentage d'accuracy
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

In [7]:
# Evaluation d'algorithm
def evaluate_algorithm(train_set,test_set, algorithm, *args):
  scores = list()
  # print("Train data size: ",len(train_set))
  # print("Test data size:", len(test_set))
  predicted = algorithm(train_set, test_set, *args)
  # actual = [row[-1] for row in fold]
  actual = [row[-1] for row in test_set]
  accuracy = accuracy_metric(actual, predicted)
  scores.append(accuracy)
  return accuracy

In [8]:
#Calcule de distance euclidienne entre deux vecteurs
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

In [10]:
#location de neighbors les plus similaires
def get_neighbors(train, test_row, num_neighbors):
    distances = list()
    for train_row in train:
        dist = euclidean_distance(test_row, train_row)
        distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append(distances[i][0])
    return neighbors

In [11]:
#faire une prediction avec voisinages
def predict_classification(train, test_row, num_neighbors):
    neighbors = get_neighbors(train, test_row, num_neighbors)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values), key=output_values.count)
    return prediction

In [12]:
# kNN Algorithme
def k_nearest_neighbors(train, test, num_neighbors):
    predictions = list()
    for row in test:
        output = predict_classification(train, row, num_neighbors)
        predictions.append(output)
    return(predictions)

In [13]:
trainfilename = 'training.csv' #give train file name
testfilename = 'testing.csv'   #give test file names

train_set=load_csv(trainfilename)[1:]
for i in range(0, len(train_set[0])-1):
  str_column_to_float(train_set, i)

test_set=load_csv(testfilename)[1:]
for i in range(0, len(test_set[0])-1):
  str_column_to_float(test_set, i)

str_column_to_int(train_set,test_set, len(train_set[0])-1)


{'water': 0, 'impervious': 1, 'grass': 2, 'orchard': 3, 'forest': 4, 'farm': 5}

In [14]:
# evaluation de notre algorithm
num_neighbors = 9
accuracy = evaluate_algorithm(train_set,test_set, k_nearest_neighbors, num_neighbors)
print('Accuracy: %.3f%%' % (accuracy))

Accuracy: 63.667%
