# Chapter 13 - K-Nearest Neighbors

In [11]:
from math import sqrt
from random import seed

from Codes.ch01_load_and_convert_data import load_csv, str_column_to_float, str_column_to_int
from Codes.ch02_scale_data_functions import dataset_minmax, normalize_dataset
from Codes.ch06_algorithm_test_harnesses import evaluate_algorithm_kfold, evaluate_algorithm_kfold_reg

### Euclidean Distance

In [2]:
# Calculate the Euclidian distance between two vectors
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

In [3]:
# Test distance function
dataset = [[2.7810836,2.550537003,0],
[1.465489372,2.362125076,0],
[3.396561688,4.400293529,0],
[1.38807019,1.850220317,0],
[3.06407232,3.005305973,0],
[7.627531214,2.759262235,1],
[5.332441248,2.088626775,1],
[6.922596716,1.77106367,1],
[8.675418651,-0.242068655,1],
[7.673756466,3.508563011,1]]
row0 = dataset[0]
for row in dataset:
    distance = euclidean_distance(row0, row)
    print(distance)

0.0
1.3290173915275787
1.9494646655653247
1.5591439385540549
0.5356280721938492
4.850940186986411
2.592833759950511
4.214227042632867
6.522409988228337
4.985585382449795


### Get Neighbors

In [4]:
# Locate the most similar neighbors
def get_neighbors(train, test_row, num_neighbors):
    distances = list()
    for train_row in train:
        dist = euclidean_distance(test_row, train_row)
        distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append(distances[i][0])
    return neighbors

In [5]:
# Test get neighbors function
dataset = [[2.7810836,2.550537003,0],
[1.465489372,2.362125076,0],
[3.396561688,4.400293529,0],
[1.38807019,1.850220317,0],
[3.06407232,3.005305973,0],
[7.627531214,2.759262235,1],
[5.332441248,2.088626775,1],
[6.922596716,1.77106367,1],
[8.675418651,-0.242068655,1],
[7.673756466,3.508563011,1]]
neighbors = get_neighbors(dataset, dataset[0], 3)
for neighbor in neighbors:
    print(neighbor)

[2.7810836, 2.550537003, 0]
[3.06407232, 3.005305973, 0]
[1.465489372, 2.362125076, 0]


### Make Predictions

#### Classification

In [6]:
# Make a classification prediction with neighbors
def predict_classification(train, test_row, num_neighbors):
    neighbors = get_neighbors(train, test_row, num_neighbors)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values), key=output_values.count)
    return prediction 

In [7]:
# Test predict classification function
dataset = [[2.7810836,2.550537003,0],
[1.465489372,2.362125076,0],
[3.396561688,4.400293529,0],
[1.38807019,1.850220317,0],
[3.06407232,3.005305973,0],
[7.627531214,2.759262235,1],
[5.332441248,2.088626775,1],
[6.922596716,1.77106367,1],
[8.675418651,-0.242068655,1],
[7.673756466,3.508563011,1]]
prediction = predict_classification(dataset, dataset[0], 3)
print('Expected %d, Got %d.' % (dataset[0][-1], prediction))

Expected 0, Got 0.


In [8]:
# Make a regression predict with neighbors
def predict_regression(train, test_row, num_neighbors):
    neighbors = get_neighbors(train, test_row, num_neighbors)
    output_values = [row[-1] for row in neighbors]
    prediction = sum(output_values) / float(len(output_values))
    return prediction

### Abalone Case Study as Classification

In [10]:
# KNN algorithm 
def k_nearest_neighbors(train, test, num_neighbors):
    predictions = list()
    for row in test:
        output = predict_classification(train, row, num_neighbors)
        predictions.append(output)
    return predictions

In [12]:
# Test the kNN on the Abalone dataset
seed(1)

# load and prepare data
filename = './data/abalone.csv'
dataset = load_csv(filename)
for i in range(1, len(dataset[0])):
    str_column_to_float(dataset, i)
# convert first column to integers
str_column_to_int(dataset, 0)

# evaluate algorithm
n_folds = 5
num_neighbors = 5
scores = evaluate_algorithm_kfold(dataset, k_nearest_neighbors, n_folds, num_neighbors)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Scores: [24.790419161676645, 21.79640718562874, 23.592814371257482, 21.676646706586826, 23.353293413173652]
Mean Accuracy: 23.042%


### Abalone Case Study as Regression

In [13]:
# KNN algorithm 
def k_nearest_neighbors(train, test, num_neighbors):
    predictions = list()
    for row in test:
        output = predict_regression(train, row, num_neighbors)
        predictions.append(output)
    return predictions

In [14]:
# Test the kNN on the Abalone dataset
seed(1)

# load and prepare data
filename = './data/abalone.csv'
dataset = load_csv(filename)
for i in range(1, len(dataset[0])):
    str_column_to_float(dataset, i)
# convert first column to integers
str_column_to_int(dataset, 0)

# evaluate algorithm
n_folds = 5
num_neighbors = 5
scores = evaluate_algorithm_kfold_reg(dataset, k_nearest_neighbors, n_folds, num_neighbors)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Scores: [2.170383116929243, 2.2087035241256405, 2.2321118594939215, 2.4013070293283603, 2.2274928845898017]
Mean Accuracy: 2.248%


## Future Works

* Tune KNN. Try larger and larger k values to see if you can improve the performance of the algorithm on the Abalone dataset.
* Regression for Classification. Combine the approach used to make predictions for regression problems (take the mean) with the classification approach to making predictions (return an integer) and see if you can improve results.
* More Distance Measures. Implement other distance measures that you can use to find similar historical data, such as Hamming distance, Manhattan distance and Minkowski distance.
* Data Preparation. Distance measures are strongly affected by the scale of the input data. Experiment with normalization and standardization data preparation methods in order to improve results.
* More Problems. As always, experiment with the technique on more and different classification and regression problems.