In [1]:
import numpy as np
from math import sqrt
from sklearn import datasets, metrics
from sklearn.neighbors import KNeighborsClassifier


Definimos una función que nos permita calcular la distancia entre dos vectores. Para así calcular la distancia entre las observaciones y así encontrar los k vecinos más cercanos.

In [2]:
def euclidean_distance(row1,row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

In [3]:
data = [[2.7810836,2.550537003,0],
        [1.465489372,2.362125076,0],
        [3.396561688,4.400293529,0],
        [1.38807019,1.850220317,0],
        [3.06407232,3.005305973,0],
        [7.627531214,2.759262235,1],
        [5.332441248,2.088626775,1],
        [6.922596716,1.77106367,1],
        [8.675418651,-0.242068655,1],
        [7.673756466,3.508563011,1]]

row0 = data[0]

Calculamos la distancia entre el primer registro y los demás

In [4]:
for row in data:
    print(euclidean_distance(row0,row))

0.0
1.3290173915275787
1.9494646655653247
1.5591439385540549
0.5356280721938492
4.850940186986411
2.592833759950511
4.214227042632867
6.522409988228337
4.985585382449795


Implementamos la función que nos permite obtener los vecinos más cercanos

In [5]:
def get_neighbors(train, test_row, num_neighbors):
    distances = list()
    for train_row in train:
        dist = euclidean_distance(test_row, train_row)
        distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append(distances[i][0])
    return neighbors

Probamos nuestro algoritmo con el dataset data previo. En este caso utilizamos 3 vecinos más cercanos.

In [6]:
neighbors = get_neighbors(data, data[0], 4)
for neighbor in neighbors:
    print(neighbor)

[2.7810836, 2.550537003, 0]
[3.06407232, 3.005305973, 0]
[1.465489372, 2.362125076, 0]
[1.38807019, 1.850220317, 0]


Vamos a implementar la función para hacer una predicción de clasificación utilizando los k vecinos más cercanos

In [None]:
def predict_classification(train, test_row, num_neighbors):
    neighbors = get_neighbors(train, test_row, num_neighbors)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values), key=output_values.count)
    return prediction

Implementamos el K-nn a partir de un conjunto de entrenamiento y uno de prueba

In [None]:
def k_nearest_neighbors(train, test, num_neighbors):
    predictions = list()
    for row in test:
        output = predict_classification(train, row, num_neighbors)
        predictions.append(output)
    return(predictions)

Hacemos la predicción con el dataset data

In [None]:
predictions = k_nearest_neighbors(data,data,3)
for (pred, i) in zip(predictions, range(len(predictions))):
    print('Expected %d, Got %d.' % (data[i][-1], pred))

Ahora vamos a utilizar el conjunto de datos de Iris para probar nuestra implementación del knn

In [None]:
iris = datasets.load_iris() #cargamos el dataset iris de la libreria de datasets

In [None]:
x = iris.data
y = iris.target

print(y.shape)

In [None]:
y = np.expand_dims(y,axis=1)
dataset = np.concatenate((x,y),axis=1)


In [None]:
predictions = k_nearest_neighbors(dataset,dataset,3)
for (pred, i) in zip(predictions, range(len(predictions))):
    print('Expected %d, Got %d.' % (dataset[i][-1], pred))

In [None]:
knn = KNeighborsClassifier(n_neighbors=3) 
knn.fit(x,y.ravel())

In [None]:
y_pred = knn.predict(x)
for (pred, i) in zip(y_pred, range(len(y_pred))):
    print('Expected %d, Got %d.' % (dataset[i][-1], pred))