# 0. Dependências

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# database
from sklearn.datasets import load_iris

# 1. Introdução 

# 2. Dados

In [2]:
iris = load_iris()

df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['class'] = iris.target
df['class'] = df['class'].map({0:iris.target_names[0], 1:iris.target_names[1], 2:iris.target_names[2]})
df.head(10)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [3]:
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [4]:
x = iris.data
y = iris.target.reshape(-1, 1)

print(x.shape, y.shape)

(150, 4) (150, 1)


In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, stratify=y)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(105, 4) (105, 1)
(45, 4) (45, 1)


# 3. Implementação

### Métricas de Distância

In [6]:
def l1_distance(a, b):
    return np.sum(np.abs(a - b), axis=1)

def l2_distance(a, b):
    return np.sqrt(np.sum((a - b)**2, axis=1))

### Classificador

In [7]:
class kNearestNeighbor(object):
    def __init__(self, n_neighbors=1, dist_func=l1_distance):
        self.n_neighbors = n_neighbors
        self.dist_func = dist_func

    def fit(self, x, y):
        self.x_train = x
        self.y_train = y

    def predict(self, x):
        y_pred = np.zeros((x.shape[0], 1), dtype=self.y_train.dtype)

        for i, x_test in enumerate(x):
            distances = self.dist_func(self.x_train, x_test)
            nn_index = np.argsort(distances)
            nn_pred = self.y_train[nn_index[:self.n_neighbors]].ravel()
            y_pred[i] = np.argmax(np.bincount(nn_pred))

        return y_pred

## 4. Teste

In [8]:
knn = kNearestNeighbor(n_neighbors=3)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)

print('Acurácia: {:.2f}%'.format(accuracy_score(y_test, y_pred)*100))

Acurácia: 93.33%


In [9]:
knn = kNearestNeighbor()
knn.fit(x_train, y_train)

list_res = []
for p in [1, 2]:
    knn.dist_func = l1_distance if p == 1 else l2_distance   
    
    for k in range(1, 10, 2):
        knn.n_neighbors = k
        y_pred = knn.predict(x_test)
        acc = accuracy_score(y_test, y_pred)*100
        list_res.append([k, 'l1_distance' if p == 1 else 'l2_distance', acc])
        
df = pd.DataFrame(list_res, columns=['k', 'dist. func.', 'acurácia'])
df

Unnamed: 0,k,dist. func.,acurácia
0,1,l1_distance,91.111111
1,3,l1_distance,93.333333
2,5,l1_distance,93.333333
3,7,l1_distance,93.333333
4,9,l1_distance,93.333333
5,1,l2_distance,93.333333
6,3,l2_distance,95.555556
7,5,l2_distance,97.777778
8,7,l2_distance,95.555556
9,9,l2_distance,95.555556


### Comparação com o Scikit-learn

In [10]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5, p=2)
knn.fit(x_train, y_train.ravel())

list_res = []
for p in [1, 2]:
    knn.p = p
    
    for k in range(1, 10, 2):
        knn.n_neighbors = k
        y_pred = knn.predict(x_test)
        acc = accuracy_score(y_test, y_pred)*100
        list_res.append([k, 'l1_distance' if p == 1 else 'l2_distance', acc])
        
df = pd.DataFrame(list_res, columns=['k', 'dist. func.', 'acurácia'])
df

Unnamed: 0,k,dist. func.,acurácia
0,1,l1_distance,93.333333
1,3,l1_distance,95.555556
2,5,l1_distance,97.777778
3,7,l1_distance,95.555556
4,9,l1_distance,95.555556
5,1,l2_distance,93.333333
6,3,l2_distance,95.555556
7,5,l2_distance,97.777778
8,7,l2_distance,95.555556
9,9,l2_distance,95.555556
