In [None]:
from sklearn import datasets
import plotly.express as px
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import Normalizer
import math

In [None]:
df = px.data.iris()
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_id
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1
2,4.7,3.2,1.3,0.2,setosa,1
3,4.6,3.1,1.5,0.2,setosa,1
4,5.0,3.6,1.4,0.2,setosa,1
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,3
146,6.3,2.5,5.0,1.9,virginica,3
147,6.5,3.0,5.2,2.0,virginica,3
148,6.2,3.4,5.4,2.3,virginica,3


In [None]:
fig = px.scatter(df, x="sepal_width", y="sepal_length", color="species", marginal_y="violin",
           marginal_x="box", trendline="ols", template="simple_white")
fig.show()

In [None]:
fig = px.scatter_matrix(df, dimensions=["sepal_width", "sepal_length", "petal_width", "petal_length"], color="species")
fig.show()

In [None]:
normalizer = Normalizer()
df_scaled = normalizer.fit_transform(df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])

In [None]:
fig = px.scatter_matrix(df_scaled, dimensions=[0, 1, 2, 3], color=df['species_id'])
fig.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_scaled, 
                                                    df['species_id'].values, 
                                                    test_size=0.25, 
                                                    random_state=1)

In [None]:
class KNN:
    def __init__(self):
        self.X_train = None
        self.y_train = None
        self.k = None
        self.predictions = []
    
    @staticmethod
    def _dist(a, b):
        return ((a[0] - b[0])**2 + (a[1] - b[1])**2)**0.5
    
    @staticmethod
    def score(y_test, predictions):
        correct = 0
        for i in range(len(y_test)):
            if y_test[i] == predictions[i]:
                correct += 1
        return correct/len(y_test)    

    def fit(self, X_train, y_train, k):
        self.X_train = X_train
        self.y_train = y_train
        self.k = k

    def predict(self, X_test):
        for i in range(len(X_test)):
            distances = []
            targets = {}

            for j in range(len(X_train)):
                distances.append([self._dist(X_test[i], X_train[j]), j])
            distances = sorted(distances)

            for j in range(self.k):
                index = distances[j][1]
                if targets.get(y_train[index]) != None:
                    targets[y_train[index]] += 1
                else:
                    targets[y_train[index]] = 1

            self.predictions.append(max(targets,key=targets.get))

        return self.predictions

In [None]:
n = X_test.shape[0]
metricas = []
# for i in range(1, math.ceil(math.sqrt(n))):
for i in range(6, 24):
    knn = KNN()
    knn.fit(X_train, y_train, i)
    pred = knn.predict(X_test)
    metricas.append(knn.score(y_test,pred))

metricas

In [None]:
n = X_test.shape[0]
knn = KNN()
knn.fit(X_train, y_train, round(math.sqrt(n)))
pred = knn.predict(X_test)
knn.score(y_test, pred)

0.9736842105263158

In [None]:
fig = px.scatter_matrix(X_test, dimensions=[0, 1, 2, 3], color=pred)
fig.show()