In [25]:
import numpy as np
from typing import Tuple
from scipy.stats import mode

from sklearn.metrics import confusion_matrix
import seaborn as sns

csv_path = 'iris.csv'

In [18]:
#pip install seaborn

Defaulting to user installation because normal site-packages is not writeable
Collecting seaborn
  Downloading seaborn-0.12.2-py3-none-any.whl (293 kB)
[K     |████████████████████████████████| 293 kB 1.4 MB/s eta 0:00:01
Installing collected packages: seaborn
Successfully installed seaborn-0.12.2
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [26]:
def load_csv(csv_path: str) -> Tuple[np.ndarray, np.ndarray]:
    np.random.seed(42)
    dataset = np.genfromtxt(csv_path, delimiter=',')
    np.random.shuffle(dataset)
    x,y = dataset[:,:-1], dataset[:, -1]
    return x,y

In [27]:
x,y = load_csv(csv_path)

x.shape, y.shape

((150, 4), (150,))

In [6]:
np.mean(x, axis=0),np.var(x, axis=0)

(array([nan, nan, nan, nan]), array([nan, nan, nan, nan]))

In [7]:
np.nanmean(x, axis=0),np.nanvar(x, axis=0)

(array([ 355.46503497, -280.09189189,    2.95      ,   21.74726027]),
 array([1.73561968e+07, 1.18405444e+07, 1.51049922e+04, 6.11729208e+04]))

In [28]:
x[np.isnan(x)] = 3.5

In [29]:
y = np.delete(y, np.where(x < 0.0)[0], axis=0)
y = np.delete(y, np.where(x > 10.0)[0], axis=0)
x = np.delete(x, np.where(x < 0.0)[0], axis=0)
x = np.delete(x, np.where(x > 10.0)[0], axis=0)

x.shape,y.shape

((144, 4), (144,))

In [30]:
def train_test_split(features: np.ndarray,
                     labels: np.ndarray,
                     test_plit_ratio: float) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    test_size = int(len(features) * test_plit_ratio)
    train_size = len(features) - test_size
    assert len(features) == test_size + train_size, "Size mismatch!"

    x_train,y_train = features[:train_size,:],labels[:train_size]
    x_test,y_test = features[train_size:train_size+test_size,:], labels[train_size:train_size + test_size]

    return x_train,y_train,x_test,y_test

In [31]:
x_train,y_train,x_test,y_test = train_test_split(x, y, 0.2)

In [32]:
def euclidean(points:np.ndarray, element_of_x:np.ndarray) -> np.ndarray:
    return np.sqrt(np.sum((points - element_of_x)**2, axis=1))

In [36]:
euclidean(x_train, x_test[0])
#x_test[0]

array([3.89615195, 1.96214169, 6.75943785, 3.78021163, 1.43874946,
       2.80356915, 4.98397432, 3.81837662, 3.06920185, 4.70531614,
       0.79372539, 2.7       , 0.93808315, 4.18210473, 5.3244718 ,
       2.94108823, 4.66154481, 5.21056619, 0.97467943, 4.27317212,
       1.2489996 , 5.05865595, 6.53528882, 4.95378643, 5.23259018,
       0.91104336, 1.34164079, 2.42899156, 4.03732585, 1.51986842,
       0.91104336, 4.41927596, 4.01248053, 1.40712473, 1.4       ,
       1.95192213, 4.34165867, 3.92300905, 4.29883705, 1.83847763,
       1.67928556, 2.69443872, 4.40113622, 4.9132474 , 3.96484552,
       6.07124369, 3.34813381, 6.43661402, 3.62904946, 1.34907376,
       6.52226954, 3.19061123, 1.3114877 , 1.3       , 1.72626765,
       2.38117618, 4.31393092, 1.2922848 , 1.4832397 , 2.68514432,
       1.27279221, 3.33916157, 6.72755528, 0.91651514, 3.64005494,
       4.24970587, 2.15174348, 5.02294734, 4.34165867, 3.1685959 ,
       3.60277671, 4.79791621, 3.04795013, 3.71079506, 2.17715

In [14]:
def predict(x_train: np.ndarray,
            y_train:np.ndarray,
            x_test:np.ndarray,
            k:int) -> np.ndarray:
    labels_pred = [];
    for x_test_element in x_test:
        distances = euclidean(x_train, x_test_element)
        distances = np.array(sorted(zip(distances, y_train)))

        label_pred = mode(distances[:k,1], keepdims=False).mode
        labels_pred.append(label_pred)

    return np.array(labels_pred,dtype=np.int64)

In [22]:
y_preds = predict(x_train, y_train, x_test, 3)


In [38]:
def accuracy(y_test:np.ndarray, y_preds:np.ndarray) -> float:
    true_positive = (y_test == y_preds).sum()
    return true_positive/len(y_test) * 100

accuracy(y_test, y_preds)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True, False,  True,  True,
        True])

In [None]:
def plot_confusion_matrix(y_test:np.ndarray, y_preds:np.ndarray):
    conf_matrix = confusion_matrix(y_test,y_preds)
    return sns.heatmap(conf_matrix, annot=True)

