In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [75]:
df = pd.read_csv("./iris_bin.csv")
df

Unnamed: 0,Id,sepal_length,sepal_width,species
0,1,5.1,3.5,Iris-setosa
1,2,4.9,3.0,Iris-setosa
2,3,4.7,3.2,Iris-setosa
3,4,4.6,3.1,Iris-setosa
4,5,5.0,3.6,Iris-setosa
...,...,...,...,...
95,96,5.7,3.0,Iris-versicolor
96,97,5.7,2.9,Iris-versicolor
97,98,6.2,2.9,Iris-versicolor
98,99,5.1,2.5,Iris-versicolor


In [76]:
df = df.sample(frac=1)
df

Unnamed: 0,Id,sepal_length,sepal_width,species
5,6,5.4,3.9,Iris-setosa
96,97,5.7,2.9,Iris-versicolor
55,56,5.7,2.8,Iris-versicolor
18,19,5.7,3.8,Iris-setosa
73,74,6.1,2.8,Iris-versicolor
...,...,...,...,...
81,82,5.5,2.4,Iris-versicolor
68,69,6.2,2.2,Iris-versicolor
58,59,6.6,2.9,Iris-versicolor
82,83,5.8,2.7,Iris-versicolor


In [77]:
df.columns

Index(['Id', 'sepal_length', 'sepal_width', 'species'], dtype='object')

In [79]:
df = df[["sepal_length", "sepal_width", "species"]]
df

Unnamed: 0,sepal_length,sepal_width,species
5,5.4,3.9,Iris-setosa
96,5.7,2.9,Iris-versicolor
55,5.7,2.8,Iris-versicolor
18,5.7,3.8,Iris-setosa
73,6.1,2.8,Iris-versicolor
...,...,...,...
81,5.5,2.4,Iris-versicolor
68,6.2,2.2,Iris-versicolor
58,6.6,2.9,Iris-versicolor
82,5.8,2.7,Iris-versicolor


In [80]:
X = df[["sepal_length", "sepal_width"]]
y = df["species"]

In [81]:
y.unique()

array(['Iris-setosa', 'Iris-versicolor'], dtype=object)

In [82]:
y = y.map({"Iris-setosa" : 0, "Iris-versicolor" : 1})
y

5     0
96    1
55    1
18    0
73    1
     ..
81    1
68    1
58    1
82    1
70    1
Name: species, Length: 100, dtype: int64

In [84]:
# Split
print(len(X) * 0.8)
X_train, X_test = X[:80], X[80:]
y_train, y_test = y[:80], y[80:]

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

80.0
(80, 2) (20, 2)
(80,) (20,)


In [86]:
# Normalization

x_idx = ["sepal_length", "sepal_width"]
u_list = []
std_list = []

for idx in x_idx:
    u = X_train[idx].mean()
    std = X_train[idx].std()

    u_list.append(u)
    std_list.append(std)

    X_train[idx] = (X_train[idx] - u) / std
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[idx] = (X_train[idx] - u) / std


In [87]:
print(u_list)
print(std_list)
X_train.head()

[5.435, 3.1475]
[0.652376376571731, 0.481814870692316]


Unnamed: 0,sepal_length,sepal_width
5,-0.05365,1.561803
96,0.406207,-0.513683
55,0.406207,-0.721231
18,0.406207,1.354255
73,1.01935,-0.721231


## Training

In [88]:
X_train = X_train.values
y_train = y_train.values
type(X_train), type(y_train)

(numpy.ndarray, numpy.ndarray)

In [89]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier()
knn_clf = clf.fit(X_train, y_train)

## Eval

In [90]:
# Normalization for Testset
for i in range(len(x_idx)):
    X_test[x_idx[i]] = (X_test[x_idx[i]] - u_list[i]) / std_list[i]

X_test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[x_idx[i]] = (X_test[x_idx[i]] - u_list[i]) / std_list[i]


Unnamed: 0,sepal_length,sepal_width
71,1.01935,-0.721231
72,1.325922,-1.343877
74,1.479207,-0.513683
89,0.099636,-1.343877
63,1.01935,-0.513683


In [91]:
y_test.head()

71    1
72    1
74    1
89    1
63    1
Name: species, dtype: int64

In [92]:
X_test = X_test.values
y_test = y_test.values

In [93]:
y_pred = knn_clf.predict(X_test)
y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1],
      dtype=int64)

In [94]:
y_test

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1],
      dtype=int64)

In [95]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test, y_pred)
acc

1.0