In [79]:
import pandas as pd
import numpy as np
from sklearn import datasets

In [80]:
## Extract information from the loaded object
iris_in = datasets.load_iris()
iris_data = iris_in.data
iris_names = iris_in.feature_names
iris_target = iris_in.target
iris_target_names = iris_in.target_names

In [81]:
print(iris_in.DESCR)

Iris Plants Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :Date: July, 1988

This is a copy of UCI ML iris d

In [82]:
## Put data into a pandas dataframe
iris = pd.DataFrame(data = iris_data, columns = iris_names)
iris["target"] = iris_target
iris_target_labels = dict(zip(list(range(3)), iris_target_names))

iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [83]:
iris.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667,1.0
std,0.828066,0.433594,1.76442,0.763161,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [35]:
## Split data into training and testing
np.random.seed(3)
n_obs = len(iris)
mask = np.random.choice(n_obs, n_obs, replace=False) >= n_obs*.8
train = iris[~mask]
test = iris[mask]

In [36]:
## Predict for test data
y_hat = []
for i in test.index:
    min_dist = np.inf
    idx = -1
    for j in train.index:
        calc_dist = np.linalg.norm(test.loc[i,test.columns != 'target']-train.loc[j,train.columns != 'target'])
        if calc_dist < min_dist:
            min_dist = calc_dist
            idx = j
    y_hat.append(train.loc[idx, "target"])

In [37]:
## Contruct dataframe with actual and predicted value
predictions = pd.DataFrame(data = {"actual": test["target"], "prediction": y_hat})

In [66]:
## Accuracy
sum(predictions["actual"] == predictions["prediction"]) / len(predictions["actual"] == predictions["prediction"])

0.9

In [39]:
predictions

Unnamed: 0,actual,prediction
14,0,0
17,0,0
20,0,0
22,0,0
26,0,0
32,0,0
40,0,0
44,0,0
48,0,0
56,1,1


In [64]:
## Train knn classifier using 7 neighbors
from sklearn.neighbors import KNeighborsClassifier

neigh_7 = KNeighborsClassifier(n_neighbors=7, n_jobs = -1)
neigh_7.fit(train.loc[:,train.columns!="target"].values, train["target"].values)
y_hat_7 = neigh.predict(test.loc[:,test.columns!="target"].values)

predictions_7 = pd.DataFrame(data = {"actual": test["target"], "prediction": y_hat_7})

In [65]:
## Accuracy
sum(predictions_7["actual"] == predictions_7["prediction"]) / len(predictions_7["actual"] == predictions_7["prediction"])

0.9333333333333333

In [60]:
predictions_7

Unnamed: 0,actual,prediction
14,0,0
17,0,0
20,0,0
22,0,0
26,0,0
32,0,0
40,0,0
44,0,0
48,0,0
56,1,1


In [75]:
neigh_13 = KNeighborsClassifier(n_neighbors=13, n_jobs = -1)
neigh_13.fit(train.loc[:,train.columns!="target"].values, train["target"].values)
y_hat_13 = neigh.predict(test.loc[:,test.columns!="target"].values)

predictions_13 = pd.DataFrame(data = {"actual": test["target"], "prediction": y_hat_13})

In [76]:
## Accuracy
sum(predictions_13["actual"] == predictions_13["prediction"]) / len(predictions_13["actual"] == predictions_13["prediction"])

0.9