In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#print(os.getcwd())

In [2]:
data_path = 'iris/iris.csv'

In [3]:
iris_df = pd.read_csv(data_path)
iris_df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [4]:
'''Splitting our dataset into training and validation. Even though we're not really training our model per se.
We want to be able to classify new points. That is essentially what we'll be using our validation set for. 
'''
#shuffling the dataframe
iris_df_shuffled = iris_df.sample(frac=1, random_state=42).reset_index(drop=True) #random state is something like np.random.seed

#we'll use the 80-20 split for trainining and validation datasets
split_index = int(0.8*len(iris_df_shuffled))
iris_train = iris_df_shuffled.iloc[0:split_index]
iris_val = iris_df_shuffled.iloc[split_index:]

In [5]:
#these are the possible values that we can classify our point into
classes = iris_train['Species'].unique()
classes

array(['Iris-versicolor', 'Iris-setosa', 'Iris-virginica'], dtype=object)

In [6]:
iris_val

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
120,92,6.1,3.0,4.6,1.4,Iris-versicolor
121,42,4.5,2.3,1.3,0.3,Iris-setosa
122,59,6.6,2.9,4.6,1.3,Iris-versicolor
123,91,5.5,2.6,4.4,1.2,Iris-versicolor
124,49,5.3,3.7,1.5,0.2,Iris-setosa
125,89,5.6,3.0,4.1,1.3,Iris-versicolor
126,108,7.3,2.9,6.3,1.8,Iris-virginica
127,125,6.7,3.3,5.7,2.1,Iris-virginica
128,22,5.1,3.7,1.5,0.4,Iris-setosa
129,58,4.9,2.4,3.3,1.0,Iris-versicolor


In [7]:
iris_val_features = iris_val.iloc[:, :-1]
iris_val_features

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
120,92,6.1,3.0,4.6,1.4
121,42,4.5,2.3,1.3,0.3
122,59,6.6,2.9,4.6,1.3
123,91,5.5,2.6,4.4,1.2
124,49,5.3,3.7,1.5,0.2
125,89,5.6,3.0,4.1,1.3
126,108,7.3,2.9,6.3,1.8
127,125,6.7,3.3,5.7,2.1
128,22,5.1,3.7,1.5,0.4
129,58,4.9,2.4,3.3,1.0


In [8]:
iris_train

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,74,6.1,2.8,4.7,1.2,Iris-versicolor
1,19,5.7,3.8,1.7,0.3,Iris-setosa
2,119,7.7,2.6,6.9,2.3,Iris-virginica
3,79,6.0,2.9,4.5,1.5,Iris-versicolor
4,77,6.8,2.8,4.8,1.4,Iris-versicolor
...,...,...,...,...,...,...
115,140,6.9,3.1,5.4,2.1,Iris-virginica
116,62,5.9,3.0,4.2,1.5,Iris-versicolor
117,148,6.5,3.0,5.2,2.0,Iris-virginica
118,80,5.7,2.6,3.5,1.0,Iris-versicolor


In [9]:
def knn(train_df, val_df, k):
    pred_arr = []
    for i in range(len(val_df)):
        dist_arr = []
        ith_sample = val_df.iloc[i]
        val_sepal_len = ith_sample['SepalLengthCm']
        val_sepal_width = ith_sample['SepalWidthCm']
        val_petal_len = ith_sample['PetalLengthCm']
        val_petal_width = ith_sample['PetalWidthCm']
        
        sepal_len_d = train_df['SepalLengthCm'].to_numpy() - val_sepal_len
        sepal_width_d = train_df['SepalWidthCm'].to_numpy() - val_sepal_width
        petal_len_d = train_df['PetalLengthCm'].to_numpy() - val_petal_len
        petal_width_d = train_df['PetalWidthCm'].to_numpy() - val_petal_width
        
        distance = np.sqrt(sepal_len_d**2 + sepal_width_d**2 + petal_len_d**2 + petal_width_d**2)
        
        
        train_df.loc[:, 'Distance'] = distance
        
        top_k = train_df.sort_values(by = 'Distance')[0:k]
        
        pred_label = top_k['Species'].value_counts().index[0]
        
        pred_arr.append(pred_label)
        
    val_df.loc[:, 'Prediction'] = pred_arr
    return val_df

In [10]:
knn(iris_train, iris_val_features, 15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.loc[:, 'Distance'] = distance


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Prediction
120,92,6.1,3.0,4.6,1.4,Iris-versicolor
121,42,4.5,2.3,1.3,0.3,Iris-setosa
122,59,6.6,2.9,4.6,1.3,Iris-versicolor
123,91,5.5,2.6,4.4,1.2,Iris-versicolor
124,49,5.3,3.7,1.5,0.2,Iris-setosa
125,89,5.6,3.0,4.1,1.3,Iris-versicolor
126,108,7.3,2.9,6.3,1.8,Iris-virginica
127,125,6.7,3.3,5.7,2.1,Iris-virginica
128,22,5.1,3.7,1.5,0.4,Iris-setosa
129,58,4.9,2.4,3.3,1.0,Iris-versicolor


In [11]:
iris_val_features['True Label'] = iris_val['Species']
iris_val_features

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Prediction,True Label
120,92,6.1,3.0,4.6,1.4,Iris-versicolor,Iris-versicolor
121,42,4.5,2.3,1.3,0.3,Iris-setosa,Iris-setosa
122,59,6.6,2.9,4.6,1.3,Iris-versicolor,Iris-versicolor
123,91,5.5,2.6,4.4,1.2,Iris-versicolor,Iris-versicolor
124,49,5.3,3.7,1.5,0.2,Iris-setosa,Iris-setosa
125,89,5.6,3.0,4.1,1.3,Iris-versicolor,Iris-versicolor
126,108,7.3,2.9,6.3,1.8,Iris-virginica,Iris-virginica
127,125,6.7,3.3,5.7,2.1,Iris-virginica,Iris-virginica
128,22,5.1,3.7,1.5,0.4,Iris-setosa,Iris-setosa
129,58,4.9,2.4,3.3,1.0,Iris-versicolor,Iris-versicolor


In [12]:
correct_class_df = iris_val_features[iris_val_features['Prediction'] == iris_val_features['True Label']
                                    ]
correct_class_df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Prediction,True Label
120,92,6.1,3.0,4.6,1.4,Iris-versicolor,Iris-versicolor
121,42,4.5,2.3,1.3,0.3,Iris-setosa,Iris-setosa
122,59,6.6,2.9,4.6,1.3,Iris-versicolor,Iris-versicolor
123,91,5.5,2.6,4.4,1.2,Iris-versicolor,Iris-versicolor
124,49,5.3,3.7,1.5,0.2,Iris-setosa,Iris-setosa
125,89,5.6,3.0,4.1,1.3,Iris-versicolor,Iris-versicolor
126,108,7.3,2.9,6.3,1.8,Iris-virginica,Iris-virginica
127,125,6.7,3.3,5.7,2.1,Iris-virginica,Iris-virginica
128,22,5.1,3.7,1.5,0.4,Iris-setosa,Iris-setosa
129,58,4.9,2.4,3.3,1.0,Iris-versicolor,Iris-versicolor


In [13]:
# print(len(correct_class_df))
# print(len(iris_val_features))

accuracy = len(correct_class_df)/len(iris_val_features)
accuracy

0.9666666666666667

In [14]:
from sklearn.neighbors import KNeighborsClassifier

In [15]:
neigh = KNeighborsClassifier(n_neighbors=15)

In [16]:
X = iris_df.iloc[:, :-1].values
y = iris_df.iloc[:, 5].values

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.20)

In [18]:
#normalizing
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
neigh.fit(X_train, Y_train)

In [20]:
y_pred = neigh.predict(X_test)

In [21]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, y_pred))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       0.83      1.00      0.91        10
 Iris-virginica       1.00      0.80      0.89        10

       accuracy                           0.93        30
      macro avg       0.94      0.93      0.93        30
   weighted avg       0.94      0.93      0.93        30

