In [2]:
#### classification modeling with k-nearest neighbors

import pandas as pd

# create dataframe by reading from .csv file
iris = pd.read_csv('https://sololearn.com/uploads/files/iris.csv')

print(iris.shape)
print()
print(iris.head())
print()

# dropping redundant 'id' column

iris.drop('id', axis=1, inplace=True)
print(iris.head())
print()

# summary statistics

print(iris.describe())
print()

# check that class distribution is balanced 

print(iris['species'].value_counts())


(150, 6)

   id  sepal_len  sepal_wd  petal_len  petal_wd      species
0   0        5.1       3.5        1.4       0.2  iris-setosa
1   1        4.9       3.0        1.4       0.2  iris-setosa
2   2        4.7       3.2        1.3       0.2  iris-setosa
3   3        4.6       3.1        1.5       0.2  iris-setosa
4   4        5.0       3.6        1.4       0.2  iris-setosa

   sepal_len  sepal_wd  petal_len  petal_wd      species
0        5.1       3.5        1.4       0.2  iris-setosa
1        4.9       3.0        1.4       0.2  iris-setosa
2        4.7       3.2        1.3       0.2  iris-setosa
3        4.6       3.1        1.5       0.2  iris-setosa
4        5.0       3.6        1.4       0.2  iris-setosa

        sepal_len    sepal_wd   petal_len    petal_wd
count  150.000000  150.000000  150.000000  150.000000
mean     5.843333    3.057333    3.758000    1.199333
std      0.828066    0.435866    1.765298    0.762238
min      4.300000    2.000000    1.000000    0.100000
25%      5

In [3]:
## feature selection
# petal_len and petal_wd selected because earlier we found they are most useful to seperate the species
X = iris[['petal_len', 'petal_wd']]
y = iris['species']

## train test split
# 70% training data, 30% testing data
# 'stratify=' ensures distribution of species types remains similar in training and testing sets by passing 'y'
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=33, stratify=y)

# verifying equal distribution of species types in training and testing sets 
print(y_train.value_counts())
print()
print(y_test.value_counts())



iris-versicolor    35
iris-virginica     35
iris-setosa        35
Name: species, dtype: int64

iris-setosa        15
iris-versicolor    15
iris-virginica     15
Name: species, dtype: int64


In [5]:
# "In classifications, stratified sampling is often chosen to ensure that the train and test sets have approximately the same percentage of samples of each target class as the complete set"

# "Remember: Import -> Instantiate -> Fit -> Predict"

## import model
from sklearn.neighbors import KNeighborsClassifier

## instantiate 
knn = KNeighborsClassifier(n_neighbors=5) #telling the model to check the 5 nearest neighbors

## fit (and print to look at details of the model)
print(knn.fit(X_train, y_train))
print()

## predict
pred = knn.predict(X_test)

# look at first 4 predictions (also called hard prediction)
print(pred[:4])
print()

# probability prediction, outputs array showing probability of the target being each label (also called soft prediction)
pred_prob = knn.predict_proba(X_test)
print(pred_prob[:4])
print()

# 100% chance of first flower being setosa, 
# 100% chance of second flower being versicolor
# 100% chance of third flower being virginica 
# 20% chance of fourth flower being versicolor, 80% chance of it being virginica. This means of the five nearest neighbours of the 4th flower in the testing set, 1 is versicolor and 4 are virginica.

# first 4 actual class labels from test data set for comparision
print(y_test[:4]) 


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

['iris-setosa' 'iris-versicolor' 'iris-virginica' 'iris-virginica']

[[1.  0.  0. ]
 [0.  1.  0. ]
 [0.  0.  1. ]
 [0.  0.2 0.8]]

14         iris-setosa
51     iris-versicolor
130     iris-virginica
149     iris-virginica
Name: species, dtype: object


In [6]:
## model evaluation

# most straighforward metric is accuracy, proportion of data points whose predicted labels match the observed labels.

print((pred==y_test.values).sum()) # correctly predicted labels
print(y_test.size) # total labels 
print()

print((pred==y_test.values).sum()/y_test.size) # accuracy
print(knn.score(X_test, y_test)) # .score() more easily outputs accuracy

from sklearn.metrics import  accuracy_score
print(accuracy_score(y_test, pred)) # accuracy_score() does the same thing
# ^this accuracy score may be slightly skewed by how the data was split, as we will see later.



44
45

0.9777777777777777
0.9777777777777777
0.9777777777777777


In [7]:

# confusion matrix

# "Accuracy alone can be misleading if there is an unequal number of observations in each class or if there are more than two classes in the dataset"

# "Calculating a confusion matrix provides a better idea of what the classification is getting right and what types of errors it is making"

# "Confusion matrix is a summary of the counts of correct and incorrect predictions, broken down by each class"

from sklearn.metrics import confusion_matrix 
print(confusion_matrix(y_test, pred)) 
print()
# or confusion_matrix(y_test, pred, labels=['iris-setosa','iris-versicolor','iris-virginica']) to specify labels

# x axis = predicted class, y axis = actual class
# 15 setosa correctly labled, none mislabed 
# 15 versicolor correctly labled, none mislabled
# 14 virginica correctly labled, one mislabed as versicolor 


[[15  0  0]
 [ 0 15  0]
 [ 0  1 14]]



In [8]:
### K-fold cross validation

# previous train-test-split before fitting the model is a simple type of cross validation also called the holdout method. Because the split is random, model performance can be sensitive to how the data is split. 
# To overcome this we use K-fold cross validation.

# the data is divided into K subsets. Then the holdout method is repeated K times, such that each time, one of the K subsets is used as the test set and the other K-1 subsets are combined to train the model. 
# Then the accuracy is averaged over K trials to provide total effectiveness of the model

## import 'cross_val_score'
from sklearn.model_selection import cross_val_score

## instatiate a new knn model
knn_cv = KNeighborsClassifier(n_neighbors=3) #telling the model to check the 3 nearest neighbors

## fit (train) model with 5-fold cross validation
## predict built-in when using 'cross_val_score()''
cv_scores = cross_val_score(knn_cv, X, y, cv=5) # 'cv=' pass value for number of folds, usually 5 or 10 preferred 

## model evaluation

# accuracy for each trial
print(cv_scores)
# average them to find the expected accuracy to report
print(cv_scores.mean())
print()


[0.96666667 0.96666667 0.9        0.93333333 1.        ]
0.9533333333333334



In [9]:
### tuning the hyperparameter 

# tuning the hyperparameter means finding the optimal k to use in a knn model

# We do this using grid search. 'GridSearchCV' trains our model multiple times on a range of values & computes cross validation scores, so that we can check which value for the tested hyperparameter performed the best

# 'GridsearchCV' essentially automates running 'cross_val_score' on the data using different 'n_neigbors' values and comparing the results 

## import 'GridSearchCV'
from sklearn.model_selection import GridSearchCV

## instantiate a new knn model
knn2 = KNeighborsClassifier() # 'n_neighbors=' not specified because this is what we're tuning

# create a dict of all values we want to test for n_neighbors
import numpy as np
param_grid = {'n_neighbors': np.arange(2, 10)} #key is 'n_neighbors' : value is array of 2 through 9

# use gridsearch to test all values in range for n_neighbors
knn_gscv = GridSearchCV(knn2, param_grid, cv=5) #passing the model, dict of parameter(s) with values to try, and number of folds for k-fold cross validation 

## fit model to data
## predict built in when using 'GridSearchCV'
knn_gscv.fit(X, y)

# check top performing 'n_neighbors' with '.best_params_'
print(knn_gscv.best_params_) # tells us it's 4
# check accuracy when 'n_neighbors' is 4 with '.best_score_'
print(knn_gscv.best_score_)


{'n_neighbors': 4}
0.9666666666666668


In [11]:
### now we are ready to build the final model

## import - KNeighborsClassifier already imported 

## instantiate 
knn_final = KNeighborsClassifier(n_neighbors=knn_gscv.best_params_['n_neighbors'])

## fit
knn_final.fit(X, y)

## predict
y_pred = knn_final.predict(X)

## model evaluation
# accuracy
print(knn_final.score(X, y))


0.9733333333333334


In [13]:
## label prediction with new data

# unknown iris with petal length of  3.76 cm and petal width of 1.20 cm 
new_data = np.array([3.76, 1.20])

# must reshape new data from 1d to 2d array because that's what the model was trained on and expects as input
new_data = new_data.reshape(1, -1)
print(new_data.shape)
print()

# predict
print(knn_final.predict(new_data))

# instead of reshaping we can just input a 2d list, same result 
print(knn_final.predict([[3.76, 1.2]]))
print()


## probability prediction with new data

# three irises, different lengths, all have the same petal width of 1.2 cm
new_data = np.array([[3.76, 1.2], [5.25, 1.2], [1.58, 1.2]])

# predict 
print(knn_final.predict(new_data))

# probability predict
print(knn_final.predict_proba(new_data))
# 100% chance of first flower being versicolor
# 25% chance of second flower being versicolor, 75% chance of it being virginica
# 100% chance of third flower being setosa 

# code and comments by github.com/alandavidgrunberg


(1, 2)

['iris-versicolor']
['iris-versicolor']

['iris-versicolor' 'iris-virginica' 'iris-setosa']
[[0.   1.   0.  ]
 [0.   0.25 0.75]
 [1.   0.   0.  ]]
