In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn import metrics

In [2]:
import os
os.chdir(r'C:\Users\Vish\Documents\Data\Dataset')
dataset = pd.read_csv('Iris.csv')

In [3]:
dataset.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
dataset.drop(['Id'],inplace=True,axis=1)

In [5]:
dataset.columns = ['sepal length in cm', 'sepal width in cm','petal length in cm','petal width in cm','species']

In [6]:
factor = pd.factorize(dataset['species'])
dataset.species = factor[0]
definitions = factor[1]
print(dataset.species.head())
print(definitions)

0    0
1    0
2    0
3    0
4    0
Name: species, dtype: int64
Index(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype='object')


In [7]:
#Splitting the data into independent and dependent variables
X = dataset.iloc[:,0:4].values
y = dataset.iloc[:,4].values
print('The independent features set: ')
print(X[:5,:])
print('The dependent variable: ')
print(y[:5])

The independent features set: 
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
The dependent variable: 
[0 0 0 0 0]


In [8]:
#Creating the Training and Test set from data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21)

In [9]:
#Scaling
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

In [10]:
#Model building
model = KNeighborsClassifier(n_neighbors=10, metric='euclidean')
#n_neighbors:int,default=5--Number of neighbors to use
#weights:{"uniform", "distance",[callable]}, default="uniform"--The weight function used in prediction. uniform means all points in each neighborhood are weighted equally.
# distance means weight points by the inverse of their distance i.e. closer neighbors of a query point will have a greater influence than neighbors which are further away.
# callable means a user-defined function which accepts an array of distances, and returns an array of the same shape containing the weights.
#algorithm:{"auto","ball_tree","kd_tree","brute"}, default="auto"--The algorithm used to compute the nearest neighbors.
# ball_tree will use BallTree, kd_tree will use KDTree, brute will use a brute-force search, auto will attempt to decide the most appropriate algorithm based on the values passed to fit method.
#leaf_size:int, [default = 30]--The leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. 
#p:int, [default = 2]--Power parameter for the Minkowski metric.p = 1 is equivalent to using manhattan_distance (l1)
# and p = 2 for euclidean_distance (l2). For arbitrary p, minkowski_distance (l_p) is used.
#metric:{"minkowski","euclidean",[callable]}, default=minkowski--the distance metric to use for the tree. The default metric is minkowski and p=2 is equivalent to the standard Euclidean metric
#n_jobs:int, range=0-Inf,default=None--Number of parallel threads
model.fit(X_train, y_train)

KNeighborsClassifier(metric='euclidean', n_neighbors=10)

In [11]:
#model=KNeighborsRegressor()

In [12]:
#Predictions
#The predict fuction converts probability values > .5 to 1 else 0
y_pred = model.predict(X_test)

In [13]:
# Using our own threshold
# decisions = (model.predict_proba(X_test) >= 0.6).astype(int)
# y_pred=decisions[:,1]
# print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
# print("Recall: ", metrics.precision_score(y_test,y_pred))
# print("F1 score: ", metrics.f1_score(y_test,y_pred))

In [14]:
reversefactor = dict(zip(range(3),definitions))
y_test = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)
print(pd.crosstab(y_test, y_pred, rownames=['Actual Species'], colnames=['Predicted Species']))

Predicted Species  Iris-setosa  Iris-versicolor  Iris-virginica
Actual Species                                                 
Iris-setosa                 13                0               0
Iris-versicolor              0               14               1
Iris-virginica               0                2               8


In [15]:
#Evaluating model
cm=metrics.confusion_matrix(y_test,y_pred)
print(cm)
metrics.accuracy_score(y_test,y_pred)

[[13  0  0]
 [ 0 14  1]
 [ 0  2  8]]


0.9210526315789473