In [58]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [9]:
#Import iris dataset
iris = fetch_openml(name='iris')

In [11]:
# Have a look at the dataset
print(iris.data)

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.1 1.5 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.

In [12]:
# Dataset URL
iris.url

'https://www.openml.org/d/61'

In [13]:
# Non graphical EDA
iris.data.shape

(150, 4)

In [14]:
# Non graphical EDA
iris.target.shape

(150,)

In [15]:
# Non graphical EDA
np.unique(iris.target)

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [16]:
# Non graphical EDA
iris.DESCR

"**Author**: R.A. Fisher  \n**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Iris) - 1936 - Donated by Michael Marshall  \n**Please cite**: [UCI Citation Policy](https://archive.ics.uci.edu/ml/citation_policy.html)  \n\n**Iris Plants Database**  \nThis is perhaps the best known database to be found in the pattern recognition literature.  Fisher's paper is a classic in the field and is referenced frequently to this day.  (See Duda & Hart, for example.)  The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant.  One class is     linearly separable from the other 2; the latter are NOT linearly separable from each other.\n\nPredicted attribute: class of iris plant.  \nThis is an exceedingly simple domain.  \n \n### Attribute Information:\n    1. sepal length in cm\n    2. sepal width in cm\n    3. petal length in cm\n    4. petal width in cm\n    5. class: \n       -- Iris Setosa\n       -- Iris Versicolour\n       -- Iris Virginica\n\n

In [17]:
# Non graphical EDA
iris.details

{'id': '61',
 'name': 'iris',
 'version': '1',
 'format': 'ARFF',
 'creator': 'R.A. Fisher',
 'collection_date': '1936',
 'upload_date': '2014-04-06T23:23:39',
 'language': 'English',
 'licence': 'Public',
 'url': 'https://www.openml.org/data/v1/download/61/iris.arff',
 'file_id': '61',
 'default_target_attribute': 'class',
 'version_label': '1',
 'citation': 'https://archive.ics.uci.edu/ml/citation_policy.html',
 'tag': ['study_1',
  'study_25',
  'study_4',
  'study_41',
  'study_50',
  'study_52',
  'study_7',
  'study_86',
  'study_88',
  'study_89',
  'uci'],
 'visibility': 'public',
 'original_data_url': 'https://archive.ics.uci.edu/ml/datasets/Iris',
 'paper_url': 'http://digital.library.adelaide.edu.au/dspace/handle/2440/15227',
 'status': 'active',
 'processing_date': '2020-11-20 19:02:18',
 'md5_checksum': 'ad484452702105cbf3d30f8deaba39a9'}

In [18]:
# Converting dataset from numpy array to pandas dataframe
x = pd.DataFrame(data=iris.data, 
                 index=np.array(range(1, 151)), 
                 columns=np.array(range(1, 5)))

In [19]:
# Converting dataset from numpy array to pandas dataframe
y = pd.DataFrame(data=iris.target, 
                 index=np.array(range(1, 151)),
                 columns=np.array(range(1, 2)))

In [20]:
# Check unique values in Target class
y[1].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [21]:
# Convering target class into numerical form
y[1] = y[1].replace(['Iris-setosa',
                     'Iris-versicolor','Iris-virginica'], [1,2,3])

In [68]:
# Training KNN on whole dataset
from sklearn.neighbors import KNeighborsClassifier
neigh=KNeighborsClassifier(n_neighbors=5, algorithm = 'brute')
neigh.fit(x,y)

  after removing the cwd from sys.path.


KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [69]:
# Prediction on test dataset
testSet = [[1.4, 3.6, 3.4, 1.2]]
test = pd.DataFrame(testSet)
print(test)
print("predicted:",neigh.predict(test))
print("neighbors",neigh.kneighbors(test))

     0    1    2    3
0  1.4  3.6  3.4  1.2
predicted: [1]
neighbors (array([[3.7067506 , 3.80657326, 3.81706694, 3.8340579 , 3.84317577]]), array([[57,  8, 42, 93, 38]], dtype=int64))


In [70]:
# from sklearn.model_selection import train_test_split
# x_train, x_test, y_train, y_test = train_test_split(x,y,test_size= 0.2)

In [71]:
accs = []
pres = []
recs = []
f1_scores = []

# Training model with Repeated stratified K fold cross validation
rskf = RepeatedStratifiedKFold(n_splits=10,
                               n_repeats = 10, 
                               random_state=36851234)

for train_index, test_index in rskf.split(x, y):
    neigh.fit(x.iloc[train_index], y.iloc[train_index])
    y_pred = neigh.predict(x.iloc[test_index])
    acc_score = accuracy_score(y.iloc[test_index], y_pred)
    prec_score = precision_score(y.iloc[test_index], y_pred, average = 'micro')
    rec_score = recall_score(y.iloc[test_index], y_pred, average = 'micro')
    f1s = f1_score(y.iloc[test_index], y_pred, average = 'micro')
    
    accs.append(acc_score)
    pres.append(prec_score)
    recs.append(rec_score)
    f1_scores.append(f1s)
    
    

  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0]

  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':


In [72]:
print("Accuracy :", np.mean(accs), "\nPrecision :", 
      np.mean(pres), "\nRecall :", np.mean(rec_score), 
      "\nF1 score :", np.mean(f1_scores) )

Accuracy : 0.9660000000000001 
Precision : 0.9660000000000001 
Recall : 0.9333333333333333 
F1 score : 0.9660000000000001
