# Read the data set using Pandas

In [2]:
import pandas as pd 

# Import dataset:
url = "iris.csv"

# Assign column names to dataset:
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']

# Convert dataset to a pandas dataframe:
dataset = pd.read_csv(url, names=names) 

# Use head() function to return the first 5 rows: 
dataset.head() 

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,Class
0,sepal.length,sepal.width,petal.length,petal.width,variety
1,5.1,3.5,1.4,.2,Setosa
2,4.9,3,1.4,.2,Setosa
3,4.7,3.2,1.3,.2,Setosa
4,4.6,3.1,1.5,.2,Setosa


# ignore the header

In [5]:
# Convert dataset to a pandas dataframe:
# dataset = pd.read_csv(url, names=names, header=None) 
dataset = pd.read_csv(url, names=names, skiprows=1) 

# Use head() function to return the first 5 rows: 
dataset.head() 

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,Class
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


# Assign x and y

In [8]:
# Assign values to the X and y variables:
X = dataset.iloc[:, :-1].values #'''skip the last column'''
y = dataset.iloc[:, 4].values #'''skip first four columns'''

# split the data for train and test

In [10]:
from sklearn.model_selection import train_test_split
# Split dataset into random train and test subsets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# preprocessing

In [11]:
from sklearn.preprocessing import StandardScaler
# Standardize features by removing mean and scaling to unit variance:
scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

# train and test dataset

In [12]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test) 

# K-Neighbors Classifier

## training

In [13]:
from sklearn.neighbors import KNeighborsClassifier

# Use the KNN classifier to fit data:
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

## testing

In [14]:
# Predict y data with classifier: 
y_predict = classifier.predict(X_test)

## results

In [16]:
from sklearn.metrics import classification_report, confusion_matrix
# Print results: 
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict)) 

[[17  0  0]
 [ 0 19  1]
 [ 0  1 12]]
              precision    recall  f1-score   support

      Setosa       1.00      1.00      1.00        17
  Versicolor       0.95      0.95      0.95        20
   Virginica       0.92      0.92      0.92        13

    accuracy                           0.96        50
   macro avg       0.96      0.96      0.96        50
weighted avg       0.96      0.96      0.96        50



## accuracy

In [18]:
y_predict = classifier.predict(X_test)

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_predict))

0.96
