In [279]:
import pandas as pd
from sklearn.model_selection import train_test_split

Use pandas to load the CSV dataset and display the first few rows.  The data set can be found [here](https://www.kaggle.com/uciml/iris) Adding `index_col='Id'` stops pandas from adding it's own id column

In [280]:
iris_dataset = pd.read_csv('datasets/iris/Iris.csv', index_col='Id')
iris_dataset.head()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
5,5.0,3.6,1.4,0.2,Iris-setosa


Species is what we want to predict.  Let's look at the different values.

In [281]:
print(iris_dataset.Species.unique())

['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']


Encode the species (sklearn will expect everything to be numerical)

In [282]:
iris_dataset.head()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
5,5.0,3.6,1.4,0.2,Iris-setosa


In [283]:
iris_dataset.tail()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
146,6.7,3.0,5.2,2.3,Iris-virginica
147,6.3,2.5,5.0,1.9,Iris-virginica
148,6.5,3.0,5.2,2.0,Iris-virginica
149,6.2,3.4,5.4,2.3,Iris-virginica
150,5.9,3.0,5.1,1.8,Iris-virginica


In [284]:
iris_dataset.replace(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], [0,1,2], inplace=True)

In [285]:
iris_dataset.head()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,5.1,3.5,1.4,0.2,0
2,4.9,3.0,1.4,0.2,0
3,4.7,3.2,1.3,0.2,0
4,4.6,3.1,1.5,0.2,0
5,5.0,3.6,1.4,0.2,0


In [286]:
iris_dataset.tail()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
146,6.7,3.0,5.2,2.3,2
147,6.3,2.5,5.0,1.9,2
148,6.5,3.0,5.2,2.0,2
149,6.2,3.4,5.4,2.3,2
150,5.9,3.0,5.1,1.8,2


Seperate labels from features

In [287]:
labels = iris_dataset['Species']
labels.head()

Id
1    0
2    0
3    0
4    0
5    0
Name: Species, dtype: int64

Drop the labels from the dataset

In [288]:
iris_dataset.drop('Species', axis=1, inplace=True)
iris_dataset.head()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,5.1,3.5,1.4,0.2
2,4.9,3.0,1.4,0.2
3,4.7,3.2,1.3,0.2
4,4.6,3.1,1.5,0.2
5,5.0,3.6,1.4,0.2


Split the data into training and testing sets

In [289]:
X_train, X_test, y_train, y_test = train_test_split(iris_dataset, labels, test_size=0.4)

# Evaluation

In [290]:
from sklearn.metrics import accuracy_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm

## KNN Model | N = 1

In [291]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
accuracy_score(y_test, knn.predict(X_test))

0.95

## KNN Model | N = 5

In [292]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
accuracy_score(y_test, knn.predict(X_test))

0.9666666666666667

## Logistic Regression

In [293]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
accuracy_score(y_test, logreg.predict(X_test))

0.95

## SVM with linear kernal

In [294]:
svm_model = svm.SVC(kernel='linear')
svm_model.fit(X_train, y_train)
accuracy_score(y_test, svm_model.predict(X_test))

0.9666666666666667