# Classifying Iris Plants Dataset

## Importing data

In [2]:
import pandas as pd
# the features (cols) in the dataset
col_header=['sepal length(cm)', 'sepal width(cm)', 'petal length(cm)', 'petal width(cm)', 'iris class(cm)']

# read file with no headers and col names as mentioned in the above array
# the data is in pandas dataframe type
iris_panda_set = pd.read_csv('data/iris.data', header=None, names=col_header)

## Displaying data

In [4]:
iris_panda_set.head(10)

Unnamed: 0,sepal length(cm),sepal width(cm),petal length(cm),petal width(cm),iris class(cm)
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


## Data preprocessing

In [70]:
# change all categorical types to numeric ones
print(iris_panda_set.loc[0, :])
print(iris_panda_set.loc[60, :])
print(iris_panda_set.loc[120, :])

# map iris class types to numeric values
iris_class_map = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2 }
# change the dataset accordingly
iris_panda_set['iris class(cm)'] = iris_panda_set['iris class(cm)'].map(iris_class_map)
print(iris_panda_set.loc[0, :])
print(iris_panda_set.loc[60, :])
print(iris_panda_set.loc[120, :])

sepal length(cm)            5.1
sepal width(cm)             3.5
petal length(cm)            1.4
petal width(cm)             0.2
iris class(cm)      Iris-setosa
Name: 0, dtype: object
sepal length(cm)                  5
sepal width(cm)                   2
petal length(cm)                3.5
petal width(cm)                   1
iris class(cm)      Iris-versicolor
Name: 60, dtype: object
sepal length(cm)               6.9
sepal width(cm)                3.2
petal length(cm)               5.7
petal width(cm)                2.3
iris class(cm)      Iris-virginica
Name: 120, dtype: object
sepal length(cm)    5.1
sepal width(cm)     3.5
petal length(cm)    1.4
petal width(cm)     0.2
iris class(cm)      0.0
Name: 0, dtype: float64
sepal length(cm)    5.0
sepal width(cm)     2.0
petal length(cm)    3.5
petal width(cm)     1.0
iris class(cm)      1.0
Name: 60, dtype: float64
sepal length(cm)    6.9
sepal width(cm)     3.2
petal length(cm)    5.7
petal width(cm)     2.3
iris class(cm)      2.0
Name

## Splitting the data

In [84]:
# Stratified ShuffleSplit cross-validator
# Provides train/test indices to split data in train/test sets
# This cross-validation object is a merge of StratifiedKFold and ShuffleSplit, which returns stratified randomized folds.
# The folds are made by preserving the percentage of samples for each class.
from sklearn.model_selection import StratifiedShuffleSplit # import training test split method from sklearn

In [97]:
# next we define the feature cols and predicted col
feature_col_names = ['sepal length(cm)', 'sepal width(cm)', 'petal length(cm)', 'petal width(cm)']
predicted_class_names = ['iris class(cm)']

# split our data into two data frames one containing the features cols and other with the iris category
X = iris_panda_set[feature_col_names].values # predictor feature cols (4)
# predicated class Iris-setosa: 0, Iris-versicolor: 1, Iris-virginica: 2
Y = iris_panda_set[predicted_class_names].values # one-d array

split_test_size = 0.30 # define the train_test split ratio 30%

sss = StratifiedShuffleSplit(n_splits=3, test_size=split_test_size, random_state=20)

for train_index, test_index in sss.split(X, Y):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

In [95]:
#  we check to ensure we have the desired 70% train and 30% test split of the data
#  here df.index is the whole data frame
print('{0:0.2f}% in training set'.format((len(X_train) / len(iris_panda_set.index)) * 100))
print('{0:0.2f}% in test set'.format((len(X_test) / len(iris_panda_set.index)) * 100))

70.00% in training set
30.00% in test set


In [96]:
# We also verify the predicted values were split the same b/w the train & test data sets
num_setosa = len(iris_panda_set.loc[iris_panda_set['iris class(cm)'] == 0.0])
num_versicolor = len(iris_panda_set.loc[iris_panda_set['iris class(cm)'] == 1.0])
num_virginica = len(iris_panda_set.loc[iris_panda_set['iris class(cm)'] == 2.0])
total = num_setosa + num_versicolor + num_virginica
percentage_setosa = (num_setosa / total) * 100
percentage_versicolor = (num_versicolor / total) * 100
percentage_virginica = (num_virginica / total) * 100
print('Number of setosa plants: {0} ({1:2.2f}%)'.format(num_setosa, percentage_setosa))
print('Number of versicolor plants: {0} ({1:2.2f}%)'.format(num_versicolor, percentage_versicolor))
print('Number of virginica plants: {0} ({1:2.2f}%)'.format(num_virginica, percentage_virginica))

num_setosa_in_train = len(Y_train[Y_train[:] == 0.0])
num_versicolor_in_train = len(Y_train[Y_train[:] == 1.0])
num_virginica_in_train = len(Y_train[Y_train[:] == 2.0])
total_in_train = num_setosa_in_train + num_versicolor_in_train + num_virginica_in_train
percentage_setosa_in_train = (num_setosa_in_train / total_in_train) * 100
percentage_versicolor_in_train = (num_versicolor_in_train / total_in_train) * 100
percentage_virginica_in_train = (num_virginica_in_train / total_in_train) * 100

# printing the result
print('Number of setosa plants in train set: {0} ({1:2.2f}%)'.format(num_setosa_in_train, percentage_setosa_in_train))
print('Number of versicolor plants in train set : {0} ({1:2.2f}%)'.format(num_versicolor_in_train, percentage_versicolor_in_train))
print('Number of virginica plants in train set: {0} ({1:2.2f}%)'.format(num_virginica_in_train, percentage_virginica_in_train))

num_setosa_in_test = len(Y_test[Y_test[:] == 0.0])
num_versicolor_in_test = len(Y_test[Y_test[:] == 1.0])
num_virginica_in_test = len(Y_test[Y_test[:] == 2.0])
total_in_test = num_setosa_in_test + num_versicolor_in_test + num_virginica_in_test
percentage_setosa_in_test = (num_setosa_in_test / total_in_test) * 100
percentage_versicolor_in_test = (num_versicolor_in_test / total_in_test) * 100
percentage_virginica_in_test = (num_virginica_in_test / total_in_test) * 100

# printing the result
print('Number of setosa plants in test set: {0} ({1:2.2f}%)'.format(num_setosa_in_test, percentage_setosa_in_test))
print('Number of versicolor plants in test set : {0} ({1:2.2f}%)'.format(num_versicolor_in_test, percentage_versicolor_in_test))
print('Number of virginica plants in test set: {0} ({1:2.2f}%)'.format(num_virginica_in_test, percentage_virginica_in_test))

Number of setosa plants: 50 (33.33%)
Number of versicolor plants: 50 (33.33%)
Number of virginica plants: 50 (33.33%)
Number of setosa plants in train set: 35 (33.33%)
Number of versicolor plants in train set : 35 (33.33%)
Number of virginica plants in train set: 35 (33.33%)
Number of setosa plants in test set: 15 (33.33%)
Number of versicolor plants in test set : 15 (33.33%)
Number of virginica plants in test set: 15 (33.33%)


## Training Naive Bayes 

In [98]:
# import Naive Bayes algorithm from the library
# In case of naive_bayes there are multiple implementations 
# we are using the gaussian algo that assumes that the feature data is distributed in a gaussian 
from sklearn.naive_bayes import GaussianNB

# create Gaussian Naive Bayes model object and train it with data
nb_model = GaussianNB() # our model object

# call the fit method to create a model trained with the training data 
# numpy.ravel returns a contiguous flattened array
nb_model.fit(X_train, Y_train.ravel())

GaussianNB(priors=None)

## Performance of Naive Bayes on Testing data

In [136]:
# to see the accuracy we load the scikit metrics library
# metrics has methods that let us get the statistics on the models predictive performance
from sklearn import metrics
# Now lets predict against the testing data
# X_test is the data we kept aside for testing
nb_predict_test = nb_model.predict(X_test)
# Y_test is the actual output and nb_predict_test is the predicted one 
test_accuracy = metrics.accuracy_score(Y_test, nb_predict_test)
print('Accuracy(%) of NB model on test data: {0: .4f}'.format(test_accuracy * 100))
# the classification report generates statistics based on the values shown in the confusion matrix.
print('\nClassification report of NB model:\n')
print(metrics.classification_report(Y_test, nb_predict_test, labels = [0, 1, 2]))
print('Here:\n 0 -> Iris-setosa \n 1 -> Iris-versicolor \n 2 -> Iris-virginica ')

Accuracy(%) of NB model on test data:  95.5556

Classification report of NB model:

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        15
          1       0.93      0.93      0.93        15
          2       0.93      0.93      0.93        15

avg / total       0.96      0.96      0.96        45

Here:
 0 -> Iris-setosa 
 1 -> Iris-versicolor 
 2 -> Iris-virginica 


## Training Random Forest

In [109]:
# import random forest from scikit
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state = 54) # Create random forest object
rf_model.fit(X_train, Y_train.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=54, verbose=0, warm_start=False)

## Performance of Random Forest on Testing data

In [135]:
rf_predict_test = rf_model.predict(X_test)
# training metrics
print('Accuracy(%) of RF model on test data: {0:.4f}'.format((metrics.accuracy_score(Y_test, rf_predict_test)) * 100))
print('\nClassification report of RF model:\n')
print(metrics.classification_report(Y_test, rf_predict_test, labels = [0, 1, 2]))
print('Here:\n 0 -> Iris-setosa \n 1 -> Iris-versicolor \n 2 -> Iris-virginica ')

Accuracy(%) of RF model on test data: 93.3333

Classification report of RF model:

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        15
          1       0.93      0.87      0.90        15
          2       0.88      0.93      0.90        15

avg / total       0.93      0.93      0.93        45

Here:
 0 -> Iris-setosa 
 1 -> Iris-versicolor 
 2 -> Iris-virginica 


## Training Logistic Regression

In [114]:
# scikit learn has an ensemble algorithm that combines logistic regression with cross validation called LogisticRegressionCV
from sklearn.linear_model import LogisticRegressionCV

In [119]:
lr_cv_model = LogisticRegressionCV(Cs=3, refit=True, cv=10,)
lr_cv_model.fit(X_train, Y_train.ravel())

LogisticRegressionCV(Cs=3, class_weight=None, cv=10, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

## Performance of LR on Testing data

In [134]:
lr_cv_predict_test = lr_cv_model.predict(X_test)
# training metrics
print('Accuracy(%) of LRCV model on test data: {0:.4f}'.format((metrics.accuracy_score(Y_test, lr_cv_predict_test)) * 100))
print('\nClassification report of LRCV model:\n ')
print(metrics.classification_report(Y_test, lr_cv_predict_test, labels = [0, 1, 2]))
print('Here:\n 0 -> Iris-setosa \n 1 -> Iris-versicolor \n 2 -> Iris-virginica ')

Accuracy(%) of LRCV model on test data: 95.5556

Classification report of LRCV model:
 
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        15
          1       0.93      0.93      0.93        15
          2       0.93      0.93      0.93        15

avg / total       0.96      0.96      0.96        45

Here:
 0 -> Iris-setosa 
 1 -> Iris-versicolor 
 2 -> Iris-virginica 


## Training MLP (simple NN)

In [126]:
# import the algorithm
# Multi-layer Perceptron (MLP) is a supervised learning algorithm
# MLPClassifier implements a multi-layer perceptron (MLP) algorithm 
    # that trains using Backpropagation.
from sklearn.neural_network import MLPClassifier
# initialize the MLP classifier model with parameters
nn_model = MLPClassifier(hidden_layer_sizes=((3, 3, 3)), max_iter=20000)
# train the model with the training set inputs and outputs 
nn_model.fit(X_train, Y_train.ravel())

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(3, 3, 3), learning_rate='constant',
       learning_rate_init=0.001, max_iter=20000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

## Performance of MLP on Testing data

In [138]:
nn_predict_test = nn_model.predict(X_test)
# training metrics
print('Accuracy(%) of MLP model on test data: {0:.4f}'.format((metrics.accuracy_score(Y_test, nn_predict_test)) * 100))
print('\nClassification report of MLP model:\n')
print(metrics.classification_report(Y_test, nn_predict_test, labels = [0, 1, 2]))
print('Here:\n 0 -> Iris-setosa \n 1 -> Iris-versicolor \n 2 -> Iris-virginica ')

Accuracy(%) of MLP model on test data: 97.7778

Classification report of MLP model:

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        15
          1       1.00      0.93      0.97        15
          2       0.94      1.00      0.97        15

avg / total       0.98      0.98      0.98        45

Here:
 0 -> Iris-setosa 
 1 -> Iris-versicolor 
 2 -> Iris-virginica 


## Training SVM

In [128]:
from sklearn import svm
# Create a classifier: a support vector classifier
svm_model = svm.SVC(gamma=0.001)
svm_model.fit(X_train, Y_train.ravel())

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

## Performance of SVM on Testing data

In [137]:
svm_predict_test = svm_model.predict(X_test)
# training metrics
print('Accuracy(%) of SVM model on test data: {0:.4f}'.format((metrics.accuracy_score(Y_test, svm_predict_test)) * 100))
print('\nClassification report of SVM model:\n')
print(metrics.classification_report(Y_test, svm_predict_test, labels = [0, 1, 2]))
print('Here:\n 0 -> Iris-setosa \n 1 -> Iris-versicolor \n 2 -> Iris-virginica ')

Accuracy(%) of SVM model on test data: 93.3333

Classification report of SVM model:

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        15
          1       0.88      0.93      0.90        15
          2       0.93      0.87      0.90        15

avg / total       0.93      0.93      0.93        45

Here:
 0 -> Iris-setosa 
 1 -> Iris-versicolor 
 2 -> Iris-virginica 
