In [1]:
## Based on
#  - https://scikit-learn.org/stable/modules/naive_bayes.html 
#  - https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
#  - https://www.datacamp.com/community/tutorials/random-forests-classifier-python
#  - https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html

# Load IRIS dataset

In [2]:
#Import scikit-learn dataset library
from sklearn import datasets

# Import other supporting libs

import pandas as pd
import numpy as np

# Generic for data prep and evaluating
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [3]:
#Load dataset
iris = datasets.load_iris()

In [4]:
# print the label species(setosa, versicolor,virginica)
print(iris.target_names)

# print the names of the four features
print(iris.feature_names)

['setosa' 'versicolor' 'virginica']
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [5]:
data=pd.DataFrame({
    'sepal length':iris.data[:,0],
    'sepal width':iris.data[:,1],
    'petal length':iris.data[:,2],
    'petal width':iris.data[:,3],
    'species':iris.target
})
data.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal length  150 non-null    float64
 1   sepal width   150 non-null    float64
 2   petal length  150 non-null    float64
 3   petal width   150 non-null    float64
 4   species       150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB


In [7]:
x=data[['sepal length', 'sepal width', 'petal length', 'petal width']]  
y=data['species']

In [8]:
# Get a subset for taining and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

# Try different classifier models

In [15]:
# Import models to explore
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [10]:
def train_test_model(model, x_train, x_test, y_train, y_test):
    
    # Train model
    model.fit(x_train, y_train)
    # Predicted values for train and test
    y_pred_train = model.predict(x_train)
    y_pred_test = model.predict(x_test)
    print("Number of mislabeled points out of a total %d points on test: %d"
       % (x_test.shape[0], (y_test != y_pred_test).sum()))
    print("Number of mislabeled points out of a total %d points on training: %d"
       % (x_train.shape[0], (y_train != y_pred_train).sum())) 
    
    # Check via CM
    CM = confusion_matrix(y_test, y_pred_test)
    print ("Confusion Matrix on test:")
    print (CM)
    

In [11]:
# Check via NBC
model = GaussianNB()
train_test_model(model, x_train, x_test, y_train, y_test)

Number of mislabeled points out of a total 30 points on test: 1
Number of mislabeled points out of a total 120 points on training: 6
Confusion Matrix on test:
[[11  0  0]
 [ 0 13  0]
 [ 0  1  5]]


In [12]:
# Check via GBC
model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)
train_test_model(model, x_train, x_test, y_train, y_test)

Number of mislabeled points out of a total 30 points on test: 1
Number of mislabeled points out of a total 120 points on training: 0
Confusion Matrix on test:
[[11  0  0]
 [ 0 13  0]
 [ 0  1  5]]


In [13]:
# Check via Random Forest
model = RandomForestClassifier(max_depth=2, random_state=0)
train_test_model(model, x_train, x_test, y_train, y_test)

Number of mislabeled points out of a total 30 points on test: 0
Number of mislabeled points out of a total 120 points on training: 5
Confusion Matrix on test:
[[11  0  0]
 [ 0 13  0]
 [ 0  0  6]]


In [16]:
# Check via NN - MLP
model = MLPClassifier(random_state=1, max_iter=300)
train_test_model(model, x_train, x_test, y_train, y_test)

Number of mislabeled points out of a total 30 points on test: 0
Number of mislabeled points out of a total 120 points on training: 4
Confusion Matrix on test:
[[11  0  0]
 [ 0 13  0]
 [ 0  0  6]]


