# Import Libraries

In [1]:
# Load libraries
import pandas
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import joblib


# Getting Dataset

In [2]:
# Load dataset from UCI repository
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
# Below, we are specifying the names of each column when loading the data. This will help later
# when we explore the data
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pandas.read_csv(url, names=names)


# Data Exploration

In [None]:
# shape
print(dataset.shape)

In [None]:
# head
print(dataset.head(20))

In [None]:
# statistical description
print(dataset.describe())

In [None]:
# class distribution
print(dataset.groupby('class').size())

# Data Visualisation

sharex : boolean, default True if ax is None else False

In case subplots=True, share x axis and set some x axis labels to invisible; defaults to True if ax is None otherwise False if an ax is passed in; Be aware, that passing in both an ax and sharex=True will alter all x axis labels for all axis in a figure!

sharey : boolean, default False

In case subplots=True, share y axis and set some y axis labels to invisible

In [None]:
# box and whisker plots
dataset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)
plt.show()

In [None]:
# histograms
dataset.hist()
plt.show()

In [None]:
# scatter plot matrix
scatter_matrix(dataset)
plt.show()

# Evaluate Algorithms

### Create Validation Dataset

In [3]:
# Split-out validation dataset
array = dataset.values
X = array[:,0:4]
Y = array[:,4]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [4]:
# Test options and evaluation metric
seed = 7
scoring = 'accuracy'

In [5]:
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=200)))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))

# evaluate each model in turn
results = []
names = []
for name, model in models:	
    kfold = model_selection.KFold(n_splits=10)	
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)       
    results.append(cv_results)	
    names.append(name)	
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())	
    print(msg)

LR: 0.983333 (0.033333)
LDA: 0.975000 (0.038188)
KNN: 0.983333 (0.033333)
CART: 0.975000 (0.038188)
NB: 0.975000 (0.053359)
SVM: 0.991667 (0.025000)


# Make Predictions

In [6]:
# Make predictions on validation dataset
svm = SVC(gamma='auto')
svm.fit(X_train, Y_train)
predictions = svm.predict(X_validation)
print("Accuracy:")
print(accuracy_score(Y_validation, predictions))
print()
print("Confusion matrix:")
print(confusion_matrix(Y_validation, predictions))
print()
print("Classification report:")
print(classification_report(Y_validation, predictions))

Accuracy:
0.9333333333333333

Confusion matrix:
[[ 7  0  0]
 [ 0 10  2]
 [ 0  0 11]]

Classification report:
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         7
Iris-versicolor       1.00      0.83      0.91        12
 Iris-virginica       0.85      1.00      0.92        11

       accuracy                           0.93        30
      macro avg       0.95      0.94      0.94        30
   weighted avg       0.94      0.93      0.93        30



In [7]:
joblib.dump(svm, 'model_svm.pkl')

['model_svm.pkl']