In [None]:
import pandas
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC



In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
col_names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
iris_data = pandas.read_csv(url,names=col_names)

# Summarize the dataset

In [None]:
print(iris_data.shape) # To print the number of rows and columns in the dataset
print(iris_data.head(20)) # To print the first 20 rows in the dataset
print(iris_data.describe())

# Class Distribution

In [None]:
print(iris_data.groupby('class').size())

# Data Visualization

In [None]:
#Univariate plots to help us understand each attribute
iris_data.plot(kind = 'box', subplots = 'False', layout = (2,2), sharex=False , sharey = False)
plt.show()
iris_data.hist()
plt.show()

In [None]:
# Multivariate plots to help us understand the interaction between the variables
scatter_matrix(iris_data)
plt.show()


# Evaluate Some Algorithms

In [None]:
# Create a Validation Set
iris_array = iris_data.values
X = iris_array[:,0:4]
Y = iris_array[:,4]
validation_ratio = .2
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_ratio, random_state=seed)


In [None]:
# Test Harness
# Using a 10 fold cross validation to estimate accuracy
seed = 7
scoring = 'accuracy'

# Building Algorithms

In [None]:
# Spot Check Algorithms
models_to_use = []
models_to_use.append(('LR',LogisticRegression()))
models_to_use.append(('LDA',LinearDiscriminantAnalysis()))
models_to_use.append(('KNN',KNeighborsClassifier()))
models_to_use.append(('CART',DecisionTreeClassifier()))
models_to_use.append(('NB',GaussianNB()))
models_to_use.append(('SVM',SVC()))

# Evaluate the models one at a time

model_results = []
names = []
for name, model in models_to_use:
    k_fold = model_selection.KFold(n_splits = 10, random_state = seed)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=k_fold, scoring=scoring)
    model_results.append(cv_results)
    names.append(names)
    print(name, cv_results.mean(),cv_results.std())

# Compare Algorithms

In [None]:

# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(model_results)
ax.set_xticklabels(names)
plt.show()

# Make Predictions

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train,Y_train)
predictions = knn.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

In [None]:
svm = SVC()
svm.fit(X_train,Y_train)
predictions = svm.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))