In [69]:
import numpy as np
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn import cross_validation

In [70]:
#The iris dataset is a classic and very easy multi-class classification dataset.
#This data sets consists of 3 different types of irises’ (Setosa, Versicolour, and Virginica) petal and sepal length, stored in a 150x4 numpy.ndarray
#The rows being the samples and the columns being: Sepal Length, Sepal Width, Petal Length and Petal Width.
iris = datasets.load_iris()
#samples of the flowers [Sepal length, Sepal width, Petal length, Petal width]
iris_x = iris.data
#clasification of the flowers 0 1 2 each correspond to one of the species (setosa,versicolor,virginica)
iris_y = iris.target
#Finds the unique elements of the array
np.unique(iris_y)

array([0, 1, 2])

In [71]:
# Split iris data in train and test data
# A random permutation, to split the data randomly
#iris_x has to be an np array in order to be indexed
#iris_y has to be an np array in order to be indexed
np.random.seed(0)
#creates a random array with size the x dimension
indices = np.random.permutation(len(iris_x))

#All the indexes from the length of array -10 until the first index
iris_x_train = iris_x[indices[:-10]]
iris_y_train = iris_y[indices[:-10]]
#Ten last indexes counting from the last of the index array
iris_x_test = iris_x[indices[-10:]]
iris_y_test = iris_y[indices[-10:]]

In [72]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(iris_x_train, iris_y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [73]:

rfc.predict(iris_x_test)

array([1, 2, 1, 0, 0, 0, 2, 1, 2, 0])

In [74]:
iris_y_test

array([1, 1, 1, 0, 0, 0, 2, 1, 2, 0])

In [75]:
accuracy_score(iris_y_test, rfc.predict(iris_x_test))


0.90000000000000002

In [76]:
#Error is the total probability minus the accuracy score
print("Error of the classifier based on accuracy value:")
1-accuracy_score(iris_y_test, rfc.predict(iris_x_test))

Error of the classifier based on accuracy value:


0.099999999999999978

In [77]:
from sklearn.linear_model import Perceptron
per = Perceptron()
per.fit(iris_x_train, iris_y_train)

Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      n_iter=5, n_jobs=1, penalty=None, random_state=0, shuffle=True,
      verbose=0, warm_start=False)

In [78]:
per.predict(iris_x_test)

array([0, 2, 1, 0, 0, 0, 2, 0, 2, 0])

In [79]:
iris_y_test

array([1, 1, 1, 0, 0, 0, 2, 1, 2, 0])

In [80]:
accuracy_score(iris_y_test, per.predict(iris_x_test))

0.69999999999999996

In [81]:
#Error is the total probability minus the accuracy score
print("Error of the classifier based on accuracy value:")
1-accuracy_score(iris_y_test, per.predict(iris_x_test))

Error of the classifier based on accuracy value:


0.30000000000000004

In [82]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(iris_x_train, iris_y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [83]:
knn.predict(iris_x_test)

array([1, 2, 1, 0, 0, 0, 2, 1, 2, 0])

In [84]:
iris_y_test

array([1, 1, 1, 0, 0, 0, 2, 1, 2, 0])

In [85]:
accuracy_score(iris_y_test, knn.predict(iris_x_test))

0.90000000000000002

In [86]:
#Error is the total probability minus the accuracy score
print("Error of the classifier based on accuracy value:")
1-accuracy_score(iris_y_test, knn.predict(iris_x_test))

Error of the classifier based on accuracy value:


0.099999999999999978

In [87]:
print("Cross classification values:")
print("nearest neighbors")
scores = cross_validation.cross_val_score(knn,iris_x,iris_y)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("random forests")
scores = cross_validation.cross_val_score(rfc,iris_x,iris_y)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("perceptron")
scores = cross_validation.cross_val_score(per,iris_x,iris_y)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Cross classification values:
nearest neighbors
Accuracy: 0.99 (+/- 0.02)
random forests
Accuracy: 0.97 (+/- 0.02)
perceptron
Accuracy: 0.52 (+/- 0.29)


In [None]:
#In my opinion both nearest neighbors and random forest have similar and very good performances according to cross_validation
#measures. Perceptron predicts much poorly compared with the other two algorithms with the parameters set by default. A tuning 
#must be made to see if the results can be improved for this method. 