In [1]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np 
from matplotlib import pyplot as plt
from sklearn import tree as treeClassifier
from sklearn.ensemble import RandomForestClassifier as forest
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.naive_bayes import GaussianNB
import time

In [2]:
datos = load_iris()
datos.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [3]:
datos["target_names"]

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [4]:
dataset = pd.DataFrame(data = datos["data"], columns =  datos["feature_names"])
dataset["especie"] = datos["target"]
dataset

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),especie
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


## Queremos obtener un conjunto de reglas de árbol de decisión para clasificar la especie de flor con las 4 medidas que tenemos

### Separación de datos

In [5]:
x_train, x_test, y_train, y_test = train_test_split(dataset.drop('especie', axis = 1), dataset['especie'], test_size = 0.3)

In [6]:
# arbol de decision
start_arbol = time.time()
arbol = treeClassifier.DecisionTreeClassifier()
arbol.fit(x_train, y_train)
# predicción para los datos de prueba
pred_arbol = arbol.predict(x_test)
t_arbol = time.time() - start_arbol
ac_arbol = accuracy_score(y_pred=pred_arbol, y_true=y_test)
print("Accuracy score: {}".format(accuracy_score(y_pred=pred_arbol, y_true=y_test)))
print("Matriz de confusión:\n{}".format(confusion_matrix(y_pred=pred_arbol, y_true=y_test)))

Accuracy score: 0.9111111111111111
Matriz de confusión:
[[19  0  0]
 [ 0 12  0]
 [ 0  4 10]]


In [7]:
# bosque aleatorio
start_bosque = time.time()
bosque = forest(800)
bosque.fit(x_train, y_train)
# predicción para los datos de prueba
pred_bosque = bosque.predict(x_test)
t_bosque = time.time() - start_bosque
ac_bosque = accuracy_score(y_pred=pred_bosque, y_true=y_test)
print("Accuracy_score: {}".format(accuracy_score(y_pred=pred_bosque, y_true=y_test)))
print("Matriz de confusión:\n{}".format(confusion_matrix(y_pred=pred_bosque, y_true=y_test)))

Accuracy_score: 0.9333333333333333
Matriz de confusión:
[[19  0  0]
 [ 0 12  0]
 [ 0  3 11]]


In [8]:
# red neuronal
start_red = time.time()
redNeuronal = MLPClassifier(hidden_layer_sizes=(10,4,3),
                            max_iter=10000,
                            activation ='logistic',
                            tol= 1e-7)

redNeuronal.fit(x_train, y_train)
pred_rn = redNeuronal.predict(x_test)
t_red = time.time() - start_red
ac_rn = accuracy_score(y_pred=pred_rn, y_true=y_test)
print("Accuracy_score: {}".format(accuracy_score(y_pred=pred_rn, y_true=y_test)))
print("Matriz de confusión:\n{}".format(confusion_matrix(y_pred=pred_rn, y_true=y_test)))

Accuracy_score: 0.9333333333333333
Matriz de confusión:
[[19  0  0]
 [ 0 12  0]
 [ 0  3 11]]




In [9]:
# regresion logistica
start_logi = time.time()
logi = LogisticRegression()
logi.fit(x_train, y_train)
pred_logi = logi.predict(x_test)
t_logi = time.time() - start_logi
ac_logi = accuracy_score(y_pred=pred_logi, y_true=y_test)
print("Accuracy_score: {}".format(accuracy_score(y_pred=pred_logi, y_true=y_test)))
print("Matriz de confusión:\n{}".format(confusion_matrix(y_pred=pred_logi, y_true=y_test)))

Accuracy_score: 0.9777777777777777
Matriz de confusión:
[[19  0  0]
 [ 0 12  0]
 [ 0  1 13]]


In [10]:
# máquina de soporte vectorial
start_svm = time.time()
svm = SVC()
svm.fit(x_train, y_train)
pred_svm = svm.predict(x_test)
t_svm = time.time() - start_svm
ac_svm = accuracy_score(y_pred=pred_svm, y_true=y_test)
print("Accuracy_score: {}".format(accuracy_score(y_pred=pred_svm, y_true=y_test)))
print("Matriz de confusión:\n{}".format(confusion_matrix(y_pred=pred_svm, y_true=y_test)))

Accuracy_score: 0.9555555555555556
Matriz de confusión:
[[19  0  0]
 [ 0 12  0]
 [ 0  2 12]]


In [11]:
# naive bayes
start_nb = time.time()
nb = GaussianNB()
nb.fit(x_train, y_train)
pred_nb = nb.predict(x_test)
t_nb = time.time() - start_nb
ac_nb = accuracy_score(y_pred=pred_nb, y_true=y_test)
print("Accuracy_score: {}".format(accuracy_score(y_pred=pred_nb, y_true=y_test)))
print("Matriz de confusión:\n{}".format(confusion_matrix(y_pred=pred_nb, y_true=y_test)))

Accuracy_score: 0.9555555555555556
Matriz de confusión:
[[19  0  0]
 [ 0 12  0]
 [ 0  2 12]]


In [12]:
resultados = pd.DataFrame({'modelo': ['naive bayes', 'regresión logística', 'arbol', 'bosque', 'red neuronal', 'svm'],
'accuracy': [ac_nb, ac_logi, ac_arbol, ac_bosque, ac_rn, ac_svm],
'tiempo': [t_nb, t_logi, t_arbol, t_bosque, t_red, t_svm]})
resultados

Unnamed: 0,modelo,accuracy,tiempo
0,naive bayes,0.955556,0.00504
1,regresión logística,0.977778,0.048472
2,arbol,0.911111,0.005989
3,bosque,0.933333,2.55313
4,red neuronal,0.933333,10.259902
5,svm,0.955556,0.007978


## Selección de hiperparámetros

In [14]:
# ejemplo de grid search con el random forest
from sklearn.model_selection import GridSearchCV
start_gs = time.time()
bosque = forest()
parametros = {'n_estimators': [100,200,400,800], 'max_depth': [4,8,12,16]}
gs = GridSearchCV(bosque, parametros)
gs.fit(x_train, y_train)
pred_mejor_bosque = gs.predict(x_test)
t_gs = time.time() - start_gs
ac_mb = accuracy_score(y_pred=pred_mejor_bosque, y_true=y_test)
print("Accuracy_score: {}".format(accuracy_score(y_pred=pred_mejor_bosque, y_true=y_test)))
print("Matriz de confusión:\n{}".format(confusion_matrix(y_pred=pred_mejor_bosque, y_true=y_test)))

Accuracy_score: 0.9333333333333333
Matriz de confusión:
[[19  0  0]
 [ 0 12  0]
 [ 0  3 11]]


In [16]:
resultados = pd.DataFrame({'modelo': ['naive bayes', 'regresión logística', 'arbol', 'bosque', 'red neuronal', 'svm', 'bosque con gs'],
'accuracy': [ac_nb, ac_logi, ac_arbol, ac_bosque, ac_rn, ac_svm, ac_mb],
'tiempo': [t_nb, t_logi, t_arbol, t_bosque, t_red, t_svm, t_gs]})
resultados

Unnamed: 0,modelo,accuracy,tiempo
0,naive bayes,0.955556,0.00504
1,regresión logística,0.977778,0.048472
2,arbol,0.911111,0.005989
3,bosque,0.933333,2.55313
4,red neuronal,0.933333,10.259902
5,svm,0.955556,0.007978
6,bosque con gs,0.933333,116.476755
