# Ensemble comparative (Obligatory)
## Christian Berdejo Sánchez


## Imports

In [91]:
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV

from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

from sklearn.datasets import load_wine
warnings.filterwarnings(action='ignore')                  # Turn off the warnings.

## Load data

In [92]:
# Load data.
data = load_wine()

In [93]:
# Explanatory variables.
X = data['data']
print(data['feature_names'])

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [94]:
X.shape

(178, 13)

In [95]:
# Response variable.
Y = data['target']
label = list(data['target_names'])
label.reverse()
print(label)

[np.str_('class_2'), np.str_('class_1'), np.str_('class_0')]


In [96]:
#Separamos en conjunto de entreno y conjunto de test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=82)

## Learners classificators

In [97]:
# Classification Tree.

DTC = DecisionTreeClassifier()
#Búsqueda por rejilla
from sklearn.model_selection import GridSearchCV

rejilla = {'criterion':['gini','entropy','log_loss'], 'max_depth':[3,5,7], 'min_samples_split':[2,4,8,16]}

DTCGCV = GridSearchCV(estimator=DTC,param_grid=rejilla,scoring='accuracy',cv=5)

DTCGCV.fit(X_train,y_train)
DTC_best_estimator = DTCGCV.best_estimator_

y_pred = DTC_best_estimator.predict(X_test)
print( "Tree accuracy : " + str(accuracy_score(y_pred,y_test)))

Tree accuracy : 0.9259259259259259


In [98]:
# Classification with KNN
KNN = KNeighborsClassifier()
rejilla = {'n_neighbors':[3,5,7,9,11],'weights':['uniform','distance'],'metric':['euclidean','cosine','manhattan']}

KNNGCV = GridSearchCV(estimator=KNN,param_grid=rejilla,scoring='accuracy',cv=10)

KNNGCV.fit(X_train,y_train)
KNN_best_estimator = KNNGCV.best_estimator_

Y_pred = KNN_best_estimator.predict(X_test)
print( "KNN accuracy : " + str(accuracy_score(y_pred,y_test)))

KNN accuracy : 0.9259259259259259


In [99]:
# Classification with svc.
SVC = SVC()
rejilla = {'C':[0.1,1,10,100], 'kernel':['linear','rbf','sigmoid'],'gamma':[0.01,0.1,0.2,0.3],'degree':[2,3,4],'coef0':[0.1,0.5,1.0]}

SVCGCV = GridSearchCV(estimator=SVC,param_grid=rejilla,scoring='accuracy',cv=5)
SVCGCV.fit(X_train,y_train)

SVC_best_estimator = SVCGCV.best_estimator_

y_pred = SVC_best_estimator.predict(X_test)
print( "SVC accuracy : " + str(accuracy_score(y_pred,y_test)))

SVC accuracy : 0.9259259259259259


In [100]:

# Classification with Naive Bayes
NB = GaussianNB()
rejilla = {'var_smoothing': np.logspace(0, -9, num=100)}
NBGCV = GridSearchCV(estimator=NB, param_grid=rejilla, scoring='accuracy', cv=5)
NBGCV.fit(X_train, y_train)

NB_best_estimator = NBGCV.best_estimator_

y_pred = NB_best_estimator.predict(X_test)
print("Naive Bayes accuracy: " + str(accuracy_score(y_pred,y_test)))

Naive Bayes accuracy: 0.9444444444444444


## Ensemble classificators

In [101]:
#HARD VOTING
VCH = VotingClassifier(estimators=[('Tree',DTC_best_estimator),('knn',KNN_best_estimator),('SVC',SVC_best_estimator)],voting='hard')             # voting = 'hard'.
VCH.fit(X_train, y_train)
y_pred = VCH.predict(X_test)
print( "Voting Classifier Accuracy : " + str(accuracy_score(y_pred,y_test)))

Voting Classifier Accuracy : 0.9259259259259259


In [102]:
VCS = VotingClassifier(estimators=[('Tree',DTC_best_estimator),('knn',KNN_best_estimator),('SVC',NB_best_estimator)],voting='soft')             # voting = 'hard'.
VCS.fit(X_train, y_train)
y_pred = VCS.predict(X_test)
print( "Voting Classifier Accuracy : " + str(accuracy_score(y_pred,y_test)))

Voting Classifier Accuracy : 0.9259259259259259


## Accuracy Comparative 
<table>
    <tr>
        <th>Model</th>
        <th>Accuracy</th>   
    </tr>
    <tr>
        <td>Decision Tree Classifier</td>
        <td>0.907</td>
    </tr>
    <tr>
        <td>KNeighboors Classifier</td>
        <td>0.926</td>
    </tr> 
    <tr>
        <td>SVC</td>
        <td>0.926</td>
    </tr>
    <tr>
        <td>GaussianNB</td>
        <td>0.944</td>
    </tr>
    <tr>
        <td>Hard voting (DTC, KNN, SVC)</td>
        <td>0.944</td>
    </tr>
    <tr>
        <td>Soft voting (DTC, KNN, GNB)</td>
        <td>0.926</td>
    </tr>
</table>

### Explicación
Como se puede ver los modelos que mejor se comportan son el GaussianNB y el Hard voting, ambos con un accuracy de 0.944. Pero se puede observar que para todos los modelos el accuracy es muy similar, esto se debe a que el dataset es muy sencillo y los modelos no tienen problemas para clasificarlo.

---
## Boosting Ensemble

In [104]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Modelo base (un árbol de decisión simple)
base_model = DecisionTreeClassifier(max_depth=1)

# Definir AdaBoost con búsqueda de hiperparámetros
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1, 10]
}

ada = AdaBoostClassifier(base_model)
grid_search = GridSearchCV(ada, param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train, y_train)

best_ada = grid_search.best_estimator_
y_pred = best_ada.predict(X_test)

print("AdaBoost Accuracy:", str(accuracy_score(y_pred,y_test)))


AdaBoost Accuracy: 0.9259259259259259


Al probar un algoritmo de boosting ensemble como AdaBoostClassifier, se obtiene un accuracy de 0.926, lo cual es peor que los modelos individuales. Esto se debe a que el dataset es muy sencillo y no se necesita de un algoritmo de boosting para clasificarlo.

Sería interesante cambiar el dataset por uno mas complejo y ver los resultados.