In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import AdaBoostClassifier

## Import Dataset

In [2]:
df_pen = sns.load_dataset('penguins')

In [None]:
df_pen = df_pen.dropna()

In [None]:
df_pen = df_pen.sample(frac=1).reset_index(drop=True)

In [None]:
X = df_pen.drop(['species'], axis = 1)
y = df_pen['species']

## Data Pre-processing

In [None]:
categorical_x = ['island', 'sex']
numerical_x = X.drop(categorical_x, axis = 1).columns

In [None]:
# ## If y is categorical:
# y.fillna(y.mode(), inplace= True)
# ##If y is numerical
# # y.fillna(y.mean(), inplace= True)
# for i in numerical_x:
#     X[i].fillna(X[i].mean(), inplace = True)

# for i in categorical_x:
#     X[i].fillna(X[i].mode().iloc[0], inplace = True)
    
categoricas = pd.get_dummies(X[categorical_x], drop_first=True)
X = pd.concat([categoricas, X[numerical_x]], axis = 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state=101)

In [None]:
escalador = StandardScaler()
escalador.fit(X_train)

X_train = escalador.transform(X_train)
X_test = escalador.transform(X_test)

## Model implementation

In [None]:
model = AdaBoostClassifier()

In [None]:
model.fit(X_train,y_train)

In [None]:
base_pred = model.predict(X_test)

## Evaluation

In [None]:
y_pred = model.predict(X_test)

In [None]:
metrics.accuracy_score(y_test,y_pred)

In [None]:
metrics.confusion_matrix(y_test,y_pred)

In [None]:
metrics.plot_confusion_matrix(model, X_test,y_test)

In [None]:
metrics.plot_confusion_matrix(model, X_test,y_test,normalize='true')

In [None]:
print(metrics.classification_report(y_test,y_pred))

In [None]:
model.feature_importances_

In [None]:
df_feature_importance = pd.DataFrame(index=X.columns,data=model.feature_importances_,columns=['Feature Importance'])
df_feature_importance = df_feature_importance.sort_values('Feature Importance', ascending = False)
df_feature_importance

## Grid Search

In [None]:
model = AdaBoostClassifier()

In [None]:
n_estimators = [15,20,25,30,35]
learning_rate = [1]

parametros = {'n_estimators':n_estimators,
              'learning_rate':learning_rate,}

In [None]:
grid_model = GridSearchCV(model,param_grid= parametros, cv = 5, scoring='accuracy', n_jobs=-1)

In [None]:
grid_model.fit(X_train,y_train)

In [None]:
grid_model.best_params_

In [None]:
y_pred = grid_model.predict(X_test)

### Evaluation

In [None]:
metrics.accuracy_score(y_test,y_pred)

In [None]:
metrics.confusion_matrix(y_test,y_pred)

In [None]:
metrics.plot_confusion_matrix(grid_model, X_test,y_test)

In [None]:
metrics.plot_confusion_matrix(grid_model, X_test,y_test,normalize='true')

In [None]:
print(metrics.classification_report(y_test,y_pred))

In [None]:
grid_model.best_estimator_.feature_importances_

In [None]:
df_feature_importance = pd.DataFrame(index=X.columns,data=grid_model.best_estimator_.feature_importances_,columns=['Feature Importance'])
df_feature_importance = df_feature_importance.sort_values('Feature Importance', ascending = False)
df_feature_importance

In [None]:
plt.figure(figsize=(14,6),dpi=200)
sns.barplot(data = df_feature_importance,x=df_feature_importance.index,y='Feature Importance')

plt.xticks(rotation=90);

## Elbow

In [None]:
errors = []
misclassifications = []

for n in range(1,64):
    rfc = AdaBoostClassifier( n_estimators=n,)
    rfc.fit(X_train,y_train)
    preds = rfc.predict(X_test)
    err = 1 - metrics.accuracy_score(preds,y_test)
    n_missed = np.sum(preds != y_test) # watch the video to understand this line!!
    errors.append(err)
    misclassifications.append(n_missed)

In [None]:
plt.plot(range(1,64),errors)

In [None]:
plt.plot(range(1,64),misclassifications)