In [65]:
import pandas as pd
import numpy as np
from palmerpenguins import load_penguins
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix

In [14]:
data = load_penguins()
data = data.dropna()
data

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007
...,...,...,...,...,...,...,...,...
339,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


## Initial Adaboost Estimator Selection

In [21]:
X = data.drop("species", axis = 1)
y = data["species"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [19]:
ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output = False, handle_unknown='ignore', drop="first"), make_column_selector(dtype_include=object)),
        ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
    ]
)

my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("adaboost", AdaBoostClassifier(algorithm="SAMME"))
    ]
)

parameters = {
    "adaboost__estimator": [DecisionTreeClassifier(), RandomForestClassifier(), SVC()]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='matthews_corrcoef')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

In [24]:
print(f"Matthew's Correlation Coefficient scores for \nDecision Tree: {test_scores[0]} \nRandom Forest: {test_scores[1]} \nSVC: {test_scores[2]}")

Matthew's Correlation Coefficient scores for 
Decision Tree: 0.9626212199210565 
Random Forest: 0.9814041447529931 
SVC: 0.0


Given these values, I will be using a random forest as the base model for Adaboost

## Adaboost Model 1: Random Forest, no parameter tuning

In [35]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("adaboost", AdaBoostClassifier(algorithm="SAMME", estimator = RandomForestClassifier()))
    ]
)

fitted_pipeline = my_pipeline.fit(X_train, y_train)

In [46]:
y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=["Actual Adelie", "Actual Gentoo", "Actual Chinstrap"], columns=["Predicted Adelie", "Predicted Gentoo", "Predicted Chinstrap"])

cm_df

Unnamed: 0,Predicted Adelie,Predicted Gentoo,Predicted Chinstrap
Actual Adelie,146,0,0
Actual Gentoo,0,68,0
Actual Chinstrap,0,0,119


## Adaboost Model 2: Random Forest, number of estimators tuning

In [37]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("adaboost", AdaBoostClassifier(algorithm="SAMME", estimator = RandomForestClassifier()))
    ]
)

parameters = {
    "adaboost__n_estimators": [10, 25, 50, 75, 100, 150]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='matthews_corrcoef')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

In [39]:
print(f"MCC for \n10 Estimators: {test_scores[0]} \n25 Estimators: {test_scores[1]} \n50 Estimators: {test_scores[2]} \n75 Estimators: {test_scores[3]} \n100 Estimators: {test_scores[4]} \n150 Estimators: {test_scores[5]}")

MCC for 
10 Estimators: 0.990822492083384 
25 Estimators: 0.9814041447529931 
50 Estimators: 0.9861574617810736 
75 Estimators: 0.9861574617810736 
100 Estimators: 0.990822492083384 
150 Estimators: 0.9861574617810736


In [40]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("adaboost", AdaBoostClassifier(algorithm="SAMME", estimator = RandomForestClassifier(), n_estimators = 10))
    ]
)

fitted_pipeline = my_pipeline.fit(X_train, y_train)

In [45]:
y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=["Actual Adelie", "Actual Gentoo", "Actual Chinstrap"], columns=["Predicted Adelie", "Predicted Gentoo", "Predicted Chinstrap"])

cm_df

Unnamed: 0,Predicted Adelie,Predicted Gentoo,Predicted Chinstrap
Actual Adelie,146,0,0
Actual Gentoo,0,68,0
Actual Chinstrap,0,0,119


## Adaboost Model 3: Random Forest, learning rate tuning

In [56]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("adaboost", AdaBoostClassifier(algorithm="SAMME", estimator = RandomForestClassifier()))
    ]
)

parameters = {
    "adaboost__learning_rate": [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='matthews_corrcoef')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

In [58]:
print(f"MCC for \nLearning Rate = 1e-10: {test_scores[0]} \nLearning Rate = 1e-9: {test_scores[1]} \nLearning Rate = 1e-8: {test_scores[2]} \nLearning Rate = 1e-7: {test_scores[3]} \nLearning Rate = 1e-6: {test_scores[4]} \nLearning Rate = 1e-5: {test_scores[5]} \nLearning Rate = 1e-4: {test_scores[6]} \nLearning Rate = 1e-3: {test_scores[7]} \nLearning Rate = 1e-2: {test_scores[8]} \nLearning Rate = 1e-1: {test_scores[9]} \nLearning Rate = 1e0: {test_scores[10]} \nLearning Rate = 1e1: {test_scores[11]} \nLearning Rate = 1e2: {test_scores[12]} \nLearning Rate = 1e3: {test_scores[13]} \nLearning Rate = 1e4: {test_scores[14]} \nLearning Rate = 1e5: {test_scores[15]}")

MCC for 
Learning Rate = 1e-10: 0.9861574617810736 
Learning Rate = 1e-9: 0.9861574617810736 
Learning Rate = 1e-8: 0.9861574617810736 
Learning Rate = 1e-7: 0.9861574617810736 
Learning Rate = 1e-6: 0.9861574617810736 
Learning Rate = 1e-5: 0.9861574617810736 
Learning Rate = 1e-4: 0.990822492083384 
Learning Rate = 1e-3: 0.9814041447529931 
Learning Rate = 1e-2: 0.9814712913665706 
Learning Rate = 1e-1: 0.9861574617810736 
Learning Rate = 1e0: 0.990822492083384 
Learning Rate = 1e1: 0.990822492083384 
Learning Rate = 1e2: 0.9861574617810736 
Learning Rate = 1e3: 0.9861574617810736 
Learning Rate = 1e4: 0.9861574617810736 
Learning Rate = 1e5: 0.9861574617810736


In [59]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("adaboost", AdaBoostClassifier(algorithm="SAMME", estimator = RandomForestClassifier(), learning_rate=1))
    ]
)

fitted_pipeline = my_pipeline.fit(X_train, y_train)

In [60]:
y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=["Actual Adelie", "Actual Gentoo", "Actual Chinstrap"], columns=["Predicted Adelie", "Predicted Gentoo", "Predicted Chinstrap"])

cm_df

Unnamed: 0,Predicted Adelie,Predicted Gentoo,Predicted Chinstrap
Actual Adelie,146,0,0
Actual Gentoo,0,68,0
Actual Chinstrap,0,0,119


## Comparison Model: Stacking

In [68]:
# Stacking Classifier
ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output = False, handle_unknown='ignore', drop="first"), make_column_selector(dtype_include=object)),
        ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
    ]
)

estimators = [
    ("DecisionTree", DecisionTreeClassifier()),
    ("RandomForest", RandomForestClassifier()),
    ("ADABoost", AdaBoostClassifier(estimator=RandomForestClassifier(), algorithm="SAMME"))
]

my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("stacking", StackingClassifier(n_jobs= -1, estimators = estimators, final_estimator = RandomForestClassifier()))
    ]
)

fitted_pipeline = my_pipeline.fit(X = X_train, y = y_train)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=["Actual Adelie", "Actual Gentoo", "Actual Chinstrap"], columns=["Predicted Adelie", "Predicted Gentoo", "Predicted Chinstrap"])

cm_df

Unnamed: 0,Predicted Adelie,Predicted Gentoo,Predicted Chinstrap
Actual Adelie,145,1,0
Actual Gentoo,0,68,0
Actual Chinstrap,0,0,119


## Model Tuning Analysis

While it was difficult to determine what tuning steps had the largest effect on the accuracy of the model due to its high baseline accuracy, some patterns could be discerned. To begin with, the best estimator that I found to use with Adaboost was a Random Forest Model. This model was slightly more accurate than a basic Decision Tree. From here, I tuned the number of estimators and the learning rate. It appears that a smaller number of estimators is better, at least to a certain point. This also reduces the amount of time it takes to run the model. For learning rate, it appeared to follow a parabolic shape, where the maximum value came with a learning rate of 1 and the minimums came at the min/max tested values. Comparing to the Stacking model, the Adaboost model is slighly more accurate and did not introduce any false positives. 