In [10]:
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
from palmerpenguins import load_penguins
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier

In [2]:
data = load_penguins()
data = data.dropna()
data

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007
...,...,...,...,...,...,...,...,...
339,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


## Gradient Boosting - No tuning

In [12]:
X = data.drop("species", axis = 1)
y = data["species"]
y = LabelEncoder().fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [13]:
ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output = False, handle_unknown='ignore', drop="first"), make_column_selector(dtype_include=object)),
        ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
    ]
)

my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("xgb", XGBClassifier())
    ]
)

fitted_pipeline = my_pipeline.fit(X_train, y_train)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=["Actual Adelie", "Actual Gentoo", "Actual Chinstrap"], columns=["Predicted Adelie", "Predicted Gentoo", "Predicted Chinstrap"])

cm_df

Unnamed: 0,Predicted Adelie,Predicted Gentoo,Predicted Chinstrap
Actual Adelie,145,1,0
Actual Gentoo,0,68,0
Actual Chinstrap,0,0,119


## Gradient Boosting - Step Size Shrinkage and Loss Reduction

In [14]:
ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output = False, handle_unknown='ignore', drop="first"), make_column_selector(dtype_include=object)),
        ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
    ]
)

my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("xgb", XGBClassifier())
    ]
)

parameters = {
    "xgb__eta": [.1, .2, .3, .4, .5, .6, .7, .8, .9],
    "xgb__gamma": [0, 1, 2, 5, 10, 15, 25]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='matthews_corrcoef')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

In [15]:
print(f"Mean Matthews Correlation Coefficient: {np.mean(test_scores)}")

Mean Matthews Correlation Coefficient: 0.9637165507770352


In [16]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("xgb", XGBClassifier(eta=.5, gamma=0))
    ]
)

fitted_pipeline = my_pipeline.fit(X_train, y_train)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=["Actual Adelie", "Actual Gentoo", "Actual Chinstrap"], columns=["Predicted Adelie", "Predicted Gentoo", "Predicted Chinstrap"])

cm_df

Unnamed: 0,Predicted Adelie,Predicted Gentoo,Predicted Chinstrap
Actual Adelie,145,1,0
Actual Gentoo,0,68,0
Actual Chinstrap,0,0,119


## Gradient Boosting - Lambda, Alpha

In [19]:
ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output = False, handle_unknown='ignore', drop="first"), make_column_selector(dtype_include=object)),
        ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
    ]
)

my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("xgb", XGBClassifier())
    ]
)

parameters = {
    "xgb__lambda": [.1, .5, 1, 5, 10],
    "xgb__alpha": [0, .1, .5, 1, 10]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='matthews_corrcoef')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

In [20]:
print(f"Mean Matthews Correlation Coefficient: {np.mean(test_scores)}")

Mean Matthews Correlation Coefficient: 0.9793668998912297


In [22]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("xgb", XGBClassifier(alpha = .5, reg_lambda = 1))
    ]
)

fitted_pipeline = my_pipeline.fit(X_train, y_train)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=["Actual Adelie", "Actual Gentoo", "Actual Chinstrap"], columns=["Predicted Adelie", "Predicted Gentoo", "Predicted Chinstrap"])

cm_df

Unnamed: 0,Predicted Adelie,Predicted Gentoo,Predicted Chinstrap
Actual Adelie,145,1,0
Actual Gentoo,0,68,0
Actual Chinstrap,0,0,119


## Gradient Boosting - Tree Methods

In [23]:
ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output = False, handle_unknown='ignore', drop="first"), make_column_selector(dtype_include=object)),
        ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
    ]
)

my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("xgb", XGBClassifier())
    ]
)

parameters = {
    "xgb__tree_method": ["auto", "exact", "approx", "hist"]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='matthews_corrcoef')
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

In [24]:
print(f"Mean Matthews Correlation Coefficient: {np.mean(test_scores)}")

Mean Matthews Correlation Coefficient: 0.9814712913665706


In [28]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("xgb", XGBClassifier(tree_method = "hist"))
    ]
)

fitted_pipeline = my_pipeline.fit(X_train, y_train)

y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=["Actual Adelie", "Actual Gentoo", "Actual Chinstrap"], columns=["Predicted Adelie", "Predicted Gentoo", "Predicted Chinstrap"])

cm_df

Unnamed: 0,Predicted Adelie,Predicted Gentoo,Predicted Chinstrap
Actual Adelie,145,1,0
Actual Gentoo,0,68,0
Actual Chinstrap,0,0,119


## Adaboost Comparison

In [29]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("adaboost", AdaBoostClassifier(algorithm="SAMME", estimator = RandomForestClassifier(), learning_rate=1))
    ]
)

fitted_pipeline = my_pipeline.fit(X_train, y_train)

In [31]:
y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=["Actual Adelie", "Actual Gentoo", "Actual Chinstrap"], columns=["Predicted Adelie", "Predicted Gentoo", "Predicted Chinstrap"])

cm_df

Unnamed: 0,Predicted Adelie,Predicted Gentoo,Predicted Chinstrap
Actual Adelie,145,1,0
Actual Gentoo,0,68,0
Actual Chinstrap,0,0,119


The xgboost and the adaboost models performed equally well on this dataset. Tuning of the parameters did not change the outcomes of the model but this is likely a result of the dataset being easy for the model to accurately classify.