# Palmer Penguins Modeling

Import the Palmer Penguins dataset and print out the first few rows.

Suppose we want to predict `species` using the other variables in the dataset.

**Dummify** all variables that require this.

In [156]:
# Code Here
import pandas as pd 
import numpy as np
from palmerpenguins import load_penguins
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, label_binarize
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, f1_score, roc_curve

In [None]:
# penguins['Adelie'] = (penguins['species'] == 'Adelie').astype(int)
# penguins['Gentoo'] = (penguins['species'] == 'Gentoo').astype(int)
# penguins['Chinstrap'] = (penguins['species'] == 'Chinstrap').astype(int)

In [23]:
penguins = load_penguins()
penguins = penguins.dropna()
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007


In [157]:
X = penguins.drop(["species"], axis = 1)
y = penguins["species"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

Let's use the other variables to predict `species`. Prepare your data and fit the following models on the entire dataset:

* Two kNN models (for different values of K)
* Two decision tree models (for different complexities of trees)

Compute the following, for each of your models, on test data. Keep in mind that you may need to stratify your creation of the training and test data.

* Confusion matrix
* Overall Accuracy
* Precision, Recall, AUC, and F1-score for each species

Create one ROC plot for the species of your choice.

In [158]:
# kNN Model 1 
ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

knn_pipeline_1 = Pipeline(
    [
        ("preprocessing", ct),
        ("knn", KNeighborsClassifier(n_neighbors=5))
    ]
)

fitted_pipeline = knn_pipeline_1.fit(X_train, y_train)
y_pred = fitted_pipeline.predict(X_test)

In [160]:
# Confusion Matrix
cmatrix = confusion_matrix(y_true = y_test, y_pred=y_pred, labels = ["Adelie", "Gentoo", "Chinstrap"])

# Overall Accuracy
accuracy = accuracy_score(y_true = y_test, y_pred = y_pred)

# Precision
precision = precision_score(y_true=y_test, y_pred=y_pred, average = None)

# Recall
recall = recall_score(y_true = y_test, y_pred = y_pred, average = None)

# AUC (need to fix)
auc = roc_auc_score(y_score = fitted_pipeline.predict_proba(X_test), y_true = y_test, multi_class = "ovr", average="macro")

# F1 Score
f1 = f1_score(y_pred = y_pred, y_true = y_test, average = None)
print(f"Confusion Matrix:\n {cmatrix} \nOverall Accuracy: {accuracy} \nPrecision: {precision} \nRecall: {recall} \nAUC: {auc} \nF1 Score: {f1}")

Confusion Matrix:
 [[35  0  1]
 [ 0 33  0]
 [ 1  0 14]] 
Overall Accuracy: 0.9761904761904762 
Precision: [0.97222222 0.93333333 1.        ] 
Recall: [0.97222222 0.93333333 1.        ] 
AUC: 0.9992275563607086 
F1 Score: [0.97222222 0.93333333 1.        ]


In [129]:
# kNN Model 2
knn_pipeline_2 = Pipeline(
    [
        ("preprocessing", ct),
        ("knn", KNeighborsClassifier(n_neighbors=25))
    ]
)

fitted_pipeline = knn_pipeline_2.fit(X_train, y_train)
y_pred = fitted_pipeline.predict(X_test)

In [130]:
# Confusion Matrix
cmatrix = confusion_matrix(y_true = y_test, y_pred=y_pred, labels = ["Adelie", "Gentoo", "Chinstrap"])

# Overall Accuracy
accuracy = accuracy_score(y_true = y_test, y_pred = y_pred)

# Precision
precision = precision_score(y_true=y_test, y_pred=y_pred, average = None)

# Recall
recall = recall_score(y_true = y_test, y_pred = y_pred, average = None)

# AUC (need to fix)
auc = roc_auc_score(y_score = fitted_pipeline.predict_proba(X_test), y_true = y_test, multi_class="ovr", average="macro")

# F1 Score
f1 = f1_score(y_pred = y_pred, y_true = y_test, average = None)
print(f"Confusion Matrix:\n {cmatrix} \nOverall Accuracy: {accuracy} \nPrecision: {precision} \nRecall: {recall} \nAUC: {auc} \nF1 Score: {f1}")

Confusion Matrix:
 [[28  0  0]
 [ 0 38  0]
 [ 1  0 17]] 
Overall Accuracy: 0.9880952380952381 
Precision: [0.96551724 1.         1.        ] 
Recall: [1.         0.94444444 1.        ] 
AUC: 0.9995068313520695 
F1 Score: [0.98245614 0.97142857 1.        ]


In [108]:
# Decision Tree 1
dtree_pipeline_1 = Pipeline(
    [
        ("preprocessing", ct),
        ("dtree", DecisionTreeClassifier(ccp_alpha=.1))
    ]
)

fitted_pipeline = dtree_pipeline_1.fit(X_train, y_train)
y_pred = fitted_pipeline.predict(X_test)

In [109]:
# Confusion Matrix
cmatrix = confusion_matrix(y_true = y_test, y_pred=y_pred, labels = ["Adelie", "Gentoo", "Chinstrap"])

# Overall Accuracy
accuracy = accuracy_score(y_true = y_test, y_pred = y_pred)

# Precision
precision = precision_score(y_true=y_test, y_pred=y_pred, average = None)

# Recall
recall = recall_score(y_true = y_test, y_pred = y_pred, average = None)

# AUC (need to fix)
auc = roc_auc_score(y_score = fitted_pipeline.predict_proba(X_test), y_true = y_test, multi_class="ovr", average="macro")

# F1 Score
f1 = f1_score(y_pred = y_pred, y_true = y_test, average = None)
print(f"Confusion Matrix:\n {cmatrix} \nOverall Accuracy: {accuracy} \nPrecision: {precision} \nRecall: {recall} \nAUC: {auc} \nF1 Score: {f1}")

Confusion Matrix:
 [[25  0  3]
 [ 2 36  0]
 [ 1  0 17]] 
Overall Accuracy: 0.9285714285714286 
Precision: [0.89285714 0.85       1.        ] 
Recall: [0.89285714 0.94444444 0.94736842] 
AUC: 0.9629008134238601 
F1 Score: [0.89285714 0.89473684 0.97297297]


In [127]:
# Decision Tree 2
dtree_pipeline_2 = Pipeline(
    [
        ("preprocessing", ct),
        ("dtree", DecisionTreeClassifier(ccp_alpha=1e-5))
    ]
)

fitted_pipeline = dtree_pipeline_2.fit(X_train, y_train)
y_pred = fitted_pipeline.predict(X_test)

In [128]:
# Confusion Matrix
cmatrix = confusion_matrix(y_true = y_test, y_pred=y_pred, labels = ["Adelie", "Gentoo", "Chinstrap"])

# Overall Accuracy
accuracy = accuracy_score(y_true = y_test, y_pred = y_pred)

# Precision
precision = precision_score(y_true=y_test, y_pred=y_pred, average = None)

# Recall
recall = recall_score(y_true = y_test, y_pred = y_pred, average = None)

# AUC (need to fix)
auc = roc_auc_score(y_score = fitted_pipeline.predict_proba(X_test), y_true = y_test, multi_class="ovr", average="macro")

# F1 Score
f1 = f1_score(y_pred = y_pred, y_true = y_test, average = None)
print(f"Confusion Matrix:\n {cmatrix} \nOverall Accuracy: {accuracy} \nPrecision: {precision} \nRecall: {recall} \nAUC: {auc} \nF1 Score: {f1}")

Confusion Matrix:
 [[28  0  0]
 [ 1 37  0]
 [ 1  0 17]] 
Overall Accuracy: 0.9761904761904762 
Precision: [0.93333333 1.         1.        ] 
Recall: [1.         0.94444444 0.97368421] 
AUC: 0.9804023948760792 
F1 Score: [0.96551724 0.97142857 0.98666667]
