In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("../input/palmer-archipelago-antarctica-penguin-data/penguins_size.csv")

In [None]:
df

In [None]:
df.info()

## EDA

### Missing Data


In [None]:
df.isna().sum()

In [None]:
df.describe().T

In [None]:
sns.kdeplot(df['culmen_length_mm'])

In [None]:
sns.kdeplot(df['culmen_depth_mm'])

In [None]:
sns.kdeplot(df['flipper_length_mm'])

In [None]:
sns.kdeplot(df['body_mass_g'])

In [None]:
df['body_mass_g'] = df['body_mass_g'].fillna(df['body_mass_g'].median())

In [None]:
df.isna().sum()

In [None]:
df[['culmen_length_mm' , 'culmen_depth_mm' , 'flipper_length_mm']] = df[['culmen_length_mm' , 'culmen_depth_mm' , 'flipper_length_mm']].fillna(df[['culmen_length_mm' , 'culmen_depth_mm' , 'flipper_length_mm']].mean())

In [None]:
df.isna().sum()

In [None]:
df.isna().sum()

In [None]:
df.head()

In [None]:
df['sex'].value_counts(dropna=False)

In [None]:
df = df[~(df['sex'] == '.')]

In [None]:
df.groupby('sex')['body_mass_g'].min()

In [None]:
df[(df['body_mass_g'] > 3900) & (df['sex'] == 'FEMALE')]

In [None]:
df[df['sex'].isna()]

In [None]:
def determine_sex(row):
    if row['body_mass_g'] > 3900:
        return 'MALE'
    else:
        return 'FEMALE'

In [None]:
df['sex'] = df.apply(lambda row: determine_sex(
    row) if pd.isna(row['sex']) else row['sex'], axis=1)

In [None]:
df[(df['body_mass_g'] > 3900) & (df['sex'] == 'FEMALE')]

In [None]:
df.groupby('sex')['body_mass_g'].mean()

In [None]:
df.isna().sum()

## Visualization


In [None]:
df.head()

In [None]:
sp = df['species'].value_counts()

In [None]:
sns.barplot(x=sp.index, y=sp.values)

In [None]:
sns.scatterplot(x='culmen_length_mm', y='culmen_depth_mm',
                data=df, hue='species', palette='Dark2')

In [None]:
sns.pairplot(df, hue='species', palette='Dark2')

In [None]:
sns.catplot(x='species', y='culmen_length_mm', data=df,
            kind='box', col='sex', palette='Dark2')

In [None]:
sns.catplot(x='species', y='body_mass_g', data=df,
            kind='box', col='sex', palette='Dark2')

## Feature Engineering


In [None]:
df.head()

In [None]:
X = pd.get_dummies(df.drop('species', axis=1), drop_first=True)

In [None]:
X

In [None]:
y = df['species']

## Train | Test Split


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=101, stratify=y)

In [None]:
y.value_counts(normalize=True)

In [None]:
y_train.value_counts(normalize=True)

In [None]:
y_test.value_counts(normalize=True)

# Decision Tree Classifier

## Default Hyperparameters


In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
base_pred = model.predict(X_test)

In [None]:
base_pred

## Evaluation


In [None]:
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay

In [None]:
cm = confusion_matrix(y_test, base_pred)

In [None]:
con = ConfusionMatrixDisplay(cm, display_labels=model.classes_)
con.plot()

In [None]:
print(classification_report(y_test, base_pred))

In [None]:
X.head()

In [None]:
model.feature_importances_

In [None]:
pd.DataFrame(index=X.columns, data=model.feature_importances_,
             columns=['Feature Importance'])

## Visualize the Tree


In [None]:
from sklearn.tree import plot_tree

In [None]:
plt.figure(figsize=(12, 8), dpi=200)
plot_tree(model, feature_names=X.columns, filled=True)

## Reporting Model Results

To begin experimenting with hyperparameters, let's create a function that reports back classification results and plots out the tree.


In [None]:
def report_model(model):
    model_preds = model.predict(X_test)
    print(classification_report(y_test, model_preds))
    print('\n')
    plt.figure(figsize=(12, 8), dpi=150)
    plot_tree(model, filled=True, feature_names=X.columns)

In [None]:
report_model(model)

In [None]:
pruned_tree = DecisionTreeClassifier(max_depth=2)
pruned_tree.fit(X_train, y_train)

In [None]:
report_model(pruned_tree)

In [None]:
pruned_tree = DecisionTreeClassifier(max_leaf_nodes=3)
pruned_tree.fit(X_train, y_train)

In [None]:
report_model(pruned_tree)

In [None]:
entropy_tree = DecisionTreeClassifier(criterion='entropy')
entropy_tree.fit(X_train, y_train)

In [None]:
report_model(entropy_tree)