In [1]:
# -------------------------------
# Gaussian Naive Bayes on Penguins
# -------------------------------
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Load dataset
penguins = sns.load_dataset("penguins").dropna()  # drop rows with missing values

# Features and target
X = penguins[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]
y = penguins['species']

# Encode target labels as integers
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=0
)

# Fit Gaussian Naive Bayes
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Predict and evaluate
y_pred = gnb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
error_rate = 1 - accuracy

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Error Rate: {error_rate:.4f}")

# 5-fold cross-validation
cv_scores = cross_val_score(gnb, X, y_encoded, cv=5)
cv_mean_accuracy = cv_scores.mean()
cv_error_rate = 1 - cv_mean_accuracy

print(f"5-fold CV Mean Accuracy: {cv_mean_accuracy:.4f}")
print(f"5-fold CV Mean Error Rate: {cv_error_rate:.4f}")


Test Accuracy: 0.9600
Test Error Rate: 0.0400
5-fold CV Mean Accuracy: 0.9700
5-fold CV Mean Error Rate: 0.0300
