In [1]:
# -------------------------------
# GaussianNB: Effect of Train/Test Split
# -------------------------------
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Load dataset
penguins = sns.load_dataset("penguins").dropna()

# Features and target
X = penguins[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]
y = penguins['species']

# Encode target labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Test set sizes to explore
test_sizes = [0.1, 0.2, 0.3]

print("Train/Test Split Analysis for GaussianNB\n")
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, test_size=test_size, random_state=0
    )
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    err = 1 - acc
    print(f"Test size = {int(test_size*100)}%")
    print(f"Accuracy: {acc:.4f}, Error rate: {err:.4f}")
    print("-" * 40)


Train/Test Split Analysis for GaussianNB

Test size = 10%
Accuracy: 1.0000, Error rate: 0.0000
----------------------------------------
Test size = 20%
Accuracy: 0.9701, Error rate: 0.0299
----------------------------------------
Test size = 30%
Accuracy: 0.9600, Error rate: 0.0400
----------------------------------------


### GaussianNB: Effect of Train/Test Split

- As the test set size increases:
  - Training data decreases, which may slightly reduce accuracy.
  - Smaller test sets (e.g., 10%) can produce higher variability in results due to sampling.
- Accuracy and error rates are relatively stable across reasonable splits (10â€“30%).
- Extreme splits (very small or very large test sets) can lead to unreliable performance estimates.
