In [1]:
# -------------------------------
# Effect of Feature Scaling on GaussianNB
# -------------------------------
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Load dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0
)

# -------------------------------
# GaussianNB without scaling
# -------------------------------
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
acc_unscaled = accuracy_score(y_test, y_pred)
err_unscaled = 1 - acc_unscaled

print("GaussianNB without scaling")
print(f"Accuracy: {acc_unscaled:.4f}")
print(f"Error rate: {err_unscaled:.4f}")
print("-" * 40)

# -------------------------------
# GaussianNB with standard scaling
# -------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

gnb_scaled = GaussianNB()
gnb_scaled.fit(X_train_scaled, y_train)
y_pred_scaled = gnb_scaled.predict(X_test_scaled)
acc_scaled = accuracy_score(y_test, y_pred_scaled)
err_scaled = 1 - acc_scaled

print("GaussianNB with standard scaling")
print(f"Accuracy: {acc_scaled:.4f}")
print(f"Error rate: {err_scaled:.4f}")
print("-" * 40)


GaussianNB without scaling
Accuracy: 0.9240
Error rate: 0.0760
----------------------------------------
GaussianNB with standard scaling
Accuracy: 0.9123
Error rate: 0.0877
----------------------------------------


### Observations

- GaussianNB assumes each feature follows a normal distribution within each class and estimates mean and variance independently.
- Feature scaling does **not significantly affect performance**, unlike distance-based classifiers (e.g., k-NN).
- Slight differences may occur if some features have very large numerical ranges that dominate variance estimates, but accuracy generally remains similar.
