In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
data = pd.read_csv('diabetes.csv')

# Split the dataset into features (X) and target (y)
X = data.drop(columns=['Outcome'])
y = data['Outcome']

# Gaussian Naive Bayes
'''
Explanation:
Why Use StandardScaler?
Gaussian Naive Bayes assumes that features follow a Gaussian (normal) distribution. StandardScaler standardizes the dataset by centering the mean at 0 and scaling the variance to 1.
Standardizing ensures that each feature's distribution aligns better with the Gaussian assumption, improving the algorithm's performance.
Without standardization, features with larger scales might dominate the calculation of probabilities, violating the algorithm's assumptions.
'''
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)

print("Gaussian Naive Bayes")
print("Accuracy:", accuracy_score(y_test, y_pred_gnb))
print(classification_report(y_test, y_pred_gnb))

# Multinomial Naive Bayes
'''
Why Use MinMaxScaler?
Multinomial Naive Bayes assumes that features represent counts or frequencies. If the dataset contains features on widely different scales 
(e.g., one feature ranges from 0 to 1 while another ranges from 0 to 1000), it might skew the model.
MinMaxScaler scales all feature values to a range of [0, 1], ensuring the features are proportional and normalized while preserving their relative differences.
This step makes the data suitable for Multinomial Naive Bayes, which expects non-negative values and often benefits from normalized ranges.
'''
scaler_minmax = MinMaxScaler()
X_minmax = scaler_minmax.fit_transform(X)
X_train_mnb, X_test_mnb, y_train_mnb, y_test_mnb = train_test_split(X_minmax, y, test_size=0.2, random_state=42)

mnb = MultinomialNB()
mnb.fit(X_train_mnb, y_train_mnb)
y_pred_mnb = mnb.predict(X_test_mnb)

print("\nMultinomial Naive Bayes")
print("Accuracy:", accuracy_score(y_test_mnb, y_pred_mnb))
print(classification_report(y_test_mnb, y_pred_mnb))

# Bernoulli Naive Bayes
'''
Why Convert to Binary?
Bernoulli Naive Bayes works with binary data where each feature indicates the presence (1) or absence (0) of a characteristic.
The dataset X may contain continuous or count-based values, which do not fit Bernoulli Naive Bayes' assumption of binary features.
The transformation (X > 0).astype(int) converts all positive values to 1 (presence) and all zero or negative values to 0 (absence), 
ensuring that the features comply with Bernoulli Naive Bayes' requirements.
'''
X_binary = (X > 0).astype(int)
X_train_bnb, X_test_bnb, y_train_bnb, y_test_bnb = train_test_split(X_binary, y, test_size=0.2, random_state=42)

bnb = BernoulliNB()
bnb.fit(X_train_bnb, y_train_bnb)
y_pred_bnb = bnb.predict(X_test_bnb)

print("\nBernoulli Naive Bayes")
print("Accuracy:", accuracy_score(y_test_bnb, y_pred_bnb))
print(classification_report(y_test_bnb, y_pred_bnb))

Gaussian Naive Bayes
Accuracy: 0.7662337662337663
              precision    recall  f1-score   support

           0       0.83      0.80      0.81        99
           1       0.66      0.71      0.68        55

    accuracy                           0.77       154
   macro avg       0.75      0.75      0.75       154
weighted avg       0.77      0.77      0.77       154


Multinomial Naive Bayes
Accuracy: 0.6428571428571429
              precision    recall  f1-score   support

           0       0.64      1.00      0.78        99
           1       0.00      0.00      0.00        55

    accuracy                           0.64       154
   macro avg       0.32      0.50      0.39       154
weighted avg       0.41      0.64      0.50       154


Bernoulli Naive Bayes
Accuracy: 0.6558441558441559
              precision    recall  f1-score   support

           0       0.65      0.99      0.79        99
           1       0.75      0.05      0.10        55

    accuracy              

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
