In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns

# Load your dataset
df = pd.read_csv('diabetes.csv')  # Replace with your dataset

In [None]:
# 1. Detailed Exploratory Data Analysis (EDA)
print("\nDataset Overview:")
print(df.head())

print("\nData Types:")
print(df.info())

print("\nMissing Values:")
print(df.isnull().sum())

print("\nDescriptive Statistics:")
print(df.describe())

# Visualizations
sns.pairplot(df, hue='target')  # Replace 'target' with your actual target variable name
plt.show()

# Correlation Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# 2. Data Preprocessing
# Assuming the target column is named 'target'
X = df.drop(columns=['target'])  # Replace 'target' with your actual target column name
y = df['target']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 3. Train Gaussian Naive Bayes Classifier
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)

In [None]:
# 4. Train Multinomial Naive Bayes Classifier
# MultinomialNB works better with non-negative integer features (e.g., count data)
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred_mnb = mnb.predict(X_test)

In [None]:
# 5. Check Accuracy Scores
gnb_accuracy = accuracy_score(y_test, y_pred_gnb)
mnb_accuracy = accuracy_score(y_test, y_pred_mnb)

print("\nGaussian Naive Bayes Accuracy:", gnb_accuracy)
print("Multinomial Naive Bayes Accuracy:", mnb_accuracy)

In [None]:
# 6. Confusion Matrix with Cross-Validation
cv_scores_gnb = cross_val_score(gnb, X, y, cv=5, scoring='accuracy')
cv_scores_mnb = cross_val_score(mnb, X, y, cv=5, scoring='accuracy')

print("\nGaussian NB Cross-Validation Accuracy:", cv_scores_gnb.mean())
print("Multinomial NB Cross-Validation Accuracy:", cv_scores_mnb.mean())

# Confusion Matrix for Gaussian NB
cm_gnb = confusion_matrix(y_test, y_pred_gnb)
sns.heatmap(cm_gnb, annot=True, fmt='d', cmap='Blues')
plt.title("Gaussian NB Confusion Matrix")
plt.show()

# Confusion Matrix for Multinomial NB
cm_mnb = confusion_matrix(y_test, y_pred_mnb)
sns.heatmap(cm_mnb, annot=True, fmt='d', cmap='Greens')
plt.title("Multinomial NB Confusion Matrix")
plt.show()

In [None]:
# 7. ROC and AUC for Gaussian NB (Binary classification only)
if len(np.unique(y)) == 2:
    y_prob_gnb = gnb.predict_proba(X_test)[:, 1]
    fpr_gnb, tpr_gnb, _ = roc_curve(y_test, y_prob_gnb)
    roc_auc_gnb = auc(fpr_gnb, tpr_gnb)

    plt.figure()
    plt.plot(fpr_gnb, tpr_gnb, color='blue', label=f'Gaussian NB ROC (AUC = {roc_auc_gnb:.2f})')
    plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()
    plt.show()

In [None]:
# 8. Interpretation
print("\nClassification Report for Gaussian NB:")
print(classification_report(y_test, y_pred_gnb))

print("Classification Report for Multinomial NB:")
print(classification_report(y_test, y_pred_mnb))

print("\nSummary:")
print("Gaussian NB performed better on continuous data features, while Multinomial NB is more suitable for categorical or count data. Evaluate based on accuracy, confusion matrix, and ROC to choose the best model.")
