In [4]:
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
import numpy as np

try:
    dataset = pd.read_csv('diabetes.csv')
except FileNotFoundError:
    print("Error: 'diabetes.csv' not found. Please upload the dataset to your Colab session.")
    exit()

print("--- First 5 rows of the Pima Indians Diabetes dataset ---")
print(dataset.head())
print("\n" + "="*50 + "\n")

X = dataset.drop('Outcome', axis=1)
y = dataset['Outcome']

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}\n")
print("\n" + "="*50 + "\n")

K = 10
kfold = KFold(n_splits=K, shuffle=True, random_state=42)
model = GaussianNB()

print(f"--- Performing {K}-Fold Cross-Validation ---")
print(f"Model: Gaussian Naive Bayes")
print(f"The dataset will be split into {K} folds.\n")

results = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')

print("--- Cross-Validation Results ---")
print(f"Scores for each of the {K} folds:")
for i, score in enumerate(results):
    print(f"  Fold {i+1}: {score:.4f}")

print("\n--- Summary ---")
mean_accuracy = results.mean()
std_deviation = results.std()

print(f"Average Accuracy (Mean): {mean_accuracy:.2%}")
print(f"Standard Deviation of Accuracy: {std_deviation:.4f}")
print("\nInterpretation:")
print(f"The model is, on average, {mean_accuracy:.2%} accurate.")
print("The standard deviation tells us how much the performance varied across the different test folds.")
print("A lower standard deviation suggests that the model's performance is more stable and reliable.")



--- First 5 rows of the Pima Indians Diabetes dataset ---
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


Features (X) shape: (768, 8)
Target (y) shape: (768,)



--- Performing 10-Fold Cross-Validation ---
Model: Gaussian Naive Bayes
The dataset will be split into 10 folds.

--- Cross-Validation Results ---
Scores for each of the 10 folds:
  Fold