In [1]:
# Step 1: Import necessary libraries for data handling, preprocessing, and modeling

import pandas as pd  # Data manipulation
import numpy as np  # Numerical operations
from sklearn.model_selection import train_test_split  # Splitting data
from sklearn.preprocessing import StandardScaler, MinMaxScaler  # Feature scaling
from sklearn.naive_bayes import GaussianNB  # Naïve Bayes classifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report  # Evaluation metrics
import seaborn as sns  # Visualization
import matplotlib.pyplot as plt  # Plotting

# Confirm successful import
print("Libraries successfully loaded!")

Libraries successfully loaded!


In [2]:
# Step 2: Load the Diabetes dataset

df = pd.read_csv("C:/Users/dbda.STUDENTSDC/Music/LabPractice/Notebooks/Datasets/Diabetes.csv")  # Load dataset

# Display first two rows for verification
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [3]:
# Step 3: Define Columns for Scaling

# StandardScaler: Used for features with varied ranges or normal distribution
standard_scaler_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'DiabetesPedigreeFunction', 'Age']

# MinMaxScaler: Used for features that benefit from range normalization
min_max_scaler_cols = ['SkinThickness', 'Insulin', 'BMI']

# Print selected columns for verification
print("Features for Standard Scaling:", standard_scaler_cols)
print("Features for Min-Max Scaling:", min_max_scaler_cols)

Features for Standard Scaling: ['Pregnancies', 'Glucose', 'BloodPressure', 'DiabetesPedigreeFunction', 'Age']
Features for Min-Max Scaling: ['SkinThickness', 'Insulin', 'BMI']


In [4]:
# Step 4: Apply Standard Scaling to Selected Features

# Initialize StandardScaler
standard_scaler = StandardScaler()

# Fit and transform the selected features
df[standard_scaler_cols] = standard_scaler.fit_transform(df[standard_scaler_cols])

# Print confirmation message
print("Standard scaling applied to:", standard_scaler_cols)

Standard scaling applied to: ['Pregnancies', 'Glucose', 'BloodPressure', 'DiabetesPedigreeFunction', 'Age']


In [5]:
# Step 5: Apply Min-Max Scaling to Selected Features

# Initialize MinMaxScaler
min_max_scaler = MinMaxScaler()

# Fit and transform the selected features
df[min_max_scaler_cols] = min_max_scaler.fit_transform(df[min_max_scaler_cols])

# Print confirmation message
print("Min-Max scaling applied to:", min_max_scaler_cols)

Min-Max scaling applied to: ['SkinThickness', 'Insulin', 'BMI']


In [6]:
# Step 5: Temporarily Display All Columns in DataFrame

pd.set_option('display.max_columns', None)  # Show all columns
print(df.head(2))  # Display first two rows

pd.reset_option("display.max_columns")  # Reset column display setting

   Pregnancies   Glucose  BloodPressure  SkinThickness  Insulin       BMI  \
0     0.639947  0.848324       0.149641       0.353535      0.0  0.500745   
1    -0.844885 -1.123396      -0.160546       0.292929      0.0  0.396423   

   DiabetesPedigreeFunction       Age  Outcome  
0                  0.468492  1.425995        1  
1                 -0.365061 -0.190672        0  


In [7]:
# Step 6: Define Features and Target Variable

# Features (Independent Variables)
X = df.drop('Outcome', axis=1)  # Remove 'Outcome' column, keeping all other features

# Target (Dependent Variable)
y = df['Outcome']  # Extract 'Outcome' column as the target variable

# Print confirmation
print("Features (X) and Target (y) successfully defined!")

Features (X) and Target (y) successfully defined!


In [8]:
# Step 6: Split the Data into Training and Testing Sets

from sklearn.model_selection import train_test_split  # Import function for data splitting

# Perform train-test split with 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print dataset sizes for verification
print(f"Training set: {X_train.shape}, {y_train.shape}")
print(f"Testing set: {X_test.shape}, {y_test.shape}")

Training set: (614, 8), (614,)
Testing set: (154, 8), (154,)


In [9]:
# Step 7: Train Naïve Bayes Model

# Initialize the Gaussian Naïve Bayes classifier
model = GaussianNB()

# Train the model using the training data
model.fit(X_train, y_train)

# Print confirmation
print("Naïve Bayes model successfully trained!")

Naïve Bayes model successfully trained!


In [10]:
# Step 7: Make Predictions on Test Set

# Perform prediction using the trained model
y_pred = model.predict(X_test)

# Display first few predictions for verification
print("Predicted values:\n", y_pred[:10])  # Show first 10 predictions

Predicted values:
 [0 0 0 0 1 1 0 1 0 1]


In [11]:
# Step 8: Compute Model Evaluation Metrics

# Calculate accuracy, precision, recall, and F1-score
accuracy = accuracy_score(y_test, y_pred)   # Overall correctness of the model
precision = precision_score(y_test, y_pred) # TP / (TP + FP) - How precise are positive predictions?
recall = recall_score(y_test, y_pred)       # TP / (TP + FN) - How well does the model detect actual positives?
f1 = f1_score(y_test, y_pred)               # Harmonic mean of precision and recall

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print evaluation results
print(f"Accuracy: {accuracy:.4f}")      # Measures overall classification correctness
print(f"Precision: {precision:.4f}")    # Evaluates positive predictions accuracy
print(f"Recall: {recall:.4f}")          # Measures sensitivity in detecting true positives
print(f"F1 Score: {f1:.4f}")            # Balances precision and recall
print("\nConfusion Matrix:")
print(conf_matrix)

Accuracy: 0.7662
Precision: 0.6610
Recall: 0.7091
F1 Score: 0.6842

Confusion Matrix:
[[79 20]
 [16 39]]


In [12]:
# Step 8: Display Model Performance Metrics

print("Model Performance Metrics:")
print(f"Accuracy: {accuracy:.2f}")   # Measures overall correctness of predictions
print(f"Precision: {precision:.2f}") # TP / (TP + FP) - How precise are the positive predictions?
print(f"Recall: {recall:.2f}")       # TP / (TP + FN) - How well does the model detect actual positives?
print(f"F1 Score: {f1:.2f}\n")       # Balances precision and recall

# Print detailed classification report for additional insights
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix to visualize misclassifications
print("Confusion Matrix:\n", conf_matrix)

Model Performance Metrics:
Accuracy: 0.77
Precision: 0.66
Recall: 0.71
F1 Score: 0.68

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.80      0.81        99
           1       0.66      0.71      0.68        55

    accuracy                           0.77       154
   macro avg       0.75      0.75      0.75       154
weighted avg       0.77      0.77      0.77       154

Confusion Matrix:
 [[79 20]
 [16 39]]


In [13]:
# Step 9: Compare Predicted vs. Actual Outcomes for Test Set

print("\nPredicted vs Actual Outcomes (Test Set):")

# Iterate through actual and predicted values
for actual, predicted in zip(y_test, y_pred):
    print(f"Actual: {actual}, Predicted: {predicted}")


Predicted vs Actual Outcomes (Test Set):
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 1
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 1
Actual: 1, Predicted: 1
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 1
Actual: 1, Predicted: 1
Actual: 1, Predicted: 1
Actual: 1, Predicted: 1
Actual: 0, Predicted: 1
Actual: 1, Predicted: 1
Actual: 1, Predicted: 1
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual: 1, Predicted: 