# Diabetes Prediction using Machine Learning

This notebook implements multiple machine learning models to predict diabetes using the Pima Indians Diabetes Dataset.

## Models Used:
- K-Nearest Neighbors (KNN)
- Naive Bayes
- Random Forest

## Dataset Features:
- Pregnancies
- Glucose
- Blood Pressure
- Skin Thickness
- Insulin
- BMI (Body Mass Index)
- Diabetes Pedigree Function
- Age
- Outcome (Target Variable: 0 = No Diabetes, 1 = Diabetes)

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc

# Set style for plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

## 2. Load and Explore Data

In [None]:
# Load the dataset
df = pd.read_csv('diabetes.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

print("\nDataset Info:")
print(df.info())

print("\nStatistical Summary:")
print(df.describe())

print("\nMissing Values:")
print(df.isnull().sum())

print("\nTarget Variable Distribution:")
print(df['Outcome'].value_counts())

## 3. Data Preprocessing

In [None]:
# Separate features and target
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nFeature scaling completed!")

## 4. Model Training and Evaluation

### 4.1 K-Nearest Neighbors (KNN)

In [None]:
# Train KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)

# Make predictions
knn_pred = knn_model.predict(X_test_scaled)
knn_pred_proba = knn_model.predict_proba(X_test_scaled)[:, 1]

# Evaluate model
print("K-Nearest Neighbors Results:")
print(f"Accuracy: {accuracy_score(y_test, knn_pred):.4f}")
print(f"Precision: {precision_score(y_test, knn_pred):.4f}")
print(f"Recall: {recall_score(y_test, knn_pred):.4f}")
print(f"F1-Score: {f1_score(y_test, knn_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, knn_pred))

### 4.2 Naive Bayes

In [None]:
# Train Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train_scaled, y_train)

# Make predictions
nb_pred = nb_model.predict(X_test_scaled)
nb_pred_proba = nb_model.predict_proba(X_test_scaled)[:, 1]

# Evaluate model
print("Naive Bayes Results:")
print(f"Accuracy: {accuracy_score(y_test, nb_pred):.4f}")
print(f"Precision: {precision_score(y_test, nb_pred):.4f}")
print(f"Recall: {recall_score(y_test, nb_pred):.4f}")
print(f"F1-Score: {f1_score(y_test, nb_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, nb_pred))

### 4.3 Random Forest

In [None]:
# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Make predictions
rf_pred = rf_model.predict(X_test_scaled)
rf_pred_proba = rf_model.predict_proba(X_test_scaled)[:, 1]

# Evaluate model
print("Random Forest Results:")
print(f"Accuracy: {accuracy_score(y_test, rf_pred):.4f}")
print(f"Precision: {precision_score(y_test, rf_pred):.4f}")
print(f"Recall: {recall_score(y_test, rf_pred):.4f}")
print(f"F1-Score: {f1_score(y_test, rf_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, rf_pred))

## 5. Visualizations

### 5.1 Confusion Matrices

In [None]:
# Create confusion matrices for all models
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

models = [
    ('KNN', knn_pred),
    ('Naive Bayes', nb_pred),
    ('Random Forest', rf_pred)
]

for idx, (name, pred) in enumerate(models):
    cm = confusion_matrix(y_test, pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx])
    axes[idx].set_title(f'{name} Confusion Matrix')
    axes[idx].set_ylabel('True Label')
    axes[idx].set_xlabel('Predicted Label')

plt.tight_layout()
plt.savefig('confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()
print("Confusion matrices saved as 'confusion_matrices.png'")

### 5.2 ROC Curves

In [None]:
# Plot ROC curves for all models
plt.figure(figsize=(10, 8))

models_proba = [
    ('KNN', knn_pred_proba),
    ('Naive Bayes', nb_pred_proba),
    ('Random Forest', rf_pred_proba)
]

for name, pred_proba in models_proba:
    fpr, tpr, _ = roc_curve(y_test, pred_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.3f})', linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves - Model Comparison', fontsize=14, fontweight='bold')
plt.legend(loc='lower right', fontsize=11)
plt.grid(alpha=0.3)
plt.savefig('roc_curves.png', dpi=300, bbox_inches='tight')
plt.show()
print("ROC curves saved as 'roc_curves.png'")

### 5.3 Feature Importance (Random Forest)

In [None]:
# Get feature importance from Random Forest
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance - Random Forest', fontsize=14, fontweight='bold')
plt.xlabel('Importance Score', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()
print("Feature importance plot saved as 'feature_importance.png'")

print("\nFeature Importance Ranking:")
print(feature_importance)

## 6. Model Comparison Summary

In [None]:
# Create a comparison table
results_df = pd.DataFrame({
    'Model': ['KNN', 'Naive Bayes', 'Random Forest'],
    'Accuracy': [
        accuracy_score(y_test, knn_pred),
        accuracy_score(y_test, nb_pred),
        accuracy_score(y_test, rf_pred)
    ],
    'Precision': [
        precision_score(y_test, knn_pred),
        precision_score(y_test, nb_pred),
        precision_score(y_test, rf_pred)
    ],
    'Recall': [
        recall_score(y_test, knn_pred),
        recall_score(y_test, nb_pred),
        recall_score(y_test, rf_pred)
    ],
    'F1-Score': [
        f1_score(y_test, knn_pred),
        f1_score(y_test, nb_pred),
        f1_score(y_test, rf_pred)
    ]
})

print("\n" + "="*60)
print("MODEL COMPARISON SUMMARY")
print("="*60)
print(results_df.to_string(index=False))
print("="*60)

# Identify best model
best_model_idx = results_df['Accuracy'].idxmax()
best_model = results_df.loc[best_model_idx, 'Model']
best_accuracy = results_df.loc[best_model_idx, 'Accuracy']

print(f"\nüèÜ Best Model: {best_model} with Accuracy: {best_accuracy:.4f}")

## 7. Conclusions

### Key Findings:
1. All three models (KNN, Naive Bayes, Random Forest) were successfully trained and evaluated
2. Feature importance analysis revealed the most significant predictors of diabetes
3. ROC curves and AUC scores provide insights into model discrimination ability
4. Confusion matrices show the distribution of true positives, true negatives, false positives, and false negatives

### Next Steps:
- Hyperparameter tuning to improve model performance
- Cross-validation for more robust evaluation
- Ensemble methods for potentially better results
- Feature engineering to create new meaningful features
- Handling class imbalance if present