# Diabetes Prediction using Machine Learning Algorithms: A Classification Problem

This project uses various ML classification algorithms (KNN, SVM, Decision Tree, Random Forest, XGBoost) to predict the likelihood of an individual having diabetes based on health-related parameters.

### Dataset Features Used:
- Glucose
- Blood Pressure
- Skin Thickness
- Insulin
- BMI
- Diabetes Pedigree Function
- Age


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')


In [None]:
# Load the dataset (make sure 'diabetes.csv' is in the same folder)
df = pd.read_csv('diabetes.csv')
df.head()


## Exploratory Data Analysis (EDA)

Let's explore the dataset to understand distributions, missing values, and correlations between features.


In [None]:
# Dataset shape and info
print("Shape of the dataset:", df.shape)
print("\nMissing values in each column:\n", df.isnull().sum())
df.describe()


In [None]:
# Histograms of all features
import matplotlib.pyplot as plt
import seaborn as sns

df[features + ['Outcome']].hist(bins=20, figsize=(15, 10))
plt.suptitle('Feature Distributions')
plt.show()


In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 7))
sns.heatmap(df[features + ['Outcome']].corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()


In [None]:
# Check class distribution
sns.countplot(x='Outcome', data=df)
plt.title("Diabetes Outcome Distribution")
plt.xticks([0, 1], ['No Diabetes', 'Diabetes'])
plt.show()


In [None]:
# Select only the 7 predictor features mentioned
features = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
X = df[features]
y = df['Outcome']  # Target variable


In [None]:
# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Dictionary of models
models = {
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(probability=True),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Store results
results = {}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    
    results[name] = {'Accuracy': acc, 'AUC': auc}
    
    print(f"\n{name} Results:")
    print("Accuracy:", round(acc, 2))
    print("AUC Score:", round(auc, 2))
    print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
# Display comparison of all models
print("\n--- Model Comparison ---")
comparison_df = pd.DataFrame(results).T
comparison_df = comparison_df.sort_values(by='AUC', ascending=False)
comparison_df
