<a href="https://colab.research.google.com/github/ayyanzia/DiabetesPredictionAI/blob/main/DiabetesChecker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Diabetes Project


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install scikit-learn pandas flask joblib




In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load dataset

df = pd.read_csv("/content/drive/MyDrive/datasetColab/diabetes_012_health_indicators_BRFSS2015.csv")  # Updated path  Ensure correct path

# Handle missing values
imputer = SimpleImputer(strategy='mean')
df.iloc[:, :] = imputer.fit_transform(df)
df.columns = df.columns.str.strip().str.lower()  # Remove spaces & lowercase
print("Cleaned Columns:", df.columns.tolist())


# Select features and target
selected_features = [ 'highbp', 'highchol', 'cholcheck', 'bmi', 'smoker', 'stroke', 'heartdiseaseorattack', 'physactivity', 'fruits', 'veggies', 'hvyalcoholconsump', 'anyhealthcare', 'nodocbccost', 'genhlth', 'menthlth', 'physhlth', 'diffwalk', 'sex', 'age', 'education', 'income']


X = df[selected_features]
y = df['diabetes_012']

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("✅ Data Loaded & Preprocessed Successfully!")


Cleaned Columns: ['diabetes_012', 'highbp', 'highchol', 'cholcheck', 'bmi', 'smoker', 'stroke', 'heartdiseaseorattack', 'physactivity', 'fruits', 'veggies', 'hvyalcoholconsump', 'anyhealthcare', 'nodocbccost', 'genhlth', 'menthlth', 'physhlth', 'diffwalk', 'sex', 'age', 'education', 'income']
✅ Data Loaded & Preprocessed Successfully!


In [23]:
# Reduce dataset to 5000 random samples
df_sample = df.sample(n=5000, random_state=42)  # Choose any number < 25,000

# Select features and target again
X = df_sample[selected_features]
y = df_sample["diabetes_012"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("✅ Reduced dataset size to:", len(X_train) + len(X_test))


✅ Reduced dataset size to: 5000


In [24]:
#training and optimization
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define hyperparameters
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

# Perform Grid Search for best hyperparameters
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Display results
print("🔹 Best Parameters:", grid_search.best_params_)
print("🔹 Best Accuracy:", grid_search.best_score_)


🔹 Best Parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 50}
🔹 Best Accuracy: 0.84175


In [25]:
from sklearn.metrics import classification_report, roc_auc_score

# Predictions
y_pred = best_model.predict(X_test)

# Performance report
print(classification_report(y_test, y_pred))

# Compute AUC Score
auc_score = roc_auc_score(y_test, best_model.predict_proba(X_test), multi_class='ovr') # add multi_class='ovr'
print(f"✅ AUC Score: {auc_score:.2f}")

              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92       857
         1.0       0.00      0.00      0.00        19
         2.0       0.44      0.12      0.19       124

    accuracy                           0.86      1000
   macro avg       0.44      0.37      0.37      1000
weighted avg       0.80      0.86      0.81      1000

✅ AUC Score: 0.70


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
#Saving
import joblib

# Save model & scaler
joblib.dump(best_model, "/content/drive/MyDrive/datasetColab/diabetes_model.pkl")
joblib.dump(scaler, "/content/drive/MyDrive/datasetColab/scaler.pkl")

print("✅ Model saved successfully!")


✅ Model saved successfully!


In [27]:
import joblib
import numpy as np

# Load the trained model and scaler
model = joblib.load("/content/drive/MyDrive/datasetColab/diabetes_model.pkl")
scaler = joblib.load("/content/drive/MyDrive/datasetColab/scaler.pkl")

print("✅ Model and scaler loaded successfully!")


✅ Model and scaler loaded successfully!


In [28]:
df.tail()

Unnamed: 0,diabetes_012,highbp,highchol,cholcheck,bmi,smoker,stroke,heartdiseaseorattack,physactivity,fruits,...,anyhealthcare,nodocbccost,genhlth,menthlth,physhlth,diffwalk,sex,age,education,income
253675,0.0,1.0,1.0,1.0,45.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,5.0,0.0,1.0,5.0,6.0,7.0
253676,2.0,1.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,4.0,0.0,0.0,1.0,0.0,11.0,2.0,4.0
253677,0.0,0.0,0.0,1.0,28.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,5.0,2.0
253678,0.0,1.0,0.0,1.0,23.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,7.0,5.0,1.0
253679,2.0,1.0,1.0,1.0,25.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,9.0,6.0,2.0


In [39]:
# Example input data (Replace with actual feature values)
#Nikal lauday ->Sohaib
# The input data should have 22 features, not 4
# Example: Assuming the first 4 features are the ones provided:
input_data = np.array([[ 1, 1, 1390, 12555553, 1, 3, 0, 1, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 50, 12, 12],[ 1, 2, 90, 69, 1, 3, 1, 1, 0, 0, 1, 1, 2, 1, 1, 1, 1, 1, 47, 12, 12]])

# Scale input data
input_data_scaled = scaler.transform(input_data)

# Make prediction
prediction = model.predict(input_data_scaled)
probability = model.predict_proba(input_data_scaled)[:,1]  # Probability of having diabetes

# Show results
print(f"🔍 Prediction: {'Diabetic' if prediction[0] >= 1 else 'Not Diabetic'}")
print(f"📊 Probability of Diabetes: {probability[0]:.2f}")
print(f"🔍 Prediction: {'Diabetic' if prediction[1] >= 1 else 'Not Diabetic'}")
print(f"📊 Probability of Diabetes: {probability[1]:.2f}")

🔍 Prediction: Not Diabetic
📊 Probability of Diabetes: 0.01
🔍 Prediction: Not Diabetic
📊 Probability of Diabetes: 0.02


