# Set up and Preprocessing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/diabetes.csv')

In [None]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    float64
 2   BloodPressure             768 non-null    float64
 3   SkinThickness             768 non-null    float64
 4   Insulin                   768 non-null    float64
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
 9   BMI_Age                   768 non-null    float64
 10  Glucose_BMI               768 non-null    float64
 11  Freuency_Age_Ratio        768 non-null    float64
dtypes: float64(9), int64(3)
memory usage: 72.1 KB


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,BMI_Age,Glucose_BMI,Freuency_Age_Ratio
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958,1080.975456,3.744848,0.104992
std,3.369578,30.435949,12.096346,8.790942,85.021108,6.875151,0.331329,11.760232,0.476951,437.83769,1.0544,0.079612
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0,382.2,1.447084,0.0
25%,1.0,99.75,64.0,25.0,121.5,27.5,0.24375,24.0,0.0,744.8,2.96334,0.04
50%,3.0,117.0,72.202592,29.15342,155.548223,32.4,0.3725,29.0,0.0,987.25,3.64321,0.090909
75%,6.0,140.25,80.0,32.0,155.548223,36.6,0.62625,41.0,1.0,1357.2,4.411519,0.160323
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0,2697.0,7.918367,0.358974


In [None]:
zero_columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for column in zero_columns:
    df[column] = df[column].replace(0, np.nan)

df['Glucose'].fillna(df['Glucose'].mean(), inplace=True)
df['BloodPressure'].fillna(df['BloodPressure'].mean(), inplace=True)
df['SkinThickness'].fillna(df['SkinThickness'].mean(), inplace=True)
df['Insulin'].fillna(df['Insulin'].mean(), inplace=True)
df['BMI'].fillna(df['BMI'].mean(), inplace=True)

# Feature Engineerg

In [None]:
df['BMI_Age'] = df['BMI'] * df['Age']
df['Glucose_BMI'] = df['Glucose'] / (df['BMI'] + 1)
df['Freuency_Age_Ratio'] = df['Pregnancies'] / (df['Age'] + 1)

# Train-Test Split & Scaling

In [None]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model Training

In [None]:
model = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_split=10, min_samples_leaf=2, random_state=42)
model.fit(X_train_scaled, y_train)

# Evaluate
y_pred = model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.82      0.81        99
           1       0.66      0.64      0.65        55

    accuracy                           0.75       154
   macro avg       0.73      0.73      0.73       154
weighted avg       0.75      0.75      0.75       154

[[81 18]
 [20 35]]
0.7532467532467533


In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Hyperparameter grid
param_grid = {'n_estimators': [50, 100, 400],
  'max_depth': [None, 10, 20, 30],
  'min_samples_split': [2, 5, 10],
  'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
{'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 50}
0.7785419165667065


In [None]:
rf = RandomForestClassifier(random_state=42)

random_search = RandomizedSearchCV(
    estimator=rf, param_distributions=param_grid,
    n_iter=20,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1,
    scoring='accuracy')

random_search.fit(X_train_scaled, y_train)

print("Best Parameters from RandomizedSearchCV:")
print(random_search.best_params_)
print(random_search.best_score_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters from RandomizedSearchCV:
{'n_estimators': 50, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 10}
0.7785419165667065


In [None]:
refined_model = {'n_estimators':
                 [random_search.best_params_['n_estimators']], 'max_depth':
                 [random_search.best_params_['max_depth']],
                 'min_samples_split':
                 [random_search.best_params_['min_samples_split']],
                 'min_samples_leaf':
                 [random_search.best_params_['min_samples_leaf']]
                 }

grid_search = GridSearchCV(estimator=model, param_grid=refined_model, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
{'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 50}
0.7785419165667065


# Save Model and Scaler

In [None]:
import joblib
joblib.dump(grid_search.best_estimator_, 'diabetes_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(X.columns.tolist(), 'model_columns.pkl')

['model_columns.pkl']

# Gradio App with Prediction Probability Chart

In [None]:



import gradio as gr
import matplotlib.pyplot as plt


import gradio as gr
import matplotlib.pyplot as plt
import joblib # Assuming joblib is used for loading

# Load everything
model = joblib.load("diabetes_model.pkl")
scaler = joblib.load("scaler.pkl")
columns = joblib.load("model_columns.pkl")

def predict_diabetes(preg, glucose, bp, skin, insulin, bmi, dpf, age):
    try:
        # Feature Engineering
        BMI_Age = bmi * age
        Glucose_BMI = glucose / (bmi + 1)
        Pregnancy_Age_Ratio = preg / (age + 1)

        data_dict = {
            'Pregnancies': preg,
            'Glucose': glucose,
            'BloodPressure': bp,
            'SkinThickness': skin,
            'Insulin': insulin,
            'BMI': bmi,
            'DiabetesPedigreeFunction': dpf,
            'Age': age,
            'BMI_Age': BMI_Age,
            'Glucose_BMI': Glucose_BMI,
            'Pregnancy_Age_Ratio': Pregnancy_Age_Ratio
        }

        for col in columns:
            if col not in data_dict:
                data_dict[col] = 0.0

        input_data = [data_dict[col] for col in columns]
        input_scaled = scaler.transform([input_data])

        probas = model.predict_proba(input_scaled)[0]
        prediction = model.predict(input_scaled)[0]

        # Risk category
        if probas[1] >= 0.75:
            risk_level = "🔴 High Risk"
        elif probas[1] >= 0.5:
            risk_level = "🟠 Moderate Risk"
        else:
            risk_level = "🟢 Low Risk"

        # Health tips
        tips = []
        if glucose > 130:
             tips.append("High glucose level – consider reducing sugar intake.")
        if bmi > 30:
             tips.append("Your BMI is quite high – regular exercise is recommended.")
        if bp < 60 or bp > 120:
             tips.append("Blood pressure is outside normal range – monitor closely.")
        if insulin > 200:
             tips.append("Insulin level is high – may indicate insulin resistance.")
        if dpf > 1:
             tips.append("Family history risk detected – regular screening advised.")

        tips_text = "\n".join(tips) if tips else " No critical health flags based on input."


        # Plot
        fig, ax = plt.subplots()
        ax.bar(["Not Diabetic", "Diabetic"], probas, color=['green', 'red'])
        ax.set_ylim([0, 1])
        ax.set_ylabel("Probability")
        ax.set_title("Prediction Confidence")
        plt.tight_layout()

        label = "Likely Diabetic" if prediction == 1 else "Not Diabetic"
        full_result = f"{label} ({risk_level})\n\n Tips:\n{tips_text}"
        return full_result, fig

    except Exception as e:
        return f"Error: {str(e)}", None

#Gradio Interface
interface = gr.Interface(
    fn=predict_diabetes,
    inputs=[
        gr.Number(label="Pregnancies"),
        gr.Number(label="Glucose"),
        gr.Number(label="Blood Pressure"),
        gr.Number(label="Skin Thickness"),
        gr.Number(label="Insulin"),
        gr.Number(label="BMI"),
        gr.Number(label="Diabetes Pedigree Function"),
        gr.Number(label="Age")
    ],
    outputs=["text", gr.Plot()],
    title="Diabetes Prediction App",
    description="Enter patient data to predict diabetes with confidence chart."
)

interface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1185f1c06ad87b98aa.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


