# Diabetes Detection Model

In [1]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

In [2]:
# Load Dataset
df = pd.read_csv("diabetic_data.csv")
print("Dataset Shape: ", df.shape)
print(df.head())

Dataset Shape:  (101766, 50)
   encounter_id  patient_nbr             race  gender      age weight  \
0       2278392      8222157        Caucasian  Female   [0-10)      ?   
1        149190     55629189        Caucasian  Female  [10-20)      ?   
2         64410     86047875  AfricanAmerican  Female  [20-30)      ?   
3        500364     82442376        Caucasian    Male  [30-40)      ?   
4         16680     42519267        Caucasian    Male  [40-50)      ?   

   admission_type_id  discharge_disposition_id  admission_source_id  \
0                  6                        25                    1   
1                  1                         1                    7   
2                  1                         1                    7   
3                  1                         1                    7   
4                  1                         1                    7   

   time_in_hospital  ... citoglipton insulin  glyburide-metformin  \
0                 1  ...          No

In [3]:
# check unique/unusual values like "?", null, -, 0 etc.
for col in df.columns:
    print(col, df[col].unique()[:20]) # first 20 unique values

encounter_id [2278392  149190   64410  500364   16680   35754   55842   63768   12522
   15738   28236   36900   40926   42570   62256   73578   77076   84222
   89682  148530]
patient_nbr [  8222157  55629189  86047875  82442376  42519267  82637451  84259809
 114882984  48330783  63555939  89869032  77391171  85504905  77586282
  49726791  86328819  92519352 108662661 107389323  69422211]
race ['Caucasian' 'AfricanAmerican' '?' 'Other' 'Asian' 'Hispanic']
gender ['Female' 'Male' 'Unknown/Invalid']
age ['[0-10)' '[10-20)' '[20-30)' '[30-40)' '[40-50)' '[50-60)' '[60-70)'
 '[70-80)' '[80-90)' '[90-100)']
weight ['?' '[75-100)' '[50-75)' '[0-25)' '[100-125)' '[25-50)' '[125-150)'
 '[175-200)' '[150-175)' '>200']
admission_type_id [6 1 2 3 4 5 8 7]
discharge_disposition_id [25  1  3  6  2  5 11  7 10  4 14 18  8 13 12 16 17 22 23  9]
admission_source_id [ 1  7  2  4  5  6 20  3 17  8  9 14 10 22 11 25 13]
time_in_hospital [ 1  3  2  4  5 13 12  9  7 10  6 11  8 14]
payer_code ['?' 'MC' 'M

# Preprocess data 

In [4]:
# Replace '?' with NaN
df = df.replace('?', np.nan)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      99493 non-null   object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    3197 non-null    object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                61510 non-null   object
 11  medical_specialty         51817 non-null   object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

In [5]:
# Drop columns with too many missing values and not important to ML prediction

df = df.drop(["encounter_id", "patient_nbr", "weight", "payer_code", "medical_specialty"], axis=1)
# Covert target column to binary
df["readmitted"] = df["readmitted"].apply(lambda x: 1 if x in [">30", "<30"] else 0)
df["readmitted"]

0         0
1         1
2         0
3         0
4         0
         ..
101761    1
101762    0
101763    0
101764    0
101765    0
Name: readmitted, Length: 101766, dtype: int64

In [6]:
# encode categorical variables
cat_cols = df.select_dtypes(include= "object").columns
df = pd.get_dummies(df, columns = cat_cols, drop_first = True)
df.head()

Unnamed: 0,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,insulin_Up,glyburide-metformin_No,glyburide-metformin_Steady,glyburide-metformin_Up,glipizide-metformin_Steady,glimepiride-pioglitazone_Steady,metformin-rosiglitazone_Steady,metformin-pioglitazone_Steady,change_No,diabetesMed_Yes
0,6,25,1,1,41,0,1,0,0,0,...,False,True,False,False,False,False,False,False,True,False
1,1,1,7,3,59,0,18,0,0,0,...,True,True,False,False,False,False,False,False,False,True
2,1,1,7,2,11,5,13,2,0,1,...,False,True,False,False,False,False,False,False,True,True
3,1,1,7,2,44,1,16,0,0,0,...,True,True,False,False,False,False,False,False,False,True
4,1,1,7,1,51,0,8,0,0,0,...,False,True,False,False,False,False,False,False,False,True


In [7]:
# Feature-target Classification
x= df.drop("readmitted", axis=1)
y= df["readmitted"]

In [8]:
# train-test split
x_train, x_test, y_train, y_test= train_test_split(x,y, test_size = 0.2, random_state= 42, stratify = y)

In [9]:
# handle imbalance with SMOTE

smote= SMOTE(random_state= 42)

x_train_bal, y_train_bal = smote.fit_resample(x_train, y_train)

print("\nBefore SMOTE:", np.bincount(y_train))
print("After SMOTE:", np.bincount(y_train_bal))


Before SMOTE: [43891 37521]
After SMOTE: [43891 43891]


In [14]:
# Hyperparameter-tuning

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5], 
    "min_samples_leaf": [1, 2]
}

rf = RandomForestClassifier(random_state=42)
grid = GridSearchCV(rf, param_grid, cv = 3, scoring = "f1", n_jobs=1, verbose=1)
grid.fit(x_train, y_train)

print("\n Best Parameters:", grid.best_params_)
best_model = grid.best_estimator_

Fitting 3 folds for each of 24 candidates, totalling 72 fits

 Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}


In [15]:
# Evaluate model
y_pred = model.predict(x_test)

print("Confusion matrix")
print(confusion_matrix(y_test, y_pred))
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion matrix
[[7125 3848]
 [3665 5716]]
Classification Report
              precision    recall  f1-score   support

           0       0.66      0.65      0.65     10973
           1       0.60      0.61      0.60      9381

    accuracy                           0.63     20354
   macro avg       0.63      0.63      0.63     20354
weighted avg       0.63      0.63      0.63     20354



In [17]:
# Subgroup Analysis (Gender)
# -------------------------------
print("\n--- Gender Subgroup Performance ---")

# Gender mapping is already one-hot encoded: gender_Male (1=Male, 0=Female)
male_idx = x_test['gender_Male'] == 1
female_idx = x_test['gender_Male'] == 0

print("\nMale Patients:")
print(classification_report(y_test[male_idx], y_pred[male_idx]))

print("\nFemale Patients:")
print(classification_report(y_test[female_idx], y_pred[female_idx]))


--- Gender Subgroup Performance ---

Male Patients:
              precision    recall  f1-score   support

           0       0.66      0.68      0.67      5270
           1       0.58      0.57      0.57      4162

    accuracy                           0.63      9432
   macro avg       0.62      0.62      0.62      9432
weighted avg       0.63      0.63      0.63      9432


Female Patients:
              precision    recall  f1-score   support

           0       0.66      0.62      0.64      5703
           1       0.61      0.64      0.63      5219

    accuracy                           0.63     10922
   macro avg       0.63      0.63      0.63     10922
weighted avg       0.63      0.63      0.63     10922

