**Project Title:Hospital Readmission Predictor and Decision Support System**

In [1]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

pd.set_option("display.max_columns",200)

**Step 1: Loading the Data and Basic Data Exploration**

In [2]:
#oading dataset
data = pd.read_csv("diabetic_data.csv")

In [3]:
#view the first few rows
data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648.0,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,?,?,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,?,?,51,0,8,0,0,0,197.0,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [4]:
#view columns on the dataset
data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [5]:
# View general information: data types, non-null counts
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

In [6]:
#Get the shape (rows, columns) of the dataset
data.shape

(101766, 50)

**Descriptive Statistics & Missing Data Analysis**

In [7]:
# Generate descriptive statistics for numeric columns
data.describe()

Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0
mean,165201600.0,54330400.0,2.024006,3.715642,5.754437,4.395987,43.095641,1.33973,16.021844,0.369357,0.197836,0.635566,7.422607
std,102640300.0,38696360.0,1.445403,5.280166,4.064081,2.985108,19.674362,1.705807,8.127566,1.267265,0.930472,1.262863,1.9336
min,12522.0,135.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,84961190.0,23413220.0,1.0,1.0,1.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0
50%,152389000.0,45505140.0,1.0,1.0,7.0,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0
75%,230270900.0,87545950.0,3.0,4.0,7.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0
max,443867200.0,189502600.0,8.0,28.0,25.0,14.0,132.0,6.0,81.0,42.0,76.0,21.0,16.0


In [8]:
# For categorical columns, you can inspect unique values and counts
data['race'].value_counts()
data['gender'].value_counts()

gender
Female             54708
Male               47055
Unknown/Invalid        3
Name: count, dtype: int64

In [9]:
# Check missing values for each feature
missing_values = data.isnull().sum()
print("Missing values in each column:\n", missing_values)

Missing values in each column:
 encounter_id                    0
patient_nbr                     0
race                            0
gender                          0
age                             0
weight                          0
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                      0
medical_specialty               0
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                          0
diag_2                          0
diag_3                          0
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride     

In [11]:
# Drop non-predictive columns (IDs)
data = data.drop(['encounter_id', 'patient_nbr'], axis=1)

In [12]:
data['readmitted'] = data['readmitted'].apply(lambda x: 0 if x == 'NO' else 1)

In [13]:
# Columns with missing values (example: 'weight', 'payer_code', 'medical_specialty')
# Impute categorical columns with 'missing' and numerical with median
data['weight'] = data['weight'].fillna('missing')
data['payer_code'] = data['payer_code'].fillna('missing')
data['medical_specialty'] = data['medical_specialty'].fillna('missing')

# For numerical columns (if any missing):
# data['num_medications'] = data['num_medications'].fillna(data['num_medications'].median())

In [24]:
# Example mapping (replace with actual CCS codes)
def map_diag(code):
    code = str(code)
    if code.startswith(('390', '391', '402')): return 'hypertension'
    elif code.startswith(('250')): return 'diabetes'
    elif code.startswith(('428')): return 'heart_failure'
    else: return 'other'

for col in ['diag_1', 'diag_2', 'diag_3']:
    data[col] = data[col].apply(map_diag)

In [25]:
data['medication_intensity'] = data['num_medications'] / (data['time_in_hospital'] + 1e-5)
data['total_visits'] = data['number_inpatient'] + data['number_outpatient']

In [26]:
from sklearn.model_selection import train_test_split

X = data.drop('readmitted', axis=1)
y = data['readmitted']

# Split data (stratify to preserve class distribution)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [27]:
#Preprocessing Pipeline

In [28]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Categorical columns
cat_cols = ['race', 'gender', 'age', 'weight', 'admission_type_id', 
            'discharge_disposition_id', 'admission_source_id', 'payer_code', 
            'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 
            'A1Cresult', 'metformin', 'insulin', 'change', 'diabetesMed']  # + other medication columns

# Numerical columns
num_cols = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 
            'num_medications', 'number_outpatient', 'number_emergency', 
            'number_inpatient', 'number_diagnoses']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ])

# Apply preprocessing
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [29]:
# Handle Class Imbalance with SMOTE

In [31]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_preprocessed, y_train)

In [32]:
#Build a Deep Learning Model

In [33]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

from tensorflow.keras.layers import BatchNormalization

model = Sequential([
    Dense(128, activation='relu', kernel_regularizer='l2', input_dim=input_dim),
    BatchNormalization(),
    Dropout(0.6),
    Dense(64, activation='relu', kernel_regularizer='l2'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.4),
    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [35]:
input_dim = X_train_balanced.shape[1]
print("Input dimension:", input_dim)  # Should match your model’s input layer

Input dimension: 198


In [36]:
model = Sequential([
    Dense(128, activation='relu', kernel_regularizer='l2', input_dim=198),  # Use input_dim=198
    BatchNormalization(),
    Dropout(0.6),
    Dense(64, activation='relu', kernel_regularizer='l2'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.4),
    Dense(1, activation='sigmoid')
])

In [37]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
)

# Verify class weights
class_counts = np.bincount(y_train_balanced)
print("Class distribution:", class_counts)  # Ensure classes are balanced

history = model.fit(
    X_train_balanced, y_train_balanced,
    epochs=100,
    batch_size=256,
    validation_split=0.2,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=10)]
)

Class distribution: [43891 43891]
Epoch 1/100
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 17ms/step - accuracy: 0.5182 - auc: 0.5078 - loss: 3.1895 - val_accuracy: 0.4263 - val_auc: 0.5917 - val_loss: 2.5682
Epoch 2/100
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.5286 - auc: 0.5329 - loss: 2.5706 - val_accuracy: 0.5018 - val_auc: 0.6242 - val_loss: 2.1423
Epoch 3/100
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.5336 - auc: 0.5403 - loss: 2.1634 - val_accuracy: 0.5207 - val_auc: 0.6345 - val_loss: 1.8443
Epoch 4/100
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.5439 - auc: 0.5551 - loss: 1.8583 - val_accuracy: 0.5291 - val_auc: 0.6428 - val_loss: 1.6236
Epoch 5/100
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.5476 - auc: 0.5607 - loss: 1.6419 - val_accuracy: 0.5273 - val_auc: 0.6461 - va

In [39]:
results = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Loss: {results[0]}, Test Accuracy: {results[1]}, Test AUC: {results[2]}")

ValueError: Invalid dtype: object

In [None]:
# Predict on test data
y_pred_prob = model.predict(X_test_preprocessed).ravel()
y_pred = (y_pred_prob > best_threshold).astype(int)  # Use optimized threshold

# Metrics
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_prob))
print(classification_report(y_test, y_pred))

In [None]:
import shap

# Sample 1000 training instances for SHAP
background = X_train_balanced[:1000]
explainer = shap.DeepExplainer(model, background)
shap_values = explainer.shap_values(X_test_preprocessed[:1000])

# Plot feature importance
shap.summary_plot(shap_values, X_test_preprocessed, feature_names=feature_names)

In [None]:
model.save('readmission_model.h5')
joblib.dump(preprocessor, 'preprocessor.pkl')