# Patient Readmission Prediction

## Model Creation

In [52]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.optimizers import Adam
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### 1. Read the dataset

In [53]:
df = pd.read_csv("..//Dataset/Modified_Actual_Data.csv") # reading the dataset

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,subject_id,hadm_id,race,admittime,dischtime,hospital_expire_flag,readmission_flag,num_procedures,num_unique_procedures,num_unique_icd_codes,num_unique_drugs,num_prescribed_days,num_prescription_records,num_hospital_admissions,proc_icd_codes,diag_icd_codes,top_drg_code,top_drg_type,top_drg_severity,top_drg_mortality,num_emar_medications,num_abnormal_labevents,num_prev_emergency,num_prev_non_emergency,num_prev_general_practice,num_prev_general_surgery,num_prev_internal_medicine,age,charlson_comorbidity_index,diag_vector,proc_vector,length_of_stay
0,0,18671630,29554955,BLACK/AFRICAN AMERICAN,2198-02-24 12:05:00,2198-03-01 14:35:00,0,0,6.0,5.0,18.0,23.0,78.0,29.0,3,10E0XZZ 10907ZC 0UQMXZZ 0WCG0ZZ 0UB70ZZ 0UB70ZZ,O480 Z370 K661 D62 O721 O26833 O9943 Z20822 Z3...,542.0,APR,3.0,2.0,45.0,44.0,0,2,0,0,0,33,0,[-0.02374649 -0.02005619 -0.05349126 -0.011907...,[ 0.00276054 -0.00779697 -0.01796394 -0.017818...,5
1,1,13118941,23589399,WHITE,2148-04-30 17:55:00,2148-05-02 18:20:00,0,0,2.0,2.0,8.0,12.0,24.0,13.0,2,3201 3391,1628 1961 5180 53081 30002 4430 V103 V4571,136.0,APR,2.0,2.0,0.0,14.0,1,0,0,0,4,55,7,[-0.01768944 -0.00689523 0.01893886 0.011578...,[-5.16450312e-03 -2.06137542e-03 -9.90867801e-...,2
2,2,10556002,24958178,WHITE,2158-02-05 16:29:00,2158-02-07 16:17:00,0,0,1.0,1.0,8.0,11.0,24.0,15.0,2,966,2883 E9361 51889 4928 2722 34590 V441 78079,663.0,APR,2.0,1.0,0.0,15.0,1,0,0,0,1,60,2,[ 0.02376392 -0.01334761 0.00358065 0.006234...,[-0.00515624 -0.00666834 -0.00777684 0.008310...,1
3,3,10730408,27077770,WHITE,2187-05-04 20:49:00,2187-05-06 08:28:00,0,0,1.0,1.0,6.0,13.0,18.0,13.0,3,8611,1737 7824 6959 7823 29590 42731,385.0,APR,2.0,2.0,0.0,21.0,0,2,0,1,4,81,4,[-0.00067335 -0.03270201 0.01627167 -0.009883...,[-0.00764096 0.00698509 0.00660495 -0.003101...,1
4,4,10892159,29607512,HISPANIC OR LATINO,2129-01-24 15:10:00,2129-01-26 15:06:00,0,0,0.0,0.0,4.0,13.0,24.0,16.0,1,,78900 2809 5589 311,251.0,APR,1.0,1.0,0.0,18.0,0,0,0,0,0,62,2,[ 0.01981758 -0.00856469 -0.00994193 -0.013454...,0.0,1


In [55]:
df.drop(['proc_icd_codes', 'diag_icd_codes', 'Unnamed: 0', 'admittime', 'dischtime', 'subject_id', 'hadm_id'], axis=1, inplace=True)

# dropping unncessary columns like dates, IDs, time.

In [None]:
df.head()

Unnamed: 0,race,hospital_expire_flag,readmission_flag,num_procedures,num_unique_procedures,num_unique_icd_codes,num_unique_drugs,num_prescribed_days,num_prescription_records,num_hospital_admissions,top_drg_code,top_drg_type,top_drg_severity,top_drg_mortality,num_emar_medications,num_abnormal_labevents,num_prev_emergency,num_prev_non_emergency,num_prev_general_practice,num_prev_general_surgery,num_prev_internal_medicine,age,charlson_comorbidity_index,diag_vector,proc_vector,length_of_stay
0,BLACK/AFRICAN AMERICAN,0,0,6.0,5.0,18.0,23.0,78.0,29.0,3,542.0,APR,3.0,2.0,45.0,44.0,0,2,0,0,0,33,0,[-0.02374649 -0.02005619 -0.05349126 -0.011907...,[ 0.00276054 -0.00779697 -0.01796394 -0.017818...,5
1,WHITE,0,0,2.0,2.0,8.0,12.0,24.0,13.0,2,136.0,APR,2.0,2.0,0.0,14.0,1,0,0,0,4,55,7,[-0.01768944 -0.00689523 0.01893886 0.011578...,[-5.16450312e-03 -2.06137542e-03 -9.90867801e-...,2
2,WHITE,0,0,1.0,1.0,8.0,11.0,24.0,15.0,2,663.0,APR,2.0,1.0,0.0,15.0,1,0,0,0,1,60,2,[ 0.02376392 -0.01334761 0.00358065 0.006234...,[-0.00515624 -0.00666834 -0.00777684 0.008310...,1
3,WHITE,0,0,1.0,1.0,6.0,13.0,18.0,13.0,3,385.0,APR,2.0,2.0,0.0,21.0,0,2,0,1,4,81,4,[-0.00067335 -0.03270201 0.01627167 -0.009883...,[-0.00764096 0.00698509 0.00660495 -0.003101...,1
4,HISPANIC OR LATINO,0,0,0.0,0.0,4.0,13.0,24.0,16.0,1,251.0,APR,1.0,1.0,0.0,18.0,0,0,0,0,0,62,2,[ 0.01981758 -0.00856469 -0.00994193 -0.013454...,0.0,1


In [None]:
df.readmission_flag.value_counts() # checking the split of the dataset

readmission_flag
0    424009
1    110229
Name: count, dtype: int64

### 2. Handling Categorical Variables

In [58]:
# adding one-hot encodings of the column 'race'

def add_dummies(df, column_name):
    dummies = pd.get_dummies(df[column_name], prefix=column_name).astype(int)
    df.drop(column_name, axis=1, inplace=True)
    df[dummies.columns] = dummies

add_dummies(df, 'race')

In [None]:
df.head()

Unnamed: 0,hospital_expire_flag,readmission_flag,num_procedures,num_unique_procedures,num_unique_icd_codes,num_unique_drugs,num_prescribed_days,num_prescription_records,num_hospital_admissions,top_drg_code,top_drg_type,top_drg_severity,top_drg_mortality,num_emar_medications,num_abnormal_labevents,num_prev_emergency,num_prev_non_emergency,num_prev_general_practice,num_prev_general_surgery,num_prev_internal_medicine,age,charlson_comorbidity_index,diag_vector,proc_vector,length_of_stay,race_AMERICAN INDIAN/ALASKA NATIVE,race_ASIAN,race_ASIAN - ASIAN INDIAN,race_ASIAN - CHINESE,race_ASIAN - KOREAN,race_ASIAN - SOUTH EAST ASIAN,race_BLACK/AFRICAN,race_BLACK/AFRICAN AMERICAN,race_BLACK/CAPE VERDEAN,race_BLACK/CARIBBEAN ISLAND,race_HISPANIC OR LATINO,race_HISPANIC/LATINO - CENTRAL AMERICAN,race_HISPANIC/LATINO - COLUMBIAN,race_HISPANIC/LATINO - CUBAN,race_HISPANIC/LATINO - DOMINICAN,race_HISPANIC/LATINO - GUATEMALAN,race_HISPANIC/LATINO - HONDURAN,race_HISPANIC/LATINO - MEXICAN,race_HISPANIC/LATINO - PUERTO RICAN,race_HISPANIC/LATINO - SALVADORAN,race_MULTIPLE RACE/ETHNICITY,race_NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER,race_OTHER,race_PORTUGUESE,race_SOUTH AMERICAN,race_UNKNOWN,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN
0,0,0,6.0,5.0,18.0,23.0,78.0,29.0,3,542.0,APR,3.0,2.0,45.0,44.0,0,2,0,0,0,33,0,[-0.02374649 -0.02005619 -0.05349126 -0.011907...,[ 0.00276054 -0.00779697 -0.01796394 -0.017818...,5,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,2.0,2.0,8.0,12.0,24.0,13.0,2,136.0,APR,2.0,2.0,0.0,14.0,1,0,0,0,4,55,7,[-0.01768944 -0.00689523 0.01893886 0.011578...,[-5.16450312e-03 -2.06137542e-03 -9.90867801e-...,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,1.0,1.0,8.0,11.0,24.0,15.0,2,663.0,APR,2.0,1.0,0.0,15.0,1,0,0,0,1,60,2,[ 0.02376392 -0.01334761 0.00358065 0.006234...,[-0.00515624 -0.00666834 -0.00777684 0.008310...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,1.0,1.0,6.0,13.0,18.0,13.0,3,385.0,APR,2.0,2.0,0.0,21.0,0,2,0,1,4,81,4,[-0.00067335 -0.03270201 0.01627167 -0.009883...,[-0.00764096 0.00698509 0.00660495 -0.003101...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0.0,0.0,4.0,13.0,24.0,16.0,1,251.0,APR,1.0,1.0,0.0,18.0,0,0,0,0,0,62,2,[ 0.01981758 -0.00856469 -0.00994193 -0.013454...,0.0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
add_dummies(df, 'top_drg_type') # adding one hot encoding of 'top_drg_type' column
df.head()

Unnamed: 0,hospital_expire_flag,readmission_flag,num_procedures,num_unique_procedures,num_unique_icd_codes,num_unique_drugs,num_prescribed_days,num_prescription_records,num_hospital_admissions,top_drg_code,top_drg_severity,top_drg_mortality,num_emar_medications,num_abnormal_labevents,num_prev_emergency,num_prev_non_emergency,num_prev_general_practice,num_prev_general_surgery,num_prev_internal_medicine,age,charlson_comorbidity_index,diag_vector,proc_vector,length_of_stay,race_AMERICAN INDIAN/ALASKA NATIVE,race_ASIAN,race_ASIAN - ASIAN INDIAN,race_ASIAN - CHINESE,race_ASIAN - KOREAN,race_ASIAN - SOUTH EAST ASIAN,race_BLACK/AFRICAN,race_BLACK/AFRICAN AMERICAN,race_BLACK/CAPE VERDEAN,race_BLACK/CARIBBEAN ISLAND,race_HISPANIC OR LATINO,race_HISPANIC/LATINO - CENTRAL AMERICAN,race_HISPANIC/LATINO - COLUMBIAN,race_HISPANIC/LATINO - CUBAN,race_HISPANIC/LATINO - DOMINICAN,race_HISPANIC/LATINO - GUATEMALAN,race_HISPANIC/LATINO - HONDURAN,race_HISPANIC/LATINO - MEXICAN,race_HISPANIC/LATINO - PUERTO RICAN,race_HISPANIC/LATINO - SALVADORAN,race_MULTIPLE RACE/ETHNICITY,race_NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER,race_OTHER,race_PORTUGUESE,race_SOUTH AMERICAN,race_UNKNOWN,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN,top_drg_type_APR,top_drg_type_HCFA
0,0,0,6.0,5.0,18.0,23.0,78.0,29.0,3,542.0,3.0,2.0,45.0,44.0,0,2,0,0,0,33,0,[-0.02374649 -0.02005619 -0.05349126 -0.011907...,[ 0.00276054 -0.00779697 -0.01796394 -0.017818...,5,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,2.0,2.0,8.0,12.0,24.0,13.0,2,136.0,2.0,2.0,0.0,14.0,1,0,0,0,4,55,7,[-0.01768944 -0.00689523 0.01893886 0.011578...,[-5.16450312e-03 -2.06137542e-03 -9.90867801e-...,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2,0,0,1.0,1.0,8.0,11.0,24.0,15.0,2,663.0,2.0,1.0,0.0,15.0,1,0,0,0,1,60,2,[ 0.02376392 -0.01334761 0.00358065 0.006234...,[-0.00515624 -0.00666834 -0.00777684 0.008310...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
3,0,0,1.0,1.0,6.0,13.0,18.0,13.0,3,385.0,2.0,2.0,0.0,21.0,0,2,0,1,4,81,4,[-0.00067335 -0.03270201 0.01627167 -0.009883...,[-0.00764096 0.00698509 0.00660495 -0.003101...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
4,0,0,0.0,0.0,4.0,13.0,24.0,16.0,1,251.0,1.0,1.0,0.0,18.0,0,0,0,0,0,62,2,[ 0.01981758 -0.00856469 -0.00994193 -0.013454...,0.0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


### 3. Oversampling to handle class imbalance 
As taught in class (test data class distribution == actual data class distribution)

In [62]:
# separate class distributions
df_class_0 = df[df['readmission_flag'] == 0]
df_class_1 = df[df['readmission_flag'] == 1]

# find the minority class size
min_size = min(len(df_class_0), len(df_class_1)) // 2

# downsample majority class (or upsample if needed)
df_class_0_balanced = df_class_0.sample(n=min_size, random_state=42)
df_class_1_balanced = df_class_1.sample(n=min_size, random_state=42)

# create a balanced training set (equal number of class 0 and class 1)
df_train_balanced = pd.concat([df_class_0_balanced, df_class_1_balanced])


### TESTING EQUALIZATION

# get remaining data for testing (keeping original distribution)
df_test = df.drop(df_train_balanced.index)
df_test_class_0 = df_test[df_test['readmission_flag'] == 0]
df_test_class_1 = df_test[df_test['readmission_flag'] == 1]

# create a balanced training set (equal number of class 0 and class 1)
df_test_balanced = pd.concat([df_test_class_0, df_test_class_1])

**Checking if the distribution are as expected after oversampling**

In [None]:
df_train_balanced.shape

(110228, 55)

In [None]:
df_train_balanced.readmission_flag.value_counts(normalize=True)

readmission_flag
0    0.5
1    0.5
Name: proportion, dtype: float64

In [None]:
df_test_balanced.shape

(424010, 55)

In [None]:
df_test_balanced.readmission_flag.value_counts(normalize=True)

readmission_flag
0    0.870015
1    0.129985
Name: proportion, dtype: float64

In [None]:
df.readmission_flag.value_counts(normalize=True)

readmission_flag
0    0.793671
1    0.206329
Name: proportion, dtype: float64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 534238 entries, 0 to 534237
Data columns (total 55 columns):
 #   Column                                          Non-Null Count   Dtype  
---  ------                                          --------------   -----  
 0   hospital_expire_flag                            534238 non-null  int64  
 1   readmission_flag                                534238 non-null  int64  
 2   num_procedures                                  534238 non-null  float64
 3   num_unique_procedures                           534238 non-null  float64
 4   num_unique_icd_codes                            534238 non-null  float64
 5   num_unique_drugs                                534238 non-null  float64
 6   num_prescribed_days                             534238 non-null  float64
 7   num_prescription_records                        534238 non-null  float64
 8   num_hospital_admissions                         534238 non-null  int64  
 9   top_drg_code              

### 4.0 Model Creation with Hyperparameter Tuning

In [None]:
# Define the models
models = {
    'LogisticRegression': LogisticRegression(max_iter=500),
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': LGBMClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0)
}

# Hyperparameters to be explored during GridSearchCV
param_grids = {
    'LogisticRegression': {'classifier__C': [0.1, 1, 10]},
    'RandomForest': {'classifier__n_estimators': [50, 100], 'classifier__max_depth': [10, 20]},
    'GradientBoosting': {'classifier__n_estimators': [50, 100], 'classifier__learning_rate': [0.01, 0.1]},
    'XGBoost': {'classifier__n_estimators': [50, 100], 'classifier__learning_rate': [0.01, 0.1]},
    'LightGBM': {'classifier__n_estimators': [50, 100], 'classifier__learning_rate': [0.01, 0.1]},
    'CatBoost': {'classifier__iterations': [50, 100], 'classifier__learning_rate': [0.01, 0.1]}
}

In [None]:
# Splitting the data into training and testing sets
X_train, X_test = df_train_balanced.drop(columns=['readmission_flag']), df_test_balanced.drop(columns=['readmission_flag'])
y_train, y_test = df_train_balanced['readmission_flag'], df_test_balanced['readmission_flag']


best_models = {}
# Cross-validation strategy
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Training and evaluating the best models
for name, model in tqdm(models.items(), desc="Training Models", unit="model"):
    pipeline = Pipeline([('classifier', model)])
    grid_search = GridSearchCV(pipeline, param_grids[name], cv=cv, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search.best_estimator_


results = {}
# Evaluating the best models on the test set
for name, model in tqdm(best_models.items(), desc="Evaluating Models", unit="model"):
    y_pred = model.predict(X_test)
    results[name] = classification_report(y_test, y_pred, output_dict=True)

In [None]:
# Display the results
df_results = pd.DataFrame({name: results[name]['weighted avg'] for name in results}).T
df_results['accuracy'] = pd.Series({name: results[name]['accuracy'] for name in results})
print(df_results)

                    precision    recall  f1-score   support  accuracy
LogisticRegression   0.819767  0.714486  0.753779  424010.0  0.714486
RandomForest         0.833075  0.660543  0.715084  424010.0  0.660543
GradientBoosting     0.831537  0.667213  0.720205  424010.0  0.667213
XGBoost              0.834117  0.672034  0.724143  424010.0  0.672034
LightGBM             0.834393  0.670133  0.722684  424010.0  0.670133
CatBoost             0.832278  0.672977  0.724731  424010.0  0.672977
