In [333]:
import pandas as pd
import numpy as np

## Load Data

In [334]:
IDS_MAPPING_FN = "./data/IDS_mapping.csv"
DIABETIC_FN = "./data/diabetic_data.csv"

In [335]:
# read files
mapping = pd.read_csv(IDS_MAPPING_FN, header=None)
df = pd.read_csv(DIABETIC_FN)

In [336]:
MEDIC_COLUMNS = df.columns[23:46].tolist()
PREVIOUS_HOSPITAL_ENCOUNTERS = ["number_outpatient", "number_inpatient", "number_emergency"]

Create dictionary of code-value mappings of `admission_type_id`, `discharge_disposition_id`, and `admission_source_id` using the mapping provided in the data-folder, and map integer values to string values for readability. 

In [337]:
admission_type_dict = {}
discharge_disposition_dict = {}
admission_source_dict = {}

list1 = []
for i, j in zip(mapping[0].values, mapping[1].values):
  if len(str(i))>3:
    feature_name = i
  elif len(str(i))!=3:
    if feature_name == 'admission_type_id':
      admission_type_dict[int(i)] = j
    elif feature_name == 'discharge_disposition_id':
      discharge_disposition_dict[int(i)] = j
    elif feature_name == 'admission_source_id':
      admission_source_dict[int(i)] = j


df['admission_type'] = df['admission_type_id'].map(admission_type_dict).astype("O")
df['discharge_disposition'] = df['discharge_disposition_id'].map(discharge_disposition_dict).astype("O")
df['admission_source'] = df['admission_source_id'].map(admission_source_dict).astype("O")

In [338]:
print(f"Number of unique encounters: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

Number of unique encounters: 101766
Number of columns: 53


## Preprocessing

Some patients have many encounters (up to 40).

We only keep the first observation for each unique patient to treat them as i.i.d random variables.

We filter to only keep observations with `admission_type` $\in$ [Emergency, Urgent, Elective].

In [339]:
df = df.groupby("patient_nbr").agg('first').reset_index()
df = df[df['admission_type'].isin(['Emergency', 'Urgent', 'Elective'])]
print(f"Number of unique encounters after only keeping first encounter for each patient and filtering by admission_type : {df.shape[0]}")

Number of unique encounters after only keeping first encounter for each patient and filtering by admission_type : 63757


#### Feature Engineering: Diabetic Information

To select features as predictor variables we are only interested in certain information about the patients, but not the exact values. Thus we aggregate the information of sets of columns into new columns:


* Blood Glucose Tests

    * `max_glu_serum_flag`: Whether a max glucose serum test was done at the hospital : `max_glu_serum`


    * `A1C_flag`: Whether an AC1 test was performed to monitor blood glucose levels : `AC1result`

    
* Diabetic Medication Information: 

    * `change_dosage`: Whether there was any change ("Up" or "Down") in the diabetic prescription dosages as a result of the hospital visit.
        * *The column `change` indicates if there was a change in diabetic medications (either dosage or generic name). So if `change` is marked as changed but `change_dosage` is none, then there must have been a change in the generic name, i.e. the chemical name of a medicine.*


    * `change_medicine`: Whether there was prescribed any new diabetic medication i.e. a change in the medicament as a result of the hospital visit.


    * `num_diabetic_prescriptions`: How many diabetic prescriptions the patient ongoingly had at the time of hospital visit - a count of entries that are $\in \{\text{"Steady"}, \text{"Up"}, \text{"Down"}\}$ in `MEDIC_COLUMNS`


* Admitted in the hospital within the previous year

    * `prev_year_hospital`: Whether the patient had any admissions in the hospital during the past year. 

* Health Insurance / Coverage

    * `blue_cross`: Patient has private insurance
    * `medicaid`: Patient has medicaid
    * `medicare`: Patient has medicare
    * `self_payed`: Patient payed up front

* Readmitted
    * `readmitted_flag`: Whether the patient was readmitted or not within 30 days based on the columns `readmitted`


In [340]:
#### Blood glucose Tests ####
df['max_glu_serum_flag'] = df['max_glu_serum'].notnull().astype(int) # Max Glucose Serum test flag
df['A1C_flag'] = df['A1Cresult'].notnull().astype(int) # AC1 test flag

#### Diabetic Medication Information ####
# Change in dosage if any of the diabetics prescriptions has entries "Up" or "Down"
df['change_dosage'] = df[MEDIC_COLUMNS].isin(['Up', 'Down']).any(axis=1).astype(int)
# Check if each entry is in the set ['Up', 'Down', 'Steady'] and sum all True entries for each row
df['num_diabetic_prescriptions'] = df[MEDIC_COLUMNS].apply(lambda col: np.isin(col, ['Up', 'Down', 'Steady'])).sum(axis=1).astype(int)
# Change in medicine is assumed to be the case when original change column = 1 but change in dosage = 0
df['change_medicine'] = np.where((df['change'] == 'Ch') & (df['change_dosage'] == 0), 1, 0)

#### Hospital Encounters during preceding year ####
df['prev_year_hospital'] = (df[PREVIOUS_HOSPITAL_ENCOUNTERS] > 0).any(axis=1).astype(int)

#### Insurance Billing ####
df['blue_cross'] = np.where(df["payer_code"]=="BC", 1, 0)
df['medicare'] = np.where(df["payer_code"]=="MC", 1, 0)
df['medicaid'] = np.where(df["payer_code"]=="MD", 1, 0)
df['self_pay'] = np.where(df["payer_code"]=="SP", 1, 0)

#### Readmitted ####
# y_i
# df['readmitted_flag'] = np.where(df['readmitted']=='<30', 1, 0) # Readmitted flag
df['readmitted_flag'] = np.where(df['readmitted']=='NO', 0, 1)

## Define variables

In [341]:
FEATURES = ["time_in_hospital", "admission_type_id", "num_lab_procedures", "num_procedures", 
            "num_medications", 'prev_year_hospital', "number_diagnoses", 'max_glu_serum_flag', 
            'A1C_flag', 'change_dosage', 'change_medicine', 'num_diabetic_prescriptions', "blue_cross", "medicaid", "medicare","self_pay"]
PROTECTED_FEATURES = ['age', 'race', 'gender']
PATIENTS = ["patient_nbr"]  
TARGET = ["readmitted_flag"]

In [342]:
# df_50_60 = df[df["age"]== "[50-60)"]
# df_60_70 = df[df["age"]== "[60-70)"]
# df_70_80 = df[df["age"]== "[70-80)"]
# df_80_90 = df[df["age"]== "[80-90)"]
# df = pd.concat([df_80_90])

In [343]:
df = df[PATIENTS + FEATURES + PROTECTED_FEATURES + TARGET]
print(df.shape, df["readmitted_flag"].value_counts())

(63757, 21) 0    38626
1    25131
Name: readmitted_flag, dtype: int64


In [375]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df[df['readmitted_flag'] == 0]
df_minority = df[df['readmitted_flag'] == 1]

# Downsample the majority class to have the same number of observations as the minority class
df_majority_downsampled = resample(df_majority,
                                   replace=False,  # Sample without replacement
                                   n_samples=len(df_minority),  # Number of samples to match minority class
                                   random_state=42)  # For reproducibility

# Combine the downsampled majority class with the minority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Shuffle the DataFrame to randomize the order of samples
df_downsampled = df_downsampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the number of samples for each class after downsampling
print(df_downsampled[TARGET].value_counts())

readmitted_flag
0                  25131
1                  25131
dtype: int64


In [376]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
# X,y = df[FEATURES].to_numpy(), df[TARGET].to_numpy()
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


X,y = df_downsampled[FEATURES].to_numpy(), df_downsampled[TARGET].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [377]:
from keras.regularizers import L1, L2
import keras
import tensorflow as tf

In [384]:
inputs = keras.Input(shape=(16,))
x = keras.layers.Dense(32, activation="relu")(inputs)
x = keras.layers.Dense(8, activation="relu")(x)
outputs = keras.layers.Dense(1, activation="softmax")(x)
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=1e-5), loss='binary_crossentropy', metrics=['accuracy'])

In [385]:
model.fit(X_train,y_train, epochs=10, batch_size=100, validation_split=0.2,callbacks=[keras.callbacks.EarlyStopping(patience=2)],shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2f3c847c0>

In [390]:
model.predict(X_test)

np.argmax(model.predict(X_test), axis=1)



array([0, 0, 0, ..., 0, 0, 0])

In [387]:
model.evaluate(X_test, y_test)



[0.7472116947174072, 0.5012434124946594]

In [398]:
# Create LightGBM Datasets for training and validation 
train_data = lgb.Dataset(X_train, label=y_train) 
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data) 
  
# Define hyperparameters and objective for LightGBM 
params = { 
    'objective': 'binary', 
    'metric': 'accuracy', 
    'boosting_type': 'gbdt', 
    'num_leaves': 31, 
    'learning_rate': 0.05, 
    'feature_fraction': 0.9, 
} 

In [423]:
num_round = 100
  
# Train a LightGBM model using defined parameters, training data, and specified number of rounds 
model = lgb.train(params, train_data, 
                  num_round, valid_sets=[test_data]) 

[LightGBM] [Info] Number of positive: 20092, number of negative: 20117
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000471 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 241
[LightGBM] [Info] Number of data points in the train set: 40209, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499689 -> initscore=-0.001244
[LightGBM] [Info] Start training from score -0.001244


In [424]:
y_pred_lgb = np.argmax(np.array(model.predict(X_test)).reshape(-1,1),axis=1)
# print(np.array(y_pred_lgb).reshape(-1,1).shape)
# print(y_test.shape)
accuracy_lgb = accuracy_score(y_test, y_pred_lgb)

In [425]:
accuracy_lgb

0.49875659007261514