In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score, roc_auc_score
from keras.callbacks import TensorBoard
import shap
import joblib

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Using vaex to pre-process the large datasets
import vaex

In [None]:
df = pd.read_csv('/kaggle/input/patient-survival-detection-dataset/Patient Survival Detection/Dataset.csv')
df.info()

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df['hospital_death'].value_counts()

In [None]:
df[df['hospital_death'].isna()]

In [None]:
df.isnull().sum()[df.isnull().sum() != 0]

In [None]:
df.isnull().mean()*100

In [None]:
# percentage of rows with missing values
df.isnull().any(axis=1).sum()/len(df.index)

In [None]:
# loading dataframe using vaex
ndf = vaex.from_csv_arrow('/kaggle/input/patient-survival-detection-dataset/Patient Survival Detection/Dataset.csv')
ndf.info()

- Filtering subdataset to select key columns
- Remove vitamins columns to reduce dataset

In [None]:
kdf = df.iloc[:,:19]
kdf.info()

In [None]:
fdf = pd.concat([kdf, df[['aids','diabetes_mellitus']]], axis=1)
fdf.info()

- Removing redundant columns from the dataset

In [None]:
mdf = fdf.drop(['encounter_id','patient_id','readmission_status'], axis=1)
target = fdf['hospital_death']

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(mdf.corr(), annot=True, vmin=-1, vmax=1)
plt.show()

## Handling missing values

In [None]:
# let's check the albumin missing values
mdf['albumin_apache'].isnull().sum()/len(mdf['albumin_apache'])

In [None]:
mdf['age']= mdf['age'].fillna(mdf['age'].mean())
mdf['bmi']= mdf['bmi'].fillna(mdf['bmi'].mean())
mdf['height']= mdf['height'].fillna(mdf['height'].mean())
mdf['weight'].fillna(mdf['weight'].mean(), inplace=True)

In [None]:
# handling missing values by most recurring values
mdf['hospital_admit_source'].replace(np.nan, 'Emergency Department', inplace=True)
mdf['icu_admit_source'].replace(np.nan,'Accident & Emergency', inplace=True)
mdf.drop('albumin_apache', axis=1, inplace=True)

In [None]:
# drop all the rows with missing values 
mdf.dropna(inplace=True, axis=0)
mdf.info()

## EDA

In [None]:
for i, col in enumerate(list(mdf.dtypes[mdf.dtypes != 'object'].index)):
    plt.figure(figsize=(6,6))
    sns.boxplot(x='hospital_death', y=col, data=mdf)
    plt.show()

In [None]:
for col in list(mdf.dtypes[mdf.dtypes == 'object'].index):
    plt.figure(figsize=(6,6))
    sns.pointplot(y='hospital_death', x=col, data=mdf)
    plt.xticks(rotation=30)
    plt.show()

### Conclusions:

- Numerical columns are not indicative of target columns
- Categorical columns represent target variable more than numerical columns

### Next steps:

- Feature encoding
- Imbalance data treatment
- Select KBest features
- Feature scaling/transformation 
- NN modeling

In [None]:
# check out aids to tumor columns for death cases
ldf = df.loc[:,'aids':'solid_tumor_with_metastasis']
for col in list(ldf.columns):
    plt.figure(figsize=(6,6))
    sns.pointplot(y=mdf['hospital_death'], x=col, data=ldf)
    plt.xticks(rotation=30)
    plt.show()

In [None]:
# final dataframe to append to mdf
cleand_ldf = ldf.dropna(axis=0)

In [None]:
# merge mdf and cleand_ldf on common indices
new_df = mdf.join(cleand_ldf, on=mdf.index, lsuffix='mdf', rsuffix='ldf')
new_df.info()

### Encoding categorical columns

In [None]:
X_new = new_df.drop('hospital_death', axis=1)
tar = new_df['hospital_death']

In [None]:
X_new = X_new.reset_index()
X_new_rein = X_new.drop('index', axis=1)

In [None]:
columns = ['ethnicity','gender','hospital_admit_source','icu_admit_source','icu_stay_type','icu_type']

encoder = OrdinalEncoder()
encoder.fit(X_new_rein[columns])
encoded_X = encoder.transform(X_new_rein[columns])
encoded_df = pd.DataFrame(encoded_X, columns=X_new_rein[columns].columns, index=X_new_rein.index)
encoded_df.info()

In [None]:
X_new_rein[columns] = encoded_df
# FINAL DATAFRAME FOR FURTHER data balancing and modeling 
#X_nn = X_new_rein.drop('level_0', axis=1)
X_new_rein.info()

In [None]:
# change the indices of the target variable
tar.index = X_new_rein.index

## Upsampling to balance the dataset

In [None]:
# smote upsampling and feature selection, then modeling
smote = SMOTE(sampling_strategy='not majority', random_state=42)
X_res ,y_res = smote.fit_resample(X_new_rein, tar)

## Feature selection function

In [None]:
def feature_sel(method, features):
    if method==f_classif:
        fse = SelectKBest(f_classif, k=features)
        X_ne = fse.fit_transform(X_res,y_res)
    if method==chi2:
        fse = SelectKBest(chi2, k=features)
        X_ne = fse.fit_transform(X_res,y_res)
    return pd.DataFrame(X_ne, columns = fse.get_feature_names_out())

In [None]:
X_ne = feature_sel(f_classif,10)
X_ne.info()

## Modeling using NNs

In [None]:
# standardize the dataset
scaler = StandardScaler()
X_fi = scaler.fit_transform(X_ne)
X_fi = pd.DataFrame(X_fi, columns=X_ne.columns)

In [None]:
# train and test split
X_trn, X_tst, y_trn, y_tst = train_test_split(X_fi, y_res, test_size=0.25, random_state=42)

In [None]:
# NN
model = Sequential()
model.add(Dense(64, input_dim=10, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer='SGD', metrics=['accuracy'])

### Tensorboard Visualization

In [None]:
tb_callback = tf.keras.callbacks.TensorBoard('./logs', update_freq=1)
model.fit(X_trn, y_trn, epochs=30, batch_size=128, callbacks=[tb_callback])

In [None]:
preds = model.predict(X_tst)
pred = list()
for i in range(len(preds)):
    pred.append(np.argmax(preds[i]))
acc = accuracy_score(y_tst, pred)
roc_auc = roc_auc_score(y_tst, pred)
print("Accuracy score:", acc)
print("ROC AUC score:", roc_auc)

## XAI (Kernel Explainer)

In [None]:
shap.initjs()

In [None]:
def f(X):
    return model.predict(X).flatten()

explainer = shap.KernelExplainer(f, X_tst.iloc[:200,:])
shap_values = explainer.shap_values(X_tst.iloc[:200,:])

In [None]:
shap_value = explainer.shap_values(X_tst.iloc[90,:])
shap.force_plot(explainer.expected_value, shap_value, X_tst.iloc[90,:])

## Save the model artifact

In [None]:
joblib.dump(explainer, "explainer.joblib")

In [None]:
model.save("patient_save.pkl")