In [44]:
import pandas as pd
import numpy as np
import seaborn as sns

In [45]:
df = pd.read_csv("Data_120294_2023-03-29.csv")


  df = pd.read_csv("Data_120294_2023-03-29.csv")


## Cleaning
- Remove patients that died in waiting list

In [46]:
print("Patients pre-cleaning: ", df.shape[0])
print("Deaths in waiting list: ", df['death_date'].notnull().sum())

# tx_date: print unique values, amount of nulls, and amount of non-nulls
print("cocaine",df['hist_cocaine_don'].value_counts())
print("alcohol",df['alcohol_heavy_don'].value_counts())


# Drop all patients with death date
# df = df.drop(df[df['death_date'].notnull()].index)      #   WL DATE OF DEATH/POST 10/25/99 REMOVALS
df = df.drop(df[df['cod_wl'].notnull()].index)          #   WL Candidate Cause of Death for Death Removal
df = df.drop(df[df['cod_ostxt_wl'].notnull()].index)    #   WL Candidate Cause of Death for Death Removal, Other Specified

# drop all patients that don't have a valid tx_date or not in format %d%b%Y'
df = df.drop(df[df['tx_date'].isnull()].index)
df = df.drop(df[df['tx_date'].str.len() != 9].index)


# print abo unique values
# print(df['abo'].unique())
print("Patients post-cleaning: ", df.shape[0])

Patients pre-cleaning:  120264
Deaths in waiting list:  8648
cocaine N    43402
U    25789
Y     7900
Name: hist_cocaine_don, dtype: int64
alcohol N    35570
Y     5896
U      702
Name: alcohol_heavy_don, dtype: int64
Patients post-cleaning:  77410


## Transforming
- Drop redundant columns

In [47]:
desired_columns = ["init_age", "gender", "abo", 
                   "pramr", "prapk", "bmi_calc", 
                   "init_bmi_calc", "init_date", "tx_date", "hemo_pcw_tcr","hemo_pa_mn_tcr"]

print("patients: ", df.shape[0])
#visualize value counts for hemo_pcw_tcr and hemop_pa_mn_tcr with seaborn
sns.countplot(x='hemo_pcw_tcr', data=df)


df['prapk'] = df['prapk'].fillna(df['prapk'].value_counts().idxmax())
df['pramr'] = df['pramr'].fillna(df['pramr'].value_counts().idxmax())

df = df[desired_columns]

df['wl_time'] = (pd.to_datetime(df['tx_date'], format='%d%b%Y') - pd.to_datetime(df['init_date'], format='%d%b%Y')).dt.days

# drop init_date and end_date
df = df.drop(['init_date', 'tx_date'], axis=1)

# remove any rows with NaN or Inf values
df = df.dropna()
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()

print("patients: ", df.shape[0])
print(df.head())

patients:  77410
patients:  51738
       init_age gender abo  pramr  prapk   bmi_calc  init_bmi_calc  \
42855      54.0      M   O    0.0    0.0  29.270638      29.270701   
42856      10.0      M   A   10.0    0.0  17.954611      18.313703   
42857      55.0      M   O    0.0    0.0  25.325987      25.070862   
42859      43.0      M   O   20.0   25.0  18.363322      18.363322   
42860      48.0      M   O    3.0    3.0  25.327720      24.649885   

       hemo_pcw_tcr  hemo_pa_mn_tcr  wl_time  
42855          18.0            23.0       29  
42856          13.0            18.0        8  
42857          27.0            33.0       83  
42859           7.0            12.0       61  
42860          10.0            18.0       76  


## Encode

In [48]:
# encode categorical variables
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

le = LabelEncoder() 
df["abo"] = le.fit_transform(df["abo"])
df["gender"] = le.fit_transform(df["gender"])
#df["alcohol_heavy_don"] = le.fit_transform(df["alcohol_heavy_don"])
# we want to predict if it took more or less than 60 days to get a transplant

df['wl_time'] = df['wl_time'].apply(lambda x: 1 if x > 30*6 else 0)

#for col in df.columns:
#    if df[col].dtype == 'object':
#        df[col] = le.fit_transform(df[col])


columns = ["init_age", "init_bmi_calc", "bmi_calc", "pramr", "prapk", "hemo_pcw_tcr", "hemo_pa_mn_tcr"]
scaler = StandardScaler()
df[columns] = scaler.fit_transform(df[columns])


## Build model

In [49]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout, Conv1D, LSTM, Flatten
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


X_train, X_test, y_train, y_test = train_test_split(df.drop('wl_time', axis=1), df['wl_time'], test_size=0.2, random_state=42)

lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print("Logistic Regression Accuracy: ", accuracy_score(y_test, y_pred))

model = Sequential([
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='linear'),
])


# compile the model
#model.compile(loss='mse', optimizer='adam', metrics=['mae'])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=30, batch_size=32, verbose=1)

y_pred = model.predict(X_test)



Logistic Regression Accuracy:  0.6867993815229996
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30

KeyboardInterrupt: 