In [37]:
import os
import pandas as pd

base_dir = "../Data"
base_csv_path = os.path.join(base_dir, 'base.csv')
assert os.path.exists(base_csv_path), f"base {base_csv_path} does not exist"

output_csv_path = os.path.join(base_dir, 'm4_imputed.csv')
df = pd.read_csv(base_csv_path)

# print all columns containing word stat
print(df.columns[df.columns.str.contains('func_stat_tcr')])
# print unique values in init_stat
print(df['func_stat_tcr'].unique())

  df = pd.read_csv(base_csv_path)


Index(['func_stat_tcr'], dtype='object')
[     nan 2.04e+03 2.07e+03 2.08e+03 9.96e+02 2.00e+00 2.09e+03 2.02e+03
 2.06e+03 1.00e+00 2.03e+03 2.05e+03 9.98e+02 4.01e+03 2.01e+03 4.08e+03
 3.00e+00 4.10e+03 4.05e+03 4.09e+03 4.03e+03 4.06e+03 4.02e+03 2.10e+03
 4.07e+03 4.04e+03]


## Feature selection

In [22]:
# Replace "ethnicity" has been replaced with ethcat variable
desired_columns = ['thoracic_dgn', 'num_prev_tx', 'tah', 'vas', 'onvent', 'icu', 'inotropic', 'gender', 'abo',
                   'wgt_kg_tcr', 'hgt_cm_tcr', 'education', 'ecmo_tcr', 'iabp_tcr', 'inotropes_tcr', 'func_stat_tcr',
                   'diab', 'dial_ty_tcr', 'cereb_vasc', 'malig_tcr', 'most_rcnt_creat', 'tot_serum_album',
                   'hemo_co_tcr', 'cig_use', 'prior_card_surg_tcr', 'histry_cig_old', 'init_stat', 'init_creat',
                   'init_age', 'ethcat', 'init_hgt_cm_calc', 'init_wgt_kg_calc', 'ventilator_tcr', 'lvad_at_listing',
                   'rvad_at_listing', 'work_income_tcr', 'academic_level_tcr', 'tx_date', 'init_date']

df = df[desired_columns]

In [31]:
# Remove all init_age under 18
pre_len = len(df.index)
df = df[df['init_age'] >= 18]
post_len = len(df.index)
print(f"Removed patients under 18: {pre_len - post_len}/{pre_len} ({(pre_len - post_len) / pre_len * 100:.2f}% removed)")

Removed patients under 18: 10530/77410 (13.60% removed)


## Encoding

In [23]:
print(f'Number of rows with value 6 in ethcat before replacing values: {len(df[df["ethcat"] == 6])}')

#1=1 (White), 2=2 (Black), 4=4 (Hispanic), 5=5 (Asian),6=6 (tidigare Amer Ind/Alaskan, kodas nu om till Other) 7=6 (tidigare Native Hawaiian, nu Other) 9=6 (tidigare Multiracial, nu Other)
df['ethcat'] = df['ethcat'].replace([7, 9, 998], 6)

print(f'Number of rows with value 6 in ethcat after replacing values: {len(df[df["ethcat"] == 6])}')

Number of rows with value 6 in ethcat before replacing values: 450
Number of rows with value 6 in ethcat after replacing values: 1854


In [24]:
# Encode education

df['education'] = df['education'].replace([2, 3], 1) # High school or less
df['education'] = df['education'].replace([4], 2) # Some college
df['education'] = df['education'].replace([5, 6], 3) # College or graduate
df['education'] = df['education'].replace([996, 998], None)

In [25]:
# Encode init_stat: label define status 1 "Status 1A" 2 "Status 1B" 3 "Status 2" 4 "Temp inactive"

df["init_stat"] = df["init_stat"].replace([2010, 2110, 2120, 2130, 2090], 1)
df["init_stat"] = df["init_stat"].replace([2020, 2140], 2)
df["init_stat"] = df["init_stat"].replace([2030, 2150, 2160], 3)
df["init_stat"] = df["init_stat"].replace([2999], 4)

Index(['func_stat_tcr', 'init_stat'], dtype='object')
[ 1.  3.  2.  4. nan]


In [None]:
# Encode thoracic_dgn: label define diag 1 "Cardiomyopathy" 2 "CAD" 3 "Valvular heart disease" 4 "Graft failure" 5 "Congenital" 7 "Other"
df["thoracic_dgn"] = df["thoracic_dgn"].replace(range(1000, 1099), 1)
df["thoracic_dgn"] = df["thoracic_dgn"].replace([1201], 1)

df["thoracic_dgn"] = df["thoracic_dgn"].replace([1007, 1200], 2)

df["thoracic_dgn"] = df["thoracic_dgn"].replace([1202], 3)

df["thoracic_dgn"] = df["thoracic_dgn"].replace(range(1100, 1199), 4)

df["thoracic_dgn"] = df["thoracic_dgn"].replace(range(1203, 1207), 5)

df["thoracic_dgn"] = df["thoracic_dgn"].replace([1208, 1209, 999, 1497, 1498], 7)

In [26]:
# Remove rows with no transplant date
pre_len = len(df.index)
df = df[df['tx_date'].notna()]
post_len = len(df.index)

print(f"Filtered to only transplanted patients: {post_len}/{pre_len} ({post_len / pre_len * 100:.2f}% remaining)")

# Compute the time to transplant (waitlist time -> wl_time)
df['wl_time'] = (
        pd.to_datetime(df['tx_date'], format='%d%b%Y') - pd.to_datetime(df['init_date'], format='%d%b%Y')).dt.days
df = df.drop(columns=['tx_date', 'init_date'])
print(f"Mean waitlist time: {df['wl_time'].median():.2f} days")


Filtered to only transplanted patients: 77410/120264 (64.37% remaining)
Mean waitlist time: 78.00 days


In [27]:
# Remove columns where all values are missing
pre_len = len(df.columns)
df = df.dropna(axis=1, how='all')
post_len = len(df.columns)
print(
    f"Removed columns with all missing values: {pre_len - post_len}/{pre_len} ({(pre_len - post_len) / pre_len * 100:.2f}% removed)")

# Remove columns where there is no variance
pre_len = len(df.columns)
df = df.loc[:, df.nunique() != 1]
post_len = len(df.columns)
print(
    f"Removed columns with no variance: {pre_len - post_len}/{pre_len} ({(pre_len - post_len) / pre_len * 100:.2f}% removed)")


Removed columns with all missing values: 1/38 (2.63% removed)
Removed columns with no variance: 2/37 (5.41% removed)


In [28]:
#Creates 
output_csv_path_cleaned = os.path.join(base_dir, 'base_cleaned.csv')
df.to_csv(output_csv_path_cleaned, index=False)

In [29]:
# Find the categorical columns using nunique
categorical_columns = df.nunique()[df.nunique() < 10].index.tolist()
categorical_column_indexes = [df.columns.get_loc(c) for c in categorical_columns if c in df]

# Print the number of categorical columns
print(f"Number of categorical columns: {len(categorical_columns)}")
print(f"Categorical columns: {categorical_columns}")

Number of categorical columns: 24
Categorical columns: ['num_prev_tx', 'tah', 'vas', 'onvent', 'icu', 'inotropic', 'gender', 'abo', 'education', 'ecmo_tcr', 'iabp_tcr', 'inotropes_tcr', 'diab', 'dial_ty_tcr', 'cereb_vasc', 'malig_tcr', 'cig_use', 'prior_card_surg_tcr', 'histry_cig_old', 'init_stat', 'ethcat', 'ventilator_tcr', 'work_income_tcr', 'academic_level_tcr']


In [30]:
from sklearn.preprocessing import OrdinalEncoder
from missingpy import MissForest

ordinal_encoder = OrdinalEncoder()
# Encode the categorical columns
df[categorical_columns] = ordinal_encoder.fit_transform(df[categorical_columns])

imputer = MissForest()

wl_time = df.pop('wl_time')

df = pd.DataFrame(imputer.fit_transform(df, cat_vars=categorical_column_indexes), columns=df.columns)

df['wl_time'] = wl_time.values

df[categorical_columns] = ordinal_encoder.inverse_transform(df[categorical_columns])


KeyboardInterrupt



In [None]:
df = pd.get_dummies(df, columns=categorical_columns)
df.to_csv(output_csv_path, index=False)