In [27]:
import os
import pandas as pd

base_dir = "../Data"
base_csv_path = os.path.join(base_dir, 'base_cleaned.csv')
assert os.path.exists(base_csv_path), f"base {base_csv_path} does not exist"

output_csv_path = os.path.join(base_dir, 'm1_imputed.csv')
df = pd.read_csv(base_csv_path)

In [28]:
# Find the categorical columns using nunique
categorical_columns = df.nunique()[df.nunique() < 10].index.tolist()
categorical_column_indexes = [df.columns.get_loc(c) for c in categorical_columns if c in df]

# Print the number of categorical columns
print(f"Number of categorical columns: {len(categorical_columns)}")
print(f"Categorical columns: {categorical_columns}")

Number of categorical columns: 25
Categorical columns: ['thoracic_dgn', 'num_prev_tx', 'tah', 'vas', 'onvent', 'icu', 'inotropic', 'gender', 'abo', 'education', 'ecmo_tcr', 'iabp_tcr', 'inotropes_tcr', 'func_stat_tcr', 'diab', 'dial_ty_tcr', 'cereb_vasc', 'malig_tcr', 'cig_use', 'prior_card_surg_tcr', 'histry_cig_old', 'init_stat', 'ethcat', 'ventilator_tcr', 'work_income_tcr']


In [29]:
print(
    f"Total number of missing values in categorical columns before replacing missing values with 'missing': {df[categorical_columns].isnull().sum().sum()}")

# For all categorical columns, replace missing values with the string "missing"
for col in categorical_columns:
    df[col] = df[col].fillna("missing")

print(
    f"Total number of missing values in categorical columns after replacing missing values with 'missing': {df[categorical_columns].isnull().sum().sum()}")


Total number of missing values in categorical columns before replacing missing values with 'missing': 451455
Total number of missing values in categorical columns after replacing missing values with 'missing': 0


In [30]:
#For all the non categorical columns, add another column with the same name as the original column name with the suffix "_missing", that for each value in the original column, it is 1 if the value is missing and 0 otherwise.
numerical_columns = [c for c in df.columns if c not in categorical_columns] 

for col in numerical_columns:
    #except for wl_time
    if col != "wl_time":
        df[col + "_missing"] = df[col].isnull().astype(int)

#Print first 15 rows of most_rcnt_creat and most_rcnt_creat_missing
print(df[["most_rcnt_creat", "most_rcnt_creat_missing"]].head(15))

    most_rcnt_creat  most_rcnt_creat_missing
0               0.9                        0
1               NaN                        1
2               NaN                        1
3               NaN                        1
4               NaN                        1
5               NaN                        1
6               1.3                        0
7               NaN                        1
8               NaN                        1
9               NaN                        1
10              NaN                        1
11              0.7                        0
12              NaN                        1
13              NaN                        1
14              0.8                        0


In [31]:
from sklearn.impute import SimpleImputer

print(f"Number of columns before imputation: {len(df.columns)}")
imputer = SimpleImputer(strategy='median')

print(f"Number of missing values before imputation: {df.isnull().sum().sum()}")
df_num = df.drop(columns=categorical_columns)  # Drop categorical columns
df_cat = df[categorical_columns]  # Keep categorical columns

df_num = pd.DataFrame(imputer.fit_transform(df_num), columns=df_num.columns,
                      index=df_num.index)  # Impute missing values in df_num
df = pd.concat([df_num, df_cat], axis=1)  # Merge to final dataframe

print(f"Number of missing values after imputation: {df.isnull().sum().sum()}")
print(f"Number of columns after imputation: {len(df.columns)}")

Number of columns before imputation: 42
Number of missing values before imputation: 78072
Number of missing values after imputation: 0
Number of columns after imputation: 42


In [32]:
# One hot encode the categorical columns
df = pd.get_dummies(df, columns=categorical_columns)
df.to_csv(output_csv_path, index=False)