### Author: Ally Sprik
### Last-updated: 25-02-2024

Goal of this notebook is to impute a combined dataset of MAYO and PIPENDO with MIDAS.

In [None]:
import numpy as np
import pandas as pd

df_MAYO = pd.read_csv("../0.1. Cleaned_data/MAYO_subdag.csv")
df_PIP = pd.read_csv('../0.1. Cleaned_data/PIPENDO_val_correctnames.csv')

# columns not in PIPENDO dataset fill in with NA
for col in df_MAYO.columns:
    if col not in df_PIP.columns:
        df_PIP[col] = np.nan




Select evidences so the imputation is not muddied

In [None]:
evidence_columns = ["ER", "PR", "p53", "L1CAM", "CA125", "Platelets", "Cytology", "MRI_MI", "MSI", "POLE", "PreoperativeGrade", 'Survival1yr', 'Survival3yr',
       'Survival5yr', 'Radiotherapy', 'Chemotherapy', 'Recurrence', 'LVSI', "LNM"]

df_MAYO = df_MAYO[evidence_columns]
df_PIP = df_PIP[evidence_columns]

# Concatenate the two datasets first MAYO then PIPENDO
df = pd.concat([df_MAYO, df_PIP], axis=0, ignore_index=True)
df

Initalise and train the MIDAS model

In [None]:
# Impute missing values with MIDAS
from sklearn.impute import SimpleImputer
import MIDASpy as midas

encoded, cat_cols_list = midas.cat_conv(df)

imputer = midas.Midas(layer_structure=[256,256], vae_layer=True, seed=123, input_drop=0.75)
imputer.build_model(encoded)
imputer.train_model(training_epochs=50)


Impute the missing values

In [None]:
imputations = imputer.generate_samples(m=10).output_list

Give the categorical columns the correct names

In [None]:
flat_cats = [cat for variable in cat_cols_list for cat in variable]
categorical = df.columns.values

for i in range(len(imputations)):
    tmp_cat = [imputations[i][x].idxmax(axis=1) for x in cat_cols_list]
    cat_df = pd.DataFrame({categorical[i]:tmp_cat[i] for i in range(len(categorical))})
    imputations[i] = pd.concat([imputations[i], cat_df], axis = 1).drop(flat_cats, axis = 1)

Remove the prefix from the columns

In [None]:
for i in range(0,10):
    imputation = imputations[i]
    for col in imputation.columns.values:
        for j in range(len(imputation)):
            imputations[i][col][j] = imputation[col][j].removeprefix(col + '_')

Save the last imputation

In [None]:
result = imputations[9]

Select only mayo rows

In [None]:
# Select only MAYO rows, 279 rows
result_MAYO = result.iloc[0:len(df_MAYO)]
result_MAYO

Save a dataset with only the CA125 value imputed

In [None]:
df_MAYO = pd.read_csv("../0.1. Cleaned_data/MAYO_subdag.csv")
# Insert CA125 value into MAYO dataset
df_MAYO["CA125"] = result_MAYO["CA125"]

df_MAYO.to_csv("../0.2. Imputed_data/MayoCA125_wPIP_Preop.csv", index=False)