### Author: Ally Sprik
### Last-updated: 25-02-2024

Goal of this notebook is to utilise MIDASpy to impute the missing values in the MAYO dataset, using PIPENDO. The imputed dataset will be used in the next steps of the analysis.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import MIDASpy as midas
import tensorflow as tf

tf.config.set_visible_devices([], 'GPU')
# get gpu available
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))



In [None]:
df_MAYO = pd.read_csv("../0.1. Cleaned_data/MAYO_subdag.csv")
df_PIP = pd.read_csv('../0.1. Cleaned_data/Casper_PIPENDO_Cleaned.csv', index_col='Unnamed: 0')

Select the columns to use as imputation evidence. The columns are the same for both datasets, so we can use the same columns for both datasets.

In [None]:
evidence_columns = ["ER", "PR", "p53", "L1CAM", "CA125", "Platelets", "PreoperativeGrade","LNM", "LVSI", "Chemotherapy", "Radiotherapy", "Survival1yr", "Survival3yr", "Survival5yr", "Cytology"]


df_MAYO = df_MAYO[evidence_columns]
df_PIP = df_PIP[evidence_columns]

# Concatenate the two datasets first MAYO then PIPENDO
data = pd.concat([df_MAYO, df_PIP], axis=0, ignore_index=True).replace({0:'no', 1:'yes'})
data

Set the data to categorical and encode the data

In [None]:
for column in data.columns:
    data[column] = data[column].astype('category')


Encode the data

In [None]:
encoded, cat_cols_list = midas.cat_conv(data)

Create the MIDAS model and train it

In [None]:
imputer = midas.Midas(layer_structure=[256,256,256], vae_layer=True, seed=123, input_drop=0.90, latent_space_size=64, vae_sample_var=0.8, vae_alpha=1)

imputer.build_model(encoded, softmax_columns=cat_cols_list)
imputer.train_model(training_epochs=100)

Generate the imputations

In [None]:
imputations = imputer.generate_samples(m=10).output_list

Decode the imputations

In [None]:
flat_cats = [cat for variable in cat_cols_list for cat in variable]
categorical = data.columns.values

for i in range(len(imputations)):
    tmp_cat = [imputations[i][x].idxmax(axis=1) for x in cat_cols_list]
    cat_df = pd.DataFrame({categorical[i]: tmp_cat[i] for i in range(len(categorical))})
    imputations[i] = pd.concat([imputations[i], cat_df], axis=1).drop(flat_cats, axis=1)
for i in range(0, 10):
    imputation = imputations[i]
    for col in imputation.columns.values:
        for j in range(len(imputation)):
            imputations[i][col][j] = imputation[col][j].removeprefix(col + '_')
            
completed_data = imputations[9]


Split the imputed data back into the original datasets, check if the imputed data is the same as the original data where there were values

In [None]:
MAYO_part = completed_data.iloc[:len(df_MAYO), :]
PIP_part = completed_data.iloc[len(df_MAYO):, :]

for col in evidence_columns:
    temp = df_MAYO[col].dropna()
    index = temp.index
    temppart = MAYO_part[col].iloc[index]
    
    # Compare if its the same
    if (temp == temppart).all():
        print(f"{col} is the same")
    else:
        print(f"{col} is not the same")

Load the original data and add the imputed ca125 to it

In [None]:
MAYO_w_CA125 = pd.read_csv("../0.1. Cleaned_data/MAYO_subdag.csv")
MAYO_w_CA125['CA125'] = MAYO_part['CA125']

for col in MAYO_w_CA125.columns:
    if col not in MAYO_part.columns:
        MAYO_part[col] = MAYO_w_CA125[col]


Save the imputed data

In [None]:
MAYO_w_CA125.to_csv('../0.2. Imputed_data/MayoCA125_wPIP_MidasPy.csv', index=False)
MAYO_part.to_csv('../0.2. Imputed_data/Mayo_wPIP_fullimp_MidasPy.csv', index=False)