### Author: Ally Sprik
### Last-updated: 25-02-2024

Goal of this notebook is to use MICEForest to impute the missing values in the MAYO dataset. The imputation will be based on the PIPENDO dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import miceforest as mf
import random
import sklearn.neighbors._base 
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from sklearn.impute import KNNImputer
import tensorflow as tf
import lightgbm as lgb
# Surpress warnings
import warnings
warnings.filterwarnings('ignore')


In [None]:
df_MAYO = pd.read_csv("../0.1. Cleaned_data/MAYO_subdag.csv")
df_PIP = pd.read_csv('../0.1. Cleaned_data/Casper_PIPENDO_Cleaned.csv')

# columns not in PIPENDO dataset fill in with NA
for col in df_MAYO.columns:
    if col not in df_PIP.columns:
        df_PIP[col] = np.nan


Select the columns to be used for the imputation. The columns are evidence columns for the Bayesian network. The columns are:

In [None]:
evidence_columns = ["ER", "PR", "p53", "L1CAM", "CA125", "Platelets", "PreoperativeGrade", "LVSI", "MyometrialInvasion"]

df_MAYO = df_MAYO[evidence_columns].replace({0:'no', 1:'yes'})
df_PIP = df_PIP[evidence_columns].replace({0:'no', 1:'yes'})

# Concatenate the two datasets first MAYO then PIPENDO
data = pd.concat([df_MAYO, df_PIP], axis=0, ignore_index=True)

Change the data types of the columns to category. This is needed for the MICEForest imputation.

In [None]:
for column in evidence_columns:
    df_MAYO[column] = df_MAYO[column].astype('category')
    df_PIP[column] = df_PIP[column].astype('category')
    data[column] = data[column].astype('category')


Define the imputation model

In [None]:

# Set-up the imputation model
kds = mf.ImputationKernel(
    data=df_PIP,
    random_state=42,
    categorical_feature='auto',
    
)


Train the imputation model

In [None]:
kds.mice(50, verbose=True)

Impute the missing values in the MAYO dataset

In [None]:
completed_data = kds.impute_new_data(df_MAYO)

In [None]:
MAYO_part = completed_data.complete_data()

for col in evidence_columns:
    temp = df_MAYO[col].dropna()
    index = temp.index
    temppart = MAYO_part[col].iloc[index]
    
    # Compare if its the same
    if (temp == temppart).all():
        print(f"{col} is the same")
    else:
        print(f"{col} is not the same")


Load in original MAYO dataset and add the imputed values

In [None]:
MAYO_w_CA125 = pd.read_csv("../0.1. Cleaned_data/MAYO_subdag.csv")
MAYO_w_CA125['CA125'] = MAYO_part['CA125']

for col in MAYO_w_CA125.columns:
    if col not in MAYO_part.columns:
        MAYO_part[col] = MAYO_w_CA125[col]


Save the imputed datasets

In [None]:
MAYO_w_CA125.to_csv('../0.2. Imputed_data/MayoCA125_wPIP_MiceForest.csv', index=False)
MAYO_part.to_csv('../0.2. Imputed_data/Mayo_wPIP_fullimp_MiceForest.csv', index=False)