### Author: Ally Sprik
### Last-updated: 25-02-2024

Goal of this notebook is use intelligent imputation on the CA125 value in the MAYO dataset, to see if it has a strong effect



In [401]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


df_MAYO = pd.read_csv("../0.1. Cleaned_data/MAYO_subdag.csv")
df_PIP = pd.read_csv('../0.1. Cleaned_data/Casper_PIPENDO_Cleaned.csv')

# columns not in PIPENDO dataset fill in with NA
for col in df_MAYO.columns:
    if col not in df_PIP.columns:
        df_PIP[col] = np.nan

Index(['Unnamed: 0', 'PreoperativeGrade', 'PostoperativeGrade',
       'MyometrialInvasion', 'Cytology', 'Platelets', 'ER', 'PR', 'L1CAM',
       'LVSI', 'p53', 'CA125', 'CTMRI', 'LNM', 'Therapy', 'Survival1yr',
       'Survival3yr', 'Survival5yr', 'Chemotherapy', 'Radiotherapy', 'POLE',
       'MSI', 'MRI_MI', 'FIGO', 'Recurrence', 'LNM_micromacro'],
      dtype='object')

Check the column factors of both datasets

In [404]:
# Check if all the columns have the same factors in both datasets
for col in df_MAYO.columns:
    if col in df_PIP.columns:
        MAYO_factors = df_MAYO[col].unique()
        PIP_factors = df_PIP[col].unique()
        if len(MAYO_factors) != len(PIP_factors):
            print(col)
            print(MAYO_factors)
            print(PIP_factors)
            print("")

POLE
['no' nan 'yes']
[nan]

MSI
['no' 'yes' nan]
[nan]

MRI_MI
[nan 'lt_50' 'ge_50']
[nan]

PreoperativeGrade
['grade 3' 'grade 1' 'grade 2' nan]
['grade 3' 'grade 1' 'grade 2']

FIGO
[nan 'II' 'IA' 'IIIC' 'IVB' 'IB' 'IIIA']
[nan]

Survival1yr
['yes' nan 'no']
['yes' 'no']

Survival3yr
['yes' 'no' nan]
['yes' 'no']

Survival5yr
['no' 'yes' nan]
['yes' 'no']

Radiotherapy
['yes' 'no' nan]
['no' 'yes']

Chemotherapy
['yes' 'no' nan]
['no' 'yes']

Recurrence
['yes' 'no' nan]
[nan]

LVSI
['no' 'yes' nan]
['no' 'yes']

LNM_micromacro
['no' 'yes' nan]
[nan]


Select the columns that are used for the imputation, based on a matching scheme. For every row in MAYO, the PIPENDO dataset is checked to see there is a closely matching row. If there are 7 or more matching columns, the CA125 value is added to a list. At the end of the PIPENDO dataset the list is counted and the most common value is used to fill in the missing value in the MAYO dataset.

In [405]:
imputation_columns = ["ER", "PR", "p53", "L1CAM", "LVSI", "PreoperativeGrade", "PostoperativeGrade", "MyometrialInvasion"] 

# If 3 of these are the same then fill in CA125 of PIPENDO in MAYO
for i in range(len(df_MAYO)):
    if pd.isna(df_MAYO.loc[i, "CA125"]):
        ls = []
    
        for j in range(len(df_PIP)):
            x = 0
            for col in imputation_columns:
                MAYO_value = df_MAYO.loc[i, col]
                PIP_value = df_PIP.loc[j, col]
                if pd.isna(MAYO_value) or pd.isna(PIP_value):
                    continue
                elif MAYO_value == PIP_value:
                    x += 1
            if x >= 7:
                ls.append(df_PIP.loc[j, "CA125"])
        
        vlcounts = pd.Series(ls).value_counts(dropna=False)
        
        if len(vlcounts) == 0:
            continue
        else:
            df_MAYO.loc[i, "CA125"] = vlcounts.index[0]


In [409]:
df_MAYO.to_csv("../0.2. Imputed_data/Informed_imputation_CA125.csv", index=False)