### Author: Ally Sprik
### Last-updated: 25-02-2024

Goal of this notebook is to generate explore the Brno dataset and generate a cleaned dataset that can be used for model training and testing.

In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv("../0. Data/Brno/Brno_dataset.csv", parse_dates=True)
# Fix NaN values
df.replace(["?", "NaN", "NA", "Na", " ", "", "X"], np.nan, inplace=True)
# View the dataframe
df

The following codeblock is a standard codeblock to search for column names that contain a certain substring. If set to "", it will return all columns.

In [None]:
for col in df.columns:
    if col.__contains__(""):
        print(col)

The following codeblock is used to generate the 1 and 3 year survival columns.

In [None]:
# Create Survival1yr and Survival3yr columns
df["Survival1yr"] = np.nan
df["Survival3yr"] = np.nan

# Loop through the dataframe and calculate the survival1yr and survival3yr columns
for i in range(len(df["Survival1yr"])):
    
    # If the death date or date of diagnosis is missing, set the survival1yr and survival3yr to nan
    if pd.isna(df["Death_Date"][i]) or pd.isna(df["Date_Diagnosis"][i]):
        df["Survival1yr"][i] = np.nan
        df["Survival3yr"][i] = np.nan
        continue
    
    # Convert the date of diagnosis and date of death
    date_of_diagnosis = pd.to_datetime(df["Date_Diagnosis"][i])
    date_of_death = pd.to_datetime(df["Death_Date"][i])
    
    # Calculate the difference in years between the date of diagnosis and date of death
    if (date_of_death.year - date_of_diagnosis.year) >= 3:
        df["Survival3yr"][i] = "yes"
        df["Survival1yr"][i] = "yes"
    elif (date_of_death.year - date_of_diagnosis.year) >= 1:
        df["Survival1yr"][i] = "yes"
        df["Survival3yr"][i] = "no"
    else:
        df["Survival1yr"][i] = "no"
        df["Survival3yr"][i] = "no"

The following codeblock is used to clean the data, placing it in a format that is clear and easy to use for the model.

In [None]:
# Start translating all the columns
# Make sure the BMI column is numeric
df["BMI"] = pd.to_numeric(df["BMI"])
# Change the FIGO_Clinical and FIGO_Surgical from numeric to the associated string of FIGO stage
df["FIGO_Clinical"].replace({1:"IA", 2:"IB", 3: "II", 4:"IIIA", 5:"IIIB", 6:"IIIC", 7:"IVA", 8:"IVB", "99":np.nan, 99:np.nan, 0:np.nan, "0":np.nan}, inplace=True)
df["FIGO_Surgical"].replace({1:"IA", 2:"IB", 3: "II", 4:"IIIA", 5:"IIIB", 6:"IIIC", 7:"IVA", 8:"IVB", "99":np.nan, 99:np.nan, 0:np.nan, "0":np.nan}, inplace=True)
# Fix the labels in the MI by US column
df["MI by US"].replace({1:"lt_50", 2:"ge_50", "99":np.nan, 99:np.nan, "5":np.nan, "3":np.nan, "0":np.nan, 5:np.nan, 3:np.nan, 0:np.nan}, inplace=True)
# Fix the labels in the preoperative CA 125 column and ensure it is numeric
df["Preoperative CA 125"].replace({"NO":np.nan}, inplace=True)
df["Preoperative CA 125"] = pd.to_numeric(df["Preoperative CA 125"])
# Create a binary column for the preoperative CA 125
df["CA125_bi"] = np.nan
df["CA125_bi"][df["Preoperative CA 125"] < 35] = "lt_35"
df["CA125_bi"][df["Preoperative CA 125"] >= 35] = "ge_35"
df["CA125_bi"][df["Preoperative CA 125"].isnull()] = np.nan

# Fix the labels in thrombocytosis
df["Trombocytosis"].replace({1:"ge_400", 0:"lt_400", "99":np.nan, 99:np.nan}, inplace=True)
# Fix the labels in the cytology column
df["Cytology"].replace({1:"benign", 2:"malignant", 0:np.nan, "99":np.nan, 99:np.nan}, inplace=True)
# Fix the labels in the symptomatology column
df["Symptomatology"].replace({1:"yes", 0:"no", "99":np.nan, 99:np.nan}, inplace=True)
# Fix the labels in the pelvic lymphadenectomy and paraaortal lymphadenectomy columns
df["Pelvic lymphadenectomy"].replace({1:"yes", 0:"no", "99":np.nan, 99:np.nan}, inplace=True)
df["Paraaortal lymphadenectomy"].replace({1:"yes", 0:"no", "99":np.nan, 99:np.nan}, inplace=True)
# Fix the labels in the histology column
df["Histology"].replace({"A":"endometrioid", "B":"serous", "C":"clear cell", "D":"mucinous", "E":"carcinosarcoma", "F":"undifferentiated", "99":np.nan, 99:np.nan}, inplace=True)
# Fix the labels in the grade preop and grade postop columns
df["Grade preop"].replace({"1":"grade 1", "LG":"grade 2", "2":"grade 2", "HG":"grade 3", "3":"grade 3", "99":np.nan, 99:np.nan}, inplace=True)
df["Grade postop"].replace({"1":"grade 1", "LG":"grade 2", "2":"grade 2", "HG":"grade 3", "3":"grade 3", "99":np.nan, 99:np.nan}, inplace=True)
# Fix the labels in the MI column
df["MI"].replace({1:"lt_50", 0:"lt_50", 2:"ge_50", "99":np.nan, 99:np.nan}, inplace=True)
# Fix the labels in the cervix column
df["Cervix"].replace({1:"endocervical glands", 0:"no", 2:"stroma", "99":np.nan, 99:np.nan}, inplace=True)
# Fix the labels in the LVSI column
df["LVSI"].replace({"0":"no", "1":"yes", "2":"yes","focal":"no","99":np.nan, 99:np.nan}, inplace=True)
# Change L1CAM from a percentage to a binary column
for i in range(len(df["L1CAM"])):
    try:
        if type(df["L1CAM"][i]) != float and df["L1CAM"][i].__contains__("%"):
            df["L1CAM"][i] = df["L1CAM"][i].replace("%", "")
        value = pd.to_numeric(df["L1CAM"][i])
        if value >= 10:
            df["L1CAM"][i] = "positive"
        else:
            df["L1CAM"][i] = "negative"
    except:
        if df["L1CAM"][i].__contains__("pos"):
            df["L1CAM"][i] = "positive"
        elif df["L1CAM"][i].__contains__("po"):
            df["L1CAM"][i] = "positive"
        elif df["L1CAM"][i].__contains__("neg"):
            df["L1CAM"][i] = "negative"

# Change ER from a percentage to a binary column
for i in range(len(df["ER"])):
    try: # Try to convert the value to a number, if not possible enter the except block (in the case of a range)
        if type(df["ER"][i]) != float and df["ER"][i].__contains__("%"):
            df["ER"][i] = df["ER"][i].replace("%", "")
        value = pd.to_numeric(df["ER"][i])
        if value >= 10:
            df["ER"][i] = "positive"
        else:
            df["ER"][i] = "negative"
    except: # Extract the low number from the range and convert it to a number, then check if it is above 10
        if df["ER"][i].__contains__("-"):
            substring = df["ER"][i].split("-")
            low_nr = pd.to_numeric(substring[0])
            if low_nr >= 10:
                df["ER"][i] = "positive"
            else:
                df["ER"][i] = "negative"
                
# Change PR from a percentage to a binary column
for i in range(len(df["PR"])):
    try: # Try to convert the value to a number, if not possible enter the except block (in the case of a range)
        if type(df["PR"][i]) != float and df["PR"][i].__contains__("%"):
            df["PR"][i] = df["PR"][i].replace("%", "")
        value = pd.to_numeric(df["PR"][i])
        if value >= 10:
            df["PR"][i] = "positive"
        else:
            df["PR"][i] = "negative"
    except: # Extract the low number from the range and convert it to a number, then check if it is above 10
        if df["PR"][i].__contains__("-"):
            substring = df["PR"][i].split("-")
            low_nr = pd.to_numeric(substring[0])
            if low_nr >= 10:
                df["PR"][i] = "positive"
            else:
                df["PR"][i] = "negative"

# Fix the labels in the p53 column
df["p53"].replace({"WT":"wildtype", "MUT":"mutant", "99":np.nan, 99:np.nan}, inplace=True)
# Fix the nan labels in the Mol. Cl. column
df["Mol. Cl."].replace({0:np.nan, "0":np.nan, 99:np.nan, "99":np.nan}, inplace=True)
# Extract the TP53, MSI, NSMP, POLE from the Mol. Cl. column
df["TP53"] = df["Mol. Cl."].apply(lambda x: np.nan if pd.isna(x) else ("yes" if x.__contains__("p53") else"no"))
df["MSI"] = df["Mol. Cl."].apply(lambda x: np.nan if pd.isna(x) else ("yes" if x.__contains__("MMR") else"no"))
df["NSMP"] = df["Mol. Cl."].apply(lambda x: np.nan if pd.isna(x) else ("yes" if x.__contains__("NSMP") else"no"))
df["POLE"] = df["Mol. Cl."].apply(lambda x: np.nan if pd.isna(x) else ("yes" if x.__contains__("POLE") else"no"))
# Fix the labels in the SNB and positive SNB columns
df["SNB"].replace({0:"no", 1:"yes", "99":np.nan, 99:np.nan}, inplace=True)
df["Positive SNB"].replace({0:"no", 1:"yes", "99":np.nan, 99:np.nan}, inplace=True)
# Ensure the Lymph nodes total column is numeric
df["Lymph nodes total"] = pd.to_numeric(df["Lymph nodes total"])
df["Number of positive nodes"] = pd.to_numeric(df["Number of positive nodes"])
# Fix the labels in the positive lymph node column
df["Positive lymph node"].replace({0:"no", 1:"yes", "99":np.nan, 99:np.nan}, inplace=True)
# Fix the labels in the localisation of LNM column
df["localisation of LNM"].replace({0:np.nan, 1:"pelvic", 2:"paraaortal", 3:"both", "99":np.nan, 99:np.nan}, inplace=True)

# Fix the labels in the radiotherapy, chemotherapy and chemoradiotherapy columns
df["Radiotherapy"].replace({0:"no", 1:"yes", 2:"yes",3:"yes",4:"yes", "99":np.nan, 99:np.nan}, inplace=True)
df["Chemotherapy"].replace({0:"no", 1:"yes", 2:"yes",3:"yes",4:"yes", "99":np.nan, 99:np.nan}, inplace=True)
df["Chemoradiotherapy"].replace({0:"no", 1:"yes", 2:"yes",3:"yes",4:"yes", "99":np.nan, 99:np.nan}, inplace=True)

# Create a therapy column that combines the radiotherapy, chemotherapy and chemoradiotherapy columns
df["Therapy"] = np.nan
for i in range(len(df["Therapy"])):
    if pd.isna(df["Chemoradiotherapy"][i]) and pd.isna(df["Chemotherapy"][i]) and pd.isna(df["Radiotherapy"][i]):
        df["Therapy"][i] = np.nan
    elif df["Chemoradiotherapy"][i] == "yes":
        df["Therapy"][i] = "chemoradiotherapy"
    elif df["Chemotherapy"][i] == "yes":
        df["Therapy"][i] = "chemotherapy"
    elif df["Radiotherapy"][i] == "yes":
        df["Therapy"][i] = "radiotherapy"
    else:
        df["Therapy"][i] = "no"

# Fix the labels in the recurrence columns
df["1.Recurrence"].replace({0:"no", 1:"yes", "99":np.nan, 99:np.nan}, inplace=True)
df["1. Recurrence_Local"].replace({0:"no", 1:"yes", "99":np.nan, 99:np.nan}, inplace=True)
df["1. Recurrence_Regional"].replace({0:"no", 1:"yes", "99":np.nan, 99:np.nan}, inplace=True)
df["1. Recurrence_Distant"].replace({0:"no", 1:"yes", "99":np.nan, 99:np.nan}, inplace=True)

# Create a binary column for the histology
df["Histology_bi"] = df["Histology"].apply(lambda x: np.nan if pd.isna(x) else ("endometrioid" if x == "endometrioid" else "non-endometrioid"))

# Create a column that combines the recurrence columns
df["Recurrence_location"]  = np.nan
for i in range(len(df["Recurrence_location"])):
    if df["1. Recurrence_Local"][i] == "yes":
        df["Recurrence_location"][i] = "local"
    elif df["1. Recurrence_Regional"][i] == "yes":
        df["Recurrence_location"][i] = "regional"
    elif df["1. Recurrence_Distant"][i] == "yes":
        df["Recurrence_location"][i] = "distant"
    elif df["1.Recurrence"][i] == "no" and df["1. Recurrence_Local"][i] == "no" and df["1. Recurrence_Regional"][i] == "no" and df["1. Recurrence_Distant"][i] == "no":
        df["Recurrence_location"][i] = "no"
    else:
        df["Recurrence_location"][i] = np.nan

# Fix the labels in the ENDORISK LNM, ENDORISK FU, LNM reality, Surv5y reality and Death of EC? columns
df["ENDORISK LNM"].replace({0:"no", 1:"yes", "99":np.nan, 99:np.nan}, inplace=True)
df["ENDORISK FU"].replace({0:"no", 1:"yes", "99":np.nan, 99:np.nan}, inplace=True)
df["LNM reality"].replace({0:"no", 1:"yes", "99":np.nan, 99:np.nan}, inplace=True)
df["Surv5y reality"].replace({0:"no", 1:"yes", "99":np.nan, 99:np.nan}, inplace=True)
df["Death of EC?"].replace({0:"no", 1:"yes", "99":np.nan, 99:np.nan}, inplace=True)

save the full cleaned dataset

In [None]:
df.to_csv("../0.1. Cleaned_data/Cleaned_Brno_full.csv", index=False)

Select the following columns for a manageable dataset

In [None]:
select_columns = ["Patient_Number", "BMI", "Patient´s age", "FIGO_Clinical", "FIGO_Surgical", "MI by US", "MI", "Cytology", "Histology_bi", "CA125_bi", "Trombocytosis", "LVSI", "L1CAM", "ER", "PR", "p53", "Grade preop", "Grade postop", "Radiotherapy", "Chemotherapy", "Therapy", "1.Recurrence", "Recurrence_location", "TP53", "MSI", "POLE", "ENDORISK LNM", "ENDORISK FU", "LNM reality", "Survival1yr", "Survival3yr", "Surv5y reality", "Death of EC?", "Positive lymph node"]

select_df = df[select_columns].copy()

The following codeblock renames the columns to the values used in the model and more readable names.

In [None]:
select_df.rename(columns={
    "Patient´s age":"Age",
    "FIGO_Clinical":"FIGO_clinical",
    "FIGO_Surgical":"FIGO_surgical",
    "MI by US":"MRI_MI",
    "MI":"MyometrialInvasion",
    "Cytology":"Cytology",
    "Histology_bi":"Histology",
    "CA125_bi":"CA125",
    "Trombocytosis":"Platelets",
    "LVSI":"LVSI",
    "L1CAM":"L1CAM",
    "ER":"ER",
    "PR":"PR",
    "p53":"p53",
    "Grade preop":"PreoperativeGrade",
    "Grade postop":"PostoperativeGrade",
    "Radiotherapy":"Radiotherapy",
    "Chemotherapy":"Chemotherapy",
    "Therapy":"Therapy",
    "1.Recurrence":"Recurrence",
    "Recurrence_location":"Recurrence_location",
    "TP53":"TP53",
    "MSI":"MSI",
    "POLE":"POLE",
    "ENDORISK LNM":"ENDORISK_LNM",
    "ENDORISK FU":"ENDORISK_FU",
    "LNM reality":"LNM",
    "Surv5y reality":"Survival5yr",
    "Death of EC?":"Death_by_EC",
    #"Positive lymph node":"LNM"
}, inplace=True)

# Fix the labels in the LNM column
select_df["LNM"].replace({0:"no", 1:"yes", "99":np.nan, 99:np.nan, "0":"no", "1":"yes", "ITC":"no", "micrometastasis":"yes"}, inplace=True)
select_df

In [None]:
# Check if survival5yr is disease specific
for i in range(len(select_df["Survival5yr"])):
    if select_df["Survival5yr"][i] == "no" and select_df["Death_by_EC"][i] == "yes":
        select_df["Survival5yr"][i] = "no"
    else: # If not consistent, print the and make it survived
        select_df["Survival5yr"][i] = "yes"
        print("inconsistent at index: ", i)

In [None]:
# Complete
select_df.to_csv("../0.1. Cleaned_data/Cleaned_Brno_model_complete.csv", index=False)