### Author: Ally Sprik
### Last-updated: 25-02-2024

Goal of this notebook is to explore the extended training set, fix some inconsistencies and to create a new dataset that is correct.



In [None]:
import pandas as pd
import numpy as np

# Extended dataset
df = pd.read_spss('../0. Data/Trainingcohort(_wTCGA)/L1CAM_3_database_ENDORISK_JAMA_mergedv6.sav', convert_categoricals=False)
df.replace({"": np.nan, " ": np.nan, "99": np.nan, 999:np.nan, "999.0":np.nan, "NA": np.nan, "NaN":np.nan, "<NA>":np.nan}, inplace=True)

# Deorecated dataset
df_wrongTCGA = pd.read_csv("../0. Data/Trainingcohort(_wTCGA)/Deprecated/L1CAM_3_database_ENDORISK_JAMA_mergedv4.csv", sep=",", na_values=(""," ","99","NA",  999, "999.0", "<NA>"))

# Only molecular data
df_OnlyMol = pd.read_spss("../0. Data/Trainingcohort(_wTCGA)/L1CAM_dataset_complete_final version_only complete molecular data.sav")
df_OnlyMol.replace({"": np.nan, " ": np.nan, "99": np.nan, 999:np.nan, "999.0":np.nan, "NA": np.nan, "NaN":np.nan, "<NA>":np.nan}, inplace=True)

# Original dataset
df_original = pd.read_csv("../0. Data/Trainingcohort(_wTCGA)/Original_model_dataset.csv", sep=",", na_values=(""," ","99","NA", "<NA>"))


The following codeblock is a standard codeblock to search for column names that contain a certain substring. If set to "", it will return all columns.

In [None]:
for column in df.columns:
    if column.__contains__("FIGO"):
        print(column)

Drop the rows that are completely empty

In [None]:
df.dropna(axis=0, how="all", inplace=True)

Check for duplicate study numbers and remove them from the dataset

In [None]:
study_numbers = df_wrongTCGA["Study_number"].unique()

df_Dupl = df[~df["Study_number"].isin(study_numbers)]

df.drop(df_Dupl.index, inplace=True)
df.reset_index(drop=True, inplace=True)
df


Fix labels in the TCGA molecular classification dataset, for consistency

In [None]:
df_OnlyMol["TCGA"].replace({"Copy number low":"NSMP", "Copy number high":"p53"}, inplace=True)
df_OnlyMol["TCGA"].value_counts(dropna=False)

# Constructing a new dataset based on these datasets, columns it should have:

- Study number                  (numeric)
- Included in training cohort   (yes/no)
- BMI                           (numeric)
- CA125_PREOP                   (numeric)
- CA125_PREOP_bi                (over or equal 35/under 35)
- Platelets                     (numeric)
- Platelets_bi                  (over or equal 400/under 400)
- L1CAM_PREOP                   (positive/negative)
- ER_PREOP                      (positive/negative)
- PR_PREOP                      (positive/negative)
- p53_PREOP                     (wildtype/mutant)
- Grade_PREOP                   (1/2/3)
- FIGO_clinical                 (I/IIA/IIB/IIIA/IIIB/IIIC/IVA/IVB)
- FIGO_surgical                 (I/IIA/IIB/IIIA/IIIB/IIIC/IVA/IVB)
- MSI_mutation                  (yes/no)
- POLE_mutation                 (yes/no)
- TP53_mutation                 (yes/no)
- CTMRI                         (yes/no)
- MRI_MI                        (yes/no)
- LVSI                          (yes/no)
- Cytology                      (malignant/benign)
- LNM                           (yes/no)
- LNM (inc. follow-up)          (yes/no)
- Myometrial invasion           (yes/no)
- Grade_POSTOP                  (1/2/3)
- Recurrence_location           (local/regional/distant)
- Recurrence                    (yes/no)
- Chemotherapy                  (yes/no)
- Radiotherapy                  (yes/no)
- Chemoradiotherapy             (no/chemo/radio/chemoradio)
- Death_by_EC                   (yes/no)
- one_year_survival             (yes/no)
- one_year_survival (DSS)       (yes/no)
- three_year_survival           (yes/no)
- three_year_survival (DSS)     (yes/no)
- five_year_survival            (yes/no)
- five_year_survival (DSS)      (yes/no)

Construct the dataset step by step selecting columns from the original datasets

In [None]:
## With normal columns
df_constr = pd.DataFrame()
# Metadata columns
df_constr["Study_number"] = df["Study_number"]
df_constr["Included_in_training_cohort"] = df["included_endorisk"].replace({1: "yes", 0: "no", np.nan: "no"})

# Ensure that the columns are numeric
df_constr["Age"] = pd.to_numeric(df["age_diagnosis"])
df_constr["BMI"] = pd.to_numeric(df["BMI"])
df_constr["Comorbidity_index"] = df["Comorbidity_index"]

# Ensure that the columns are numeric and create a binary column for CA125
df_constr["CA125_PREOP"] = pd.to_numeric(df["CA125_PREOP"])
df_constr["CA125_PREOP_bi"] = df["CA125_PREOP"].apply(lambda x: "ge_35" if x >= 35 else ("lt_35" if x < 35 else np.nan))

# Ensure that the columns are numeric and create a binary column for Platelets
df_constr["Platelets"] = pd.to_numeric(df["Platelets"])
df_constr["Platelets_bi"] = df["Platelets"].apply(lambda x: "ge_400" if x >= 400 else ("lt_400" if x < 400 else np.nan))

# Fix the labels for the histochemical markers
df_constr["L1CAM_PREOP"] = df["L1CAM_expression_preop"].replace({1: "positive", 0: "negative", np.nan: np.nan})
df_constr["ER_PREOP"] = df["ER_expression_preop"].replace({1: "positive", 0: "negative", np.nan: np.nan})
df_constr["PR_PREOP"] = df["PR_expression_preop"].replace({1: "positive", 0: "negative", np.nan: np.nan})
df_constr["p53_PREOP"] = df["p53_expression_preop"].replace({"overexpression": "mutant", "wildtype": "wildtype", np.nan: np.nan})

# Fix the labels for the PreoperativeGrade
df_constr["Grade_PREOP"] = df["Grade_PREOP"].replace({"grade 3 or non-endometrioid": "grade 3", "grade 2": "grade 2", "grade 1": "grade 1", np.nan: np.nan})

# Fix the labels for the FIGO stages
df_constr["FIGO_clinical"] = df["FIGO_clinical"].replace({1: "IA", 2: "IB", 3: "II", 4: "IIIA", 5: "IIIB", 6: "IIIC", 7: "IVA", 8: "IVB", np.nan: np.nan})
df_constr["FIGO_surgical"] = df["FIGO_surgical"].replace({1: "IA", 2: "IB", 3: "II", 4: "IIIA", 5: "IIIB", 6: "IIIC", 7: "IVA", 8: "IVB", np.nan: np.nan})

# Create empty columns for the molecular data
df_constr["MSI_mutation"] = np.nan
df_constr["POLE_mutation"] = np.nan
df_constr["TP53_mutation"] = np.nan

# Fix the labels for the CTMRI, MRI_MI, LVSI and Cytology
df_constr["CTMRI"] = df["CT_or_MRI_LNM"].replace({"lymphadenopathy":"yes", "no extra-uterine disease": "no", np.nan: np.nan})
df_constr["MRI_MI"] = df["MRI_MI"].replace({"<50%": "lt_50", ">50%": "ge_50", np.nan: np.nan})
df_constr["LVSI"] = df["LVSI_bi1"].replace({"LVSI": "yes", "no LVSI": "no", np.nan: np.nan})
df_constr["Cytology"] =  df["Cytology"].astype("string").replace({"cytology, malignant cells present": "malignant", "cytology, no malignant cells": "benign", "cytology, unknown if malignant cells present":np.nan, "no cytology":np.nan, np.nan: np.nan})

# Select two LNM columns
df_constr["LNM"] = df["Positive_nodes_bi1"]
df_constr["LNM (inc. follow-up)"] = df["Positive_nodes_including_followup_BI"]

# Fix the labels for the Myometrial invasion, PostoperativeGrade, Recurrence, Recurrence_location, Chemotherapy, Radiotherapy, Death_by_EC and Histology
df_constr["Myometrial_invasion"] = df["Myometrial_invasion_L"].replace({">=50%": "ge_50", "<50%": "lt_50", "no invasion":"lt_50", np.nan: np.nan})
df_constr["Grade_POSTOP"] = df["Grade_postop"].replace({3: "grade 3", 2: "grade 2", 1: "grade 1", np.nan: np.nan})
df_constr["Recurrence"] = df["Recurrence"]
df_constr["Recurrence_location"] = df["Recurrence_location"].replace({"yes, distant":"distant", "yes, local":"local", "yes, regional":"regional", np.nan: np.nan})
df_constr["Chemotherapy"] = df["Chemotherapy"].replace({1:"yes", 0:"no", np.nan:np.nan})
df_constr["Radiotherapy"] = df["Radiotherapy"].replace({"VBT":"yes", "EBRT":"yes", "EBRT+VBT":"yes", np.nan:np.nan, "no":"no", 2:"yes", 1:"yes", 3:"yes", 0:"no"})
df_constr["Death_by_EC"] = df["Death_by_EC"]
df_constr["Histology"] = df["Histology_bi_PREOP"] 

# Create empty columns for the survival and TCGA 
df_constr["Therapy"] = np.nan
df_constr["one_year_survival"] = np.nan
df_constr["one_year_survival (DSS)"] = np.nan
df_constr["three_year_survival"] = np.nan
df_constr["three_year_survival (DSS)"] = np.nan
df_constr["five_year_survival"] = np.nan
df_constr["five_year_survival (DSS)"] = np.nan
df_constr["TCGA"] = np.nan

# Reset the index
df_constr.reset_index(drop=True, inplace=True)

## Fill Therapy based on Chemotherapy and Radiotherapy
for i in range(len(df_constr)):
    if df_constr["Chemotherapy"].iloc[i] == "yes" and df_constr["Radiotherapy"][i] == "yes":
        df_constr["Therapy"][i] = "chemoradiotherapy"
    elif df_constr["Chemotherapy"][i] == "yes":
        df_constr["Therapy"][i] = "chemotherapy"
    elif df_constr["Radiotherapy"][i] == "yes":
        df_constr["Therapy"][i] = "radiotherapy"
    elif pd.isna(df_constr["Chemotherapy"][i]) or pd.isna(df_constr["Radiotherapy"][i]):
        df_constr["Therapy"][i] = np.nan
    else:
        df_constr["Therapy"][i] = "no"

## Fill in Survivals
df["Death_date"] = pd.to_datetime(df["Death_date"], errors="coerce")
df["Date_diagnosis"] = pd.to_datetime(df["Date_diagnosis"], errors="coerce")
df["Last_followup"] = pd.to_datetime(df["Last_followup"],  errors="coerce")

for i in range(len(df)):
    Death_date = df["Death_date"][i]
    Date_diagnosis = df["Date_diagnosis"][i]
    Last_followup = df["Last_followup"][i]
    Death_by_EC = df["Death_by_EC"][i]
    
    if pd.isna(Death_date) and pd.isna(Last_followup):
        df_constr["one_year_survival"][i] = np.nan
        df_constr["one_year_survival (DSS)"][i] = np.nan
        df_constr["three_year_survival"][i] = np.nan
        df_constr["three_year_survival (DSS)"][i] = np.nan
        df_constr["five_year_survival"][i] = np.nan
        df_constr["five_year_survival (DSS)"][i] = np.nan
    elif not pd.isna(Death_date):
        if (Death_date - Date_diagnosis).days >= 365:
            df_constr["one_year_survival"][i] = "yes"
            df_constr["one_year_survival (DSS)"][i] = "yes"    
        else:
            df_constr["one_year_survival"][i] = "no"
            if Death_by_EC == "yes":
                df_constr["one_year_survival (DSS)"][i] = "no"
            else:
                df_constr["one_year_survival (DSS)"][i] = "yes"
        
        if (Death_date - Date_diagnosis).days >= 1095:
            df_constr["three_year_survival"][i] = "yes"
            df_constr["three_year_survival (DSS)"][i] = "yes"
        else:
            df_constr["three_year_survival"][i] = "no"
            if Death_by_EC == "yes":
                df_constr["three_year_survival (DSS)"][i] = "no"
            else:
                df_constr["three_year_survival (DSS)"][i] = "yes"

        if (Death_date - Date_diagnosis).days >= 1825:
            df_constr["five_year_survival"][i] = "yes"
            df_constr["five_year_survival (DSS)"][i] = "yes"
        else:
            df_constr["five_year_survival"][i] = "no"
            if Death_by_EC == "yes":
                df_constr["five_year_survival (DSS)"][i] = "no"
            else:
                df_constr["five_year_survival (DSS)"][i] = "yes"
                
    elif not pd.isna(Last_followup):
        if (Last_followup - Date_diagnosis).days >= 365:
            df_constr["one_year_survival"][i] = "yes"
            df_constr["one_year_survival (DSS)"][i] = "yes"    
        else:
            df_constr["one_year_survival"][i] = "no"
            if Death_by_EC == "yes":
                df_constr["one_year_survival (DSS)"][i] = "no"
            else:
                df_constr["one_year_survival (DSS)"][i] = "yes"
        
        if (Last_followup - Date_diagnosis).days >= 1095:
            df_constr["three_year_survival"][i] = "yes"
            df_constr["three_year_survival (DSS)"][i] = "yes"
        else:
            df_constr["three_year_survival"][i] = "no"
            if Death_by_EC == "yes":
                df_constr["three_year_survival (DSS)"][i] = "no"
            else:
                df_constr["three_year_survival (DSS)"][i] = "yes"

        if (Last_followup - Date_diagnosis).days >= 1825:
            df_constr["five_year_survival"][i] = "yes"
            df_constr["five_year_survival (DSS)"][i] = "yes"
        else:
            df_constr["five_year_survival"][i] = "no"
            if Death_by_EC == "yes":
                df_constr["five_year_survival (DSS)"][i] = "no"
            else:
                df_constr["five_year_survival (DSS)"][i] = "yes"
    
    else:
        df_constr["one_year_survival"][i] = np.nan
        df_constr["one_year_survival (DSS)"][i] = np.nan
        df_constr["three_year_survival"][i] = np.nan
        df_constr["three_year_survival (DSS)"][i] = np.nan
        df_constr["five_year_survival"][i] = np.nan
        df_constr["five_year_survival (DSS)"][i] = np.nan

## Filling in the TCGA from the extra dataset
df_OnlyMol["MSI_bi"].replace({"MSI stable":"no", "MSI":"yes"}, inplace=True)

# Fill in the TCGA data
for i in range(len(df_OnlyMol)):
    study_number = df_OnlyMol["Study_number"][i]
    row_index = df_constr.loc[df_constr["Study_number"] == study_number].index
    
    df_constr["POLE_mutation"][row_index] = df_OnlyMol["POLE_mutation"][i]
    df_constr["MSI_mutation"][row_index] = df_OnlyMol["MSI_bi"][i]
    df_constr["TP53_mutation"][row_index] = df_OnlyMol["TP53_mutation"][i]
    df_constr["TCGA"][row_index] = df_OnlyMol["TCGA"][i]
    

For the patients that are present in the original dataset, insert the survival data from the original dataset, since something went wrong with the merging

In [None]:
# Insert the one, three, and five year survival from the original dataset on the correct patients
x  =0
for i in range(len(df_original)):
    study_number = df_original["Study_number"][i]
    # Check if the study number is in the new dataset, if not throw an error
    if not study_number in df_constr["Study_number"].values:
        print("Study number " + str(study_number) + " not found in the new dataset")
        continue
    row_index = df_constr.loc[df_constr["Study_number"] == study_number].index
    
    df_constr["one_year_survival"][row_index] = df_original["one_year_survival"][i]
    df_constr["one_year_survival (DSS)"][row_index] = df_original["one_year_survival"][i]
    df_constr["three_year_survival"][row_index] = df_original["three_year_survival"][i]
    df_constr["three_year_survival (DSS)"][row_index] = df_original["three_year_survival"][i]
    df_constr["five_year_survival"][row_index] = df_original["five_year_survival"][i]
    df_constr["five_year_survival (DSS)"][row_index] = df_original["five_year_survival"][i]

save the dataset

In [None]:
df_constr.to_csv("../0. Data/Trainingcohort(_wTCGA)/Training+JAMA_Merged.csv", sep=",", index=False)

# Creating a cleaned version for model use with the correct column names

Copy the dataset and rename the columns

In [None]:
df_constr_clean = df_constr.copy()

df_constr_clean.rename(columns={"Study_number": "Study_number", 
                                "Included_in_training_cohort": "Included_in_training_cohort", 
                                "Age": "Age", 
                                "BMI": "BMI", 
                                "Comorbidity_index": "Comorbidity_index", 
                                "CA125_PREOP": "CA125_PREOP", 
                                "CA125_PREOP_bi": "CA125", 
                                "Platelets": "Platelets_numeric", 
                                "Platelets_bi": "Platelets", 
                                "L1CAM_PREOP": "L1CAM", 
                                "ER_PREOP": "ER", 
                                "PR_PREOP": "PR", 
                                "p53_PREOP": "p53", 
                                "Grade_PREOP": "PreoperativeGrade", 
                                "FIGO_clinical": "FIGO_clinical",
                                "FIGO_surgical": "FIGO_surgical",
                                "MSI_mutation": "MSI", 
                                "POLE_mutation": "POLE", 
                                "TP53_mutation": "TP53", 
                                "CTMRI": "CTMRI", 
                                "MRI_MI": "MRI_MI", 
                                "LVSI": "LVSI", 
                                "Cytology": "Cytology", 
                                "LNM": "LNM", 
                                "LNM (inc. follow-up)": "LNM (inc. follow-up)", 
                                "Myometrial_invasion": "MyometrialInvasion", 
                                "Grade_POSTOP": "PostoperativeGrade", 
                                "Recurrence": "Recurrence", 
                                "Recurrence_location": "Recurrence_location", 
                                "Chemotherapy": "Chemotherapy", 
                                "Radiotherapy": "Radiotherapy", 
                                "Therapy": "Therapy", 
                                "Death_by_EC": "Death_by_EC", 
                                "one_year_survival": "one_year_survival (All)", 
                                "one_year_survival (DSS)": "Survival1yr", 
                                "three_year_survival": "three_year_survival (All)", 
                                "three_year_survival (DSS)": "Survival3yr", 
                                "five_year_survival": "five_year_survival (All)", 
                                "five_year_survival (DSS)": "Survival5yr"}, inplace=True)


# Compare df_constr_clean to df_original for the the columns, with the same study numbers

In [None]:
# Compare df_constr_clean to df_original
study_nrs = df_original["Study_number"].values
df_c_select = df_constr_clean[df_constr_clean["Study_number"].isin(study_nrs)]
df_c_select.reset_index(drop=True, inplace=True)

df_c_select = df_c_select[["CA125", "PR","ER", "L1CAM", "p53", "PreoperativeGrade", "LNM","PostoperativeGrade", "Survival1yr", "Survival3yr", "Survival5yr"]]
df_c_select.replace({"negative":0, "positive":1, "wildtype":0, "mutant":1
                     ,"lt_35":0, "ge_35":1, "lt_400":0, "ge_400":1, "lt_50":0, "ge_50":1,
                     "no":0, "yes":1, "grade 1":1, "grade 2":2, "grade 3":3}, inplace=True)

column_pairs = [["CA125", "CA125_PREOP_bi"], ["ER", "ER_expression_preop"], ["PR", "PR_expression_preop"], ["L1CAM", "L1CAM_expression_preop"], ["p53","p53_expression_preop"], ["PreoperativeGrade","Grade_PREOP"], ["LNM","LNM"],["PostoperativeGrade", "Grade"], ["Survival1yr", "one_year_survival"], ["Survival3yr", "three_year_survival"], ["Survival5yr", "five_year_survival"]]

changes = 0
# Check if all columns are the same
for i in range(len(df_c_select)):
    for pairs in column_pairs:
        contr_column = pairs[0]
        orig_column = pairs[1]
        if pd.isna(df_c_select[contr_column].iloc[i]) and pd.isna(df_original[orig_column].iloc[i]):
            continue
        elif float(df_c_select[contr_column].iloc[i]) != float(df_original[orig_column].iloc[i]):
            df_c_select[contr_column].iloc[i] = df_original[orig_column].iloc[i]
            print("Difference found in " + contr_column + " and " + orig_column + " at index " + str(i))
            print(df_c_select[contr_column].iloc[i])
            print(df_original[orig_column].iloc[i])
            print("\n")
            changes += 1

print("Total changes: " + str(changes))

Fix labels in the df_constr_clean dataset

In [None]:
# Revert to the labels of the original dataset
df_c_select["CA125"].replace({0:"lt_35", 1:"ge_35"}, inplace=True)
df_c_select["ER"].replace({0:"negative", 1:"positive"}, inplace=True)
df_c_select["PR"].replace({0:"negative", 1:"positive"}, inplace=True)
df_c_select["L1CAM"].replace({0:"negative", 1:"positive"}, inplace=True)
df_c_select["p53"].replace({0:"wildtype", 1:"mutant"}, inplace=True)
df_c_select["PreoperativeGrade"].replace({1:"grade 1", 2:"grade 2", 3:"grade 3"}, inplace=True)
df_c_select["LNM"].replace({0:"no", 1:"yes"}, inplace=True)
df_c_select["PostoperativeGrade"].replace({1:"grade 1", 2:"grade 2", 3:"grade 3"}, inplace=True)
df_c_select["Survival1yr"].replace({0:"no", 1:"yes"}, inplace=True)
df_c_select["Survival3yr"].replace({0:"no", 1:"yes"}, inplace=True)
df_c_select["Survival5yr"].replace({0:"no", 1:"yes"}, inplace=True)

for column in df_c_select.columns:
    for i in range (len(df_c_select)):
        nr = study_nrs[i]
        try:
            df_constr_clean.loc[df_constr_clean["Study_number"] == nr, column] = df_c_select[column].iloc[i]
        except:
            print("Error at index " + str(i) + " with column " + column)
            print(df_c_select[column].iloc[i])
            print("\n")


Fix more labels in the df_constr_clean dataset

In [None]:
df_constr_clean["Recurrence"].replace({0:"no", 1:"yes"}, inplace=True)
df_constr_clean["Recurrence_location"].replace({0:"no", 1:"local", 2:"regional", 3:"distant"}, inplace=True)
df_constr_clean["MyometrialInvasion"].replace({0:"lt_50", 1:"lt_50", 2:"ge_50"}, inplace=True)
df_constr_clean["Chemotherapy"].replace({0:"no", 1:"yes"}, inplace=True)
df_constr_clean["Radiotherapy"].replace({0:"no", 1:"yes"}, inplace=True)
df_constr_clean["MRI_MI"].replace({1:"lt_50", 2:"ge_50"}, inplace=True)
df_constr_clean["CTMRI"].replace({0:"no", 1:"yes"}, inplace=True)
df_constr_clean["LVSI"].replace({0:"no", 1:"yes"}, inplace=True)
df_constr_clean["LNM (inc. follow-up)"].replace({0:"no", 1:"yes"}, inplace=True)
df_constr_clean["Histology"].replace({1:"endometrioid", 2:"non-endometrioid"}, inplace=True)
df_constr_clean["Death_by_EC"].replace({0:"no", 1:"yes"}, inplace=True)
df_constr_clean["Cytology"].replace({0:"benign", 1:"benign", 2:"malignant", 3:"malignant", "0.0":"benign", "1.0":"benign", "2.0":"malignant", "3.0":"malignant", "<NA>":np.nan}, inplace=True)

Designate the FIGO_surgical column as the FIGO column

In [None]:
df_constr_clean.rename(columns={"FIGO_surgical": "FIGO"}, inplace=True)

Save the cleaned dataset

In [None]:
df_constr_clean.to_csv("../0.1. Cleaned_data/Training+TCGA+JAMA_cleaned.csv", sep=",", index=False)


Calculate the sensitivity and specificity of the MRI_MI prediction of Myometrial invasion

In [None]:
cross_table = pd.crosstab(df_constr_clean["MyometrialInvasion"], df_constr_clean["MRI_MI"], margins=True)

# Sensitivity = TP / (TP + FN)
# Specificity = TN / (TN + FP)
TP = cross_table["ge_50"]["ge_50"]
FN = cross_table["ge_50"]["lt_50"]
TN = cross_table["lt_50"]["lt_50"]
FP = cross_table["lt_50"]["ge_50"]

sensitivity = TP / (TP + FN)
specificity = TN / (TN + FP)

print("Sensitivity: " + str(sensitivity))
print("Specificity: " + str(specificity))