### Author: Ally Sprik
### Last-updated: 25-02-2024

Goal of this notebook is to explore the MAYO dataset and clean it for further analysis.



In [None]:
import numpy as np
import pandas as pd

# Load in two datasets to compare the columns, the incl. biomarkers dataset should be the correct one, but we're still doubting if the MAYO dataset is correct.
df1 = pd.read_spss('../0. Data/MAYO/MAYO cohort 22-9-2023inclbiomarkers.sav')
# Fix the labels for NaN values
df1.replace({99: np.nan, 'Unknown':np.nan, 'Grade 1':1, 'Grade 2':2, 'Grade 3':3}, inplace=True)

df2 = pd.read_csv('../0. Data/MAYO/MAYO cohort 22-9-2023.csv', sep=';')

The following codeblock is a standard codeblock to search for column names that contain a certain substring. If set to "", it will return all columns.

In [None]:
for col in df1.columns:
	if col.__contains__('micro'):
		print(col)

Create a new column that does not include itc metastasis as metastasis

In [None]:
df1['LNM_micromacro'] = df1['Metastasis_macro_micro_itc'].apply(lambda x: np.nan if pd.isna(x) else ('yes' if x == 'Macrometastasis' or x == 'Micrometastasis' else 'no'))

Check what columns are in the incl. biomarkers dataset, but not in the other dataset

In [None]:
for column in df1.columns:
	if column not in df2.columns:
		print(column)

Added columns: 
ER_preop_perc_def
ER_preop_def_bi
PR_preop_perc_def
PR_preop_def_bi
L1CAM_preop_def
p53_preop_def

Copy df1 to df for easier reference, since we will only use the incl. biomarkers dataset

In [None]:
df = df1

The following codeblock drops the Study number and Study ID columns, as they are not relevant for the analysis.
It also drops the rows that are completely empty.

In [None]:
# Drop all rows with all NaN values
df.drop(columns=['Studynumber', 'StudyID'], inplace=True)
df = df.dropna(how='all', axis=0)
df

The following codeblock creates a new column that tries to asses risk level based on the preoperative grade, PR and p53 status.

In [None]:
df['Risk_level_PREOP'] = np.nan
df['Risk_level_PREOP'] = df['Risk_level_PREOP'].astype(object)
for i in range(len(df)):
	if pd.isna(df['Preoperative_grade'][i]):
		df['Risk_level_PREOP'][i] = 'Unknown'
	elif df['Preoperative_grade'].values[i] == 1:
		df['Risk_level_PREOP'][i] = 'Low'
	elif df['Preoperative_grade'].values[i] == 3:
		df['Risk_level_PREOP'][i] = 'High'
	elif df['Preoperative_grade'].values[i] == 2:
		if df['PR_preop_def_bi'].values[i] == 1:
			df['Risk_level_PREOP'][i] = 'High'
		else:
			if df['p53_preop_def'].values[i] == 1:
				df['Risk_level_PREOP'][i] = 'High'
			else:
				df['Risk_level_PREOP'][i] = 'Low'
	else:
		df['Risk_level_PREOP'][i] = 'Unknown'


The following codeblock creates a new column that tries to asses risk level based on the postoperative grade, PR and p53 status.

In [None]:
df['Risk_level_POSTOP'] = np.nan
df['Risk_level_POSTOP'] = df['Risk_level_POSTOP'].astype(object)

for i in range(len(df)):
	if pd.isna(df['Postoperative_grade'][i]):
		df['Risk_level_POSTOP'][i] = 'Unknown'
	elif df['Postoperative_grade'].values[i] == 1:
		df['Risk_level_POSTOP'][i] = 'Low'
	elif df['Postoperative_grade'].values[i] == 3:
		df['Risk_level_POSTOP'][i] = 'High'
	elif df['Postoperative_grade'].values[i] == 2:
		if df['PR_preop_def_bi'].values[i] > 0:
			df['Risk_level_POSTOP'][i] = 'High'
		else:
			if df['p53_preop_def'].values[i] > 0:
				df['Risk_level_POSTOP'][i] = 'High'
			else:
				df['Risk_level_POSTOP'][i] = 'Low'
	else:
		df['Risk_level_PREOP'][i] = 'Unknown'


The following codeblock creates a new column that tries to asses risk level based on only PR and p53 status and CA-125.

In [None]:
df['Risk_level_BM'] = np.nan
df['Risk_level_BM'] = df['Risk_level_BM'].astype(object)

for i in range(len(df)):
	if df['PR_preop_def_bi'].values[i] == 1:
		df['Risk_level_BM'][i] = 'High'
	elif df['p53_preop_def'].values[i] == 1:
		df['Risk_level_BM'][i] = 'High'
	elif df['CA_125_pos_neg'].values[i] == 1:
		df['Risk_level_BM'][i] = 'High'
	elif df['p53_preop_def'].values[i] == 0 or df['PR_preop_def_bi'].values[i] == 0 or \
			df['CA_125_pos_neg'].values[i] == 0:
		df['Risk_level_BM'][i] = 'Low'
	else:
		df['Risk_level_BM'][i] = 'Unknown'


The following codeblock creates a new column that tries to asses risk level based on the TCGA classification.

In [None]:
df['Risk_level_TCGA'] = np.nan
df['Risk_level_TCGA'] = df['Risk_level_TCGA'].astype(object)

for i in range(len(df)):
	if df['MSI_POLE_TP53_NSMP'].values[i] == 2:
		df['Risk_level_TCGA'][i] = 'Low'
	elif df['MSI_POLE_TP53_NSMP'].values[i] == 1 or df['MSI_POLE_TP53_NSMP'].values[i] == 4:
		df['Risk_level_TCGA'][i] = 'Medium'
	elif df['MSI_POLE_TP53_NSMP'].values[i] == 3:
		df['Risk_level_TCGA'][i] = 'High'
	else:
		df['Risk_level_TCGA'][i] = 'Unknown'



The following codeblock creates a new column that tries to asses risk level based on the TCGA classification with the biomarkers, if missing, it will use only biomarkers.

In [None]:
df['Risk_level_TCGA_BM'] = np.nan
df['Risk_level_TCGA_BM'] = df['Risk_level_TCGA_BM'].astype(object)

for i in range(len(df)):
	if df['MSI_POLE_TP53_NSMP'].values[i] == 2:
		df['Risk_level_TCGA_BM'][i] = 'Low'
	elif df['MSI_POLE_TP53_NSMP'].values[i] == 3:
		df['Risk_level_TCGA_BM'][i] = 'High'
	elif df['MSI_POLE_TP53_NSMP'].values[i] == 1 or df['MSI_POLE_TP53_NSMP'].values[i] == 4:
		if df['PR_preop_def_bi'].values[i] == 1:
			df['Risk_level_TCGA_BM'][i] = 'High'
		elif df['p53_preop_def'].values[i] == 1:
			df['Risk_level_TCGA_BM'][i] = 'High'
		elif df['CA_125_pos_neg'].values[i] == 1:
			df['Risk_level_TCGA_BM'][i] = 'High'
		elif df['p53_preop_def'].values[i] == 0 or df['PR_preop_def_bi'].values[i] == 0 or \
				df['CA_125_pos_neg'].values[i] == 0:
			df['Risk_level_TCGA_BM'][i] = 'Low'
		else:
			df['Risk_level_TCGA_BM'][i] = 'Unknown'
	else:
		if df['PR_preop_def_bi'].values[i] == 1:
			df['Risk_level_TCGA_BM'][i] = 'High'
		elif df['p53_preop_def'].values[i] == 1:
			df['Risk_level_TCGA_BM'][i] = 'High'
		elif df['CA_125_pos_neg'].values[i] == 1:
			df['Risk_level_TCGA_BM'][i] = 'High'
		elif df['p53_preop_def'].values[i] == 0 or df['PR_preop_def_bi'].values[i] == 0 or \
				df['CA_125_pos_neg'].values[i] == 0:
			df['Risk_level_TCGA_BM'][i] = 'Low'
		else:
			df['Risk_level_TCGA_BM'][i] = 'Unknown'


The following codeblock creates a new column that tries to asses risk level based on the TCGA classification, if missing, it will use biomarkers.

In [None]:
df['Risk_level_TCGA_BMNaN'] = np.nan
df['Risk_level_TCGA_BMNaN'] = df['Risk_level_TCGA_BMNaN'].astype(object)

for i in range(len(df)):
	if df['MSI_POLE_TP53_NSMP'].values[i] == 2:
		df['Risk_level_TCGA_BMNaN'][i] = 'Low'
	elif df['MSI_POLE_TP53_NSMP'].values[i] == 1:
		df['Risk_level_TCGA_BMNaN'][i] = 'Medium'
	elif df['MSI_POLE_TP53_NSMP'].values[i] == 3:
		df['Risk_level_TCGA_BMNaN'][i] = 'High'
	elif df['MSI_POLE_TP53_NSMP'].values[i] == 4:
		df['Risk_level_TCGA_BMNaN'][i] = 'Medium'
	else:
		if df['PR_preop_def_bi'].values[i] == 1:
			df['Risk_level_TCGA_BMNaN'][i] = 'High'
		elif df['p53_preop_def'].values[i] == 1:
			df['Risk_level_TCGA_BMNaN'][i] = 'High'
		elif df['CA_125_pos_neg'].values[i] == 1:
			df['Risk_level_TCGA_BMNaN'][i] = 'High'
		elif df['p53_preop_def'].values[i] == 0 or df['PR_preop_def_bi'].values[i] == 0 or \
				df['CA_125_pos_neg'].values[i] == 0:
			df['Risk_level_TCGA_BMNaN'][i] = 'Low'
		else:
			df['Risk_level_TCGA_BMNaN'][i] = 'Unknown'


save the dataframe to a csv file

In [None]:
df.to_csv('../0.1. Cleaned_data/MAYO_with_risk_levels.csv')

Create a subset of the data for model use

In [None]:
df = pd.read_csv('../0.1. Cleaned_data/MAYO_with_risk_levels.csv')

Generate survival columns based on the days from diagnosis to last follow-up

In [None]:
df["one_year_survival"] = df["Days_from_diagnosis_to_lastfollowup_mayo"].apply(lambda x: 1 if x >= 365 else (np.nan if pd.isna(x) else 0))
df["three_year_survival"] = df["Days_from_diagnosis_to_lastfollowup_mayo"].apply(lambda x: 1 if x >= 1095 else (np.nan if pd.isna(x) else 0))
df["five_year_survival"] = df["Days_from_diagnosis_to_lastfollowup_mayo"].apply(lambda x: 1 if x >= 1825 else (np.nan if pd.isna(x) else 0))


Create radiotherapy and chemotherapy columns from the adjuvant therapy column

In [None]:
df["Radiotherapy"] = df["Adjuvant_therapy"].apply(lambda x: 1 if x == "Radiotherapy" else (1 if x == "Chemoradiotherapy" else(np.nan if pd.isna(x) else 0)))
df["Chemotherapy"] = df["Adjuvant_therapy"].apply(lambda x: 1 if x == "Chemotherapy" else (1 if x == "Chemoradiotherapy" else(np.nan if pd.isna(x) else 0)))

Create binary columns for the LVSI and platelets

In [None]:
df["LVSI_bi"] = df["LVSI"].apply(lambda x: 1 if x == "Invasion" else (np.nan if pd.isna(x) else (np.nan if x == "Not mentioned" else 0)))
df["platelets_bi"] = df["Preoperative_platelets_serumlevel"].apply(lambda x: 1 if x >= 400 else (np.nan if pd.isna(x) else 0))

Create a binary column from CT_MRI

In [None]:
df["CT_MRI"] = np.nan
for i in range(len(df)):
	if df["Enlarged_LN_CT"].values[i] == "Lymphadenopathy" or df["Enlarged_LN_MRI"].values[i] == "Lymphadenopathy":
		df["CT_MRI"][i] = 1
	elif df["Enlarged_LN_CT"].values[i] == "No metastasis" and df["Enlarged_LN_MRI"].values[i] == "No metastasis":
		df["CT_MRI"][i] = 0
	else:
		df["CT_MRI"][i] = np.nan
		

# Make a subset of the data

In [None]:
selectionCols = ["ER_preop_def_bi", "PR_preop_def_bi", "L1CAM_preop_def", "p53_preop_def",
				 "POLE", "MSI_MRR", "TP53", "CA_125_pos_neg", "MI_MRI", "platelets_bi", "CT_MRI", 
				 "Preoperative_grade", "LNM_LNDorSLN", "Postoperative_grade", "Postoperative_MI","Preoperative_FIGOstage", 
				 "one_year_survival", "three_year_survival", "five_year_survival", "Radiotherapy", "Chemotherapy", "Recurrence", 
				 "LVSI_bi", "LNM_micromacro"]

df_subdag = df[selectionCols]

Rename the columns for model use and consistency

In [None]:
df_subdag.rename(columns={
	"CT_MRI":"CTMRI",
	"platelets_bi":"Platelets",
	"CA_125_pos_neg":"CA125",
	"ER_preop_def_bi":"ER",
	"PR_preop_def_bi":"PR",
	"p53_preop_def":"p53",
	"TP53":"TP53",
	"L1CAM_preop_def":"L1CAM",
	"POLE":"POLE",
	"MSI_MRR":"MSI",
	"LNM_LNDorSLN":"LNM",
	"MI_MRI":"MRI_MI",
	"Preoperative_grade":"PreoperativeGrade",
	"Postoperative_grade":"PostoperativeGrade",
	"Postoperative_MI":"MyometrialInvasion",
	"Preoperative_FIGOstage":"FIGO",
	"one_year_survival":"Survival1yr",
	"three_year_survival":"Survival3yr",
	"five_year_survival":"Survival5yr",
	"Radiotherapy":"Radiotherapy",
	"Chemotherapy":"Chemotherapy",
	"Recurrence":"Recurrence",
	"LVSI_bi":"LVSI"
}, inplace=True)


Replace numerical groups with the correct labels

In [None]:
df_subdag["Platelets"].replace({1:"ge_400", 0:"lt_400"}, inplace=True)

df_subdag["CA125"].replace({"Positive":"ge_35", "Negative":"lt_35"}, inplace=True)

df_subdag["CTMRI"].replace({1:"yes", 0:"no"}, inplace=True)

df_subdag["ER"].replace({1:"positive", 0:"negative"}, inplace=True)

df_subdag["PR"].replace({1:"positive", 0:"negative"}, inplace=True)

df_subdag["p53"].replace({1:"mutant", 0:"wildtype"}, inplace=True)

df_subdag["TP53"].replace({"No":"mutant", "Yes":"wildtype"}, inplace=True)

df_subdag["L1CAM"].replace({1:"positive", 0:"negative"}, inplace=True)

df_subdag["POLE"].replace({"No":"no", "Yes":"yes"}, inplace=True)

df_subdag["MSI"].replace({"Stable":"no", "Unstable":"yes"}, inplace=True)

df_subdag["LNM"].replace({"No":"no", "Yes":"yes"}, inplace=True)

df_subdag["MRI_MI"].replace({"<50%":"lt_50", ">50%":"ge_50", "Not assessed":np.nan}, inplace=True)

df_subdag["PreoperativeGrade"].replace({1:"grade 1", 2:"grade 2", 3:"grade 3"}, inplace=True)

df_subdag["PostoperativeGrade"].replace({1:"grade 1", 2:"grade 2", 3:"grade 3"}, inplace=True)

df_subdag["MyometrialInvasion"].replace({"No":"lt_50", "Yes":"ge_50", "<50%":"lt_50", "No invasion":"lt_50", ">50%":"ge_50"}, inplace=True)

df_subdag["Radiotherapy"].replace({1:"yes", 0:"no"}, inplace=True)

df_subdag["Chemotherapy"].replace({1:"yes", 0:"no"}, inplace=True)

df_subdag["Recurrence"].replace({"Yes":"yes", "No":"no"}, inplace=True)

df_subdag["LVSI"].replace({1:"yes", 0:"no"}, inplace=True)

df_subdag["Survival1yr"].replace({1:"yes", 0:"no"}, inplace=True)

df_subdag["Survival3yr"].replace({1:"yes", 0:"no"}, inplace=True)

df_subdag["Survival5yr"].replace({1:"yes", 0:"no"}, inplace=True)

In [None]:
# TP53 not in subdag
df_subdag.drop(columns=["TP53"], inplace=True)

Make an informed imputation for the Cytology column, otherwise the bnlearn imputation will not work, only for testing purposes

In [None]:
df_subdag["Cytology"] = np.nan
# Informed imputation of two values so bnlearn will accept it
df_subdag["Cytology"].iloc[277] = "malignant"
df_subdag["Cytology"].iloc[240] = "benign"

drop all rows that are completely empty

In [None]:
df_subdag.dropna(axis=0, how="all", inplace=True)

save the dataframe to a csv file

In [None]:
df_subdag.to_csv('../0.1. Cleaned_data/MAYO_subdag.csv', index=False)