### Author: Ally Sprik
### Last-updated: 25-02-2024

Goal of this notebook is to explore the PIPENDO dataset, some inconsistencies were present with the dataset casper used.

In [None]:
import numpy as np
import pandas as pd

df2 = pd.read_spss('../0. Data/PIPENDO/PIPENDO compleet opgeschoond.sav')
df = pd.read_spss('../0. Data/PIPENDO/PIPENDO_voor_ENDORISK.sav')
key = pd.read_spss('../0. Data/PIPENDO/PIPENDO_key.sav')
df_v1 = pd.read_spss('../0. Data/PIPENDO/pipendo start met elkerliek zonder extra variabele.sav')


Rename columns so it is easier to work with and more consistent across the datasets

In [None]:
 # Grade is the preop grade, grade_uterus is the postop grade
df['Grade_PREOP_new'] = df['Grade']
df['Grade_POSTOP_new'] = df['Grade_uterus']

Fix up the column and create a table for the counts to compare to the paper

In [None]:
df['Grade_POSTOP_new'] = df['Grade_POSTOP_new'].astype(object)
df['Grade_POSTOP_new'].replace({'unknown': np.nan, 99.0: np.nan}, inplace=True)
pd.DataFrame([df['Grade_POSTOP_new'].value_counts(dropna=False).astype(int), round(df['Grade_POSTOP_new'].value_counts(normalize=True),2)])

Fix up the column and create a table for the counts to compare to the paper

In [None]:
df['Grade_PREOP_new'] = df['Grade_PREOP_new'].astype(object)
df['Grade_PREOP_new'].replace({'unknown': np.nan, 'missing': np.nan}, inplace=True)
pd.DataFrame([df['Grade_PREOP_new'].value_counts(dropna=False).astype(int), round(df['Grade_PREOP_new'].value_counts(normalize=True),3)*100])


Find all the complete cases

In [None]:
complete_cases = df.dropna(inplace=False)
complete_cases

Create a table for the counts of ER 

In [None]:
pd.DataFrame([df['ER_expression_PREOP'].value_counts(dropna=False).astype(int), round(df['ER_expression_PREOP'].value_counts(normalize=True),3)*100])

Create a table for the counts of PR

In [None]:
pd.DataFrame([df['PR_expression_PREOP'].value_counts(dropna=False).astype(int), round(df['PR_expression_PREOP'].value_counts(normalize=True),3)*100])

The following codeblock is a standard codeblock to search for column names that contain a certain substring. If set to "", it will return all columns.

In [None]:
for column in df.columns:
    if column.__contains__('CT'):
        print(column)

Generate risk levels per patient preoperatively, based on biomarkers and grade

In [None]:
df['Risk_level_PREOP'] = np.nan
df['Risk_level_PREOP'] = df['Risk_level_PREOP'].astype(object)
for i in range(len(df)):
	if pd.isna(df['Grade_PREOP'][i]):
		df['Risk_level_PREOP'][i] = 'Unknown'
	elif df['Grade_PREOP'].values[i] == 'grade 1':
		df['Risk_level_PREOP'][i] = 'Low'
	elif df['Grade_PREOP'].values[i] == 'grade 3 or non-endometrioid':
		df['Risk_level_PREOP'][i] = 'High'
	elif df['Grade_PREOP'].values[i] == 'grade 2':
		if df['PR_expression_PREOP'].values[i] == '<10%':
			df['Risk_level_PREOP'][i] = 'High'
		else:
			if df['p53_expression_PREOP'].values[i] == 'overexpression':
				df['Risk_level_PREOP'][i] = 'High'
			else:
				df['Risk_level_PREOP'][i] = 'Low'
	else:
		df['Risk_level_PREOP'][i] = 'Unknown'


Generate risk levels per patient postoperatively, based on biomarkers and grade

In [None]:
df['Risk_level_POSTOP'] = np.nan
df['Risk_level_POSTOP'] = df['Risk_level_POSTOP'].astype(object)

for i in range(len(df)):
    if df['Grade'].values[i] == 'unkown' or df['Grade'].values[i] == 'missing':
        df['Risk_level_POSTOP'][i] = 'Unknown'
    elif df['Grade'].values[i] == '1':
        df['Risk_level_POSTOP'][i] = 'Low'
    elif df['Grade'].values[i] == '3':
        df['Risk_level_POSTOP'][i] = 'High'
    elif df['Grade'].values[i] == '2' or df['Grade'].values[i] == 'other (sarcoma)':
	    if df['PR_expression_PREOP'].values[i] == '<10%':
		    df['Risk_level_PREOP'][i] = 'High'
	    else:
		    if df['p53_expression_PREOP'].values[i] == 'overexpression':
			    df['Risk_level_PREOP'][i] = 'High'
		    else:
			    df['Risk_level_PREOP'][i] = 'Low'
    else:
        df['Risk_level_PREOP'][i] = 'Unknown'


Generate risk levels per patient based on biomarkers

In [None]:
df['Risk_level_BM'] = np.nan
df['Risk_level_BM'] = df['Risk_level_BM'].astype(object)

for i in range(len(df)):
	if df['PR_expression_PREOP'].values[i] == '<10%':
		df['Risk_level_BM'][i] = 'High'
	elif df['p53_expression_PREOP'].values[i] == 'overexpression':
		df['Risk_level_BM'][i] = 'High'
	elif df['CA125_PREOP_bi'].values[i] == '=/>35 U/mL (=abnormal)':
		df['Risk_level_BM'][i] = 'High'
	elif df['p53_expression_PREOP'].values[i] == 'wildtype' or df['PR_expression_PREOP'].values[i] == '>10%, unclear percentage' or \
			df['CA125_PREOP_bi'].values[i] == '<35 U/mL (=normal)':
		df['Risk_level_BM'][i] = 'Low'
	else:
		df['Risk_level_BM'][i] = 'Unknown'


### Data cleaning

Select the PIPENDO patients, they have to have Preoperative grade and at least 3 molecular biomarkers and 1 clinical peroperative maker, for consistency with Casper Reijnen

dataframe 1

In [None]:
x = [] 
# Check if they have at least three of the molecular biomarkers: ER, PR, p53, L1CAM
for i in range(len(df)):
	row = df.iloc[i]
	count = 0
	if not pd.isna(row['ER_expression_PREOP']):
		count += 1
	if not pd.isna(row['PR_expression_PREOP']):
		count += 1
	if not pd.isna(row['p53_expression_PREOP']):
		count += 1
	if not pd.isna(row['L1CAM_expression_PREOP']):
		count += 1
	if count < 3:
		x.append(i)

df = df.drop(df.index[x])

dataframe 2

In [None]:
df = df.dropna(subset=['Grade_rev_PREOP_with_unknown'])

x = []
# Check if they have at least three of the molecular biomarkers: ER, PR, p53, L1CAM
for i in range(len(df2)):
	row = df2.iloc[i]
	count = 0
	if not pd.isna(row['ER']):
		count += 1
	if not pd.isna(row['PR']):
		count += 1
	if not pd.isna(row['p53']):
		count += 1
	if not pd.isna(row['L1CAM']):
		count += 1
	if count < 3:
		x.append(i)

df2 = df2.drop(df2.index[x])


Merge the two dataframes, on study number

In [None]:
# Merge the two dataframes
df_merge = df.merge(df2, how='left', left_on='Study_number_OC', right_on='RecordId')
df_merge

Create binary column for platelets, change CT_result to normal and abnormal, and create a column for the postoperative grade

In [None]:
df_merge['Platelets_bi'] = df_merge['Platelets'].apply(lambda x: 1 if x >= 400 else (0 if x < 400 else np.nan))

df_merge['CT_result'].replace({'no extra uterine disease': 'normal', 'inconclusive':'normal', 'lymph node metastasis suspected':'abnormal', 'distant metastasis suspected': 'abnormal', 'lymph node and distant metastasis suspected':'abnormal'}, inplace=True)

df_merge['Grade_postop'] = df_merge['Grade_hyst_rev_tri_y'].apply(lambda x: x if not pd.isna(x) else df_merge['Grade_uterus_y'])


Select the columns that are relevant for the analysis

In [None]:
df_selection = df_merge[['Grade_rev_PREOP_with_unknown_y', 'ER', 'PR', 'L1CAM', 'p53', 'CA_125_ELEV_bi', 'Platelets_bi', 'CT_result', 'Lymph_nodes', 'Dood_EC_bi_x']]

# Check if the columns are correct
- Grade_rev_PREOP_with_unknown_y is correct
- ER is correct
- PR is correct
- L1CAM is correct
- p53 is correct
- CA_125_ELEV_bi is correct
- Platelets_bi is wrong
- CT_result is wrong
- Dood EC_bi is wrong --> has to be 330, is 337
- Lymph_nodes are unclear

In [None]:
df.to_csv('../0.1. Cleaned_data/Pipendo_with_risk_levels.csv')
df_selection.to_csv('../0.1. Cleaned_data/Pipendo_selection_val.csv')

# Since there are still some inconsistencies, check the Casper_PIPENDO notebook for a comparison, and use that PIPENDO dataset