### Author: Ally Sprik
### Last-updated: 25-02-2024

Goal of this notebook is to compare between the most current version of the PIPENDO dataset and the version Casper used, since there seemed to be inconsistencies in the data.


In [None]:
import pandas as pd
import numpy as np

df = pd.read_excel("../0. Data/PIPENDO/PIPENDO_validation_set_manual_prediction_5YRS_v2_only_included.xlsx")
df_old = pd.read_csv("../0.1. Cleaned_data/Pipendo_selection_val.csv")

# Clean up labels
df.replace({"":np.nan, " ":np.nan, "Na":np.nan, "NA":np.nan, "NAN":np.nan, "NaN":np.nan}, inplace=True)
df_old.replace({"":np.nan, " ":np.nan, "Na":np.nan, "NA":np.nan, "NAN":np.nan, "NaN":np.nan}, inplace=True)

The following code selects the rows that have at least 3 of the following columns: ER, PR, L1CAM, p53. This is done to make sure that the data is not too sparse.

In [None]:
x = []

# Select only the rows that have at least 3 of ER PR L1CAM p53
for i in range(len(df)):
    y = 0
    row = df.iloc[i]
    
    if not pd.isna(row['ER']):
        y += 1
    if not pd.isna(row['PR']):
        y += 1
    if not pd.isna(row['L1CAM']):
        y += 1
    if not pd.isna(row['p53']):
        y += 1
    
    if y >= 3:
        x.append(i)

selected_df = df.iloc[x]

The following code block create a binary column for the platelets, where 1 is for platelets >= 400 and 0 for platelets < 400.

In [None]:
selected_df['platelets_bi'] = selected_df['Pl'].apply(lambda x: 1 if x >= 400 else ( 0 if x < 400 else np.nan))

The following code renames the columns to make them correspond to model node names.

In [None]:
selected_df.rename(columns={
                'Therapy':'Therapy',
                "Primarytumor":"PreoperativeGrade",
                "Histology":"PostoperativeGrade",
                "platelets_bi":"Platelets",
                "LVSIb":"LVSI",
                "MI":"MyometrialInvasion",
                "X1YR":"Survival1yr",
                "X3YR":"Survival3yr",
                "X5YR":"Survival5yr",
                }, inplace=True)


Select the columns that are used in the model 

In [None]:
data_selection = ["PreoperativeGrade",
                  "PostoperativeGrade",
                "MyometrialInvasion",
                  "Cytology",
                  "Platelets",
                  "ER", 
                  "PR",
                  "L1CAM",
                  "LVSI",
                  "p53",
                  "CA125",
                  "CTMRI",
                  "LNM",
                  "Therapy",
                  "Survival1yr",
                  "Survival3yr",
                  "Survival5yr"]

subdag = selected_df[data_selection]

Replace the values in the columns to make them correspond to the model values

In [None]:
subdag["PreoperativeGrade"].replace({1:"grade 1",
                                     2:"grade 2",
                                     3:"grade 3"}, inplace=True)
subdag["PostoperativeGrade"].replace({1:"grade 1",
                                     2:"grade 2",
                                     3:"grade 3"}, inplace=True)
subdag["MyometrialInvasion"].replace({0:"lt_50",1:"lt_50",
                                      2:"ge_50"}, inplace=True)
subdag["Cytology"].replace({0:"benign",
                            1:"malignant"}, inplace=True)
subdag["LVSI"].replace({0:"no",
                        1:"yes"}, inplace=True)
subdag["ER"].replace({1:"positive",
                       0: "negative"}, inplace = True)
subdag["PR"].replace({1:"positive",
                       0: "negative"}, inplace = True)
subdag["L1CAM"].replace({1:"positive",
                       0: "negative"}, inplace = True)
subdag["p53"].replace({1:"mutant",
                       0: "wildtype"}, inplace = True)
subdag['CA125'].replace({0:"lt_35",
                         1:"ge_35"}, inplace = True)
subdag["Platelets"].replace({0:"lt_400",
                            1:"ge_400"}, inplace = True)
subdag["CTMRI"].replace({0:"no",
                         1:"yes",},
                        inplace=True)
subdag["LNM"].replace({0:"no",
                       1:"yes"},
                      inplace=True)
subdag["Survival1yr"].replace({1:"no",
                               0:"yes"}, inplace=True)
subdag["Survival3yr"].replace({1:"no",
                                 0:"yes"}, inplace=True)
subdag["Survival5yr"].replace({0:"no",
                               1:"yes"}, inplace=True)
subdag["Therapy"].replace({0:"no", 1:"chemotherapy", 2:"radiotherapy", 3:"chemoradiotherapy"}, inplace=True)

The following code block creates the columns for the chemotherapy and radiotherapy, based on the Therapy column.

In [None]:
# Create chemo and radiotherapy columns from Therapy
subdag["Chemotherapy"] = np.nan
subdag["Radiotherapy"] = np.nan
for i in range(len(subdag)):
    row = subdag.iloc[i]
    if row["Therapy"] == "chemotherapy":
        subdag.at[i, "Chemotherapy"] = 'yes'
        subdag.at[i, "Radiotherapy"] = 'no'
    elif row["Therapy"] == "radiotherapy":
        subdag.at[i, "Chemotherapy"] = 'no'
        subdag.at[i, "Radiotherapy"] = 'yes'
    elif row["Therapy"] == "chemoradiotherapy":
        subdag.at[i, "Chemotherapy"] = 'yes'
        subdag.at[i, "Radiotherapy"] = 'yes'
    else:
        subdag.at[i, "Chemotherapy"] = 'no'
        subdag.at[i, "Radiotherapy"] = 'no'

In [None]:
subdag.to_csv("../0.1. Cleaned_data/Casper_PIPENDO_Cleaned.csv")