### Author: Ally Sprik
### Last-updated: 25-02-2024

Goal of this notebook is to examine the correlation of BMI to other variables and risk groups 



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df_train = pd.read_csv('../0.1. Cleaned_data/Training_TCGA_Risk_levels.csv')
df_MAYO = pd.read_csv('../0.1. Cleaned_data/MAYO_with_risk_levels.csv')
df_PIP = pd.read_csv('../0.1. Cleaned_data/Pipendo_with_risk_levels.csv')

Select the following columns in each dataset and rename them to be the same
- BMI
- Preoperative grade
- Postoperative grade
- PR
- ER 
- L1CAM
- P53

In [None]:
Train_cols = ["BMI", "Grade_PREOP", "Grade", "p53_expression_preop", "ER_expression_preop", "PR_expression_preop", "L1CAM_expression_preop"]
Mayo_cols = ["BMI", "Preoperative_grade", "Postoperative_grade", "p53_preop_def", "ER_preop_def_bi", "PR_preop_def_bi", "L1CAM_preop_def"]
Pip_cols = ["BMI", "Rev_preop_grade", "Final_diagnosis_grade", "p53_expression_PREOP", "ER_expression_PREOP", "PR_expression_PREOP", "L1CAM_expression_PREOP"]

df_train = df_train[Train_cols]
df_train = df_train.rename(columns={"Grade_PREOP": "Preop_Grade", "Grade":"Postop_grade", "p53_expression_preop":"p53", "ER_expression_preop":"ER", "PR_expression_preop": "PR", "L1CAM_expression_preop": "L1CAM"})

df_MAYO = df_MAYO[Mayo_cols]
df_MAYO = df_MAYO.rename(columns={"Preoperative_grade": "Preop_Grade", "Postoperative_grade": "Postop_grade", "p53_preop_def": "p53", "ER_preop_def_bi": "ER", "PR_preop_def_bi": "PR", "L1CAM_preop_def": "L1CAM"})

df_PIP = df_PIP[Pip_cols]
df_PIP = df_PIP.rename(columns={"Rev_preop_grade": "Preop_Grade", "Final_diagnosis_grade": "Postop_grade", "p53_expression_PREOP": "p53", "ER_expression_PREOP": "ER", "PR_expression_PREOP": "PR", "L1CAM_expression_PREOP": "L1CAM"})



The following code deals with the BMI columns by using regular expressions to extract the value

In [None]:
import re 
p = re.compile(r'(\d+),(\d+)')
df_train['BMI'] = df_train['BMI'].apply(lambda x: p.sub(r'\1.\2', x) if isinstance(x, str) else x)
df_train['BMI'] = pd.to_numeric(df_train['BMI'], errors='coerce')

# Make sure all the categories are the same over the datasets 
df_train["BMI"] = df_train["BMI"].astype(float)
df_MAYO["BMI"] = df_MAYO["BMI"].astype(float)
df_PIP["BMI"] = df_PIP["BMI"].astype(float)


The following code encodes labels into numerical categories

In [None]:
df_PIP["Preop_Grade"].replace({"grade 1":1, "grade 2":2, "grade 3":3, 5:np.nan, 18:np.nan}, inplace=True)
df_PIP["Postop_grade"].replace({"I":1, "II":2, "III":3}, inplace=True)
df_PIP["p53"].replace({"wildtype":0, "overexpression":1}, inplace=True)
df_PIP["ER"].replace({">10%, unclear percentage":1, "<10%":0}, inplace=True)
df_PIP["PR"].replace({">10%, unclear percentage":1, "<10%":0}, inplace=True)
df_PIP["L1CAM"].replace({"<10%":0, "=/> 10%":1}, inplace=True)

# Force all as float
df_train = df_train.astype(float)
df_MAYO = df_MAYO.astype(float)
df_PIP = df_PIP.astype(float)


Selecting only the low postoperativegrade patients, since this is the population that is interesting 

In [None]:
# Select grade 1 and 2 only
df_train = df_train[df_train["Postop_grade"] != 3]
df_MAYO = df_MAYO[df_MAYO["Postop_grade"] != 3]
df_PIP = df_PIP[df_PIP["Postop_grade"] != 3]


Create BMI categories

In [None]:
# Divide BMI into 3 categories
df_train["BMI_cat"] = df_train["BMI"].apply(lambda x: "<25" if x < 25 else (">=25 and <30" if x < 30 else ">=30"))
df_MAYO["BMI_cat"] = df_MAYO["BMI"].apply(lambda x: "<25" if x < 25 else (">=25 and <30" if x < 30 else ">=30"))
df_PIP["BMI_cat"] = df_PIP["BMI"].apply(lambda x: "<25" if x < 25 else (">=25 and <30" if x < 30 else ">=30"))

# Drop the original BMI and pre and postop grades
df_train.drop(columns=["BMI", "Postop_grade", "Preop_Grade"], inplace=True)

The following code created bar charts per category of BMI in the training set

In [None]:
# Define a dataframe for only the <25 group, with the counts for the histochemical markers
df_train_low = df_train[df_train["BMI_cat"] == "<25"].drop(columns=["BMI_cat"]).replace({0:"Normal", 1:"Abnormal"})
df_train_low_plot = pd.DataFrame([
    df_train_low["p53"].value_counts(dropna=False),
    df_train_low["ER"].value_counts(dropna=False),
    df_train_low["PR"].value_counts(dropna=False),
    df_train_low["L1CAM"].value_counts(dropna=False)
    ], index=["p53", "ER", "PR", "L1CAM"]).T

# Define a dataframe for the >25 <30 group with the counts for the histochemical markers
df_train_mid = df_train[df_train["BMI_cat"] == ">=25 and <30"].drop(columns=["BMI_cat"]).replace({0:"Normal", 1:"Abnormal"})
df_train_mid_plot = pd.DataFrame([
    df_train_mid["p53"].value_counts(dropna=False),
    df_train_mid["ER"].value_counts(dropna=False),
    df_train_mid["PR"].value_counts(dropna=False),
    df_train_mid["L1CAM"].value_counts(dropna=False)
    ], index=["p53", "ER", "PR", "L1CAM"]).T

# Define a dataframe for the >30 group with teh counts for the histochemical markers
df_train_high = df_train[df_train["BMI_cat"] == ">=30"].drop(columns=["BMI_cat"]).replace({0:"Low", 1:"High"})
df_train_high_plot = pd.DataFrame([
    df_train_high["p53"].value_counts(dropna=False),
    df_train_high["ER"].value_counts(dropna=False),
    df_train_high["PR"].value_counts(dropna=False),
    df_train_high["L1CAM"].value_counts(dropna=False)
    ], index=["p53", "ER", "PR", "L1CAM"]).T

# Plot the generated dataframes.
fig, ax = plt.subplots(1,3, figsize=(20,10))
# Reindex the dataframes to make sure the order is the same for all plots

df_train_low_plot.reindex(["Normal", "Abnormal", np.nan]).plot(kind="bar", ax=ax[0], rot=0, title="BMI < 25")
df_train_mid_plot.reindex(["Normal", "Abnormal", np.nan]).plot(kind="bar", ax=ax[1], rot=0, title="BMI >= 25 and < 30")
df_train_high_plot.reindex(["Low", "High", np.nan]).plot(kind="bar", ax=ax[2], rot=0, title="BMI >= 30")


The following code created bar charts per category of BMI in the MAYO set

In [None]:
# Create bar charts to show the distribution of all expression parameters 
# Define the dataframe with only the <25 group, with the counts for the histochemical markers
df_MAYO_low = df_MAYO[df_MAYO["BMI_cat"] == "<25"].drop(columns=["BMI_cat"]).replace({1:"Normal", 0:"Abnormal"})
df_MAYO_low["p53"].replace({"Abnormal":"Normal", "Normal":"Abnormal"}, inplace=True)
df_MAYO_low["L1CAM"].replace({"Abnormal":"Normal", "Normal":"Abnormal"}, inplace=True)
# Create a dataframe with the counts for the histochemical markers
df_MAYO_low_plot = pd.DataFrame([
    df_MAYO_low["p53"].value_counts(dropna=False),
    df_MAYO_low["ER"].value_counts(dropna=False),
    df_MAYO_low["PR"].value_counts(dropna=False),
    df_MAYO_low["L1CAM"].value_counts(dropna=False)
    ], index=["p53", "ER", "PR", "L1CAM"]).T

# Define the dataframe with only the >25 <30 group, with the counts for the histochemical markers
df_MAYO_mid = df_MAYO[df_MAYO["BMI_cat"] == ">=25 and <30"].drop(columns=["BMI_cat"]).replace({1:"Normal", 0:"Abnormal"})
df_MAYO_mid["p53"].replace({"Abnormal":"Normal", "Normal":"Abnormal"}, inplace=True)
df_MAYO_mid["L1CAM"].replace({"Abnormal":"Normal", "Normal":"Abnormal"}, inplace=True)

# Create a dataframe with the counts for the histochemical markers
df_MAYO_mid_plot = pd.DataFrame([
    df_MAYO_mid["p53"].value_counts(dropna=False),
    df_MAYO_mid["ER"].value_counts(dropna=False),
    df_MAYO_mid["PR"].value_counts(dropna=False),
    df_MAYO_mid["L1CAM"].value_counts(dropna=False)
    ], index=["p53", "ER", "PR", "L1CAM"]).T

# Define the dataframe with only the >30 group, with the counts for the histochemical markers
df_MAYO_high = df_MAYO[df_MAYO["BMI_cat"] == ">=30"].drop(columns=["BMI_cat"]).replace({1:"Normal", 0:"Abnormal"})
df_MAYO_high["p53"].replace({"Abnormal":"Normal", "Normal":"Abnormal"}, inplace=True)
df_MAYO_high["L1CAM"].replace({"Abnormal":"Normal", "Normal":"Abnormal"}, inplace=True)

# Create a dataframe with the counts for the histochemical markers
df_MAYO_high_plot = pd.DataFrame([
    df_MAYO_high["p53"].value_counts(dropna=False),
    df_MAYO_high["ER"].value_counts(dropna=False),
    df_MAYO_high["PR"].value_counts(dropna=False),
    df_MAYO_high["L1CAM"].value_counts(dropna=False)
    ], index=["p53", "ER", "PR", "L1CAM"]).T

# Plot the generated dataframes.
fig, ax = plt.subplots(1,3, figsize=(20,10))
# Reindex the dataframes to make sure the order is the same for all plots
df_MAYO_low_plot.reindex(["Normal", "Abnormal", np.nan]).plot(kind="bar", ax=ax[0], rot=0, title="BMI < 25")
df_MAYO_mid_plot.reindex(["Normal", "Abnormal", np.nan]).plot(kind="bar", ax=ax[1], rot=0, title="BMI >= 25 and < 30")
df_MAYO_high_plot.reindex(["Normal", "Abnormal", np.nan]).plot(kind="bar", ax=ax[2], rot=0, title="BMI >= 30")



The following code created bar charts per category of BMI in the PIPENDO set

In [None]:
# Create bar charts to show the distribution of all expression parameters
# Define the dataframe with only the <25 group, with the counts for the histochemical markers
df_PIP_low = df_PIP[df_PIP["BMI_cat"] == "<25"].drop(columns=["BMI_cat"]).replace({1:"Normal", 0:"Abnormal"})
df_PIP_low["p53"].replace({"Abnormal":"Normal", "Normal":"Abnormal"}, inplace=True)
df_PIP_low["L1CAM"].replace({"Abnormal":"Normal", "Normal":"Abnormal"}, inplace=True)

# Create a dataframe with the counts for the histochemical markers
df_PIP_low_plot = pd.DataFrame([
    df_PIP_low["p53"].value_counts(dropna=False),
    df_PIP_low["ER"].value_counts(dropna=False),
    df_PIP_low["PR"].value_counts(dropna=False),
    df_PIP_low["L1CAM"].value_counts(dropna=False)
    ], index=["p53", "ER", "PR", "L1CAM"]).T

# Define the dataframe with only the >25 <30 group, with the counts for the histochemical markers
df_PIP_mid = df_PIP[df_PIP["BMI_cat"] == ">=25 and <30"].drop(columns=["BMI_cat"]).replace({1:"Normal", 0:"Abnormal"})
df_PIP_mid["p53"].replace({"Abnormal":"Normal", "Normal":"Abnormal"}, inplace=True)
df_PIP_mid["L1CAM"].replace({"Abnormal":"Normal", "Normal":"Abnormal"}, inplace=True)

# Create a dataframe with the counts for the histochemical markers
df_PIP_mid_plot = pd.DataFrame([
    df_PIP_mid["p53"].value_counts(dropna=False),
    df_PIP_mid["ER"].value_counts(dropna=False),
    df_PIP_mid["PR"].value_counts(dropna=False),
    df_PIP_mid["L1CAM"].value_counts(dropna=False)
    ], index=["p53", "ER", "PR", "L1CAM"]).T

# Define the dataframe with only the >30 group, with the counts for the histochemical markers
df_PIP_high = df_PIP[df_PIP["BMI_cat"] == ">=30"].drop(columns=["BMI_cat"]).replace({1:"Normal", 0:"Abnormal"})
df_PIP_high["p53"].replace({"Abnormal":"Normal", "Normal":"Abnormal"}, inplace=True)
df_PIP_high["L1CAM"].replace({"Abnormal":"Normal", "Normal":"Abnormal"}, inplace=True)

# Create a dataframe with the counts for the histochemical markers
df_PIP_high_plot = pd.DataFrame([
    df_PIP_high["p53"].value_counts(dropna=False),
    df_PIP_high["ER"].value_counts(dropna=False),
    df_PIP_high["PR"].value_counts(dropna=False),
    df_PIP_high["L1CAM"].value_counts(dropna=False)
    ], index=["p53", "ER", "PR", "L1CAM"]).T

# Plot the generated dataframes.
fig, ax = plt.subplots(1,3, figsize=(20,10))
# Reindex the dataframes to make sure the order is the same for all plots
df_PIP_low_plot.reindex(["Normal", "Abnormal", np.nan]).plot(kind="bar", ax=ax[0], rot=0, title="BMI < 25")
df_PIP_mid_plot.reindex(["Normal", "Abnormal", np.nan]).plot(kind="bar", ax=ax[1], rot=0, title="BMI >= 25 and < 30")
df_PIP_high_plot.reindex(["Normal", "Abnormal", np.nan]).plot(kind="bar", ax=ax[2], rot=0, title="BMI >= 30")