### Author: Ally Sprik
### Last-updated: 25-02-2024

Goal of this notebook is to generate crosstables for the dataset from PIPENDO



In [None]:
from matplotlib_inline.backend_inline import set_matplotlib_formats
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

df = pd.read_csv('../../Cleaned_data/Pipendo_with_risk_levels.csv')

df['Grade'].replace({'unknown': np.nan, 'missing': np.nan}, inplace=True)

# Create a more manageable dataframe
selection = df[['Grade_PREOP_new', 'Grade_POSTOP_new', 'p53_expression_PREOP', 'PR_expression_PREOP', 'CA125_PREOP_bi']]

# Set the output format for the plots and the dpi
set_matplotlib_formats('png', 'pdf')
plt.rcParams['figure.dpi'] = 150
sns.set_style('darkgrid')



The code below generated the crosstable between the pre-and postoperative grade. The crosstable is normalized to show the percentage of the total number of patients in each category. The crosstable is then plotted as a heatmap using seaborn. 

In [None]:
# Count the number of patients
N = len(selection[['Grade_PREOP_new', 'Grade_POSTOP_new']].dropna())

# Replace the grade 3 or non-endometrioid with grade 3
selection['Grade_PREOP_new'].replace({'grade 3 or non-endometrioid': 'grade 3'}, inplace=True)

# Create the crosstable
crosstab = (pd.crosstab(selection['Grade_PREOP_new'], selection['Grade_POSTOP_new'], normalize=True) * 100).round(1)

# Sort the crosstable
crosstab.sort_values('Grade_PREOP_new', ascending=False, inplace=True)

# Plot the crosstable as a heatmap
x = sns.heatmap(crosstab, annot=True, fmt='g', cmap='Oranges', vmin=0, vmax=40) \
    .set_title(f'Correlation between pre- and post-op grade [%] (N = {N})')

# Set the labels
plt.ylabel('Pre-op grade')
plt.xlabel('Post-op grade')



The code below generated the crosstable between the PR and postoperative grade was performed. following the same steps as above.

In [None]:
N = len(selection[['PR_expression_PREOP', 'Grade_POSTOP_new']].dropna())

selection['PR_expression_PREOP'].replace({'<10%': 'Negative', '>10%, unclear percentage': 'Positive'}, inplace=True)

crosstab = (pd.crosstab(selection['PR_expression_PREOP'], selection['Grade_POSTOP_new'], normalize=True) * 100).round(1)

x = sns.heatmap(crosstab, annot=True, fmt='g', cmap='Oranges', vmin=0, vmax=40) \
    .set_title(f'Correlation between PR expression and post-op grade [%] (N = {N})')
plt.ylabel('PR expression pre-op')
plt.xlabel('Post-op grade')


The code below generated the crosstable between the p53 and postoperative grade was performed. following the same steps as above.

In [None]:
N = len(selection[['p53_expression_PREOP', 'Grade_POSTOP_new']].dropna())

selection['p53_expression_PREOP'].replace({'overexpression': 'Mutant', 'wildtype': 'Wildtype'}, inplace=True)
sns.heatmap(
    (pd.crosstab(selection['p53_expression_PREOP'], selection['Grade_POSTOP_new'], normalize=True) * 100).round(1),
    annot=True, fmt='g', cmap='Oranges', vmin=0, vmax=40) \
    .set_title(f'Correlation between p53 expression and post-op grade [%] (N = {N})')

plt.ylabel('p53 expression pre-op')
plt.xlabel('Post-op grade')


The code below generated the crosstable between the CA125 and postoperative grade was performed. following the same steps as above.

In [None]:
N = len(selection[['CA125_PREOP_bi', 'Grade_POSTOP_new']].dropna())

selection['CA125_PREOP_bi'].replace({'<35 U/mL (=normal)': '< 35', '=/>35 U/mL (=abnormal)': '>= 35'}, inplace=True)

crosstab = (pd.crosstab(selection['CA125_PREOP_bi'], selection['Grade_POSTOP_new'], normalize=True) * 100).round(1)
crosstab.sort_values('CA125_PREOP_bi', ascending=False, inplace=True)

sns.heatmap(crosstab, annot=True, fmt='g', cmap='Oranges', vmin=0, vmax=40) \
    .set_title(f'Correlation between CA125 and post-op grade [%] (N = {N})')

plt.ylabel('CA125 level pre-op')
plt.xlabel('Post-op grade')