### Author: Ally Sprik
### Last-updated: 25-02-2024

Goal of this notebook is to generate crosstables for the dataset from MAYO, rochester

In [None]:
from matplotlib_inline.backend_inline import set_matplotlib_formats
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

df = pd.read_csv('../../Cleaned_data/MAYO_with_risk_levels.csv')

# Create a smaller more manageable dataframe
selection = df[['Preoperative_grade', 'p53_preop_def', 'PR_preop_def_bi', 'CA_125_pos_neg', 'Postoperative_grade',
                'MSI_POLE_TP53_NSMP', 'LNM_LNDorSLN']]
# Replace some values
for column in selection.columns:
    print(column)
    df[column].replace({9: np.nan, '9': np.nan}, inplace=True)

# Set the format for printed figures, and set the figure dpi
set_matplotlib_formats('png', 'pdf')
plt.rcParams['figure.dpi'] = 150
# Set the seaborn style for the plots
sns.set_style('darkgrid')


The code below generated the crosstable between the pre-and postoperative grade. The crosstable is normalized to show the percentage of the total number of patients in each category. The crosstable is then plotted as a heatmap using seaborn. 


In [None]:
# Count the number of patients
N = len(selection[['Preoperative_grade', 'Postoperative_grade']].dropna())

# Generate the crosstable
crosstab = (pd.crosstab(selection['Preoperative_grade'], selection['Postoperative_grade'], normalize=True) * 100).round(
    1)

# Sort the crosstable
crosstab.sort_values('Preoperative_grade', ascending=False, inplace=True)

# Plot the crosstable as a heatmap
x = sns.heatmap(crosstab, annot=True, fmt='g', cmap='Oranges', vmin=0, vmax=40) \
    .set_title(f'Correlation between pre- and post-op grade [%] (N = {N})')

# Set the labels for the axes
plt.ylabel('Pre-op grade')
plt.xlabel('Post-op grade')




The code below generated the crosstable between the PR marker and Postoperative grade.

In [None]:
selection.index = selection.index.astype(int)
# Correct labels
selection['PR_preop_def_bi'].replace({' ': np.nan, '9': np.nan, 0: 'Negative', 1: 'Positive', 2: 'Negative'},
                                     inplace=True)
#  Count the number of patients
N = len(selection[['PR_preop_def_bi', 'Postoperative_grade']].dropna())

# Generate the crosstable
crosstab = (pd.crosstab(selection['PR_preop_def_bi'], selection['Postoperative_grade'], normalize=True) * 100).round(1)

# Plot the crosstable as a heatmap
x = sns.heatmap(crosstab, annot=True, fmt='g', cmap='Oranges', vmin=0, vmax=40) \
    .set_title(f'Correlation between PR expression and post-op grade [%] (N = {N})')

# Set the labels for the axes
plt.ylabel('PR expression pre-op')
plt.xlabel('Post-op grade')



The code below generated the crosstable between the p53 marker and Postoperative grade. following the same steps as above.

In [None]:
selection['p53_preop_def'].replace({' ': np.nan, '9': np.nan, 0: 'Wildtype', 1: 'Mutant'},
                                      inplace=True)
N = len(selection[['p53_preop_def', 'Postoperative_grade']].dropna())

sns.heatmap(
    (pd.crosstab(selection['p53_preop_def'], selection['Postoperative_grade'], normalize=True) * 100).round(1),
    annot=True, fmt='g', cmap='Oranges', vmin=0, vmax=40) \
    .set_title(f'Correlation between p53 expression and post-op grade [%] (N = {N})')

plt.ylabel('p53 expression pre-op')



The code below generated the crosstable between the CA125 marker and Postoperative grade. following the same steps as above.

In [None]:
selection['CA_125_pos_neg'].replace({'9': np.nan, 9: np.nan, 1: '< 35', 0: '>= 35'}, inplace=True)

N = len(selection[['CA_125_pos_neg', 'Postoperative_grade']].dropna())

crosstab = (pd.crosstab(selection['CA_125_pos_neg'], selection['Postoperative_grade'], normalize=True) * 100).round(1)
crosstab.sort_values('CA_125_pos_neg', ascending=False, inplace=True)

sns.heatmap(crosstab, annot=True, fmt='g', cmap='Oranges', vmin=0, vmax=40) \
    .set_title(f'Correlation between CA125 and post-op grade [%] (N = {N})')

plt.ylabel('CA125 level pre-op')
plt.xlabel('Post-op grade')


The code below generated the crosstable between the TCGA molecular classification and LNM. following the same steps as above.

In [None]:
selection['MSI_POLE_TP53_NSMP'].replace({9: np.nan, '9': np.nan, 1: 'MSI', 2: 'POLE', 3: 'p53', 4: 'NSMP'},
                                        inplace=True)
selection['MSI_POLE_TP53_NSMP'].value_counts(dropna=False)

N = len(selection[['MSI_POLE_TP53_NSMP', 'LNM_LNDorSLN']].dropna())
selection['LNM_LNDorSLN'].replace({0: 'No', 1: 'Yes'}, inplace=True)
crosstab = (pd.crosstab(selection['MSI_POLE_TP53_NSMP'], selection['LNM_LNDorSLN'], normalize=True) * 100).round(1)

crosstab = crosstab.reindex(['TP53', 'MSI/MMR', 'POLE', 'NSMP'])

sns.heatmap(crosstab, annot=True, fmt='g', cmap='Oranges', vmin=0, vmax=40) \
    .set_title(f'Correlation between MSI, POLE, P53 and NSMP \n and lymph node metastasis [%] (N = {N})')

plt.ylabel('TCGA molecular classification')
plt.xlabel('Lymph node metastasis')

The code below generated the crosstable between the TCGA molecular classification and LNM where lympadenectomy was performed. following the same steps as above.

In [None]:
# Correlation between TCGA molecular classification and LNM where lympadenectomy was performed
N = len(selection[['MSI_POLE_TP53_NSMP', 'LNM_LNDorSLN']].dropna())

selection['LNM_LNDorSLN'].replace({0: 'No', 1: 'Yes'}, inplace=True)
crosstab = (pd.crosstab(selection['MSI_POLE_TP53_NSMP'], selection['LNM_LNDorSLN'], normalize=True) * 100).round(1)

crosstab = crosstab.reindex(['TP53', 'MSI/MMR', 'POLE', 'NSMP'])

sns.heatmap(crosstab, annot=True, fmt='g', cmap='Oranges', vmin=0, vmax=40) \
    .set_title(f'Correlation between MSI, POLE, P53 and NSMP and LNM,\n Lymphadenectomy (all) [%] (N = {N})')

plt.ylabel('TCGA molecular classification')
plt.xlabel('Lymph node metastasis')
