### Author: Ally Sprik
### Last-updated: 25-02-2024

Goal of this notebook is to generate crosstables for the dataset from Tubingen



In [None]:
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

df = pd.read_csv('../../Cleaned_data/Tubingen_risk_groups.csv', index_col=0)

# rename some labels
df['MolClass'].replace({'MMRd' : 'MSI', 'p53abn' : 'p53'}, inplace=True)

# select a more manageable subset of the data
selection = df[
    ['Grade_POSTOP', 'Grade_PREOP', 'PR_PREOP', 'p53_PREOP', 'Pre-operative CA-125 level ', 'MolClass', 'LNM_obs']]

# Set the output format for the plots and the dpi
set_matplotlib_formats('png', 'pdf')
plt.rcParams['figure.dpi'] = 150
# Set style
sns.set_style('darkgrid')


The code below generated the crosstable between the pre-and postoperative grade. The crosstable is normalized to show the percentage of the total number of patients in each category. The crosstable is then plotted as a heatmap using seaborn.  

In [None]:
# Count the number of patients in the selection
N = len(selection[['Grade_PREOP', 'Grade_POSTOP']].dropna())

# Create the crosstable
crosstab = (pd.crosstab(selection['Grade_PREOP'], selection['Grade_POSTOP'], normalize=True) * 100).round(1)

# Sort the crosstable
crosstab.sort_values('Grade_PREOP', ascending=False, inplace=True)

# Plot the crosstable as a heatmap
x = sns.heatmap(crosstab, annot=True, fmt='g', cmap='Oranges', vmin=0, vmax=40) \
    .set_title(f'Correlation between pre- and post-op grade [%] (N = {N})')

# Set the labels
plt.ylabel('Pre-op grade')
plt.xlabel('Post-op grade')



The code below generated the crosstable between the PR and postoperative grade was performed. following the same steps as above.

In [None]:
# Count the number of patients in the selection
N = len(selection[['PR_PREOP', 'Grade_POSTOP']].dropna())

# Replace the numerical values with the corresponding labels
selection['PR_PREOP'].replace({0: 'Negative', 1: 'Positive'}, inplace=True)

# Create the crosstable
crosstab = (pd.crosstab(selection['PR_PREOP'], selection['Grade_POSTOP'], normalize=True) * 100).round(1)

# Create the heatmap
x = sns.heatmap(crosstab, annot=True, fmt='g', cmap='Oranges', vmin=0, vmax=40) \
    .set_title(f'Correlation between PR expression and post-op grade [%] (N = {N})')

# Set the labels
plt.ylabel('PR expression pre-op')
plt.xlabel('Post-op grade')


The code below generated the crosstable between the p53 and postoperative grade was performed. following the same steps as above.

In [None]:
# Count the number of patients in the selection
N = len(selection[['p53_PREOP', 'Grade_POSTOP']].dropna())

# Replace the numerical values with the corresponding labels
selection['p53_PREOP'].replace({1: 'Mutant', 0: 'Wildtype'}, inplace=True)

# Create the crosstable and plot the heatmap
sns.heatmap((pd.crosstab(selection['p53_PREOP'], selection['Grade_POSTOP'], normalize=True) * 100).round(1), annot=True,
            fmt='g', cmap='Oranges', vmin=0, vmax=40) \
    .set_title(f'Correlation between p53 expression and post-op grade [%] (N = {N})')

# Set the labels
plt.ylabel('p53 expression pre-op')



The code below generated the crosstable between the CA125 and postoperative grade was performed. following the same steps as above.

In [None]:
# Count the number of patients in the selection
N = len(selection[['Pre-operative CA-125 level ', 'Grade_POSTOP']].dropna())

# Create the crosstable
crosstab = (pd.crosstab(selection['Pre-operative CA-125 level '], selection['Grade_POSTOP'],
                        normalize=True) * 100).round(1)

# Sort the crosstable
crosstab.sort_values('Pre-operative CA-125 level ', ascending=False, inplace=True)

# Plot the heatmap
sns.heatmap(crosstab, annot=True, fmt='g', cmap='Oranges', vmin=0, vmax=40) \
    .set_title(f'Correlation between CA125 and post-op grade [%] (N = {N})')

# Set the labels
plt.ylabel('CA125 level pre-op')
plt.xlabel('Post-op grade')


The code below generated the crosstable between the molecular classification and LNM was performed. following the same steps as above.

In [None]:
# Count the number of patients in the selection
N = len(selection[['MolClass', 'LNM_obs']].dropna())

# Create the crosstable
crosstab = (pd.crosstab(selection['MolClass'], selection['LNM_obs'], normalize=True)*100).round(1)
# Sort the crosstable
crosstab = crosstab.reindex(['p53', 'MSI', 'NSMP', 'POLE'])

# Plot the heatmap
x = sns.heatmap(crosstab, annot=True, fmt='g', cmap='Oranges', vmin=0, vmax=40)\
	.set_title(f'Correlation between DNA mutation and \n lymph node metastasis [%] (N = {N})')

# Set the labels
plt.ylabel('TCGA molecular classification')
plt.xlabel('LNM')
