### Author: Ally Sprik
### Last-updated: 25-02-2024

Goal of this notebook is to generate crosstables for the training dataset



In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
from matplotlib_inline.backend_inline import set_matplotlib_formats

dfTCGA = pd.read_csv('../../0.1. Cleaned_data/Training+TCGA+JAMA_cleaned.csv')

# Create a more manageable dataset
columns = ['PostoperativeGrade', 'PreoperativeGrade', 'PR', 'p53', 'CA125', 'MSI', 'POLE',
           'LNM', "MRI_MI", "MyometrialInvasion"]

selection = dfTCGA[columns]
# Set the output format for the plots and the dpi
set_matplotlib_formats('png', 'pdf')
plt.rcParams['figure.dpi'] = 150
sns.set_style('darkgrid')


The code below generated the crosstable between the pre-and postoperative grade. The crosstable is normalized to show the percentage of the total number of patients in each category. The crosstable is then plotted as a heatmap using seaborn. 

In [None]:
# Count the number of patients
N = len(selection[['PreoperativeGrade', 'PostoperativeGrade']].dropna())

# generate the crosstable
crosstab = (pd.crosstab(selection['PreoperativeGrade'], selection['PostoperativeGrade'], normalize=True) * 100).round(1)

# Sort the crosstable
crosstab.sort_values('PreoperativeGrade', ascending=False, inplace=True)

# Generate the heatmap
x = sns.heatmap(crosstab, annot=True, fmt='.1f', cmap='Oranges', vmin=0, vmax=40) \
    .set_title(f'Correlation between pre- and post-op grade [%] (N = {N})')

# Set the plot labels
plt.ylabel('Pre-op grade')
plt.xlabel('Post-op grade')


The code below generated the crosstable between the PR and postoperative grade was performed. following the same steps as above.

In [None]:
# Count the number of patients
N = len(selection[['PR', 'PostoperativeGrade']].dropna())

# Correct the labels if necessary
selection['PR'].replace({1: 'Negative', 0: 'Positive'}, inplace=True)

# generate the crosstable
crosstab = (pd.crosstab(selection['PR'], selection['PostoperativeGrade'], normalize=True) * 100).round(1)

# Plot the heatmap
x = sns.heatmap(crosstab, annot=True, fmt='g', cmap='Oranges', vmin=0, vmax=40) \
    .set_title(f'Correlation between PR expression and post-op grade [%] (N = {N})')
# Set the plot labels
plt.ylabel('PR expression pre-op')



The code below generated the crosstable between the p53 and postoperative grade was performed. following the same steps as above.

In [None]:
# Count the number of patients
N = len(selection[['p53', 'PostoperativeGrade']].dropna())

# Correct the labels if necessary
selection['p53'].replace({1: 'Mutant', 0: 'Wildtype'}, inplace=True)

# Generate the crosstable and plot the heatmap
sns.heatmap((pd.crosstab(selection['p53'], selection['PostoperativeGrade'], normalize=True) * 100).round(1),
            annot=True, fmt='g', cmap='Oranges', vmin=0, vmax=40) \
    .set_title(f'Correlation between p53 expression and post-op PostoperativeGrade [%] (N = {N})')

# Set the plot labels
plt.ylabel('p53 expression pre-op')



The code below generated the crosstable between the CA125 and postoperative grade was performed. following the same steps as above.

In [None]:
# Count the number of patients
N = len(selection[['CA125', 'PostoperativeGrade']].dropna())

# Correct the labels if necessary
selection['CA125'].replace({1: '> 35', 0: '=< 35'}, inplace=True)

# generate the crosstable
crosstab = (pd.crosstab(selection['CA125'], selection['PostoperativeGrade'], normalize=True) * 100).round(1)
# Sort the crosstable
crosstab.sort_values('CA125', ascending=False, inplace=True)

# Generate the heatmap
sns.heatmap(crosstab, annot=True, fmt='g', cmap='Oranges', vmin=0, vmax=40) \
    .set_title(f'Correlation between CA125 and post-op PostoperativeGrade [%] (N = {N})')

# Set the plot labels
plt.ylabel('CA125 level pre-op')



The code below generates a TCGA column according to clinical guidelines

In [None]:
# Create F_TCGA column from the 3 columns with the TCGA molecular classification
# Following order: MSI, POLE, p53, NSMP

for i in range(len(selection)):
    if pd.isna(selection['MSI'][i]) and pd.isna(selection['POLE'][i]) and pd.isna(selection['p53'][i]):
        selection.at[i, 'F_TCGA'] = np.nan
    elif selection['MSI'][i] == 'yes':
        selection.at[i, 'F_TCGA'] = 'MSI'
    elif selection['POLE'][i] == 'yes':
        selection.at[i, 'F_TCGA'] = 'POLE'
    elif selection['p53'][i] == 'mutant':
        selection.at[i, 'F_TCGA'] = 'p53'
    else:
        selection.at[i, 'F_TCGA'] = 'NSMP'

The code below generated the crosstable between the TCGA molecular classification and LNM was performed. following the same steps as above.

In [None]:
# Count the number of patients
N = len(selection[['F_TCGA', 'LNM']].dropna())

# Correct the labels if necessary
selection['F_TCGA'].replace({1: 'POLE', "MSI": 'MMRd', 3: 'p53', 4: 'NSMP'}, inplace=True)
selection['LNM'].replace({1: 'Yes', 0: 'No'}, inplace=True)

# generate the crosstable
crosstab = (pd.crosstab(selection['F_TCGA'],selection['LNM'], normalize=True) * 100).round(1)
crosstab = crosstab.reindex(['p53', 'MMRd',  'POLE', 'NSMP'], axis=0)

# Set the figure size
plt.figure(figsize=(5,5))

# Generate the heatmap
x = sns.heatmap(crosstab, annot=True, fmt='g', cmap='Oranges', vmin=0, vmax=40) \
    .set_title(f'Training correlation between TCGA molecular\n classification and lymph node metastasis [%] (N = {N})')

# Set the plot labels
plt.ylabel('TCGA molecular classification')
plt.xlabel('Lymph node metastasis')    


The code below generated the crosstable between the MRI_MI and Myometrial invasion was performed. following the same steps as above.

In [None]:
# Count the number of patients
N = len(selection[['MRI_MI', 'MyometrialInvasion']].dropna())

# generate the crosstable
crosstab = (pd.crosstab(selection['MRI_MI'], selection['MyometrialInvasion'], normalize=True) * 100).round(1)

# reindex both column and rows to have the same order lt_50, ge_50
crosstab = crosstab.reindex(['ge_50', 'lt_50'])
crosstab = crosstab.reindex(['lt_50', 'ge_50'], axis=1)
# Fixing label
crosstab.rename(columns={'lt_50': '< 50', 'ge_50': '>= 50'}, index={'lt_50': '< 50', 'ge_50': '>= 50'}, inplace=True)
# Generate the heatmap
x = sns.heatmap(crosstab, annot=True, fmt='g', cmap='Oranges', vmin=0, vmax=40) \
    .set_title(f'Correlation between MRI_MI and\n myometrial invasion [%] (N = {N})')

# Set the plot labels
plt.ylabel('MRI_MI')
plt.xlabel('Myometrial invasion')

