### Author: Ally Sprik
### Last-updated: 25-02-2024

Goal of this notebook is to generate crosstables for the dataset from Brno

In [None]:
from matplotlib_inline.backend_inline import set_matplotlib_formats
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


df = pd.read_csv("../../0.1. Cleaned_data/Cleaned_Brno_model_complete.csv")

set_matplotlib_formats('png', 'pdf') # Sets the output of the images to PNG, making them able to be copied
plt.rcParams['figure.dpi'] = 150


Code block below implements the TCGA variables into a single variable according to the clinical guidelines.

In [None]:
# Create a combined TCGA algorithm column
df['TCGA'] = np.nan

for i in range(len(df)):
    if pd.isna(df['MSI'][i]) and pd.isna(df['POLE'][i]) and pd.isna(df['p53'][i]):
        df.at[i, 'TCGA'] = np.nan
    elif df['MSI'][i] == 'yes':
        df.at[i, 'TCGA'] = 'MSI'
    elif df['POLE'][i] == 'yes':
        df.at[i, 'TCGA'] = 'POLE'
    elif df['p53'][i] == 'mutant':
        df.at[i, 'TCGA'] = 'p53'
    else:
        df.at[i, 'TCGA'] = 'NSMP'


The code below generated the crosstable between the TCGA groups and lymph node metastasis

In [None]:
# Set a figure size, sometimes good to play with for presentations and papers.
plt.figure(figsize=(5,5))
# Calculate the amount of rows that have an entry in both TCGA and LNM
N = len(df[['TCGA', 'LNM']].dropna())
# Create the crosstable, normalised and rounded.
crosstab = (pd.crosstab( df['TCGA'],df['LNM'], normalize=True) * 100).round(1)
# Reindex to create the order you'd like, axis designated the x(1) or y(0)-axis
crosstab = crosstab.reindex(['p53', 'MSI', 'POLE', 'NSMP'], axis=0)
# Since in this dataset, after the clinical TCGA classification there are no people with a POLE classification, fill up the empty spaces with 0 so they are still displayed.
crosstab = crosstab.fillna(0)
# Plot a SNS heatmap
x = sns.heatmap(crosstab, annot=True, fmt='g', cmap='Oranges', vmin=0, vmax=40) \
    .set_title(f'Brno correlation between TCGA molecular\n classification and lymph node metastasis [%] (N = {N})')
# Plot labels
plt.ylabel('TCGA molecular classification')
plt.xlabel('Lymph node metastasis')    


The following code block generates the crosstable and heatmap for US_MI (called MRI_MI in the dataset here since it is the cleaned dataset for plugging in the model) and myometrial invasion

In [None]:
# Crosstable US_MI and Myometrial invasion
N = len(df[['MRI_MI', 'MyometrialInvasion']].dropna())
crosstab = (pd.crosstab(df['MRI_MI'], df['MyometrialInvasion'], normalize=True) * 100).round(1)
# Reindex values
crosstab = crosstab.reindex(['ge_50', 'lt_50'])
crosstab = crosstab.reindex(['lt_50', 'ge_50'], axis=1)
# Create better labels
crosstab.rename(columns={'lt_50': '< 50', 'ge_50': '>= 50'}, index={'lt_50': '< 50', 'ge_50': '>= 50'}, inplace=True)
# Seaborn heatmap
x = sns.heatmap(crosstab, annot=True, fmt='g', cmap='Oranges', vmin=0, vmax=40) \
    .set_title(f'Correlation between US_MI and\n myometrial invasion [%] (N = {N})')

# Plot labels
plt.ylabel('US_MI')
plt.xlabel('Myometrial invasion')

