### Author: Ally Sprik
### Last-updated: 25-02-2024

Goal of this notebook is to explore the risk groups I had defined at the beginning of the internship through the other data exploration notebooks.



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Training = pd.read_csv('../Cleaned_data/Training_TCGA_Risk_levels.csv')
MAYO = pd.read_csv('../Cleaned_data/MAYO_with_risk_levels.csv')
PIP = pd.read_csv('../Cleaned_data/Pipendo_with_risk_levels.csv')
Tubingen = pd.read_csv('../Cleaned_data/Tubingen_risk_groups.csv')

columns = ['Risk_level_PREOP', 'Risk_level_POSTOP', 'Risk_level_BM']


Compare the risk levels through a bar graph for all datasets

In [None]:
# Compare them in a bar plot side by side and in the same bar plot for the same column, normalized
# Create 3 subplots to display side by side


plt.figure(figsize=(10, 10))
PreOP = pd.DataFrame({'Training': Training['Risk_level_PREOP'].value_counts(normalize=True)*100,
                      'MAYO': MAYO['Risk_level_PREOP'].value_counts(normalize=True)*100,
                      'PIP': PIP['Risk_level_PREOP'].value_counts(normalize=True)*100,
                      'Tubingen': Tubingen['Risk_level_PREOP'].value_counts(normalize=True)*100})
PreOP.plot.bar()
plt.title('Risk level distribution through PREOP grade and biomarkers')
plt.ylabel('Normalized value counts (%)')
plt.ylim([0, 100])
         
plt.figure(figsize=(10, 10))

PostOp = pd.DataFrame({'Training': Training['Risk_level_POSTOP'].value_counts(normalize=True)*100,
                      'MAYO': MAYO['Risk_level_POSTOP'].value_counts(normalize=True)*100,
                      'PIP': PIP['Risk_level_POSTOP'].value_counts(normalize=True)*100,
                       'Tubingen': Tubingen['Risk_level_POSTOP'].value_counts(normalize=True)*100})
PostOp.plot.bar()

plt.title('Risk level distribution through POSTOP grade and biomarkers')
plt.ylabel('Normalized value counts (%)')
plt.ylim([0, 100])

plt.figure(figsize=(10, 10))
Mol = pd.DataFrame({'Training': Training['Risk_level_BM'].value_counts(normalize=True)*100,
                      'MAYO': MAYO['Risk_level_BM'].value_counts(normalize=True)*100,
                      'PIP': PIP['Risk_level_BM'].value_counts(normalize=True)*100,
                    'Tubingen': Tubingen['Risk_level_BM'].value_counts(normalize=True)*100})
Mol.plot.bar()

plt.title('Risk level distribution through biomarker analysis')
plt.ylabel('Normalized value counts (%)')
plt.ylim([0, 100])

    
plt.rcParams['figure.dpi'] = 150

Compare the risk levels within the extended training dataset.

In [None]:
# Compare the risk levels within Training dataset seperated on risk level in the same bar plot
cols = ['Risk_level_PREOP', 'Risk_level_POSTOP', 'Risk_level_BM', 'Risk_level_TCGA']

data = pd.DataFrame(
    {'Risk_level_PREOP': Training['Risk_level_PREOP'].value_counts(normalize=True)*100,
     'Risk_level_POSTOP': Training['Risk_level_POSTOP'].value_counts(normalize=True)*100,
     'Risk_level_BM': Training['Risk_level_BM'].value_counts(normalize=True)*100,
     'Risk_level_TCGA': Training['Risk_level_TCGA'].value_counts(normalize=True)*100,
     'Risk_level_TCGA_BMNaN' : Training['Risk_level_TCGA_BMNaN'].value_counts(normalize=True)*100,
     'Risk_level_TCGA_BM' : Training['Risk_level_TCGA_BM'].value_counts(normalize=True)*100,
     })

data.plot.bar()
plt.ylim([0, 100])
plt.ylabel('Normalized value counts (%)')
plt.title('Risk level distribution in Training dataset (N=952)')


Compare the risk levels within the tubingen dataset.

In [None]:
# Do the same for tubingen
cols = ['Risk_level_PREOP', 'Risk_level_POSTOP', 'Risk_level_BM', 'Risk_level_TCGA']

fig, axes = plt.subplots(1,1, figsize=(10, 10))
data = pd.DataFrame(
    {'Risk_level_PREOP': Tubingen['Risk_level_PREOP'].value_counts(normalize=True)*100,
     'Risk_level_POSTOP': Tubingen['Risk_level_POSTOP'].value_counts(normalize=True)*100,
     'Risk_level_BM': Tubingen['Risk_level_BM'].value_counts(normalize=True)*100,
     'Risk_level_TCGA': Tubingen['Risk_level_TCGA'].value_counts(normalize=True)*100,
     'Risk_level_TCGA_BMNaN' : Tubingen['Risk_level_TCGA_BMNaN'].value_counts(normalize=True)*100,
     'Risk_level_TCGA_BM' : Tubingen['Risk_level_TCGA_BM'].value_counts(normalize=True)*100,
     })
N = len(Tubingen)
data.plot.bar(ax=axes)
plt.ylim([0, 100])
plt.ylabel('Normalized value counts (%)')
plt.title(f'Risk level distribution in Tubingen dataset (N={N})')

Compare the risk levels within the MAYO dataset.

In [None]:
# And for MAYO
cols = ['Risk_level_PREOP', 'Risk_level_POSTOP', 'Risk_level_BM', 'Risk_level_TCGA']

fig, axes = plt.subplots(1,1, figsize=(10, 10))
data = pd.DataFrame(
    {'Risk_level_PREOP': MAYO['Risk_level_PREOP'].value_counts(normalize=True)*100,
     'Risk_level_POSTOP': MAYO['Risk_level_POSTOP'].value_counts(normalize=True)*100,
     'Risk_level_BM': MAYO['Risk_level_BM'].value_counts(normalize=True)*100,
     'Risk_level_TCGA': MAYO['Risk_level_TCGA'].value_counts(normalize=True)*100,
     'Risk_level_TCGA_BMNaN' : MAYO['Risk_level_TCGA_BMNaN'].value_counts(normalize=True)*100,
     'Risk_level_TCGA_BM' : MAYO['Risk_level_TCGA_BM'].value_counts(normalize=True)*100,
     })
N = len(MAYO)
data.plot.bar(ax=axes)
plt.ylim([0, 100])
plt.ylabel('Normalized value counts (%)')
plt.title(f'Risk level distribution in MAYO dataset (N={N})')

Create crosstables for the risk groups in the training dataset

In [None]:
# Crosstable for Risk_level_TCGA_BM and RISK_level_POSTOP in the training dataset

crosstab = pd.crosstab(Training['Risk_level_TCGA_BM'], Training['Risk_level_POSTOP'], normalize=True)*100
crosstab = crosstab.reindex(['Low', 'High', 'Unknown'], axis='columns')
sns.heatmap(crosstab, annot=True, cmap='Oranges', fmt='.1f', vmin=0, vmax=40)

plt.title('Risk level distribution in Training dataset (N=952)')
plt.ylabel('Risk level TCGA and BM')
plt.xlabel('Risk level Postoperative grade and BM')

Generic code block to search for a column name

In [None]:
for col in Training.columns:
    if 'TCGA' in col:
        print(col)

Make a binary column for LNM_final_bi for all datasets

In [None]:
Training['LNM_final_bi'] = Training['LNM_incl_followup_bi'].replace({0: 'No', 1: 'Yes'})
Tubingen['LNM_final_bi'] = Tubingen['LNM_obs'].replace({'negative': 'No', 'positive': 'Yes'})
MAYO['LNM_final_bi'] = MAYO['LNM_LNDorSLN'].replace({0: 'No', 1: 'Yes'})
PIP['LNM_final_bi'] = PIP['Pathology_Lymphnodes'].replace({'No Pathology Nodes': 'No', 'Nodes involved': 'Yes', np.nan:'Unknown'})


Crosstable for LNM_final_bi and Risk_level_TCGA_BM for all datasets

In [None]:
# Crosstab for LNM_final_bi and Risk_level_TCGA_BM for all datasets

# Training
plt.figure()
L = len(Training['LNM_final_bi'].dropna())
crosstab = pd.crosstab(Training['LNM_final_bi'], Training['Risk_level_TCGA_BMNaN'], normalize=True)*100
crosstab = crosstab.reindex(['No', 'Yes'])
crosstab = crosstab.reindex(['Low', 'Medium', 'High'], axis='columns')
sns.heatmap(crosstab, annot=True, cmap='Oranges', fmt='.1f', vmin=0, vmax=40)

plt.title(f'Risk level distribution in Training dataset N={L}')
plt.ylabel('Lymph node metastasis')
plt.xlabel('Risk level TCGA and BM')

# Tubingen
plt.figure()
L = len(Tubingen)
crosstab = pd.crosstab(Tubingen['LNM_final_bi'], Tubingen['Risk_level_TCGA_BMNaN'], normalize=True)*100
crosstab = crosstab.reindex(['No', 'Yes'])
crosstab = crosstab.reindex(['Low', 'Medium', 'High'], axis='columns')
sns.heatmap(crosstab, annot=True, cmap='Oranges', fmt='.1f', vmin=0, vmax=40)

plt.title(f'Risk level distribution in Tubingen dataset N={L}')
plt.ylabel('Lymph node metastasis')
plt.xlabel('Risk level TCGA and BM')

# MAYO
plt.figure()
L = len(MAYO)
crosstab = pd.crosstab(MAYO['LNM_final_bi'], MAYO['Risk_level_TCGA_BMNaN'], normalize=True)*100
crosstab = crosstab.reindex(['No', 'Yes'])
crosstab = crosstab.reindex(['Low', 'Medium', 'High'], axis='columns')
sns.heatmap(crosstab, annot=True, cmap='Oranges', fmt='.1f', vmin=0, vmax=40)

plt.title(f'Risk level distribution in MAYO dataset N={L}')
plt.ylabel('Lymph node metastasis')
plt.xlabel('Risk level TCGA and BM')

