### Author: Ally Sprik
### Last-updated: 25-02-2024

Goal of this notebook is to generate results for my paper, in specific retrieving information about the cohorts and the distribution of the variables in the cohorts. This will be used to generate the tables in the paper.

In [None]:
import numpy as np
import pandas as pd

pd.options.mode.copy_on_write = True  # This will allow the code to run faster and keep Pandas happy. Technical detail: https://pandas.pydata.org/pandas-docs/stable/user_guide/copy_on_write.html#

df_763 = pd.read_csv('../../../0. Source_files/0.2. Cleaned_data/Training+TCGA+JAMA_cleaned.csv')
df_763 = df_763[df_763['Included_in_training_cohort'] == 'yes']
df_952 = pd.read_csv('../../../0. Source_files/0.2. Cleaned_data/Training+TCGA+JAMA_cleaned.csv')
df_brno = pd.read_csv('../../../0. Source_files/0.2. Cleaned_data/Cleaned_Brno_model_complete.csv')
df_tubingen = pd.read_csv('../../../0. Source_files/0.2. Cleaned_data/Tubingen_Validation_wMSI.csv')

Printing out the column statistics for each cohort

In [None]:
# N for each cohort
print('763:', df_763.shape[0])
print('952:', df_952.shape[0])
print('Brno:', df_brno.shape[0])
print('Tubingen:', df_tubingen.shape[0])


In [None]:
# Average Age with std for each cohort, rounded to 2 decimal places
print('763:', round(df_763['Age'].mean(),2), round(df_763['Age'].std(),2))
print('952:', round(df_952['Age'].mean(),2), round(df_952['Age'].std(),2))
print('Brno:', round(df_brno['Age'].mean(),2), round(df_brno['Age'].std(),2))
print('Tubingen:', round(df_tubingen['age_at_dx'].mean(),2), round(df_tubingen['age_at_dx'].std(),2))

In [None]:
# PreoperativeGrade counts with both number and percentage for each cohort, rounded to 2 decimal places
print('763:', df_763['PreoperativeGrade'].value_counts(dropna=False), df_763['PreoperativeGrade'].value_counts(normalize=True).mul(100).round(2))
print('952:', df_952['PreoperativeGrade'].value_counts(dropna=False), df_952['PreoperativeGrade'].value_counts(normalize=True).mul(100).round(2))
print('Brno:', df_brno['PreoperativeGrade'].value_counts(dropna=False), df_brno['PreoperativeGrade'].value_counts(normalize=True).mul(100).round(2))
print('Tubingen:', df_tubingen['PreoperativeGrade'].value_counts(dropna=False), df_tubingen['PreoperativeGrade'].value_counts(normalize=True).mul(100).round(2))


In [None]:
# ER status counts with both number and percentage for each cohort, rounded to 2 decimal places
print('763:', df_763['ER'].value_counts(dropna=False), df_763['ER'].value_counts(normalize=True).mul(100).round(2))
print('952:', df_952['ER'].value_counts(dropna=False), df_952['ER'].value_counts(normalize=True).mul(100).round(2))
print('Brno:', df_brno['ER'].value_counts(dropna=False), df_brno['ER'].value_counts(normalize=True).mul(100).round(2))
print('Tubingen:', df_tubingen['ER'].value_counts(dropna=False), df_tubingen['ER'].value_counts(normalize=True).mul(100).round(2))

In [None]:
# PR status counts with both number and percentage for each cohort, rounded to 2 decimal places
print('763:', df_763['PR'].value_counts(dropna=False), df_763['PR'].value_counts(normalize=True).mul(100).round(2))
print('952:', df_952['PR'].value_counts(dropna=False), df_952['PR'].value_counts(normalize=True).mul(100).round(2))
print('Brno:', df_brno['PR'].value_counts(dropna=False), df_brno['PR'].value_counts(normalize=True).mul(100).round(2))
print('Tubingen:', df_tubingen['PR'].value_counts(dropna=False), df_tubingen['PR'].value_counts(normalize=True).mul(100).round(2))

In [None]:
# L1CAM status counts with both number and percentage for each cohort, rounded to 2 decimal places
print('763:', df_763['L1CAM'].value_counts(dropna=False), df_763['L1CAM'].value_counts(normalize=True).mul(100).round(2))
print('952:', df_952['L1CAM'].value_counts(dropna=False), df_952['L1CAM'].value_counts(normalize=True).mul(100).round(2))
print('Brno:', df_brno['L1CAM'].value_counts(dropna=False), df_brno['L1CAM'].value_counts(normalize=True).mul(100).round(2))
print('Tubingen:', df_tubingen['L1CAM'].value_counts(dropna=False), df_tubingen['L1CAM'].value_counts(normalize=True).mul(100).round(2))

In [None]:
# p53 status counts with both number and percentage for each cohort, rounded to 2 decimal places
print('763:', df_763['p53'].value_counts(dropna=False), df_763['p53'].value_counts(normalize=True).mul(100).round(2))
print('952:', df_952['p53'].value_counts(dropna=False), df_952['p53'].value_counts(normalize=True).mul(100).round(2))
print('Brno:', df_brno['p53'].value_counts(dropna=False), df_brno['p53'].value_counts(normalize=True).mul(100).round(2))
print('Tubingen:', df_tubingen['p53'].value_counts(dropna=False), df_tubingen['p53'].value_counts(normalize=True).mul(100).round(2))

In [None]:
# CA125 status counts with both number and percentage for each cohort, rounded to 2 decimal places
print('763:', df_763['CA125'].value_counts(dropna=False), df_763['CA125'].value_counts(normalize=True).mul(100).round(2))
print('952:', df_952['CA125'].value_counts(dropna=False), df_952['CA125'].value_counts(normalize=True).mul(100).round(2))
print('Brno:', df_brno['CA125'].value_counts(dropna=False), df_brno['CA125'].value_counts(normalize=True).mul(100).round(2))
print('Tubingen:', df_tubingen['CA125'].value_counts(dropna=False), df_tubingen['CA125'].value_counts(normalize=True).mul(100).round(2))


In [None]:
# Platelets counts with both number and percentage for each cohort, rounded to 2 decimal places
print('763:', df_763['Platelets'].value_counts(dropna=False), df_763['Platelets'].value_counts(normalize=True).mul(100).round(2))
print('952:', df_952['Platelets'].value_counts(dropna=False), df_952['Platelets'].value_counts(normalize=True).mul(100).round(2))
print('Brno:', df_brno['Platelets'].value_counts(dropna=False), df_brno['Platelets'].value_counts(normalize=True).mul(100).round(2))
print('Tubingen:', df_tubingen['Platelets'].value_counts(dropna=False), df_tubingen['Platelets'].value_counts(normalize=True).mul(100).round(2))

In [None]:
# MSI status counts with both number and percentage for each cohort, rounded to 2 decimal places
print('763:', df_763['MSI'].value_counts(dropna=False), df_763['MSI'].value_counts(normalize=True).mul(100).round(2))
print('952:', df_952['MSI'].value_counts(dropna=False), df_952['MSI'].value_counts(normalize=True).mul(100).round(2))
print('Brno:', df_brno['MSI'].value_counts(dropna=False), df_brno['MSI'].value_counts(normalize=True).mul(100).round(2))
print('Tubingen:', df_tubingen['MSI'].value_counts(dropna=False), df_tubingen['MSI'].value_counts(normalize=True).mul(100).round(2))

In [None]:
# POLE status counts with both number and percentage for each cohort, rounded to 2 decimal places
print('763:', df_763['POLE'].value_counts(dropna=False), df_763['POLE'].value_counts(normalize=True).mul(100).round(2))
print('952:', df_952['POLE'].value_counts(dropna=False), df_952['POLE'].value_counts(normalize=True).mul(100).round(2))
print('Brno:', df_brno['POLE'].value_counts(dropna=False), df_brno['POLE'].value_counts(normalize=True).mul(100).round(2))
print('Tubingen:', df_tubingen['POLE'].value_counts(dropna=False), df_tubingen['POLE'].value_counts(normalize=True).mul(100).round(2))

In [None]:
# MRI_MI status counts with both number and percentage for each cohort, rounded to 2 decimal places
print('763:', df_763['MRI_MI'].value_counts(dropna=False), df_763['MRI_MI'].value_counts(normalize=True).mul(100).round(2))
print('952:', df_952['MRI_MI'].value_counts(dropna=False), df_952['MRI_MI'].value_counts(normalize=True).mul(100).round(2))
print('Brno:', df_brno['MRI_MI'].value_counts(dropna=False), df_brno['MRI_MI'].value_counts(normalize=True).mul(100).round(2))
# print('Tubingen:', df_tubingen['MRI_MI'].value_counts(dropna=False), df_tubingen['MRI_MI'].value_counts(normalize=True).mul(100).round(2)) Doesn't have it


In [None]:
# Cytology, counts with both number and percentage for each cohort, rounded to 2 decimal places
print('763:', df_763['Cytology'].value_counts(dropna=False), df_763['Cytology'].value_counts(normalize=True).mul(100).round(2))
print('952:', df_952['Cytology'].value_counts(dropna=False), df_952['Cytology'].value_counts(normalize=True).mul(100).round(2))
print('Brno:', df_brno['Cytology'].value_counts(dropna=False), df_brno['Cytology'].value_counts(normalize=True).mul(100).round(2))
# print('Tubingen:', df_tubingen['Cytology'].value_counts(dropna=False), df_tubingen['Cytology'].value_counts(normalize=True).mul(100).round(2)) - Doesnt seem to have it


In [None]:
# Histology counts with both number and percentage for each cohort, rounded to 2 decimal places
print('763:', df_763['Histology'].value_counts(dropna=False), df_763['Histology'].value_counts(normalize=True).mul(100).round(2))
print('952:', df_952['Histology'].value_counts(dropna=False), df_952['Histology'].value_counts(normalize=True).mul(100).round(2))
print('Brno:', df_brno['Histology'].value_counts(dropna=False), df_brno['Histology'].value_counts(normalize=True).mul(100).round(2))
print('Tubingen:', df_tubingen['Histology'].value_counts(dropna=False), df_tubingen['Histology'].value_counts(normalize=True).mul(100).round(2))


In [None]:
# MyometrialInvasion status counts with both number and percentage for each cohort, rounded to 2 decimal places
print('763:', df_763['MyometrialInvasion'].value_counts(dropna=False), df_763['MyometrialInvasion'].value_counts(normalize=True).mul(100).round(2))
print('952:', df_952['MyometrialInvasion'].value_counts(dropna=False), df_952['MyometrialInvasion'].value_counts(normalize=True).mul(100).round(2))
print('Brno:', df_brno['MyometrialInvasion'].value_counts(dropna=False), df_brno['MyometrialInvasion'].value_counts(normalize=True).mul(100).round(2))
print('Tubingen:', df_tubingen['MyometrialInvasion'].value_counts(dropna=False), df_tubingen['MyometrialInvasion'].value_counts(normalize=True).mul(100).round(2))


In [None]:
# PostoperativeGrade status counts with both number and percentage for each cohort, rounded to 2 decimal places
print('763:', df_763['PostoperativeGrade'].value_counts(dropna=False), df_763['PostoperativeGrade'].value_counts(normalize=True).mul(100).round(2))
print('952:', df_952['PostoperativeGrade'].value_counts(dropna=False), df_952['PostoperativeGrade'].value_counts(normalize=True).mul(100).round(2))
print('Brno:', df_brno['PostoperativeGrade'].value_counts(dropna=False), df_brno['PostoperativeGrade'].value_counts(normalize=True).mul(100).round(2))
print('Tubingen:', df_tubingen['PostoperativeGrade'].value_counts(dropna=False), df_tubingen['PostoperativeGrade'].value_counts(normalize=True).mul(100).round(2))


In [None]:
# FIGO_stage status counts with both number and percentage for each cohort, rounded to 2 decimal places, sorted
print('763:', df_763['FIGO'].value_counts(dropna=False).sort_index(), df_763['FIGO'].value_counts(normalize=True).mul(100).round(2).sort_index())
print('952:', df_952['FIGO'].value_counts(dropna=False).sort_index(), df_952['FIGO'].value_counts(normalize=True).mul(100).round(2).sort_index())
print('Brno:', df_brno['FIGO_surgical'].value_counts(dropna=False).sort_index(), df_brno['FIGO_surgical'].value_counts(normalize=True).mul(100).round(2).sort_index())
print('Tubingen:', df_tubingen['FIGO Stage'].value_counts(dropna=False).sort_index(), df_tubingen['FIGO Stage'].value_counts(normalize=True).mul(100).round(2).sort_index())

In [None]:
# LVSI status counts with both number and percentage for each cohort, rounded to 2 decimal places
print('763:', df_763['LVSI'].value_counts(dropna=False), df_763['LVSI'].value_counts(normalize=True).mul(100).round(2))
print('952:', df_952['LVSI'].value_counts(dropna=False), df_952['LVSI'].value_counts(normalize=True).mul(100).round(2))
print('Brno:', df_brno['LVSI'].value_counts(dropna=False), df_brno['LVSI'].value_counts(normalize=True).mul(100).round(2))
print('Tubingen:', df_tubingen['LVSI'].value_counts(dropna=False), df_tubingen['LVSI'].value_counts(normalize=True).mul(100).round(2))


In [None]:
# LNM status counts with both number and percentage for each cohort, rounded to 2 decimal places
print('763:', df_763['LNM'].value_counts(dropna=False), df_763['LNM'].value_counts(normalize=True).mul(100).round(2))
print('952:', df_952['LNM'].value_counts(dropna=False), df_952['LNM'].value_counts(normalize=True).mul(100).round(2))
print('Brno:', df_brno['LNM'].value_counts(dropna=False), df_brno['LNM'].value_counts(normalize=True).mul(100).round(2))
print('Tubingen:', df_tubingen['LNM'].value_counts(dropna=False), df_tubingen['LNM'].value_counts(normalize=True).mul(100).round(2))


In [None]:
# Survival5yr status counts with both number and percentage for each cohort, rounded to 2 decimal places
print('763:', df_763['Survival5yr'].value_counts(dropna=False), df_763['Survival5yr'].value_counts(normalize=True).mul(100).round(2))
print('952:', df_952['Survival5yr'].value_counts(dropna=False), df_952['Survival5yr'].value_counts(normalize=True).mul(100).round(2))
print('Brno:', df_brno['Survival5yr'].value_counts(dropna=False), df_brno['Survival5yr'].value_counts(normalize=True).mul(100).round(2))
print('Tubingen:', df_tubingen['Survival5yr'].value_counts(dropna=False), df_tubingen['Survival5yr'].value_counts(normalize=True).mul(100).round(2))


In [None]:
# Therapy status counts with both number and percentage for each cohort, rounded to 2 decimal places sorted
print('763:', df_763['Therapy'].value_counts(dropna=False).sort_index(), df_763['Therapy'].value_counts(normalize=True).mul(100).round(2).sort_index())
print('952:', df_952['Therapy'].value_counts(dropna=False).sort_index(), df_952['Therapy'].value_counts(normalize=True).mul(100).round(2).sort_index())
print('Brno:', df_brno['Therapy'].value_counts(dropna=False).sort_index(), df_brno['Therapy'].value_counts(normalize=True).mul(100).round(2).sort_index())
print('Tubingen:', df_tubingen['Therapy'].value_counts(dropna=False).sort_index(), df_tubingen['Therapy'].value_counts(normalize=True).mul(100).round(2).sort_index())
