# Annotate cohort data

For each cohort, annotate the number of samples and mutations after filtering. 

Generate the table "cohorts_samples_mutations.txt"

In [1]:
import os

import pandas as pd

In [2]:
main_directory = ''
input_directory_filtered = os.path.join(main_directory, 'inputs', 'data', 'cohorts_filtered')
output_directory = os.path.join(main_directory, 'inputs', 'tables')

In [3]:
lines = []
for entry in os.scandir(input_directory_filtered): 
    if entry.name.endswith('.in.gz'): 
        cohort_name = entry.name.split('.')[0]
        cohort_data = [cohort_name]
        
        cohort_df = pd.read_csv(entry.path, sep='\t', header=0, low_memory=False)
        filtered_samples = len(cohort_df['SAMPLE'].unique())
        filtered_mutations = len(cohort_df)
        
        cohort_data += [filtered_samples]
        cohort_data += [filtered_mutations]
        for mutype in ['snv', 'mnv', 'del', 'ins']: 
            count_mutype = cohort_df['MUTYPE'].to_list().count(mutype)
            cohort_data += [count_mutype]
        
        lines.append(pd.DataFrame([cohort_data]))
        print(cohort_name)

annot_df = pd.concat(lines)
annot_df.columns = ['COHORT', 'SAMPLES', 'MUTATIONS_TOTAL', 'MUTATIONS_SNV', 'MUTATIONS_MNV', 'MUTATIONS_INS', 'MUTATIONS_DEL']

HARTWIG_MESOTHELIOMA
HARTWIG_NERVOUS_SYSTEM_GLIOBLASTOMA_MULTIFORME
PCAWG_WGS_UTERUS_ADENOCA
PCAWG_WGS_THY_ADENOCA
PCAWG_WGS_SKIN_MELANOMA
PCAWG_WGS_PROST_ADENOCA
PCAWG_WGS_STOMACH_ADENOCA
HARTWIG_NET_LUNG
HARTWIG_SKIN_SKIN_SQUAMOUS_CELL_CARCINOMA
PCAWG_WGS_ESO_ADENOCA
HARTWIG_STOMACH
PCAWG_WGS_CNS_MEDULLO
HARTWIG_UTERUS_CERVICAL
PCAWG_WGS_CERVIX_SCC
HARTWIG_SKIN_MELANOMA
HARTWIG_UTERUS_ENDOMETRIAL
HARTWIG_THYROID
HARTWIG_HEAD_AND_NECK
HARTWIG_URINARY_TRACT
HARTWIG_SMALL_INTESTINE
PCAWG_WGS_LUNG_ADENOCA
PCAWG_WGS_HEAD_SCC
HARTWIG_KIDNEY_RENAL_CELL
PCAWG_WGS_CNS_GBM
D_CM
PCAWG_WGS_LIVER_HCC
PCAWG_WGS_BREAST_ADENOCA
PCAWG_WGS_BREAST_LOBULARCA
HARTWIG_PROSTATE
D_ALL
PCAWG_WGS_KIDNEY_RCC
PCAWG_WGS_COLORECT_ADENOCA
PCAWG_WGS_KIDNEY_CHRCC
HARTWIG_BONESOFT_TISSUE
D_AML
D_RHBDS
D_ACC
PCAWG_WGS_CNS_OLIGO
PCAWG_WGS_CNS_PILOASTRO
D_HGG
HARTWIG_BILIARY
HARTWIG_COLONRECTUM
PEDCBIOP_WGS_ES_IOCURIE_2014
HARTWIG_PANCREAS
HARTWIG_NET_PANCREATIC
HARTWIG_OVARY
HARTWIG_SKIN_BASAL_CELL_CARCINOMA
HARTWIG_ES

In [4]:
annot_df.head()

Unnamed: 0,COHORT,SAMPLES,MUTATIONS_TOTAL,MUTATIONS_SNV,MUTATIONS_MNV,MUTATIONS_INS,MUTATIONS_DEL
0,HARTWIG_MESOTHELIOMA,33,111336,100261,1310,6031,3734
0,HARTWIG_NERVOUS_SYSTEM_GLIOBLASTOMA_MULTIFORME,54,371363,335086,1413,24920,9944
0,PCAWG_WGS_UTERUS_ADENOCA,38,331657,288049,1288,30366,11954
0,PCAWG_WGS_THY_ADENOCA,48,63303,59334,245,2801,923
0,PCAWG_WGS_SKIN_MELANOMA,98,7339443,7140676,121007,57184,20576


In [5]:
output_f = os.path.join(output_directory, 'cohorts_samples_mutations.txt')
annot_df.to_csv(output_f, sep='\t', header=True, index=False)