# Merge cohorts into cancer types

Write somatic mutations from individual cohorts into cancer type files

In [2]:
import os
from collections import defaultdict

import pandas as pd

In [3]:
main_dir = ''

In [4]:
ctypes_annotations = os.path.join(main_dir, 'tables', 'cancer_types_annotated.tsv')
cohorts_annotations = os.path.join(main_dir, 'tables', 'cohorts_annotated.tsv')

#### Read cancer types for analysis and their cohorts

In [5]:
ctypes_for_analysis_df = pd.read_csv(ctypes_annotations, sep='\t', header=0)
ctypes_for_analysis_df.head()

Unnamed: 0,CANCER_TYPE,CANCER_TYPE_LONG,LEVEL_ONCOTREE,LEVEL_ANALYSIS,SAMPLES,COHORTS,MUTATIONS_TOTAL,MUTATIONS_SNV,MUTATIONS_MNV,MUTATIONS_INS,MUTATIONS_DEL
0,ACC,Adrenocortical Carcinoma (ACC),2,A,20,D_ACC,16649,16614,0,26,9
1,ALL,Acute lymphoblastic leukemia (ALL),3,A,278,D_ALL,154660,154095,198,145,222
2,LNM,Lymphoid Neoplasm (LNM),2,B,1420,"D_ALL, PCAWG_WGS_LYMPH_CLL, HARTWIG_LYMPHOID, ...",1723396,1596077,6705,74059,46555
3,EPM,Ependymoma (EPM),3,A,39,D_EPD,9158,9114,0,34,10
4,BRAIN,CNS/Brain (BRAIN),1,B,475,"D_EPD, HARTWIG_NERVOUS_SYSTEM_GLIOBLASTOMA_MUL...",958034,877915,2716,50214,27189


In [6]:
cohorts_annotations_df = pd.read_csv(cohorts_annotations, sep='\t', header=0)
all_cohorts = cohorts_annotations_df['COHORT'].tolist()
len(all_cohorts)

78

In [7]:
cohorts_per_cancertype = {}
for _, row in ctypes_for_analysis_df.iterrows(): 
    cohorts_per_cancertype[row['CANCER_TYPE']] = row['COHORTS'].split(',')

In [8]:
cohorts_per_cancertype['BRAIN']

['D_EPD',
 ' HARTWIG_NERVOUS_SYSTEM_GLIOBLASTOMA_MULTIFORME',
 ' PCAWG_WGS_CNS_GBM',
 ' D_HGG',
 ' PCAWG_WGS_CNS_OLIGO',
 ' D_LGG',
 ' PCAWG_WGS_CNS_MEDULLO',
 ' D_MB',
 ' PCAWG_WGS_CNS_PILOASTRO']

In [9]:
# Add PANCANCER
cohorts_per_cancertype['PANCANCER'] = all_cohorts

#### Write qmap file

In [10]:
code = os.path.join(main_dir, 'code', 'merge_cohorts.py')
map_file = os.path.join(main_dir, 'code', '4_merge_cohorts.map')

In [11]:
info = [
    '[params]',
    'cores=1',
    'memory=8G\n',
    '[pre]',
    '. "/home/$USER/miniconda3/etc/profile.d/conda.sh"',
    'conda activate hotspots_framework\n',
    '[jobs]',
]

In [12]:
mutations_subdirectories = [
    (os.path.join(main_dir, 'data', 'cohorts_filtered'), os.path.join(main_dir, 'data', 'cancertypes_filtered'), '.filtered.in.gz')
]

In [13]:
with open(map_file, 'w') as ofd: 
    
    for line in info: 
        ofd.write(f'{line}\n')
    
    for input_directory, output_directory, sufix in mutations_subdirectories: 
        for cancertype, cohorts in cohorts_per_cancertype.items(): 
            output_file = os.path.join(output_directory, f'{cancertype}{sufix}')
            # Load cohorts
            cohorts_string = ''
            for cohort in cohorts: 
                cohorts_string += f'-co {cohort} '
            
            ofd.write(f'python {code} -i {input_directory} -a {cohorts_annotations} -ct {cancertype} -o {output_file} {cohorts_string}\n')