# Run filter cohorts 

Run "filter_cohorts.py" to filter files containing somatic mutations (mapped to hg38 reference genome) from different cancer cohorts.  

Filtering includes: 
1. Keep mutations in autosomal + sexual chromosomes
2. Remove mutations that ref == alt 
3. Keep mutations that ref == bgreference
4. Keep mutations that don't have N in alt/ref nor 5-mer context
5. Remove complex indels
6. Remove mutations overlapping low mappability regions
7. Remove mutations overlapping population variants

Detailed information can be found at Methods section "Pre-processing of cohorts"

In [1]:
import os

In [1]:
main_dir = ''

In [2]:
mutations_dir = main_dir + 'data/cohorts_raw'
output_dir = main_dir + 'data/cohorts_filtered'

In [4]:
code = os.path.join(main_dir, 'code', 'filter_cohorts.py')
map_file = os.path.join(main_dir, 'code', '1_filter_cohorts.map')

In [5]:
info = [
    '[params]',
    'cores=1',
    'memory=8G\n',
    '[pre]',
    '. "/home/$USER/miniconda3/etc/profile.d/conda.sh"',
    'conda activate hotspots_framework\n',
    '[jobs]',
]

In [6]:
with open(map_file, 'w') as ofd: 
    
    for line in info: 
        ofd.write(f'{line}\n')
          
    for entry in os.scandir(mutations_dir): 
        cohort_mutations_file = entry.path
        cohort = entry.name.split('.')[0]
        cohort_output_file = os.path.join(output_dir, f'{cohort}.filtered.in.gz')
        if os.path.isfile(cohort_mutations_file) and cohort_mutations_file.endswith('.gz'):
            ofd.write(f'python {code} -i {cohort_mutations_file} -o {cohort_output_file}\n')
            print(cohort)

PCAWG_WGS_PROST_ADENOCA
PCAWG_WGS_SKIN_MELANOMA
PCAWG_WGS_STOMACH_ADENOCA
PCAWG_WGS_THY_ADENOCA
PCAWG_WGS_UTERUS_ADENOCA
PEDCBIOP_WGS_ES_IOCURIE_2014
TARGET_WGS_NBL_US
TARGET_WGS_WT_US
HARTWIG_NET_PANCREATIC
HARTWIG_NET_SMALL_INTESTINAL
HARTWIG_OVARY
HARTWIG_PANCREAS
HARTWIG_PROSTATE
HARTWIG_SKIN_BASAL_CELL_CARCINOMA
HARTWIG_SKIN_MELANOMA
HARTWIG_BILIARY
HARTWIG_BONESOFT_TISSUE
HARTWIG_BREAST
HARTWIG_COLONRECTUM
HARTWIG_ESOPHAGUS
HARTWIG_HEAD_AND_NECK
HARTWIG_KIDNEY_RENAL_CELL
HARTWIG_SKIN_SKIN_SQUAMOUS_CELL_CARCINOMA
HARTWIG_SMALL_INTESTINE
HARTWIG_STOMACH
HARTWIG_THYROID
HARTWIG_URINARY_TRACT
HARTWIG_UTERUS_CERVICAL
HARTWIG_UTERUS_ENDOMETRIAL
PCAWG_WGS_BREAST_ADENOCA
PCAWG_WGS_BREAST_LOBULARCA
PCAWG_WGS_CERVIX_SCC
PCAWG_WGS_CNS_GBM
PCAWG_WGS_CNS_MEDULLO
PCAWG_WGS_CNS_OLIGO
PCAWG_WGS_CNS_PILOASTRO
PCAWG_WGS_COLORECT_ADENOCA
PCAWG_WGS_ESO_ADENOCA
PCAWG_WGS_HEAD_SCC
PCAWG_WGS_KIDNEY_CHRCC
PCAWG_WGS_KIDNEY_RCC
PCAWG_WGS_LIVER_HCC
PCAWG_WGS_LUNG_ADENOCA
D_ACC
D_ALL
D_AML
D_CM
D_EPD
D_HGG
