# Count methylated/mutated sites per bin

In [1]:
import os

import pandas as pd

In [2]:
main_dir = ''

In [3]:
code_f = f'{main_dir}/methylation/code/count_methy_mut_status.py'

In [4]:
cpg_dir = output_dir = f'{main_dir}/methylation/data/fractional_methylation'

In [5]:
bins_dir = f'{main_dir}/genomic_bins/data'

In [6]:
info = [
    '[params]',
    'cores=1',
    'memory=32G\n',
    '[pre]',
    '. "/home/$USER/miniconda3/etc/profile.d/conda.sh"',
    'conda activate hotspots_framework\n',
    '[jobs]',
]

In [7]:
autosomes = [f'chr{c}' for c in range(1, 23)]

## Count tissue-matched methylation & MUTATIONS

In [8]:
epigenomes_to_run = {
    'COADREAD': ['E084','E106'],
    'ESOPHA_STOMACH': ['E079','E094'],
    'NSCLC': ['E017','E096'],
}

In [9]:
binsizes = ['1000kb', '500kb', '250kb', '100kb', '50kb', '25kb', '10kb']

In [11]:
map_f = f'{main_dir}/methylation/code/count_methy_mut_status.map'

In [12]:
with open(map_f, 'w') as ofd: 
    for line in info: 
        ofd.write(f'{line}\n')
    for bin_size in binsizes: 
        bins_f = f'{bins_dir}/hg38_{bin_size}_bin.nodrivers.filtered.mappable_positions.autosomes.binids.txt'
        bins_trinuc_f = f'{bins_dir}/hg38_{bin_size}_bin.nodrivers.filtered.mappable_positions.autosomes.trinuc_per_bin.json'
        for ctype, epigenomes in epigenomes_to_run.items(): 
            for chromosome in autosomes: 
                cpg_f = f'{cpg_dir}/SBS1_{ctype}.{chromosome}.mutation_status.gz'
                output_f = f'{output_dir}/SBS1_{ctype}.{bin_size}.{chromosome}.counts_methyl_muts.tsv'
                ofd.write(f'python {code_f} -e {epigenomes[0]} -e {epigenomes[1]} ' +  
                          f'--bins_trinuc_f {bins_trinuc_f} --chromosome {chromosome} ' + 
                          f'--bins_f {bins_f} --cpg_f {cpg_f} --output_file {output_f}\n' )
            print(bin_size, ctype)

1000kb COADREAD
1000kb ESOPHA_STOMACH
1000kb NSCLC
500kb COADREAD
500kb ESOPHA_STOMACH
500kb NSCLC
250kb COADREAD
250kb ESOPHA_STOMACH
250kb NSCLC
100kb COADREAD
100kb ESOPHA_STOMACH
100kb NSCLC
50kb COADREAD
50kb ESOPHA_STOMACH
50kb NSCLC
25kb COADREAD
25kb ESOPHA_STOMACH
25kb NSCLC
10kb COADREAD
10kb ESOPHA_STOMACH
10kb NSCLC


## Count tissue-matched methylation & HOTSPOTS

In [13]:
epigenomes_to_run = {
    'COADREAD': ['E084','E106'],
    'ESOPHA_STOMACH': ['E079','E094'],
    'NSCLC': ['E017','E096'],
}

In [14]:
binsizes = ['1000kb']    # this is only used to calculate genome-wide OR

In [15]:
map_f = f'{main_dir}/methylation/code/count_methy_hotspot_status.map'

In [16]:
with open(map_f, 'w') as ofd: 
    for line in info: 
        ofd.write(f'{line}\n')
    for bin_size in binsizes: 
        bins_f = f'{bins_dir}/hg38_{bin_size}_bin.nodrivers.filtered.mappable_positions.autosomes.binids.txt'
        bins_trinuc_f = f'{bins_dir}/hg38_{bin_size}_bin.nodrivers.filtered.mappable_positions.autosomes.trinuc_per_bin.json'
        for ctype, epigenomes in epigenomes_to_run.items(): 
            for chromosome in autosomes: 
                cpg_f = f'{cpg_dir}/SBS1_{ctype}.{chromosome}.hotspot_status.gz'
                output_f = f'{output_dir}/SBS1_{ctype}.{bin_size}.{chromosome}.counts_methyl_hotspots.tsv'
                ofd.write(f'python {code_f} -e {epigenomes[0]} -e {epigenomes[1]} ' +  
                          f'--bins_trinuc_f {bins_trinuc_f} --chromosome {chromosome} ' + 
                          f'--bins_f {bins_f} --cpg_f {cpg_f} --output_file {output_f}\n' )
            print(bin_size, ctype)

1000kb COADREAD
1000kb ESOPHA_STOMACH
1000kb NSCLC
