# Run filtering of mutations

Filter mutations within autosomic 1 Mbp mappable bins of the genome (drivers excluded). Note that mutations included in the analysis are already filtered with mappable genome positions. Now we will subset those within 1 Mbp mappable bins in autosomes. 

In [1]:
import os
from collections import defaultdict

import numpy as np
import pandas as pd

In [2]:
main_dir = ''

In [3]:
input_path_mutations = f'{main_dir}/signatures/assign_muts_to_sigs/data'

In [4]:
bins_f = f'{main_dir}/genomic_bins/data/hg38_1000kb_bin.nodrivers.filtered.mappable_positions.autosomes.bed.gz'

In [5]:
code_f = f'{main_dir}/propensity/code/filter_muts_mappable_bins.py'
map_file = f'{main_dir}/propensity/code/1_filter_muts_mappable_bins.map'

In [6]:
output_path = f'{main_dir}/propensity/data'

In [7]:
info = [
    '[params]',
    'cores=1',
    'memory=24G\n',
    '[pre]',
    '. "/home/$USER/miniconda3/etc/profile.d/conda.sh"',
    'conda activate hotspots_framework\n',
    '[jobs]',
]

In [8]:
data_to_run = {
 'BLADDER_URI': ['SBS2', 'SBS13', 'SBS5', 'SBS1'],
 'BRCA': ['SBS5', 'SBS1', 'SBS2', 'SBS13', 'SBS3'],
 'COADREAD': ['SBS1', 'SBS40', 'SBS17b', 'SBS5', 'SBS18', 'SBS93', 'SBS17a'],
 'ESOPHA_STOMACH': ['SBS17b', 'SBS17a', 'SBS5', 'SBS1'],
 'NSCLC': ['SBS4', 'SBS5', 'SBS40', 'SBS2', 'SBS13', 'SBS1'],
 'PROSTATE': ['SBS5', 'SBS1', 'SBS8'],
 'SKCM': ['SBS7a', 'SBS7b']
}

In [9]:
with open(map_file, 'w') as ofd: 
    for line in info: 
        ofd.write(f'{line}\n')
        
    output_dir = f'{output_path}/filtered_mutations'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)

    for cancertype in data_to_run.keys(): 
        input_f = os.path.join(input_path_mutations, f'{cancertype}_SBS96_total_maxprob.originalref.tsv')
        output_f = f'{output_dir}/{cancertype}_SBS96_total_maxprob.originalref.hg38_1000kb_autosomes_bin.tsv'
        ofd.write(f'python {code_f} -i {input_f} -o {output_f} -b {bins_f}\n')