# Compute mappable positions for filtered bins

For each bin size between 1 Mbp and 10 Kbp: 

- Read the file of filtered high mappable bins
- Intersect it with the mappable genome 
- Save coordinates per bin overlapping the mappable genome

The output consists of high mappable genomic bins with filtered start and end coordinates (bin size is no longer equal among bins in the same file)

In [1]:
import gzip 

import pandas as pd
from bgreference import hg38

In [2]:
main_dir = ''

In [3]:
input_dir = output_dir = f'{main_dir}/data'

In [4]:
mappable_genome_file = f'{main_dir}/data/hg38_mappable_genome.nodrivers.tsv.gz'

In [5]:
map_file = f'{main_dir}/code/mappable_pos_per_bin_nodrivers.map'
code_file = f'{main_dir}/code/mappable_pos_per_bin.py'

In [6]:
info = [
    '[params]',
    'cores=1',
    'memory=32G\n',
    '[pre]',
    '. "/home/$USER/miniconda3/etc/profile.d/conda.sh"',
    'conda activate hotspots_framework\n',
    '[jobs]',
]

In [7]:
genome = 'hg38'

#### Run 1 Mb and below

In [8]:
# Select length of bins to be computed (in bp)
bin_sizes = [1000000, 500000, 250000, 100000, 50000, 25000, 10000]

In [9]:
with open(map_file, 'w') as ofd: 
    for line in info: 
        ofd.write(f'{line}\n')
        
    for bin_size in bin_sizes: 

        bins_f = f'{input_dir}/{genome}_{round(bin_size/1e3)}kb_bin.nodrivers.filtered.all_positions.autosomes.bed.gz'
        output_f = f'{input_dir}/{genome}_{round(bin_size/1e3)}kb_bin.nodrivers.filtered.mappable_positions.autosomes.bed.gz'
        ofd.write(f'python {code_file} --bins_f {bins_f} --mappable_genome_f {mappable_genome_file} --output_f {output_f}\n')
        print(bin_size)

1000000
500000
250000
100000
50000
25000
10000
