# Compute the overlap of a genomic bin with the mappable genome

Get genomic bins of a given length (1 Mbp) and compute their overlap with the mappbale genome. 

The script runs per chromosome to speed up the analysis. 

The output for a set of bins of N size (1 Mbp) is 24 files (one per chromosome) with coordinates and the size of the overlap per bin in the chromosome.

In [1]:
import pandas as pd

In [2]:
main_dir = ''

In [3]:
input_dir = output_dir = f'{main_dir}/genomic_bins/data'

In [4]:
mappable_genome_file = f'{main_dir}/data/hg38_mappable_genome.nodrivers.tsv.gz'

In [5]:
map_file = f'{main_dir}/code/compute_mappablegenome_overlap_nodrivers.map'
code_file = f'{main_dir}/code/compute_mappablegenome_overlap_as_bed.py'

In [6]:
info = [
    '[params]',
    'cores=1',
    'memory=16G\n',
    '[pre]',
    '. "/home/$USER/miniconda3/etc/profile.d/conda.sh"',
    'conda activate hotspots_framework\n',
    '[jobs]',
]

In [7]:
genome = 'hg38'

In [8]:
# Select length of bins to be computed (in bp)
bin_sizes = [1000000]

In [9]:
chromosomes = list(map(lambda x: f'chr{x}', list(range(1,23)) + ['X', 'Y']))

In [10]:
with open(map_file, 'w') as ofd: 
    for line in info: 
        ofd.write(f'{line}\n')
        
    for chromosome in chromosomes:
        for bin_size in bin_sizes: 
            bins_f = f'{input_dir}/{genome}_{round(bin_size/1e3)}kb_bin.bed.gz'
            output_f = f'{output_dir}/{genome}_{round(bin_size/1e3)}kb_bin.nodrivers.{chromosome}.bed.stats'
            ofd.write(
                f'python {code_file} --chromosome {chromosome} --bins_f {bins_f} --mappable_genome_f {mappable_genome_file} --output_f {output_f}\n')
        print(chromosome)

chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
chrX
chrY
