# Generate sub-megabase bins annotations

Get filtered mappable megabases in autosomes and split them in 500-10 Kbp bins. 

In [1]:
from collections import defaultdict
import gzip
import json

import numpy as np
import pandas as pd

In [2]:
autosomes = list(map(lambda x: f'chr{x}', range(1,23)))

In [5]:
main_dir = ''

In [6]:
bins_file = f'{main_dir}/data/hg38_1000kb_bin.nodrivers.filtered.all_positions.bed.gz'

In [7]:
# Load 1 Mb bins
bins_df = pd.read_csv(bins_file, sep='\t', header=0)
len(bins_df['BINID'].unique())

2308

In [8]:
# Subset autosomes
bins_df = bins_df.loc[bins_df['CHR'].isin(autosomes)]
len(bins_df['BINID'].unique())

2196

In [9]:
# Get IDs of 1 Mb bins
megabases = list(bins_df['BINID'].unique())
megabases[0]

'chr1:4000000-5000000'

In [10]:
submegabases = [500000, 250000, 100000, 50000, 25000, 10000]

In [11]:
# Create a json containing bin identifiers 
subm_annotations_d = dict()
# Add megabase bins
subm_annotations_d[str(1000000)] = megabases
# Add submegabase bins
for subm in submegabases: 
    subm_annotations_d[str(subm)] = list()
    for megab in megabases: 
        chrom, start_end = megab.split(':')
        start, end = start_end.split('-')
        
        chunks = np.arange(int(start), int(end), subm)
        for c in range(len(chunks) - 1):    # skip the last one
            start_coord = chunks[c]
            end_coord = chunks[c + 1]
            subm_annotations_d[str(subm)] += [f'{chrom}:{start_coord}-{end_coord}']
        subm_annotations_d[str(subm)] += [f'{chrom}:{chunks[-1]}-{int(end)}']

In [12]:
for subm in [1000000] + submegabases: 
    print(subm/1000)
    print(len(subm_annotations_d[str(subm)]))

1000.0
2196
500.0
4392
250.0
8784
100.0
21960
50.0
43920
25.0
87840
10.0
219600


In [13]:
# Check expected lengths
len(megabases)*2, len(megabases)*4, len(megabases)*10, len(megabases)*20, len(megabases)*40, len(megabases)*100,

(4392, 8784, 21960, 43920, 87840, 219600)

In [14]:
# Save json with submegabase identifiers for quick access
output_file = f'{main_dir}/data/hg38_1000kb_submegabases.nodrivers.filtered.autosomes.json'
with open(output_file, 'wt') as ofd: 
    json.dump(subm_annotations_d, ofd)

In [17]:
# Save 
for subm in [1000000] + submegabases: 
    subm_name = f'{int(subm/1000)}kb'
    output_file = f'{main_dir}/data/hg38_{subm_name}_bin.nodrivers.filtered.all_positions.autosomes.bed.gz'
    print(subm_name)
    with gzip.open(output_file, 'wt') as ofd: 
        ofd.write('{}\n'.format('\t'.join(['CHR', 'START', 'END','BINID'])))
        for binid in subm_annotations_d[str(subm)]: 
            chrom, start_end = binid.split(':')
            start, end = start_end.split('-')
            ofd.write('{}\n'.format('\t'.join([chrom, start, end, binid])))

1000kb
500kb
250kb
100kb
50kb
25kb
10kb
