# Filter bins based on mappable genome overlap

Filter out bins with low overlap with the mappable genome

- Read the overlap of each bin with the mappable genome
- Obtain the distribution of the fraction of overlap to the mappable genome per bin
- Filter out low mappable bins and keep those with fraction overlap above 1st quartile of the fractional overlap distribution

The output consists of high mappable 1 Mbp bins with their original start and end coordinates (original size)

In [1]:
import os

import numpy as np
import pandas as pd

In [2]:
chromosomes = list(map(lambda x: f'chr{x}', list(range(1,23)) + ['X', 'Y']))
autosomes = [i for i in chromosomes if i not in ['chrX', 'chrY']]

In [3]:
main_dir = ''

In [4]:
input_dir = output_dir = f'{main_dir}/data'

In [5]:
chromosomes = [f'chr{i}' for i in range(1, 23)] + ['chrX', 'chrY']

In [6]:
for bin_size in [1000000]: 
    
    bin_length_str = f'{round(bin_size/1e3)}kb'
    output_f = os.path.join(main_dir, 'data', f'hg38_{bin_length_str}_bin.nodrivers.filtered.all_positions.bed.gz')
    print(bin_length_str)
    
    # Read overlap and merge all data in a single table 
    lines = []
    for chrom in chromosomes: 
        file = f'{input_dir}/hg38_{bin_length_str}_bin.nodrivers.{chrom}.bed.stats'
        df = pd.read_csv(file, sep='\t', header=0)
        df['FRACTION'] = df.apply(lambda x: x['BP_OVERLAP']/bin_size, axis=1)
        lines.append(df)
    results = pd.concat(lines)
    print(f'Bins before filtering: {len(results)}')
    
    # Filter
    threshold = np.percentile(results['FRACTION'], 25)
    print(f'Fraction overlap threshold: {threshold}')
    filtered_results = results.loc[results['FRACTION'] > threshold].copy()
    print(f'Bins after filtering: {len(filtered_results)}')
    bins_in_autosomes = len(filtered_results.loc[~filtered_results['CHR'].isin(['chrY', 'chrX'])])
    print(f'Bins after filtering (only autosomes): {bins_in_autosomes}')
    
    # Sort and save
    filtered_results.sort_values(by='START')
    filtered_results = filtered_results[['CHR', 'START', 'END', 'BINID']]
    filtered_results.sort_values(by=['CHR', 'START'], inplace=True)
    filtered_results.to_csv(output_f, sep='\t', header=True, index=False, compression='gzip')

1000kb
Bins before filtering: 3078
Fraction overlap threshold: 0.8036232499999999
Bins after filtering: 2308
Bins after filtering (only autosomes): 2196
