# Filter and reformat CTCF binding sites annotations

For a file containing CTCF binding sites from ChIP-seq data, filter those sites that: 
- Overlap autosomes
- Have a length of 200-600 bp
- 90% or more of their sequence overlaps mappable genome (mappable, non-blacklisted, no SNPs, no drivers)

In [1]:
import os, sys
import glob
os.environ["PATH"] = os.path.dirname(sys.executable) + os.pathsep + os.environ["PATH"]
import time

from intervaltree import IntervalTree
import pandas as pd
import pybedtools

In [2]:
autosomes = list(map(lambda x: f'chr{x}', range(1, 23)))

In [3]:
main_dir = ''

### Load data

In [4]:
mappable_genome_f = f'{main_dir}/mappable_genome/data/hg38_mappable_genome.nodrivers.tsv.gz'

In [5]:
%%time

mappable_genome_df = pd.read_csv(mappable_genome_f, sep='\t', header=0, low_memory=False)
mappable_genome_df.loc[mappable_genome_df['CHR'].isin(autosomes)]
mappable_genome_df['CHR'] = mappable_genome_df.apply(lambda x: x['CHR'][3:], axis=1)
mappable_genome_df['START'] = mappable_genome_df.apply(lambda x: x['START'] - 1, axis=1)    # BED format
mappable_genome_bed = pybedtools.BedTool.from_dataframe(mappable_genome_df)

CPU times: user 15min 20s, sys: 4.6 s, total: 15min 24s
Wall time: 15min 44s


In [6]:
input_files = glob.glob(f'{main_dir}/ctcf/data/inputs/*.bed.gz')

### Intersect

In [8]:
%%time

for input_f in input_files: 
    
    # Read
    df = pd.read_csv(input_f, sep='\t', header=None)
    df = df[[0,1,2]]
    df.columns = ['CHR', 'START', 'END']
    df['ID'] = df.apply(lambda x: x['CHR']+':'+str(x['START'])+'-'+str(x['END']), axis=1)

    # Filter autosomes
    df = df.loc[df['CHR'].isin(autosomes)].copy()

    # Filter length
    df['LENGTH'] = df.apply(lambda x: x['END']- x['START'] + 1, axis=1)
    df = df.loc[df['LENGTH'] <= 600].copy()

    # Reformat to BED
    df['CHR'] = df.apply(lambda x: x['CHR'][3:], axis=1)
    ctcf_bed = pybedtools.BedTool.from_dataframe(df)

    intersect_bed = ctcf_bed.intersect(mappable_genome_bed, wao=True)
    intersect_df = pd.read_csv(intersect_bed.fn, sep='\t', header=None)

    length_d = dict(list(zip(intersect_df[3], intersect_df[4])))

    # Compute the total number of bp overlapping each binding site
    overlap = dict()
    for element, data in intersect_df.groupby(3): 
        total_bp = data[8].sum()
        overlap[element] = total_bp

    # Filter based on overlap
    overlap_filter = dict([(k, v) for k, v in overlap.items() if v/length_d[k] > 0.90])
    results = df.loc[df['ID'].isin(overlap_filter.keys())]

    # Save
    output_f = input_f.split('.gz')[0] + '.reformat.txt'
    results.to_csv(output_f, sep='\t', index=False)    

CPU times: user 53.3 s, sys: 212 ms, total: 53.5 s
Wall time: 1min 32s
