# This creates bigbed files from encode peak files
- This assumes that encode peak files have a log10 pvalue and log2 fold change as the 4 and 5th columns, so the script will turn these columns into strings (bigbed doesn't like numbers in these fields)
- This also assumes that peak files are unfiltered ( so apply fold change and pvalue cutoffs )

In [1]:
import pandas as pd
import os
import json
import yaml
import glob
# import rethinkdb as r
from collections import defaultdict
from qtools import Submitter
from encode import manifest_helpers as m

from tqdm import tnrange, tqdm_notebook
pd.set_option("display.max_colwidth", 10000)

In [1]:
density_runner = '/home/bay001/projects/codebase/rbp-maps/maps/plot_density.py'
annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/se_renamed/'

# This dictionary just helps me determine file locations for individual peaks and IDR peaks
params = {
    'peak':{
        'output_dir' : '/projects/ps-yeolab3/bay001/maps/current_annotations/se_peak_bigbeds/',
        'clip_manifest' : '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.txt',
        'prefix' : 'peak',
        'peak_dir' : '/home/elvannostrand/data/clip/CLIPseq_analysis/ENCODE_FINALforpapers_20170325'
    },
    'idr':{
        'output_dir' : '/projects/ps-yeolab3/bay001/maps/current_annotations/se_idr_peak_bigbeds2/',
        'clip_manifest' : '/home/elvannostrand/data/clip/CLIPseq_analysis/ENCODE_FINALforpapers_20180205/hg19/ENCODE_FINAL_ANNOTATIONS.uidsonly.txt.manifest.txt',
        'prefix' : 'idr',
        'peak_dir' : '/home/elvannostrand/data/clip/CLIPseq_analysis/ENCODE_FINALforpapers_20180205/hg19/IDR/'
    },
}

# Create a directory that contains all of the qsub bash scripts 

In [5]:
current_date = '2-26-2018'
bash_scripts_dir = '/projects/ps-yeolab3/bay001/maps/bash_scripts/{}'.format(current_date)
! mkdir $bash_scripts_dir

mkdir: cannot create directory `/projects/ps-yeolab3/bay001/maps/bash_scripts/2-26-2018': File exists


In [6]:
p = 0
fc = 0
inputnormed_names = ['chrom','start','end','l10p','l2fc','strand']

def filter_and_change_namefield(fn, out_file, p, fc):
    """
    In order for a bed file to be accepted into the bedToBigBed script, 
    compressed.bed files must have proper names (string) in field 4 and 
    proper score (0) in field 5. This function filters and modifies this.
    """
    df = pd.read_table(fn, names=inputnormed_names)
    df = df[(df['l10p']>=p) & (df['l2fc']>=fc)]
    df['l10p'] = 'peak'
    df['l2fc'] = 0
    df.to_csv(out_file, sep='\t', index=False, header=False)


224


# Get all peak files

In [None]:
# all_peaks = glob.glob(os.path.join(params['peak']['peak_dir'],'*.compressed.bed'))
all_peaks = glob.glob(os.path.join(params['idr']['peak_dir'],'*.out.0102merged.bed.blacklist_removed.bed'))
print(len(all_peaks))

# For every peak file:
1. Filter l2fc and l10p and turn name/score fields into strings
2. Sort by chromosome and position
3. Call bedToBigBed 

In [7]:
chrom_sizes = '/projects/ps-yeolab/genomes/hg19/hg19.chrom.sizes'
progress = tnrange(len(all_peaks))

for peak in all_peaks:
    out_file = os.path.join(params['idr']['output_dir'], os.path.basename(peak) + '.p{}f{}.bed'.format(p, fc))
    sorted_bed_file = out_file + '.sorted.bed'
    bb_file = sorted_bed_file + '.bb'
    
    filter_and_change_namefield(peak, out_file, p, fc)
    ! sort -k1,1 -k2,2n $out_file > $sorted_bed_file
    ! bedToBigBed $sorted_bed_file $chrom_sizes $bb_file
    progress.update(1)

pass1 - making usageList (23 chroms): 29 millis
pass2 - checking and writing primary data (1140 records, 6 fields): 8 millis
pass1 - making usageList (22 chroms): 1 millis
pass2 - checking and writing primary data (438 records, 6 fields): 5 millis
pass1 - making usageList (25 chroms): 5 millis
pass2 - checking and writing primary data (8703 records, 6 fields): 29 millis
pass1 - making usageList (24 chroms): 2 millis
pass2 - checking and writing primary data (1257 records, 6 fields): 8 millis
pass1 - making usageList (24 chroms): 2 millis
pass2 - checking and writing primary data (2397 records, 6 fields): 11 millis
pass1 - making usageList (24 chroms): 1 millis
pass2 - checking and writing primary data (491 records, 6 fields): 5 millis
pass1 - making usageList (23 chroms): 4 millis
pass2 - checking and writing primary data (6406 records, 6 fields): 22 millis
pass1 - making usageList (23 chroms): 1 millis
pass2 - checking and writing primary data (134 records, 6 fields): 4 millis
pass1 -

pass1 - making usageList (21 chroms): 1 millis
pass2 - checking and writing primary data (78 records, 6 fields): 3 millis
pass1 - making usageList (23 chroms): 7 millis
pass2 - checking and writing primary data (10684 records, 6 fields): 34 millis
pass1 - making usageList (25 chroms): 2 millis
pass2 - checking and writing primary data (1083 records, 6 fields): 7 millis
pass1 - making usageList (24 chroms): 1 millis
pass2 - checking and writing primary data (1002 records, 6 fields): 8 millis
pass1 - making usageList (24 chroms): 2 millis
pass2 - checking and writing primary data (2359 records, 6 fields): 11 millis
pass1 - making usageList (24 chroms): 3 millis
pass2 - checking and writing primary data (4455 records, 6 fields): 16 millis
pass1 - making usageList (10 chroms): 1 millis
pass2 - checking and writing primary data (29 records, 6 fields): 2 millis
pass1 - making usageList (25 chroms): 4 millis
pass2 - checking and writing primary data (5755 records, 6 fields): 20 millis
pass1 -

pass1 - making usageList (23 chroms): 2 millis
pass2 - checking and writing primary data (1781 records, 6 fields): 9 millis
pass1 - making usageList (23 chroms): 1 millis
pass2 - checking and writing primary data (644 records, 6 fields): 6 millis
pass1 - making usageList (23 chroms): 2 millis
pass2 - checking and writing primary data (2105 records, 6 fields): 10 millis
pass1 - making usageList (24 chroms): 3 millis
pass2 - checking and writing primary data (4135 records, 6 fields): 16 millis
pass1 - making usageList (23 chroms): 5 millis
pass2 - checking and writing primary data (4591 records, 6 fields): 15 millis
pass1 - making usageList (23 chroms): 11 millis
pass2 - checking and writing primary data (22728 records, 6 fields): 66 millis
pass1 - making usageList (25 chroms): 5 millis
pass2 - checking and writing primary data (6549 records, 6 fields): 21 millis
pass1 - making usageList (24 chroms): 3 millis
pass2 - checking and writing primary data (3936 records, 6 fields): 15 millis
p

pass1 - making usageList (23 chroms): 11 millis
pass2 - checking and writing primary data (20334 records, 6 fields): 60 millis
pass1 - making usageList (24 chroms): 4 millis
pass2 - checking and writing primary data (5812 records, 6 fields): 21 millis
pass1 - making usageList (24 chroms): 5 millis
pass2 - checking and writing primary data (7852 records, 6 fields): 26 millis
pass1 - making usageList (24 chroms): 1 millis
pass2 - checking and writing primary data (455 records, 6 fields): 6 millis
pass1 - making usageList (24 chroms): 2 millis
pass2 - checking and writing primary data (862 records, 6 fields): 6 millis
pass1 - making usageList (24 chroms): 5 millis
pass2 - checking and writing primary data (9242 records, 6 fields): 17 millis
pass1 - making usageList (23 chroms): 1 millis
pass2 - checking and writing primary data (774 records, 6 fields): 6 millis
pass1 - making usageList (13 chroms): 1 millis
pass2 - checking and writing primary data (53 records, 6 fields): 3 millis
pass1 -