# Writes metagene maps and submits jobs to TSCC

In [1]:
import pandas as pd
import os
import json
import yaml
import glob
# import rethinkdb as r
from collections import defaultdict
from qtools import Submitter
from encode import manifest_helpers as m

from tqdm import tnrange, tqdm_notebook
pd.set_option("display.max_colwidth", 10000)

In [2]:
density_runner = '/home/bay001/projects/codebase/rbp-maps/maps/plot_map.py'

params = {
    'peak':{
        'peak_dir' : '/projects/ps-yeolab3/bay001/maps/current_annotations/se_peak_bigbeds/',
        'output_dir' : '/projects/ps-yeolab3/bay001/maps/current/metagene/',
        'clip_manifest' : '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.txt',
        'prefix' : 'peak_bb',
    },
    'idr-peak':{
        'peak_dir' : '/projects/ps-yeolab3/bay001/maps/current_annotations/se_idr_peak_bigbeds/',
        'output_dir' : '/projects/ps-yeolab3/bay001/maps/current/idr_metagene/',
        'clip_manifest' : '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.txt',
        'prefix' : 'idr_peak_bb',
    },
}

In [6]:
current_date = '1-28-2017'

annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/'

utr5_hepg2 = os.path.join(annotation_dir, 'hg19_v19_five_prime_utrs.transcripts.bed.HepG2_merged.bed')
utr5_k562 = os.path.join(annotation_dir, 'hg19_v19_five_prime_utrs.transcripts.bed.K562_merged.bed')

cds_hepg2 = os.path.join(annotation_dir, 'hg19_v19_cds.transcripts.bed.HepG2_merged.bed')
cds_k562 = os.path.join(annotation_dir, 'hg19_v19_cds.transcripts.bed.K562_merged.bed')

utr3_hepg2 = os.path.join(annotation_dir, 'hg19_v19_three_prime_utrs.transcripts.bed.HepG2_merged.bed')
utr3_k562 = os.path.join(annotation_dir, 'hg19_v19_three_prime_utrs.transcripts.bed.K562_merged.bed')

bash_scripts_dir = '/projects/ps-yeolab3/bay001/maps/bash_scripts/{}'.format(current_date)

if not os.path.exists(bash_scripts_dir):
    ! mkdir $bash_scripts_dir

In [7]:
img_extension = ['png']

read_type = 'peak'
all_peaks = glob.glob(os.path.join(params[read_type]['peak_dir'],'*.bb'))

### Force override of maps
force = True

clip_df = pd.read_table(params[read_type]['clip_manifest'])
cmds = []

for ext in img_extension:
    for peak in all_peaks:

        uid, rep, other = m.split_uid_and_rep(os.path.basename(peak).split('.')[0])
        _, _, _, rbp, cell = m.get_clip_file_from_uid(
            clip_df, uid
        )

        if cell == 'K562':
            utr5_annotations = utr5_k562
            utr3_annotations = utr3_k562
            cds_annotations = cds_k562
        elif cell == 'HepG2':
            utr5_annotations = utr5_hepg2
            utr3_annotations = utr3_hepg2
            cds_annotations = cds_hepg2
        else:
            print(cell)

        ### build teh command used to call the python script.
        output_filename = os.path.join(
            params[read_type]['output_dir'],
            os.path.basename(peak).replace('.bb','.bb.{}'.format(ext))
        )

        # Build the cmd line
        cmd = "python " + density_runner
        cmd = cmd + " --event {}".format('metagene')
        cmd = cmd + " --peak {}".format(peak)
        cmd = cmd + " --output {}".format(output_filename)
        cmd = cmd + " --annotations {} {} {}".format(
            cds_annotations, utr3_annotations, utr5_annotations
        )
        cmd = cmd + " --annotation_type {} {} {}".format(
            'cds', '3utr', '5utr'
        )
        cmd = cmd + " --normalization_level {}".format(0)
        if not os.path.exists(output_filename) or force == True:
            cmds.append(cmd)


bash_script_sh = '/projects/ps-yeolab3/bay001/maps/bash_scripts/{}/{}-metagene_{}.{}.sh'.format(
    current_date, 
    params[read_type]['prefix'], 
    ext,
    0
)

Submitter(
    cmds, 
    "{}-metagene_{}".format(params[read_type]['prefix'], ext), 
    sh=bash_script_sh,
    submit=False,
    array=True,
    walltime='8:00:00',
    queue='home-yeo'
)

Writing 362 tasks as an array-job.
Wrote commands to /projects/ps-yeolab3/bay001/maps/bash_scripts/1-28-2017/peak_bb-metagene_png.0.sh.


<qtools.submitter.Submitter at 0x2b00c1ff36d0>

In [8]:
img_extension = ['png']

read_type = 'idr-peak'
all_peaks = glob.glob(os.path.join(params[read_type]['peak_dir'],'*.bb'))

### Force override of maps
force = True

clip_df = pd.read_table(params[read_type]['clip_manifest'])
cmds = []

for ext in img_extension:
    for peak in all_peaks:
        
        uid = os.path.basename(peak).split('.')[0]
        _, _, _, rbp, cell = m.get_clip_file_from_uid(
            clip_df, uid
        )

        if cell == 'K562':
            utr5_annotations = utr5_k562
            utr3_annotations = utr3_k562
            cds_annotations = cds_k562
        elif cell == 'HepG2':
            utr5_annotations = utr5_hepg2
            utr3_annotations = utr3_hepg2
            cds_annotations = cds_hepg2
        else:
            print(cell)

        ### build teh command used to call the python script.
        output_filename = os.path.join(
            params[read_type]['output_dir'],
            os.path.basename(peak).replace('.bb','.bb.{}'.format(ext))
        )

        # Build the cmd line
        cmd = "python " + density_runner
        cmd = cmd + " --event {}".format('metagene')
        cmd = cmd + " --peak {}".format(peak)
        cmd = cmd + " --output {}".format(output_filename)
        cmd = cmd + " --annotations {} {} {}".format(
            cds_annotations, utr3_annotations, utr5_annotations
        )
        cmd = cmd + " --annotation_type {} {} {}".format(
            'cds', '3utr', '5utr'
        )
        cmd = cmd + " --normalization_level {}".format(0)
        if not os.path.exists(output_filename) or force == True:
            cmds.append(cmd)


bash_script_sh = '/projects/ps-yeolab3/bay001/maps/bash_scripts/{}/{}-metagene_{}.{}.sh'.format(
    current_date, 
    params[read_type]['prefix'], 
    ext,
    0
)

Submitter(
    cmds, 
    "{}-metagene_{}".format(params[read_type]['prefix'], ext), 
    sh=bash_script_sh,
    submit=False,
    array=True,
    walltime='8:00:00',
    queue='home-yeo'
)

Writing 181 tasks as an array-job.
Wrote commands to /projects/ps-yeolab3/bay001/maps/bash_scripts/1-28-2017/idr_peak_bb-metagene_png.0.sh.


<qtools.submitter.Submitter at 0x2b00c220c850>