# Wrapper notebook for submitting the RBP maps script to TSCC

In [1]:
import pandas as pd
import os
import json
import yaml
import glob
# import rethinkdb as r
from collections import defaultdict
from qtools import Submitter
from encode import manifest_helpers as m

from tqdm import tnrange, tqdm_notebook
pd.set_option("display.max_colwidth", 10000)


# Define programs

In [2]:
density_runner = '/home/bay001/projects/codebase/rbp-maps/maps/plot_density.py'

params = {
    'whole_read':{
        'output_dir' : '/home/bay001/projects/maps_20160420/analysis/cds_start_stop_txstart_crg_eu/',
        'clip_manifest' : '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.txt',
        'prefix' : 'whole_read',
    },
    'peak_cds':{
        'peak_dir' : '/projects/ps-yeolab3/bay001/maps/current_annotations/se_peak_bigbeds/',
        'output_dir' : '/projects/ps-yeolab3/bay001/maps/current/peak_cds_start_stop_hg19v19/',
        'clip_manifest' : '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.txt',
        'prefix' : 'peak_bb_cds',
    },
    'peak_tx':{
        'peak_dir' : '/projects/ps-yeolab3/bay001/maps/current_annotations/se_peak_bigbeds/',
        'output_dir' : '/projects/ps-yeolab3/bay001/maps/current/peak_tx_start_stop_hg19v19/',
        'clip_manifest' : '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.txt',
        'prefix' : 'peak_bb_tx',
    },
}



# Define static annotations

In [3]:
current_date = '12-5-2017'

annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/'

hepg2_start_codons = os.path.join(annotation_dir, 'hg19_v19_start_codons.HepG2_tpm1.bed')
hepg2_stop_codons = os.path.join(annotation_dir, 'hg19_v19_stop_codons.HepG2_tpm1.bed')

hepg2_txstart = os.path.join(annotation_dir, 'hg19_v19_transcription_start_sites.HepG2_tpm1.bed')
hepg2_txstop = os.path.join(annotation_dir, 'hg19_v19_poly_a_sites.HepG2_tpm1.bed')

k562_start_codons = os.path.join(annotation_dir, 'hg19_v19_start_codons.K562_tpm1.bed')
k562_stop_codons = os.path.join(annotation_dir, 'hg19_v19_stop_codons.K562_tpm1.bed')

k562_txstart = os.path.join(annotation_dir, 'hg19_v19_transcription_start_sites.K562_tpm1.bed')
k562_txstop = os.path.join(annotation_dir, 'hg19_v19_poly_a_sites.K562_tpm1.bed')


bash_scripts_dir = '/projects/ps-yeolab3/bay001/maps/bash_scripts/{}'.format(current_date)

if not os.path.exists(bash_scripts_dir):
    ! mkdir $bash_scripts_dir

# Plot Density Maps
- I think we're choosing not to run this part, just use the peaks.

In [4]:
def make_density_maps():
    img_extension = ['png']


    read_type = 'whole_read'
    normalization_levels = [3]

    ### Force override of maps
    force = True

    clip_df = pd.read_table(params[read_type]['clip_manifest'])
    cmds = []

    for normalization_level in normalization_levels:
        for ext in img_extension:
            for uid in clip_df['uID']:

                r1, r2, i, rbp, cell = m.get_clip_file_from_uid(clip_df, uid)

                if cell == 'HepG2':
                    start_codons_annotation = hepg2_start_codons
                    txstart_annotation = hepg2_txstart
                    txstop_annotation = hepg2_txstop
                elif cell == 'K562':
                    start_codons_annotation = k562_start_codons
                    txstart_annotation = k562_txstart
                    txstop_annotation = k562_txstop
                else:
                    print(cell)

                ### Foreach replicate, build teh command used to call the python script.
                for r in [r1, r2]:
                    name = os.path.basename(r).replace('.bam','.{}.{}'.format(normalization_level, ext))
                    output_filename = os.path.join(
                        params[read_type]['output_dir'],
                        name
                    )

                    # Build the cmd line
                    cmd = "python " + density_runner
                    cmd = cmd + " --event {}".format('bed')
                    cmd = cmd + " --ipbam {}".format(r)
                    cmd = cmd + " --inputbam {}".format(i)
                    cmd = cmd + " --output {}".format(output_filename)
                    cmd = cmd + " --annotations {} {} {}".format(
                        start_codons_annotation, txstart_annotation, txstop_annotation
                    )
                    cmd = cmd + " --annotation_type {} {} {}".format(
                        'bed', 'bed', 'bed', 
                    )
                    cmd = cmd + " --scale"
                    cmd = cmd + " --exon_offset 0"
                    cmd = cmd + " --intron_offset 0"
                    cmd = cmd + " --normalization_level {}".format(normalization_level)
                    if not os.path.exists(output_filename) or force == True:
                        cmds.append(cmd)


    bash_script_sh = '/projects/ps-yeolab3/bay001/maps/bash_scripts/{}/{}-cdstss_{}.{}.sh'.format(
        current_date, 
        params[read_type]['prefix'], 
        ext,
        normalization_level
    )

    Submitter(
        cmds, 
        "{}-cdstss_{}".format(params[read_type]['prefix'], ext), 
        sh=bash_script_sh,
        submit=False,
        array=True,
        walltime='2:00:00',
        queue='home-yeo'
    )

In [7]:
img_extension = ['png']


read_type = 'peak_tx'
all_peaks = glob.glob(os.path.join(params[read_type]['peak_dir'],'*.bb'))

### Force override of maps
force = True

clip_df = pd.read_table(params[read_type]['clip_manifest'])
cmds = []

for ext in img_extension:
    for peak in all_peaks:

        uid, rep, other = m.split_uid_and_rep(os.path.basename(peak).split('.')[0])
        _, _, _, rbp, cell = m.get_clip_file_from_uid(
            clip_df, uid
        )

        if cell == 'HepG2':
            if read_type == 'peak_cds':
                annotation1 = hepg2_start_codons
                annotation2 = hepg2_stop_codons
            elif read_type == 'peak_tx':
                annotation1 = hepg2_txstart
                annotation2 = hepg2_txstop
        elif cell == 'K562':
            if read_type == 'peak_cds':
                annotation1 = k562_start_codons
                annotation2 = k562_stop_codons
            elif read_type == 'peak_tx':
                annotation1 = k562_txstart
                annotation2 = k562_txstop
        else:
            print(cell)

        ### build teh command used to call the python script.
        output_filename = os.path.join(
            params[read_type]['output_dir'],
            os.path.basename(peak).replace('.bb','.bb.{}'.format(ext))
        )

        # Build the cmd line
        cmd = "python " + density_runner
        cmd = cmd + " --event {}".format('bed')
        cmd = cmd + " --peak {}".format(peak)
        cmd = cmd + " --output {}".format(output_filename)
        cmd = cmd + " --annotations {} {}".format(
            annotation1, annotation2
        )
        cmd = cmd + " --annotation_type {} {}".format(
            'bed', 'bed', 
        )
        cmd = cmd + " --scale"
        cmd = cmd + " --exon_offset 300" ## note: the crg-eu are already padded
        cmd = cmd + " --intron_offset 300" ## note: the crg-eu are already padded
        cmd = cmd + " --normalization_level {}".format(0)
        if not os.path.exists(output_filename) or force == True:
            cmds.append(cmd)
                    

bash_script_sh = '/projects/ps-yeolab3/bay001/maps/bash_scripts/{}/{}-cdstss_{}.{}.sh'.format(
    current_date, 
    params[read_type]['prefix'], 
    ext,
    0
)

Submitter(
    cmds, 
    "{}-{}".format(params[read_type]['prefix'], ext), 
    sh=bash_script_sh,
    submit=False,
    array=True,
    walltime='8:00:00',
    queue='home-yeo'
)

Writing 362 tasks as an array-job.
Wrote commands to /projects/ps-yeolab3/bay001/maps/bash_scripts/12-5-2017/peak_bb_tx-cdstss_png.0.sh.


<qtools.submitter.Submitter at 0x2ad6a988a710>