# Wrapper notebook for submitting the RBP maps script to TSCC

In [1]:
import pandas as pd
import os
import json
import yaml
import glob
# import rethinkdb as r
from collections import defaultdict
from qtools import Submitter
from encode import manifest_helpers as m

from tqdm import tnrange, tqdm_notebook
pd.set_option("display.max_colwidth", 10000)


# Define programs

In [2]:
density_runner = '/home/bay001/projects/codebase/rbp-maps/maps/plot_density.py'
peak_runner = '/home/bay001/projects/codebase/rbp-maps/maps/plot_peak.py'
miso_annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/as_miso_renamed'
annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/se_renamed/'

params = {
    'whole_read':{
        'output_dir' : '/projects/ps-yeolab3/bay001/maps/current/se',
        'clip_manifest' : '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.txt',
        'prefix' : 'whole_read',
        'website_manifest' : '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.website.wholeread.txt'
    },
    '5p':{
        'output_dir' : '/projects/ps-yeolab3/bay001/maps/current/se_5p/',
        'clip_manifest' : '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.5p.txt',
        'prefix' : '5p',
        'website_manifest' : '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.website.5p.txt'
    },
    'peak_bb':{
        'output_dir' : '/projects/ps-yeolab3/bay001/maps/current/se_peak/',
        'clip_manifest' : '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.txt',
        'prefix' : 'peak',
        'peak_dir' : '/projects/ps-yeolab3/bay001/maps/current_annotations/se_peak_bigbeds/'
    },
    'idr_bb':{
        'output_dir' : '/projects/ps-yeolab3/bay001/maps/current/idr_peaks/',
        'clip_manifest' : '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.txt',
        'prefix' : 'peak',
        'peak_dir' : '/projects/ps-yeolab3/bay001/maps/current_annotations/se_idr_peak_bigbeds/'
    },
}

# Define manifests, directories, etc.
- SBDS-BGKLV24-K562 and PPIL4-BGKLV24-K562 were changed in the K562.csv list from SBDS-BGKLV24_2-K562 and PPIL4-BGKLV24_2-K562 per an email discussion from xintao.
- RNASEN50-BGKLV28-K562 was changed to DROSHA-BGKLV28-K562 per Eric's suggestion

In [3]:
current_date = '12-5-2017'
hepg2_rnaseq_manifest = '/home/bay001/projects/maps_20160420/permanent_data/RNASeq_final_exp_list_HepG2.csv'
k562_rnaseq_manifest = '/home/bay001/projects/maps_20160420/permanent_data/RNASeq_final_exp_list_K562.csv'
rnaseq_manifests = {'HepG2':hepg2_rnaseq_manifest, 'K562':k562_rnaseq_manifest}

bash_scripts_dir = '/projects/ps-yeolab3/bay001/maps/bash_scripts/{}'.format(current_date)
! mkdir $bash_scripts_dir

mkdir: cannot create directory `/projects/ps-yeolab3/bay001/maps/bash_scripts/12-5-2017': File exists


# Plot SE Density

In [4]:
img_extension = ['png']
pos_splicing_suffix = '-included-upon-knockdown'
neg_splicing_suffix = '-excluded-upon-knockdown'

if not os.path.exists(bash_scripts_dir):
    ! mkdir $bash_scripts_dir

read_type = '5p'
normalization_levels = [1]

### Force override of maps
force = True

### DEFINE BACKGROUNDS (THESE ARE STATIC AND DON'T CHANGE) ###
k562_background_ce = os.path.join(annotation_dir, 'K562_constitutive_exons')
k562_background_nse_all = os.path.join(annotation_dir, 'K562_native_cassette_exons')
k562_background_nse_inc = os.path.join(annotation_dir, 'K562_native_included_cassette_exons')
k562_background_nse_exc = os.path.join(annotation_dir, 'K562_native_excluded_cassette_exons')

hepg2_background_ce = os.path.join(annotation_dir, 'HepG2_constitutive_exons')
hepg2_background_nse_all = os.path.join(annotation_dir, 'HepG2_native_cassette_exons')
hepg2_background_nse_inc = os.path.join(annotation_dir, 'HepG2_native_included_cassette_exons')
hepg2_background_nse_exc = os.path.join(annotation_dir, 'HepG2_native_excluded_cassette_exons')

clip_df = pd.read_table(params[read_type]['clip_manifest'])

no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
no_rnaseq_yet = [] # uIDs for which we have an expt id, but haven't downloaded the data yet (or there were no significant splice events)
cmds = []

for normalization_level in normalization_levels:
    for ext in img_extension:
        for uid in clip_df['uID']:

            r1, r2, i, rbp, cell = m.get_clip_file_from_uid(clip_df, uid)

            if cell == 'K562':
                background1 = k562_background_ce
                background2 = k562_background_nse_all
                background3 = k562_background_nse_inc
                background4 = k562_background_nse_exc
            elif cell == 'HepG2':
                background1 = hepg2_background_ce
                background2 = hepg2_background_nse_all
                background3 = hepg2_background_nse_inc
                background4 = hepg2_background_nse_exc
            else:
                print(cell)

            ##### Given RBP name, cell line, return the Graveley lab ID (ie. RBFOX2-BGHLV19-HepG2) #####
            splicing_prefix = m.get_rnaseq_splicing_prefix_from_rbpname(
                rnaseq_manifests, rbp, cell
            )
            if(splicing_prefix == "NO_RNASEQ"): # we don't have an rna seq expt for this clip:
                no_rnaseq.append(uid)
            else:
                ##### get the positive and negative associated annotations using this prefix #####
                positive, negative = m.get_annotations_from_splicing_prefix(
                    annotation_dir, splicing_prefix, 
                    pos_splicing_suffix, neg_splicing_suffix
                )
                ### we HAVE to have at both positive and negative annotations to plot ###
                if(positive == None or negative == None):
                    no_rnaseq_yet.append(uid)
                else:
                    ### uses RBP name to ensure positive and negative annotations are being pulled ###
                    pos_prefix = os.path.basename(positive).split('-')[0]
                    neg_prefix = os.path.basename(negative).split('-')[0]
                    if not (pos_prefix in rbp and neg_prefix in rbp):
                        print(
                            'warning, these dont match: {}, {}, {}'.format(
                                rbp, 
                                os.path.basename(positive),
                                os.path.basename(negative)
                            )
                        )
                    ### Foreach replicate, build teh command used to call the python script.
                    for r in [r1, r2]:
                        name = os.path.basename(r).replace('.bam','.{}.{}'.format(normalization_level, ext))
                        output_filename = os.path.join(
                            params[read_type]['output_dir'],
                            name
                        )

                        # Build the cmd line
                        cmd = "python " + density_runner
                        cmd = cmd + " --event {}".format('se')
                        cmd = cmd + " --ipbam {}".format(r)
                        cmd = cmd + " --inputbam {}".format(i)
                        cmd = cmd + " --output {}".format(output_filename)
                        cmd = cmd + " --annotations {} {} {} {} {} {}".format(
                            positive, negative, background1, background2, background3, background4
                        )
                        cmd = cmd + " --annotation_type {} {} {} {} {} {}".format(
                            'rmats', 'rmats', 'eric', 'eric', 'eric', 'eric' 
                        )
                        # cmd = cmd + " --chrom_sizes {}".format(chrom_sizes)
                        cmd = cmd + " --testnum {} {}".format(0, 1)
                        cmd = cmd + " --bgnum {}".format(3) # test against native SE
                        cmd = cmd + " --normalization_level {}".format(normalization_level)
                        if not os.path.exists(output_filename) or force == True:
                            cmds.append(cmd)

bash_script_sh = '/projects/ps-yeolab3/bay001/maps/bash_scripts/{}/{}-SE_NR_{}.{}.sh'.format(
    current_date, 
    params[read_type]['prefix'], 
    ext,
    normalization_level
)

Submitter(
    cmds, 
    "{}-SE_NR_{}".format(params[read_type]['prefix'], ext), 
    sh=bash_script_sh,
    submit=True,
    array=True,
    walltime='3:00:00',
    queue='home-yeo'
)

### Print any missing/unavailable annotations to check over ###
print("uIDs for which we don't have splicing data for: {}".format(
        len(no_rnaseq))
     )
print("uIDs for which we have an RNASEQ ID, but we don't have the splicing data yet: {}".format(
        len(no_rnaseq_yet))
     )



Writing 306 tasks as an array-job.
Wrote commands to /projects/ps-yeolab3/bay001/maps/bash_scripts/12-5-2017/5p-SE_NR_png.1.sh.


uIDs for which we don't have splicing data for: 21
uIDs for which we have an RNASEQ ID, but we don't have the splicing data yet: 7


Submitted script to queue home-yeo.
 Job ID: 10326979


In [5]:
with open(bash_script_sh.replace('.sh','.missing.txt'), 'w') as o:
    for no in no_rnaseq:
        o.write(
            '{}\t{}\n'.format(
                m.get_clip_file_from_uid(clip_df, no)[3],
                m.get_clip_file_from_uid(clip_df, no)[4],
            )
        )

    print("\n\nNO SUFFICIENT POSITIVE OR NEGATIVE SIGNIFICANT ANNOTATIONS:")
    for no in no_rnaseq_yet:
        print(m.get_clip_file_from_uid(clip_df, no)[3:]),



NO SUFFICIENT POSITIVE OR NEGATIVE SIGNIFICANT ANNOTATIONS:
(u'AUH', u'K562') (u'SLTM', u'K562') (u'CSTF2T', u'K562') (u'FAM120A', u'K562') (u'LSM11', u'K562') (u'FASTKD2', u'K562') (u'RPL23A', u'HepG2')


# Plot SE Peaks

In [6]:
peak_runner = '/home/bay001/projects/codebase/rbp-maps/maps/plot_density.py' # this is the main driver now.

key = 'peak_bb'

img_extensions = ['png'] #,'svg']
all_peaks = glob.glob(os.path.join(params[key]['peak_dir'],'*p3f3.bed.sorted.bed.bb'))
clip_df = pd.read_table(params[key]['clip_manifest'])

no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
no_rnaseq_yet = [] # uIDs for which we have an expt id, but haven't downloaded the data yet
cmds = []
force = True

### DEFINE BACKGROUNDS (THESE ARE STATIC AND DON'T CHANGE) ###
k562_background_ce = os.path.join(annotation_dir, 'K562_constitutive_exons')
k562_background_nse_all = os.path.join(annotation_dir, 'K562_native_cassette_exons')
k562_background_nse_inc = os.path.join(annotation_dir, 'K562_native_included_cassette_exons')
k562_background_nse_exc = os.path.join(annotation_dir, 'K562_native_excluded_cassette_exons')

hepg2_background_ce = os.path.join(annotation_dir, 'HepG2_constitutive_exons')
hepg2_background_nse_all = os.path.join(annotation_dir, 'HepG2_native_cassette_exons')
hepg2_background_nse_inc = os.path.join(annotation_dir, 'HepG2_native_included_cassette_exons')
hepg2_background_nse_exc = os.path.join(annotation_dir, 'HepG2_native_excluded_cassette_exons')

for ext in img_extensions:
    for peak in all_peaks:
        output_filename = os.path.join(
            params[key]['output_dir'],
            os.path.basename(peak).replace('.bb','.bb.{}'.format(ext))
        )
        uid, rep, other = m.split_uid_and_rep(os.path.basename(peak).split('.')[0])
        _, _, _, rbp, cell = m.get_clip_file_from_uid(
            clip_df, uid
        )
        splicing_prefix = m.get_rnaseq_splicing_prefix_from_rbpname(
            rnaseq_manifests, rbp, cell
        )

        if cell == 'HepG2':
            ce_background = hepg2_background_ce
            nc_background = hepg2_background_nse_all
            ni_background = hepg2_background_nse_inc
            ne_background = hepg2_background_nse_exc
        elif cell == 'K562':
            ce_background = k562_background_ce
            nc_background = k562_background_nse_all
            ni_background = k562_background_nse_inc
            ne_background = k562_background_nse_exc
        else:
            print('error')

        if(splicing_prefix == "NO_RNASEQ"): # we don't have an rna seq expt for this clip:
            no_rnaseq.append(uid)
        else:
            positive, negative = m.get_annotations_from_splicing_prefix(
                annotation_dir, splicing_prefix, 

            )
            if(positive == None or negative == None):
                no_rnaseq_yet.append(uid)
            else:
                # Build the cmd line
                cmd = "python " + peak_runner
                cmd = cmd + " --event {}".format('se')
                cmd = cmd + " --peak {}".format(peak)
                cmd = cmd + " --output {}".format(output_filename)
                cmd = cmd + " --annotations {} {} {} {} {} {}".format(
                    positive, negative, ce_background, nc_background, ni_background, ne_background
                )
                cmd = cmd + " --annotation_type {} {} {} {} {} {}".format(
                    'rmats', 'rmats', 'eric', 'eric', 'eric', 'eric' 
                )
                # cmd = cmd + " --chrom_sizes {}".format(chrom_sizes)
                cmd = cmd + " --testnum {} {}".format(0, 1)
                cmd = cmd + " --bgnum {}".format(3) # test against native SE
                cmd = cmd + " --normalization_level {}".format(0)
                cmd = cmd + " --sigtest {}".format("fisher")
                if not os.path.exists(output_filename) or force == True:
                    cmds.append(cmd)

bash_script_sh = '/projects/ps-yeolab3/bay001/maps/bash_scripts/{}/SE_PEAK_PNGS.sh'.format(current_date)

Submitter(
    cmds, 
    "SE_PEAK_PNGS", 
    sh=bash_script_sh,
    submit=True,
    array=True,
    walltime='0:20:00',
    queue='home-yeo'
)

with open(bash_script_sh.replace('.sh','.missing.txt'), 'w') as o:
    for no in no_rnaseq:
        o.write(
            '{}\t{}\n'.format(
                m.get_clip_file_from_uid(clip_df, no)[3],
                m.get_clip_file_from_uid(clip_df, no)[4],
            )
        )
    print("\n\nNO SUFFICIENT POSITIVE OR NEGATIVE SIGNIFICANT ANNOTATIONS:")
    for no in no_rnaseq_yet:
        print(m.get_clip_file_from_uid(clip_df, no)[3:]),

Writing 302 tasks as an array-job.
Wrote commands to /projects/ps-yeolab3/bay001/maps/bash_scripts/12-5-2017/SE_PEAK_PNGS.sh.
Submitted script to queue home-yeo.
 Job ID: 10326980




NO SUFFICIENT POSITIVE OR NEGATIVE SIGNIFICANT ANNOTATIONS:
(u'CSTF2T', u'K562') (u'FASTKD2', u'K562') (u'RPL23A', u'HepG2') (u'SLTM', u'K562') (u'AUH', u'K562') (u'LSM11', u'K562') (u'FAM120A', u'K562') (u'SLTM', u'K562') (u'AUH', u'K562') (u'LSM11', u'K562') (u'FAM120A', u'K562') (u'CSTF2T', u'K562') (u'FASTKD2', u'K562') (u'RPL23A', u'HepG2')


In [7]:
peak_runner = '/home/bay001/projects/codebase/rbp-maps/maps/plot_density.py' # this is the main driver now.

key = 'idr_bb'

img_extensions = ['png'] #,'svg']
all_peaks = glob.glob(os.path.join(params[key]['peak_dir'],'*bed.sorted.bed.bb'))
clip_df = pd.read_table(params[key]['clip_manifest'])

no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
no_rnaseq_yet = [] # uIDs for which we have an expt id, but haven't downloaded the data yet
cmds = []
force = True

### DEFINE BACKGROUNDS (THESE ARE STATIC AND DON'T CHANGE) ###
k562_background_ce = os.path.join(annotation_dir, 'K562_constitutive_exons')
k562_background_nse_all = os.path.join(annotation_dir, 'K562_native_cassette_exons')
k562_background_nse_inc = os.path.join(annotation_dir, 'K562_native_included_cassette_exons')
k562_background_nse_exc = os.path.join(annotation_dir, 'K562_native_excluded_cassette_exons')

hepg2_background_ce = os.path.join(annotation_dir, 'HepG2_constitutive_exons')
hepg2_background_nse_all = os.path.join(annotation_dir, 'HepG2_native_cassette_exons')
hepg2_background_nse_inc = os.path.join(annotation_dir, 'HepG2_native_included_cassette_exons')
hepg2_background_nse_exc = os.path.join(annotation_dir, 'HepG2_native_excluded_cassette_exons')

for ext in img_extensions:
    for peak in all_peaks:
        output_filename = os.path.join(
            params[key]['output_dir'],
            os.path.basename(peak).replace('.bb','.bb.{}'.format(ext))
        )
        
        uid = os.path.basename(peak).split('.')[0]
        _, _, _, rbp, cell = m.get_clip_file_from_uid(clip_df, uid)
        splicing_prefix = m.get_rnaseq_splicing_prefix_from_rbpname(rnaseq_manifests, rbp, cell)

        if cell == 'HepG2':
            ce_background = hepg2_background_ce
            nc_background = hepg2_background_nse_all
            ni_background = hepg2_background_nse_inc
            ne_background = hepg2_background_nse_exc
        elif cell == 'K562':
            ce_background = k562_background_ce
            nc_background = k562_background_nse_all
            ni_background = k562_background_nse_inc
            ne_background = k562_background_nse_exc
        else:
            print('error')

        if(splicing_prefix == "NO_RNASEQ"): # we don't have an rna seq expt for this clip:
            no_rnaseq.append(uid)
        else:
            positive, negative = m.get_annotations_from_splicing_prefix(
                annotation_dir, splicing_prefix, 

            )
            if(positive == None or negative == None):
                no_rnaseq_yet.append(uid)
            else:
                # Build the cmd line
                cmd = "python " + peak_runner
                cmd = cmd + " --event {}".format('se')
                cmd = cmd + " --peak {}".format(peak)
                cmd = cmd + " --output {}".format(output_filename)
                cmd = cmd + " --annotations {} {} {} {} {} {}".format(
                    positive, negative, ce_background, nc_background, ni_background, ne_background
                )
                cmd = cmd + " --annotation_type {} {} {} {} {} {}".format(
                    'rmats', 'rmats', 'eric', 'eric', 'eric', 'eric' 
                )
                # cmd = cmd + " --chrom_sizes {}".format(chrom_sizes)
                cmd = cmd + " --testnum {} {}".format(0, 1)
                cmd = cmd + " --bgnum {}".format(3) # test against native SE
                cmd = cmd + " --normalization_level {}".format(0)
                cmd = cmd + " --sigtest {}".format("fisher")
                if not os.path.exists(output_filename) or force == True:
                    cmds.append(cmd)

bash_script_sh = '/projects/ps-yeolab3/bay001/maps/bash_scripts/{}/IDR_PEAK_PNGS.sh'.format(current_date)

Submitter(
    cmds, 
    "IDR_PEAK_PNGS", 
    sh=bash_script_sh,
    submit=True,
    array=True,
    walltime='0:20:00',
    queue='home-yeo'
)

with open(bash_script_sh.replace('.sh','.missing.txt'), 'w') as o:
    for no in no_rnaseq:
        o.write(
            '{}\t{}\n'.format(
                m.get_clip_file_from_uid(clip_df, no)[3],
                m.get_clip_file_from_uid(clip_df, no)[4],
            )
        )
    print("\n\nNO SUFFICIENT POSITIVE OR NEGATIVE SIGNIFICANT ANNOTATIONS:")
    for no in no_rnaseq_yet:
        print(m.get_clip_file_from_uid(clip_df, no)[3:]),

Writing 151 tasks as an array-job.
Wrote commands to /projects/ps-yeolab3/bay001/maps/bash_scripts/12-5-2017/IDR_PEAK_PNGS.sh.
Submitted script to queue home-yeo.
 Job ID: 10326981




NO SUFFICIENT POSITIVE OR NEGATIVE SIGNIFICANT ANNOTATIONS:
(u'SLTM', u'K562') (u'FASTKD2', u'K562') (u'FAM120A', u'K562') (u'AUH', u'K562') (u'RPL23A', u'HepG2') (u'CSTF2T', u'K562') (u'LSM11', u'K562')
