# Wrapper notebook for submitting the RBP maps script to TSCC

In [10]:
import pandas as pd
import os
import json
import yaml
import glob
# import rethinkdb as r
from collections import defaultdict
from qtools import Submitter
from encode import manifest_helpers as m

from tqdm import tnrange, tqdm_notebook
pd.set_option("display.max_colwidth", 10000)


# Define programs

In [6]:
density_runner = '/home/bay001/projects/codebase/rbp-maps/maps/plot_density.py'
peak_runner = '/home/bay001/projects/codebase/rbp-maps/maps/plot_peak.py'
miso_annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/as_miso_renamed'
annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/se_renamed/'

params = {
    'whole_read':{
        'output_dir' : '/projects/ps-yeolab3/bay001/maps/current/se',
        'clip_manifest' : '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.txt',
        'prefix' : 'whole_read',
    },
    '5p':{
        'output_dir' : '/projects/ps-yeolab3/bay001/maps/current/se_5p/',
        'clip_manifest' : '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.5p.txt',
        'prefix' : '5p',
    },
    'peak':{
        'output_dir' : '/projects/ps-yeolab3/bay001/maps/current/se_peak/',
        'clip_manifest' : '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.txt',
        'prefix' : 'peak',
        'peak_dir' : '/home/elvannostrand/data/clip/CLIPseq_analysis/ENCODE_CLIPperv2_20161120/'
    },
    'idr':{
        'output_dir' : '/projects/ps-yeolab3/bay001/maps/current/idr',
        'clip_manifest' : '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.txt',
        'prefix' : 'idr',
        'peak_dir' : '/home/elvannostrand/data/clip/CLIPseq_analysis/ENCODE_FINALforpapers_20170325/IDR/'
    },
}



# Define manifests, directories, etc.
- SBDS-BGKLV24-K562 and PPIL4-BGKLV24-K562 were changed in the K562.csv list from SBDS-BGKLV24_2-K562 and PPIL4-BGKLV24_2-K562 per an email discussion from xintao.

In [7]:
current_date = '7-11-2017'
hepg2_rnaseq_manifest = '/home/bay001/projects/maps_20160420/permanent_data/RNASeq_final_exp_list_HepG2.csv'
k562_rnaseq_manifest = '/home/bay001/projects/maps_20160420/permanent_data/RNASeq_final_exp_list_K562.csv'
rnaseq_manifests = {'HepG2':hepg2_rnaseq_manifest, 'K562':k562_rnaseq_manifest}
chrom_sizes = '/projects/ps-yeolab/genomes/hg19/hg19.chrom.sizes'

bash_scripts_dir = '/projects/ps-yeolab3/bay001/maps/bash_scripts/{}'.format(current_date)
! mkdir $bash_scripts_dir

mkdir: cannot create directory `/projects/ps-yeolab3/bay001/maps/bash_scripts/7-11-2017': File exists


# Plot SE Density

In [12]:
img_extension = ['svg']
pos_splicing_suffix = '-included-upon-knockdown'
neg_splicing_suffix = '-excluded-upon-knockdown'

if not os.path.exists(bash_scripts_dir):
    ! mkdir $bash_scripts_dir

read_type = 'whole_read'
normalization_levels = [1]

### DEFINE BACKGROUNDS (THESE ARE STATIC AND DON'T CHANGE) ###
k562_background_ce = os.path.join(annotation_dir, 'K562_constitutive_exons')
k562_background_nse_all = os.path.join(annotation_dir, 'K562_native_cassette_exons')
k562_background_nse_inc = os.path.join(annotation_dir, 'K562_natively_included_exons')
k562_background_nse_exc = os.path.join(annotation_dir, 'K562_natively_excluded_exons')

hepg2_background_ce = os.path.join(annotation_dir, 'HepG2_constitutive_exons')
hepg2_background_nse_all = os.path.join(annotation_dir, 'HepG2_native_cassette_exons')
hepg2_background_nse_inc = os.path.join(annotation_dir, 'HepG2_natively_included_exons')
hepg2_background_nse_exc = os.path.join(annotation_dir, 'HepG2_natively_excluded_exons')

clip_df = pd.read_table(params[read_type]['clip_manifest'])

no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
no_rnaseq_yet = [] # uIDs for which we have an expt id, but haven't downloaded the data yet (or there were no significant splice events)
cmds = []

for normalization_level in normalization_levels:
    for ext in img_extension:
        for uid in clip_df['uID']:

            r1, r2, i, rbp, cell = m.get_clip_file_from_uid(clip_df, uid)

            if cell == 'K562':
                background1 = k562_background_ce
                background2 = k562_background_nse_all
                background3 = k562_background_nse_inc
                background4 = k562_background_nse_exc
            elif cell == 'HepG2':
                background1 = hepg2_background_ce
                background2 = hepg2_background_nse_all
                background3 = hepg2_background_nse_inc
                background4 = hepg2_background_nse_exc
            else:
                print(cell)

            ##### Given RBP name, cell line, return the Graveley lab ID (ie. RBFOX2-BGHLV19-HepG2) #####
            splicing_prefix = m.get_rnaseq_splicing_prefix_from_rbpname(
                rnaseq_manifests, rbp, cell
            )
            if(splicing_prefix == "NO_RNASEQ"): # we don't have an rna seq expt for this clip:
                no_rnaseq.append(uid)
            else:
                ##### get the positive and negative associated annotations using this prefix #####
                positive, negative = m.get_annotations_from_splicing_prefix(
                    annotation_dir, splicing_prefix, 
                    pos_splicing_suffix, neg_splicing_suffix
                )
                ### we HAVE to have at both positive and negative annotations to plot ###
                if(positive == None or negative == None):
                    no_rnaseq_yet.append(uid)
                else:
                    ### uses RBP name to ensure positive and negative annotations are being pulled ###
                    pos_prefix = os.path.basename(positive).split('-')[0]
                    neg_prefix = os.path.basename(negative).split('-')[0]
                    if not (pos_prefix in rbp and neg_prefix in rbp):
                        print(
                            'warning, these dont match: {}, {}, {}'.format(
                                rbp, 
                                os.path.basename(positive),
                                os.path.basename(negative)
                            )
                        )
                    ### Foreach replicate, build teh command used to call the python script.
                    for r in [r1, r2]:
                        name = os.path.basename(r).replace('.bam','.{}.{}'.format(normalization_level, ext))
                        output_filename = os.path.join(
                            params[read_type]['output_dir'],
                            name
                        )

                        # Build the cmd line
                        cmd = "python " + density_runner
                        cmd = cmd + " --event {}".format('se')
                        cmd = cmd + " --ipbam {}".format(r)
                        cmd = cmd + " --inputbam {}".format(i)
                        cmd = cmd + " --output {}".format(output_filename)
                        cmd = cmd + " --annotations {} {} {} {} {} {}".format(
                            positive, negative, background1, background2, background3, background4
                        )
                        cmd = cmd + " --annotation_type {} {} {} {} {} {}".format(
                            'rmats', 'rmats', 'eric', 'eric', 'eric', 'eric' 
                        )
                        cmd = cmd + " --chrom_sizes {}".format(chrom_sizes)
                        cmd = cmd + " --to_test {} {}".format(positive, negative)
                        cmd = cmd + " --bgnum {}".format(3) # test against native SE
                        cmd = cmd + " --normalization_level {}".format(normalization_level)
                        cmds.append(cmd)
bash_script_sh = '/projects/ps-yeolab3/bay001/maps/bash_scripts/{}/{}-SE_NR_{}.{}.sh'.format(
    current_date, 
    params[read_type]['prefix'], 
    ext,
    normalization_level
)

Submitter(
    cmds, 
    "{}-SE_NR_{}".format(params[read_type]['prefix'], ext), 
    sh=bash_script_sh,
    submit=False,
    array=True,
    walltime='3:00:00',
    queue='home-yeo'
)

### Print any missing/unavailable annotations to check over ###
print("uIDs for which we don't have splicing data for: {}".format(
        len(no_rnaseq))
     )
print("uIDs for which we have an RNASEQ ID, but we don't have the splicing data yet: {}".format(
        len(no_rnaseq_yet))
     )

uIDs for which we don't have splicing data for: 23
uIDs for which we have an RNASEQ ID, but we don't have the splicing data yet: 7


Writing 302 tasks as an array-job.
Wrote commands to /projects/ps-yeolab3/bay001/maps/bash_scripts/7-11-2017/whole_read-SE_NR_svg.1.sh.


In [None]:
with open(bash_script_sh.replace('.sh','.missing.txt'), 'w') as o:
    for no in no_rnaseq:
        o.write(
            '{}\t{}\n'.format(
                m.get_clip_file_from_uid(clip_df, no)[3],
                m.get_clip_file_from_uid(clip_df, no)[4],
            )
        )

    print("\n\nNO SUFFICIENT POSITIVE OR NEGATIVE SIGNIFICANT ANNOTATIONS:")
    for no in no_rnaseq_yet:
        print(m.get_clip_file_from_uid(clip_df, no)[3:]),

# Plot SE Peaks

In [12]:
p = 3
fc = 3

img_extensions = ['png'] #,'svg']
all_peaks = glob.glob(os.path.join(params['peak']['peak_dir'],'*.compressed.bed'))
clip_df = pd.read_table(params['idr']['clip_manifest'])

no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
no_rnaseq_yet = [] # uIDs for which we have an expt id, but haven't downloaded the data yet
cmds = []

for ext in img_extensions:
    for peak in all_peaks:
        outfile = os.path.join(
            params['peak']['output_dir'],
            os.path.basename(peak).replace('.compressed.bed','.compressed.{}'.format(ext))
        )
        uid, rep, other = m.split_uid_and_rep(os.path.basename(peak).split('.')[0])
        _, _, _, rbp, cell = m.get_clip_file_from_uid(
            clip_df, uid
        )
        splicing_prefix = m.get_rnaseq_splicing_prefix_from_rbpname(
            rnaseq_manifests, rbp, cell
        )

        if cell == 'HepG2':
            ce_background = os.path.join(miso_annotation_dir,'HepG2-constitutive-exons.miso')
            nc_background = os.path.join(miso_annotation_dir,'HepG2-native-cassette-exons.miso')
            ni_background = os.path.join(miso_annotation_dir,'HepG2-native-included-exons.miso')
            ne_background = os.path.join(miso_annotation_dir,'HepG2-native-excluded-exons.miso')
        elif cell == 'K562':
            ce_background = os.path.join(miso_annotation_dir,'K562-constitutive-exons.miso')
            nc_background = os.path.join(miso_annotation_dir,'K562-native-cassette-exons.miso')
            ni_background = os.path.join(miso_annotation_dir,'K562-native-included-exons.miso')
            ne_background = os.path.join(miso_annotation_dir,'K562-native-excluded-exons.miso')
        else:
            print('error')

        if(splicing_prefix == "NO_RNASEQ"): # we don't have an rna seq expt for this clip:
            no_rnaseq.append(uid)
        else:
            positive, negative = m.get_annotations_from_splicing_prefix(
                miso_annotation_dir, splicing_prefix, 

            )
        if(positive == None or negative == None):
            no_rnaseq_yet.append(uid)
        # elif not os.path.exists(outfile):
        cmd = "python {} -i {} -o {} -m {} {} {} {} {} {} -bgnum 4 -p {} -f {}".format(
            peak_runner,
            peak,
            outfile,
            positive,
            negative,
            ce_background,
            nc_background,
            ni_background,
            ne_background,
            p,
            fc
        )
        cmds.append(cmd)

bash_script_sh = '/projects/ps-yeolab3/bay001/maps/bash_scripts/{}/SE_PEAK_PNGS.sh'.format(current_date)

Submitter(
    cmds, 
    "SE_PEAK_PNGS", 
    sh=bash_script_sh,
    submit=True,
    array=True,
    walltime='0:20:00',
    queue='home-yeo'
)

with open(bash_script_sh.replace('.sh','.missing.txt'), 'w') as o:
    for no in no_rnaseq:
        o.write(
            '{}\t{}\n'.format(
                m.get_clip_file_from_uid(clip_df, no)[3],
                m.get_clip_file_from_uid(clip_df, no)[4],
            )
        )
    print("\n\nNO SUFFICIENT POSITIVE OR NEGATIVE SIGNIFICANT ANNOTATIONS:")
    for no in no_rnaseq_yet:
        print(m.get_clip_file_from_uid(clip_df, no)[3:]),

Writing 362 tasks as an array-job.
Wrote commands to /projects/ps-yeolab3/bay001/maps/bash_scripts/6-19-2017/SE_PEAK_PNGS.sh.
Submitted script to queue home-yeo.
 Job ID: 8786348




NO SUFFICIENT POSITIVE OR NEGATIVE SIGNIFICANT ANNOTATIONS:
(u'FAM120A', u'K562') (u'RPL23A', u'HepG2') (u'AUH', u'K562') (u'FASTKD2', u'K562') (u'LSM11', u'K562') (u'SLTM', u'K562') (u'CSTF2T', u'K562') (u'LSM11', u'K562') (u'FASTKD2', u'K562') (u'AUH', u'K562') (u'RPL23A', u'HepG2') (u'FAM120A', u'K562') (u'CSTF2T', u'K562') (u'SLTM', u'K562')


# Plot the IDR peaks

In [25]:
all_peaks = glob.glob(os.path.join(params['idr']['peak_dir'],'*0102merged.bed'))
clip_df = pd.read_table(params['idr']['clip_manifest'])

progress = tnrange(len(all_peaks))

no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
no_rnaseq_yet = [] # uIDs for which we have an expt id, but haven't downloaded the data yet
cmds = []

for peak in all_peaks:
    outfile = os.path.join(
        params['idr']['output_dir'],
        os.path.basename(peak).replace('merged.bed','merged.png')
    )
    uid = os.path.basename(peak).split('.')[0]
    _, _, _, rbp, cell = m.get_clip_file_from_uid(clip_df, uid)
    splicing_prefix = m.get_rnaseq_splicing_prefix_from_rbpname(rnaseq_manifests, rbp, cell)
    
    if(splicing_prefix == "NO_RNASEQ"): # we don't have an rna seq expt for this clip:
        no_rnaseq.append(uid)
    else:    
        positive, negative = m.get_annotations_from_splicing_prefix(miso_annotation_dir, splicing_prefix)
    
        if positive is not None and negative is not None:
            if cell == 'HepG2':
                ce_background = os.path.join(miso_annotation_dir,'HepG2-constitutive-exons.miso')
                nc_background = os.path.join(miso_annotation_dir,'HepG2-native-cassette-exons.miso')
                ni_background = os.path.join(miso_annotation_dir,'HepG2-native-included-exons.miso')
                ne_background = os.path.join(miso_annotation_dir,'HepG2-native-excluded-exons.miso')
            elif cell == 'K562':
                ce_background = os.path.join(miso_annotation_dir,'K562-constitutive-exons.miso')
                nc_background = os.path.join(miso_annotation_dir,'K562-native-cassette-exons.miso')
                ni_background = os.path.join(miso_annotation_dir,'K562-native-included-exons.miso')
                ne_background = os.path.join(miso_annotation_dir,'K562-native-excluded-exons.miso')
            else:
                print('error')
            cmd = "python {} -i {} -o {} -m {} {} {} {} {} {} -bgnum 4".format(
                peak_runner,
                peak,
                outfile,
                positive,
                negative,
                ce_background,
                nc_background,
                ni_background,
                ne_background
            )
            cmds.append(cmd)
        else:
            no_rnaseq_yet.append(uid)

    progress.update(1)

bash_script_sh = '/projects/ps-yeolab3/bay001/maps/bash_scripts/{}/IDR_PNGS.sh'.format(current_date)
Submitter(
    cmds, 
    'idr_bg_peaks', 
    sh=bash_script_sh,
    submit=True,
    array=True,
    walltime='0:20:00',
    queue='home-yeo'
)

with open(bash_script_sh.replace('.sh','.missing.txt'), 'w') as o:
    for no in no_rnaseq:
        o.write(
            '{}\t{}\n'.format(
                m.get_clip_file_from_uid(clip_df, no)[3],
                m.get_clip_file_from_uid(clip_df, no)[4],
            )
        )

    print("\n\nNO SUFFICIENT POSITIVE OR NEGATIVE SIGNIFICANT ANNOTATIONS:")
    for no in no_rnaseq_yet:
        print(m.get_clip_file_from_uid(clip_df, no)[3:]),

Writing 151 tasks as an array-job.
Wrote commands to /projects/ps-yeolab3/bay001/maps/bash_scripts/6-9-2017/IDR_PNGS.sh.
Submitted script to queue home-yeo.
 Job ID: 8647606




NO SUFFICIENT POSITIVE OR NEGATIVE SIGNIFICANT ANNOTATIONS:
(u'AUH', u'K562') (u'SLTM', u'K562') (u'LSM11', u'K562') (u'FASTKD2', u'K562') (u'CSTF2T', u'K562') (u'RPL23A', u'HepG2') (u'FAM120A', u'K562')
181/|/100%|| 181/181 [00:13<00:00, 82.61it/s]

In [None]:
clip_df = pd.read_table(clip_manifest)

hepg2_rnaseq_df = pd.read_table(rnaseq_manifests['HepG2'])
k562_rnaseq_df = pd.read_table(rnaseq_manifests['K562'])

hepg2_clip_df = clip_df[clip_df['Cell line']=='HepG2']
k562_clip_df = clip_df[clip_df['Cell line']=='K562']

In [None]:
hepg2_rbps = pd.merge(hepg2_clip_df, hepg2_rnaseq_df, how='outer', left_on='RBP', right_on='Official_RBP').dropna()
k562_rbps = pd.merge(k562_clip_df, k562_rnaseq_df, how='outer', left_on='RBP', right_on='Official_RBP').dropna()

In [None]:
pos_annotations = glob.glob('/projects/ps-yeolab3/bay001/maps/current_annotations/se/*.positive.nr.txt')
neg_annotations = glob.glob('/projects/ps-yeolab3/bay001/maps/current_annotations/se/*.negative.nr.txt')

def get_rbp(f):
    lst = os.path.basename(f).split('-')
    return '{}-{}'.format(lst[0], lst[2])

In [None]:
pos_rbp = set([get_rbp(f) for f in pos_annotations])
neg_rbp = set([get_rbp(f) for f in neg_annotations])

In [None]:
allrbp = (pos_rbp.intersection(neg_rbp))
print("We have annotations for: {} rbps".format(len(allrbp)))