In [1]:
import pandas as pd
import os
import json
import yaml
import glob
# import rethinkdb as r
from collections import defaultdict
from qtools import Submitter
from encode import manifest_helpers as m

from tqdm import tnrange, tqdm_notebook
pd.set_option("display.max_colwidth", 10000)

In [2]:
current_date = '7-3-2017'
density_runner = '/home/bay001/projects/codebase/rbp-maps/maps/plot_density.py'
peak_runner = '/home/bay001/projects/codebase/rbp-maps/maps/plot_peak.py'
out_dir = '/home/bay001/projects/gabe_qc_20170612/data/'
miso_annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/as_miso_renamed'
annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/se_renamed/'
# annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/se/'

In [8]:
params = {
    'whole_read':{
        # 'output_dir' : '/home/bay001/projects/gabe_qc_20170612/data/maps_w_redundant_annotations/',
        'output_dir' : '/home/bay001/projects/gabe_qc_20170612/data/',
        'clip_manifest' : '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.txt',
        # 'clip_manifest' : '/home/bay001/projects/gabe_qc_20170612/permanent_data/gabe_manifest_examples.txt',
        'prefix' : 'whole_read',
        'confidence' : 1
    },
}

In [9]:
hepg2_rnaseq_manifest = '/home/bay001/projects/maps_20160420/permanent_data/RNASeq_final_exp_list_HepG2.csv'
k562_rnaseq_manifest = '/home/bay001/projects/maps_20160420/permanent_data/RNASeq_final_exp_list_K562.csv'
rnaseq_manifests = {'HepG2':hepg2_rnaseq_manifest, 'K562':k562_rnaseq_manifest}
chrom_sizes = '/projects/ps-yeolab/genomes/hg19/hg19.chrom.sizes'

In [10]:
bash_scripts_dir = '/projects/ps-yeolab3/bay001/gabe_qc_20170612/bash_scripts/{}'.format(current_date)
! mkdir $bash_scripts_dir

mkdir: cannot create directory `/projects/ps-yeolab3/bay001/gabe_qc_20170612/bash_scripts/7-3-2017': File exists


In [11]:
img_extension = ['png']
pos_splicing_suffix = '-included-upon-knockdown'
neg_splicing_suffix = '-excluded-upon-knockdown'
# pos_splicing_suffix = '-SE.MATS.JunctionCountOnly.positive.txt'
# neg_splicing_suffix = '-SE.MATS.JunctionCountOnly.negative.txt'

read_type = 'whole_read'

### DEFINE BACKGROUNDS (THESE ARE STATIC AND DON'T CHANGE) ###
k562_background_ce = os.path.join(annotation_dir, 'K562_constitutive_exons')
k562_background_nse_all = os.path.join(annotation_dir, 'K562_native_cassette_exons')
k562_background_nse_inc = os.path.join(annotation_dir, 'K562_natively_included_exons')
k562_background_nse_exc = os.path.join(annotation_dir, 'K562_natively_excluded_exons')

hepg2_background_ce = os.path.join(annotation_dir, 'HepG2_constitutive_exons')
hepg2_background_nse_all = os.path.join(annotation_dir, 'HepG2_native_cassette_exons')
hepg2_background_nse_inc = os.path.join(annotation_dir, 'HepG2_natively_included_exons')
hepg2_background_nse_exc = os.path.join(annotation_dir, 'HepG2_natively_excluded_exons')


In [12]:
clip_df = pd.read_table(params[read_type]['clip_manifest'])
confidence = params[read_type]['confidence']
no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
no_rnaseq_yet = [] # uIDs for which we have an expt id, but haven't downloaded the data yet (or there were no significant splice events)
cmds = []

normalization_levels = [1,2]

for level in normalization_levels:
    for ext in img_extension:
        for uid in clip_df['uID']:

            r1, r2, i, rbp, cell = m.get_clip_file_from_uid(clip_df, uid)

            if cell == 'K562':
                background1 = k562_background_ce
                background2 = k562_background_nse_all
                background3 = k562_background_nse_inc
                background4 = k562_background_nse_exc
            elif cell == 'HepG2':
                background1 = hepg2_background_ce
                background2 = hepg2_background_nse_all
                background3 = hepg2_background_nse_inc
                background4 = hepg2_background_nse_exc
            else:
                print(cell)

            ##### Given RBP name, cell line, return the Graveley lab ID (ie. RBFOX2-BGHLV19-HepG2) #####
            splicing_prefix = m.get_rnaseq_splicing_prefix_from_rbpname(
                rnaseq_manifests, rbp, cell
            )
            if(splicing_prefix == "NO_RNASEQ"): # we don't have an rna seq expt for this clip:
                no_rnaseq.append(uid)
            else:
                ##### get the positive and negative associated annotations using this prefix #####
                positive, negative = m.get_annotations_from_splicing_prefix(
                    annotation_dir, splicing_prefix, 
                    pos_splicing_suffix, neg_splicing_suffix
                )
                ### we HAVE to have at both positive and negative annotations to plot ###
                if(positive == None or negative == None):
                    no_rnaseq_yet.append(uid)
                else:
                    ### uses RBP name to ensure positive and negative annotations are being pulled ###
                    pos_prefix = os.path.basename(positive).split('-')[0]
                    neg_prefix = os.path.basename(negative).split('-')[0]
                    if not (pos_prefix in rbp and neg_prefix in rbp):
                        print(
                            'warning, these dont match: {}, {}, {}'.format(
                                rbp, 
                                os.path.basename(positive),
                                os.path.basename(negative)
                            )
                        )
                    ### Foreach replicate, build teh command used to call the python script.
                    for r in [r1, r2]:
                        name = os.path.basename(r).replace('.bam','.{}.conf{}.{}'.format(level, confidence, ext))
                        output_filename = os.path.join(
                            params[read_type]['output_dir'],
                            name
                        )

                        # Build the cmd line
                        cmd = "python " + density_runner
                        cmd = cmd + " --event {}".format('se')
                        cmd = cmd + " --ipbam {}".format(r)
                        cmd = cmd + " --inputbam {}".format(i)
                        cmd = cmd + " --output {}".format(output_filename)
                        cmd = cmd + " --annotations {} {} {} {} {} {}".format(
                            positive, negative, background1, background2, background3, background4
                        )
                        cmd = cmd + " --annotation_type {} {} {} {} {} {}".format(
                            'rmats', 'rmats', 'eric', 'eric', 'eric', 'eric' 
                        )
                        cmd = cmd + " --chrom_sizes {}".format(chrom_sizes)
                        cmd = cmd + " --to_test {} {}".format(positive, negative)
                        cmd = cmd + " --bgnum {}".format(3) # test against native SE
                        cmd = cmd + " --normalization_level {}".format(level)
                        cmd = cmd + " --confidence {}".format(confidence)
                        cmds.append(cmd)

bash_script_sh = '/projects/ps-yeolab3/bay001/gabe_qc_20170612/bash_scripts/{}/{}.conf{}.-SE_NR_{}.NR.sh'.format(
    current_date, 
    params[read_type]['prefix'], 
    confidence,
    ext
)
with open(bash_script_sh,'w') as o:
    o.write("#!/bin/bash\n")
    for cmd in cmds:
        o.write(cmd + '\n')

Submitter(
    cmds, 
    "{}-SE_NR_{}".format(params[read_type]['prefix'], ext), 
    sh=bash_script_sh,
    submit=False,
    array=True,
    walltime='3:00:00',
    queue='home-scrm'
)

### Print any missing/unavailable annotations to check over ###
print("uIDs for which we don't have splicing data for: {}".format(
        len(no_rnaseq))
     )
print("uIDs for which we have an RNASEQ ID, but we don't have the splicing data yet: {}".format(
        len(no_rnaseq_yet))
     )

uIDs for which we don't have splicing data for: 46
uIDs for which we have an RNASEQ ID, but we don't have the splicing data yet: 14


Writing 500 tasks as an array-job.
Wrote commands to /projects/ps-yeolab3/bay001/gabe_qc_20170612/bash_scripts/7-3-2017/whole_read.conf1.-SE_NR_png.NR1.sh.
Submitted script to queue home-scrm.
 Job ID: 8912391
Writing 104 tasks as an array-job.
Wrote commands to /projects/ps-yeolab3/bay001/gabe_qc_20170612/bash_scripts/7-3-2017/whole_read.conf1.-SE_NR_png.NR2.sh.
Submitted script to queue home-scrm.
 Job ID: 8912392


In [60]:
no_rnaseq

[332, 332, 332, 332]

# 