# Wrapper notebook for submitting the RBP maps script to TSCC

In [2]:
import pandas as pd
import os
import json
import yaml
import glob
# import rethinkdb as r
from collections import defaultdict
from qtools import Submitter
from encode import manifest_helpers as m

from tqdm import tnrange, tqdm_notebook
pd.set_option("display.max_colwidth", 10000)


# Define manifests, directories, etc.
- SBDS-BGKLV24-K562 and PPIL4-BGKLV24-K562 were changed in the K562.csv list from SBDS-BGKLV24_2-K562 and PPIL4-BGKLV24_2-K562 per an email discussion from xintao.

In [3]:
current_date = '4-13-2018'
clip_manifest = '/projects/ps-yeolab3/bay001/reference_data/misc_ENCODE/ENCODE_FINAL_ANNOTATIONS.uidsonly.txt.manifest.txt'
density_runner = '/home/bay001/projects/codebase/rbp-maps/maps/plot_map.py'
ri_annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/ri_renamed'
control_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/erics_controls/nonredundant_renamed/ri/'

clip_df = pd.read_table(clip_manifest)

master_table = pd.read_table(
    '/projects/ps-yeolab3/encode/rnaseq/eCLIP_finalstatus_20180406_ENCODE_combined_RNASEQ.tsv'
)
del master_table['eCLIP_Antibody_Lot_#']  # there is a NaN value in here that's screwing things up. Don't care about antibody anyway, so delete
merged = pd.merge(
    clip_df,
    master_table,
    how='right',
    left_on=['uID'],
    right_on=['eCLIP_Final_internal_accession']
)
merged.dropna(subset=['eCLIP_Final_internal_accession','SE_jxc_file'], inplace=True)
print(merged.shape)


(203, 43)


In [8]:
def get_clip_file_from_uid(uid, df=merged):
    df = df[df['uID']==uid]
    return df['CLIP_rep1'].values[0], \
            df['CLIP_rep2'].values[0], \
            df['INPUT'].values[0], \
            df['eCLIP_Official_Gene_Symbol'].values[0], \
            df['eCLIP_Cell_Line'].values[0], \
            df['SE_jxc_file'].values[0]

r1, r2, i, rbp, cell, jxc_se  = get_clip_file_from_uid('204')

def get_ri_annotations_from_jxc_se(jxc, jxc_dir=ri_annotation_dir, event='ri'):
    """ jxc contains the basename of the junction counts file """
    orig_file = os.path.join(jxc_dir, jxc)
    positive = orig_file.replace('.SE.MATS.JunctionCountOnly.txt','-included-upon-knockdown'.format(event.upper()))
    negative = orig_file.replace('.SE.MATS.JunctionCountOnly.txt','-excluded-upon-knockdown'.format(event.upper()))
    if not os.path.exists(positive):
        positive = None
    if not os.path.exists(negative):
        negative = None
    return positive, negative

get_ri_annotations_from_jxc_se(jxc_se)

('/projects/ps-yeolab3/bay001/maps/current_annotations/ri_renamed/RBFOX2-BGHLV26-HepG2.set26-included-upon-knockdown',
 '/projects/ps-yeolab3/bay001/maps/current_annotations/ri_renamed/RBFOX2-BGHLV26-HepG2.set26-excluded-upon-knockdown')

# Plot RI

In [3]:
clip_df = pd.read_table(clip_manifest)

events_to_annotation_dict = {
    'ri':'/projects/ps-yeolab3/bay001/maps/current_annotations/ri_renamed/'
}


img_extensions = ['png']
# out_base = '/projects/ps-yeolab3/bay001/maps/current/'
output_dir = '/projects/ps-yeolab3/bay001/maps/current/ri/'
pos_splicing_suffix = '-included-upon-knockdown' # positive RMATS -> more intron inclusion (upon knockdown)
neg_splicing_suffix = '-excluded-upon-knockdown' # negative RMATS -> more intron splicing (upon knockdown)

for img_extension in img_extensions:
    for event, annotation_dir in events_to_annotation_dict.iteritems():
        no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
        no_rnaseq_yet = [] # uIDs for which we have an expt id, but haven't downloaded the data yet
        cmds = []
        for uid in clip_df['uID']:
            r1, r2, i, rbp, cell = m.get_clip_file_from_uid(clip_df, uid)
            if cell == 'HepG2':
                background1 = os.path.join(annotation_dir, 'HepG2-constitutive-introns')
                background2 = os.path.join(annotation_dir, 'HepG2-all-retained-introns')
                background3 = os.path.join(annotation_dir, 'HepG2-greater-than-50-percent-spliced')
                background4 = os.path.join(annotation_dir, 'HepG2-greater-than-50-percent-retained')
            elif cell == 'K562':
                background1 = os.path.join(annotation_dir, 'K562-constitutive-introns')
                background2 = os.path.join(annotation_dir, 'K562-all-retained-introns')
                background3 = os.path.join(annotation_dir, 'K562-greater-than-50-percent-spliced')
                background4 = os.path.join(annotation_dir, 'K562-greater-than-50-percent-retained')
            else:
                print(cell)
            splicing_prefix = m.get_rnaseq_splicing_prefix_from_rbpname(rnaseq_manifests, rbp, cell)
            if(splicing_prefix == "NO_RNASEQ"): # we don't have an rna seq expt for this clip:
                no_rnaseq.append(uid)
            else:
                positive, negative = m.get_annotations_from_splicing_prefix(
                    annotation_dir, 
                    splicing_prefix,
                    pos_splicing_suffix=pos_splicing_suffix,
                    neg_splicing_suffix=neg_splicing_suffix
                )
                if positive is not None and negative is not None:
                    pos_prefix = os.path.basename(positive).split('-')[0]
                    neg_prefix = os.path.basename(negative).split('-')[0]
                    if not (pos_prefix in rbp and neg_prefix in rbp):
                        print(
                            'warning, these dont match: {}, {}, {}'.format(
                                rbp, 
                                os.path.basename(positive),
                                os.path.basename(negative)
                            )
                        )
                    for r in [r1, r2]:
                        name = os.path.basename(r).replace('.bam','.{}'.format(img_extension))
                        output_filename = os.path.join(
                            output_dir,
                            name
                        )
                        cmd = "python " + density_runner
                        cmd = cmd + " --event {}".format(event)
                        cmd = cmd + " --ipbam {}".format(r)
                        cmd = cmd + " --inputbam {}".format(i)
                        cmd = cmd + " --output {}".format(output_filename)
                        if positive is not None and negative is not None:
                            cmd = cmd + " --annotations {} {} {} {} {} {}".format(
                                positive, negative, background1, background2, background3, background4
                            )
                            cmd = cmd + " --annotation_type {} {} {} {} {} {}".format(
                                'rmats', 'rmats', 'eric', 'eric', 'eric', 'eric', 
                            )
                        cmd = cmd + " --testnum {} {}".format(0, 1)
                        cmd = cmd + " --bgnum {}".format(3)
                        cmd = cmd + " --sigtest mannwhitneyu"
                        cmds.append(cmd)
    Submitter(
        cmds, 
        "{}_NR_{}".format(event, img_extension), 
        sh='/projects/ps-yeolab3/bay001/maps/bash_scripts/{}_NR_{}.sh'.format(
            event, img_extension
        ),
        submit=True,
        array=True,
        queue='home-yeo',
        walltime='2:00:00'
    )
print("{} uIDs for which we don't have splicing data for: {}".format(event, len(no_rnaseq)))
print("{} uIDs for which we have an RNASEQ ID, but we don't have the splicing data yet: {}".format(event, len(no_rnaseq_yet)))



Writing 280 tasks as an array-job.
Wrote commands to /projects/ps-yeolab3/bay001/maps/bash_scripts/ri_NR_png.sh.


ri uIDs for which we don't have splicing data for: 23
ri uIDs for which we have an RNASEQ ID, but we don't have the splicing data yet: 0


Submitted script to queue home.
 Job ID: 10326987
