# Wrapper notebook for submitting the RBP maps script to TSCC

In [1]:
import pandas as pd
import os
import json
import yaml
import glob
# import rethinkdb as r
from collections import defaultdict
from qtools import Submitter
from encode import manifest_helpers as m

from tqdm import tnrange, tqdm_notebook
pd.set_option("display.max_colwidth", 10000)


# Define manifests, directories, etc.
- SBDS-BGKLV24-K562 and PPIL4-BGKLV24-K562 were changed in the K562.csv list from SBDS-BGKLV24_2-K562 and PPIL4-BGKLV24_2-K562 per an email discussion from xintao.

In [2]:
current_date = '6-2-2017'
clip_manifest = '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.txt'
hepg2_rnaseq_manifest = '/home/bay001/projects/maps_20160420/permanent_data/RNASeq_final_exp_list_HepG2.csv'
k562_rnaseq_manifest = '/home/bay001/projects/maps_20160420/permanent_data/RNASeq_final_exp_list_K562.csv'
rnaseq_manifests = {'HepG2':hepg2_rnaseq_manifest, 'K562':k562_rnaseq_manifest}
chrom_sizes = '/projects/ps-yeolab/genomes/hg19/hg19.chrom.sizes'
density_runner = '/home/bay001/projects/codebase/stable/rbp-maps/maps/plot_density.py'

# Plot RI

In [5]:
clip_df = pd.read_table(clip_manifest)

events_to_annotation_dict = {
    'ri':'/projects/ps-yeolab3/bay001/maps/current_annotations/ri_renamed/'
}


img_extensions = ['png','svg']
# out_base = '/projects/ps-yeolab3/bay001/maps/current/'
out_base = '/home/bay001/projects/encode/analysis/tests/rbp_map_tests/ri'
pos_splicing_suffix = '-included-upon-knockdown' # positive RMATS -> more intron inclusion (upon knockdown)
neg_splicing_suffix = '-excluded-upon-knockdown' # negative RMATS -> more intron splicing (upon knockdown)

for img_extension in img_extensions:
    for event, annotation_dir in events_to_annotation_dict.iteritems():
        no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
        no_rnaseq_yet = [] # uIDs for which we have an expt id, but haven't downloaded the data yet
        cmds = []
        output_dir = os.path.join(out_base, '{}_nr'.format(event))
        for uid in clip_df['uID']:
            r1, r2, i, rbp, cell = m.get_clip_file_from_uid(clip_df, uid)
            if cell == 'HepG2':
                background1 = os.path.join(annotation_dir, 'HepG2-constitutive-introns')
                background2 = os.path.join(annotation_dir, 'HepG2-all-retained-introns')
                background3 = os.path.join(annotation_dir, 'HepG2-greater-than-50-percent-spliced')
                background4 = os.path.join(annotation_dir, 'HepG2-greater-than-50-percent-retained')
            elif cell == 'K562':
                background1 = os.path.join(annotation_dir, 'K562-constitutive-introns')
                background2 = os.path.join(annotation_dir, 'K562-all-retained-introns')
                background3 = os.path.join(annotation_dir, 'K562-greater-than-50-percent-spliced')
                background4 = os.path.join(annotation_dir, 'K562-greater-than-50-percent-retained')
            else:
                print(cell)
            splicing_prefix = m.get_rnaseq_splicing_prefix_from_rbpname(rnaseq_manifests, rbp, cell)
            if(splicing_prefix == "NO_RNASEQ"): # we don't have an rna seq expt for this clip:
                no_rnaseq.append(uid)
            else:
                positive, negative = m.get_annotations_from_splicing_prefix(
                    annotation_dir, 
                    splicing_prefix,
                    pos_splicing_suffix=pos_splicing_suffix,
                    neg_splicing_suffix=neg_splicing_suffix
                )
                if positive is not None and negative is not None:
                    pos_prefix = os.path.basename(positive).split('-')[0]
                    neg_prefix = os.path.basename(negative).split('-')[0]
                    if not (pos_prefix in rbp and neg_prefix in rbp):
                        print(
                            'warning, these dont match: {}, {}, {}'.format(
                                rbp, 
                                os.path.basename(positive),
                                os.path.basename(negative)
                            )
                        )
                    for r in [r1, r2]:
                        name = os.path.basename(r).replace('.bam','.{}'.format(img_extension))
                        output_filename = os.path.join(
                            output_dir,
                            name
                        )
                        cmd = "python " + density_runner
                        cmd = cmd + " --event {}".format(event)
                        cmd = cmd + " --ipbam {}".format(r)
                        cmd = cmd + " --inputbam {}".format(i)
                        cmd = cmd + " --output {}".format(output_filename)
                        if positive is not None and negative is not None:
                            cmd = cmd + " --annotations {} {} {} {} {} {}".format(
                                positive, negative, background1, background2, background3, background4
                            )
                            cmd = cmd + " --annotation_type {} {} {} {} {} {}".format(
                                'rmats', 'rmats', 'eric', 'eric', 'eric', 'eric', 
                            )
                        cmd = cmd + " --chrom_sizes {}".format(chrom_sizes)
                        cmds.append(cmd)
    Submitter(
        cmds, 
        "{}_NR_{}".format(event, img_extension), 
        sh='/projects/ps-yeolab3/bay001/maps/bash_scripts/{}_NR_{}.sh'.format(
            event, img_extension
        ),
        submit=False,
        array=True,
        walltime='2:00:00'
    )
print("{} uIDs for which we don't have splicing data for: {}".format(event, len(no_rnaseq)))
print("{} uIDs for which we have an RNASEQ ID, but we don't have the splicing data yet: {}".format(event, len(no_rnaseq_yet)))



Writing 280 tasks as an array-job.
Wrote commands to /projects/ps-yeolab3/bay001/maps/bash_scripts/ri_NR_png.sh.


ri uIDs for which we don't have splicing data for: 23
ri uIDs for which we have an RNASEQ ID, but we don't have the splicing data yet: 0


Writing 280 tasks as an array-job.
Wrote commands to /projects/ps-yeolab3/bay001/maps/bash_scripts/ri_NR_svg.sh.


# Verify RBP SE Maps
- see if we have all the maps

In [None]:
clip_df = pd.read_table(clip_manifest)

hepg2_rnaseq_df = pd.read_table(rnaseq_manifests['HepG2'])
k562_rnaseq_df = pd.read_table(rnaseq_manifests['K562'])

hepg2_clip_df = clip_df[clip_df['Cell line']=='HepG2']
k562_clip_df = clip_df[clip_df['Cell line']=='K562']

In [None]:
hepg2_rbps = pd.merge(hepg2_clip_df, hepg2_rnaseq_df, how='outer', left_on='RBP', right_on='Official_RBP').dropna()
k562_rbps = pd.merge(k562_clip_df, k562_rnaseq_df, how='outer', left_on='RBP', right_on='Official_RBP').dropna()

In [None]:
pos_annotations = glob.glob('/projects/ps-yeolab3/bay001/maps/current_annotations/se/*.positive.nr.txt')
neg_annotations = glob.glob('/projects/ps-yeolab3/bay001/maps/current_annotations/se/*.negative.nr.txt')

def get_rbp(f):
    lst = os.path.basename(f).split('-')
    return '{}-{}'.format(lst[0], lst[2])

In [None]:
pos_rbp = set([get_rbp(f) for f in pos_annotations])
neg_rbp = set([get_rbp(f) for f in neg_annotations])

In [None]:
allrbp = (pos_rbp.intersection(neg_rbp))
print("We have annotations for: {} rbps".format(len(allrbp)))