# Wrapper notebook for submitting the RBP maps script to TSCC

In [1]:
import pandas as pd
import os
import json
import yaml
import glob
# import rethinkdb as r
from collections import defaultdict
from qtools import Submitter
from encode import manifest_helpers as m

from tqdm import tnrange, tqdm_notebook
pd.set_option("display.max_colwidth", 10000)

# Define manifests, directories, etc.
- SBDS-BGKLV24-K562 and PPIL4-BGKLV24-K562 were changed in the K562.csv list from SBDS-BGKLV24_2-K562 and PPIL4-BGKLV24_2-K562 per an email discussion from xintao.

In [2]:
current_date = '4-27-2018'
clip_manifest = '/projects/ps-yeolab3/bay001/reference_data/misc_ENCODE/ENCODE_FINAL_ANNOTATIONS.uidsonly.txt.manifest.txt'
density_runner = '/home/bay001/projects/codebase/rbp-maps/maps/plot_map.py'
a3ss_annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/a3ss_renamed'
a5ss_annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/a5ss_renamed'

clip_df = pd.read_table(clip_manifest)

master_table = pd.read_table(
    '/projects/ps-yeolab3/encode/rnaseq/eCLIP_finalstatus_20180406_ENCODE_combined_RNASEQ.tsv'
)
del master_table['eCLIP_Antibody_Lot_#']  # there is a NaN value in here that's screwing things up. Don't care about antibody anyway, so delete
merged = pd.merge(
    clip_df,
    master_table,
    how='right',
    left_on=['uID'],
    right_on=['eCLIP_Final_internal_accession']
)
merged.dropna(subset=['eCLIP_Final_internal_accession','SE_jxc_file'], inplace=True)
print(merged.shape)


(203, 43)


# Plot the A3SS/A5SS splice events (positive and negative and controls) all together

In [3]:
a3ss_control_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/erics_controls/nonredundant_renamed/a3ss/'
a5ss_control_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/erics_controls/nonredundant_renamed/a5ss/'

a5ss_k562_all = 'K562-all-native-a5ss-events'
a5ss_k562_basic = 'K562-shorter-isoform-in-majority-of-controls'
a5ss_k562_center = 'K562-mixed-psi-isoform-in-majority-of-controls'
a5ss_k562_extension = 'K562-longer-isoform-in-majority-of-controls'

a3ss_k562_all = 'K562-all-native-a3ss-events'
a3ss_k562_basic = 'K562-shorter-isoform-in-majority-of-controls'
a3ss_k562_center = 'K562-mixed-psi-isoform-in-majority-of-controls'
a3ss_k562_extension = 'K562-longer-isoform-in-majority-of-controls'

a5ss_hepg2_all = 'HepG2-all-native-a5ss-events'
a5ss_hepg2_basic = 'HepG2-shorter-isoform-in-majority-of-controls'
a5ss_hepg2_center = 'HepG2-mixed-psi-isoform-in-majority-of-controls'
a5ss_hepg2_extension = 'HepG2-longer-isoform-in-majority-of-controls'

a3ss_hepg2_all = 'HepG2-all-native-a3ss-events'
a3ss_hepg2_basic = 'HepG2-shorter-isoform-in-majority-of-controls'
a3ss_hepg2_center = 'HepG2-mixed-psi-isoform-in-majority-of-controls'
a3ss_hepg2_extension = 'HepG2-longer-isoform-in-majority-of-controls'

In [4]:
def get_clip_file_from_uid(uid, df=merged):
    df = df[df['uID']==uid]
    return df['CLIP_rep1'].values[0], \
            df['CLIP_rep2'].values[0], \
            df['INPUT'].values[0], \
            df['eCLIP_Official_Gene_Symbol'].values[0], \
            df['eCLIP_Cell_Line'].values[0], \
            df['SE_jxc_file'].values[0]

r1, r2, i, rbp, cell, jxc_se  = get_clip_file_from_uid('204')

def get_altss_annotations_from_jxc_se(jxc, jxc_dir=a3ss_annotation_dir, event='a3ss'):
    """ jxc contains the basename of the junction counts file """
    orig_file = os.path.join(jxc_dir, jxc)
    positive = orig_file.replace('SE.MATS.JunctionCountOnly.txt','{}longer-isoform-included-upon-knockdown'.format(event.upper()))
    negative = orig_file.replace('SE.MATS.JunctionCountOnly.txt','{}shorter-isoform-included-upon-knockdown'.format(event.upper()))
    if not os.path.exists(positive):
        positive = None
    if not os.path.exists(negative):
        negative = None
    return positive, negative

get_altss_annotations_from_jxc_se(jxc_se)

('/projects/ps-yeolab3/bay001/maps/current_annotations/a3ss_renamed/RBFOX2-BGHLV26-HepG2.set26.A3SSlonger-isoform-included-upon-knockdown',
 '/projects/ps-yeolab3/bay001/maps/current_annotations/a3ss_renamed/RBFOX2-BGHLV26-HepG2.set26.A3SSshorter-isoform-included-upon-knockdown')

In [5]:

events = {
    'a3ss':'/projects/ps-yeolab3/bay001/maps/current_annotations/a3ss_renamed/',
    'a5ss':'/projects/ps-yeolab3/bay001/maps/current_annotations/a5ss_renamed/',
}

img_extensions = ['svg']
out_base = '/projects/ps-yeolab3/bay001/maps/current/'

for event, annotation_dir in events.iteritems(): # for each annotation
    for img_extension in img_extensions: # for each image extension
        no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
        no_rnaseq_yet = [] # uIDs for which we have an expt id, but haven't downloaded the data yet
        cmds = []
        output_dir = os.path.join(out_base, '{}'.format(event))
        for uid in merged['uID']:
            r1, r2, i, rbp, cell, jxc_se = get_clip_file_from_uid(uid, merged)

            if cell == 'K562':
                if event == 'a3ss':
                    background_all = os.path.join(a3ss_control_dir, a3ss_k562_all)
                    background_basic = os.path.join(a3ss_control_dir, a3ss_k562_basic)
                    background_center = os.path.join(a3ss_control_dir, a3ss_k562_center)
                    background_extension = os.path.join(a3ss_control_dir, a3ss_k562_extension)
                elif event == 'a5ss':
                    background_all = os.path.join(a5ss_control_dir, a5ss_k562_all)
                    background_basic = os.path.join(a5ss_control_dir, a5ss_k562_basic)
                    background_center = os.path.join(a5ss_control_dir, a5ss_k562_center)
                    background_extension = os.path.join(a5ss_control_dir, a5ss_k562_extension)
                else:
                    print(event)
            elif cell == 'HepG2':
                if event == 'a3ss':
                    background_all = os.path.join(a3ss_control_dir, a3ss_hepg2_all)
                    background_basic = os.path.join(a3ss_control_dir, a3ss_hepg2_basic)
                    background_center = os.path.join(a3ss_control_dir, a3ss_hepg2_center)
                    background_extension = os.path.join(a3ss_control_dir, a3ss_hepg2_extension)
                elif event == 'a5ss':
                    background_all = os.path.join(a5ss_control_dir, a5ss_hepg2_all)
                    background_basic = os.path.join(a5ss_control_dir, a5ss_hepg2_basic)
                    background_center = os.path.join(a5ss_control_dir, a5ss_hepg2_center)
                    background_extension = os.path.join(a5ss_control_dir, a5ss_hepg2_extension)
                else:
                    print(event)
            else:
                print(cell)


            positive, negative = get_altss_annotations_from_jxc_se(
                jxc_se, annotation_dir, event
            )
            if(positive == None or negative == None):
                no_rnaseq_yet.append(uid)
            else:
                if not (rbp in positive and rbp in negative):
                    print(
                        'warning, these dont match: {}, {}, {}'.format(
                            rbp, 
                            os.path.basename(positive),
                            os.path.basename(negative)
                        )
                    )
                pos_prefix = os.path.basename(positive).split('-')[0]
                neg_prefix = os.path.basename(negative).split('-')[0]
                if not (pos_prefix in rbp and neg_prefix in rbp):
                    print(
                        'warning, these dont match: {}, {}, {}'.format(
                            rbp, 
                            os.path.basename(positive),
                            os.path.basename(negative)
                        )
                    )
                for r in [r1, r2]:
                    name = os.path.basename(r).replace('.bam','.{}'.format(img_extension))
                    output_filename = os.path.join(
                        output_dir,
                        name
                    )
                    cmd = "python " + density_runner
                    cmd = cmd + " --event {}".format(event)
                    cmd = cmd + " --ipbam {}".format(r)
                    cmd = cmd + " --inputbam {}".format(i)
                    cmd = cmd + " --output {}".format(output_filename)
                    if positive is not None and negative is not None:
                        cmd = cmd + " --annotations {} {} {} {} {} {}".format(
                            positive, negative, background_all, background_basic, background_center, background_extension
                        )
                        cmd = cmd + " --annotation_type {} {} {} {} {} {}".format(
                            'rmats', 'rmats', 'eric', 'eric', 'eric', 'eric'
                        )
                    # cmd = cmd + " --chrom_sizes {}".format(chrom_sizes)
                    cmd = cmd + " --bgnum {}".format(2)
                    cmd = cmd + " --testnum {} {}".format(0, 1)
                    if not os.path.exists(output_filename):
                        cmds.append(cmd)
            # if(uid == '228'):
            #     print(r1, r2, i, rbp, cell, annotation_dir, splicing_prefix, pos_splicing_suffix, neg_splicing_suffix)
        bash_script_sh = '/projects/ps-yeolab3/bay001/maps/bash_scripts/{}/{}_NR_{}.sh'.format(
            current_date, event, img_extension
        )
        Submitter(
            cmds, 
            "{}_NR_{}".format(event, img_extension), 
            sh=bash_script_sh,
            submit=False,
            array=True,
            walltime='2:00:00',
            queue='home-yeo'
        )
        with open(bash_script_sh.replace('.sh','.missing.txt'), 'w') as o:
            for no in no_rnaseq:
                o.write(
                    '{}\t{}\n'.format(
                        m.get_clip_file_from_uid(clip_df, no)[3],
                        m.get_clip_file_from_uid(clip_df, no)[4],
                    )
                )
            print("\n\nNO SUFFICIENT POSITIVE OR NEGATIVE SIGNIFICANT ANNOTATIONS:")
            for no in no_rnaseq_yet:
                print(m.get_clip_file_from_uid(clip_df, no)[3:]),



Writing 274 tasks as an array-job.
Wrote commands to /projects/ps-yeolab3/bay001/maps/bash_scripts/4-27-2018/a5ss_NR_svg.sh.




NO SUFFICIENT POSITIVE OR NEGATIVE SIGNIFICANT ANNOTATIONS:


NO SUFFICIENT POSITIVE OR NEGATIVE SIGNIFICANT ANNOTATIONS:
(u'IGF2BP1', u'HepG2') (u'IGF2BP1', u'K562') (u'FKBP4', u'HepG2') (u'XRN2', u'K562') 

Writing 312 tasks as an array-job.
Wrote commands to /projects/ps-yeolab3/bay001/maps/bash_scripts/4-27-2018/a3ss_NR_svg.sh.


(u'SLTM', u'K562') (u'CSTF2T', u'HepG2') (u'FAM120A', u'HepG2') (u'SND1', u'HepG2') (u'XRCC6', u'HepG2') (u'SND1', u'K562') (u'ILF3', u'K562') (u'GTF2F1', u'K562') (u'GTF2F1', u'HepG2') (u'LIN28B', u'HepG2') (u'PABPC4', u'K562') (u'WRN', u'K562') (u'KHSRP', u'K562') (u'EWSR1', u'K562') (u'LSM11', u'K562') (u'TROVE2', u'K562') (u'FASTKD2', u'K562') (u'QKI', u'K562') (u'DDX24', u'K562') (u'EXOSC5', u'K562') (u'BUD13', u'HepG2') (u'TROVE2', u'HepG2') (u'EIF3H', u'HepG2') (u'ZRANB2', u'K562') (u'YBX3', u'K562') (u'CSTF2', u'HepG2') (u'DDX6', u'K562') (u'TBRG4', u'K562') (u'DDX51', u'K562') (u'UTP18', u'HepG2') (u'GRWD1', u'HepG2') (u'DDX52', u'HepG2') (u'GRWD1', u'K562') (u'FASTKD2', u'HepG2') (u'RBFOX2', u'K562') (u'DDX52', u'K562') (u'FUS', u'K562') (u'AKAP1', u'K562') (u'CPEB4', u'K562') (u'EXOSC5', u'HepG2') (u'SDAD1', u'HepG2') (u'DDX21', u'K562') (u'SUB1', u'HepG2')


# Ensure we have all of the maps for integrated paper.

In [19]:
annotation_dir = a3ss_annotation_dir
event = 'a3ss'
output_dir = os.path.join(out_base, '{}'.format(event))
ext = 'png'
for uid in merged['uID']:
    r1, r2, i, rbp, cell, jxc_se = get_clip_file_from_uid(uid)
    positive, negative = get_altss_annotations_from_jxc_se(
        jxc_se, annotation_dir, event
    )
    if(positive == None or negative == None):
        pass
    else:
        pdf = pd.read_table(positive)
        ndf = pd.read_table(negative)

        if(pdf.shape[0] >= 25 and ndf.shape[0] >= 25):
            means = glob.glob(
                os.path.join(
                    output_dir,
                    os.path.basename(r).replace('.bam','*.means.txt')
                )
            )
            
            for r in [r1, r2]:
                name = os.path.basename(r).replace('.bam','.{}'.format(ext))
                output_filename = os.path.join(
                    output_dir,
                    name
                )
                if not os.path.exists(output_filename):
                    print("{} {} doesnt exist".format(output_filename, jxc_se))
                if len(means) != 6:
                    print("missing means (found {})".format(means))

In [20]:
annotation_dir = a5ss_annotation_dir
event = 'a5ss'
output_dir = os.path.join(out_base, '{}'.format(event))
ext = 'png'
for uid in merged['uID']:
    r1, r2, i, rbp, cell, jxc_se = get_clip_file_from_uid(uid)
    positive, negative = get_altss_annotations_from_jxc_se(
        jxc_se, annotation_dir, event
    )
    if(positive == None or negative == None):
        pass
    else:
        pdf = pd.read_table(positive)
        ndf = pd.read_table(negative)

        if(pdf.shape[0] >= 25 and ndf.shape[0] >= 25):
            means = glob.glob(
                os.path.join(
                    output_dir,
                    os.path.basename(r).replace('.bam','*.means.txt')
                )
            )
            for r in [r1, r2]:
                name = os.path.basename(r).replace('.bam','.{}'.format(ext))
                output_filename = os.path.join(
                    output_dir,
                    name
                )
                if not os.path.exists(output_filename):
                    print("{} {} doesnt exist".format(output_filename, jxc_se))
                if len(means) != 6:
                    print("missing means (found {})".format(means))