# Wrapper notebook for submitting the RBP maps script to TSCC

In [1]:
import pandas as pd
import os
import json
import yaml
import glob
# import rethinkdb as r
from collections import defaultdict
from qtools import Submitter
from encode import manifest_helpers as m

from tqdm import tnrange, tqdm_notebook
pd.set_option("display.max_colwidth", 10000)

# Define manifests, directories, etc.
- SBDS-BGKLV24-K562 and PPIL4-BGKLV24-K562 were changed in the K562.csv list from SBDS-BGKLV24_2-K562 and PPIL4-BGKLV24_2-K562 per an email discussion from xintao.

In [16]:
current_date = '11-07-2018'
clip_manifest = '/projects/ps-yeolab3/bay001/reference_data/ENCODE/ENCODE_FINAL_ANNOTATIONS.uidsonly.txt.manifest.txt'
density_runner = '/home/bay001/projects/codebase/rbp-maps/maps/plot_map.py'
a3ss_annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/a3ss_renamed'
a5ss_annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/a5ss_renamed'

clip_df = pd.read_table(clip_manifest)

master_table = pd.read_table(
    '/projects/ps-yeolab3/encode/rnaseq/eCLIP_finalstatus_20180406_ENCODE_combined_RNASEQ.tsv'
)
del master_table['eCLIP_Antibody_Lot_#']  # there is a NaN value in here that's screwing things up. Don't care about antibody anyway, so delete
merged = pd.merge(
    clip_df,
    master_table,
    how='right',
    left_on=['uID'],
    right_on=['eCLIP_Final_internal_accession']
)
merged.dropna(subset=['eCLIP_Final_internal_accession','SE_jxc_file'], inplace=True)
print(merged.shape)
print(clip_df.columns)
merged[merged['uID']=='241']

(203, 43)
Index([u'uID', u'RBP', u'Cellline', u'CLIP_rep1', u'CLIP_rep2', u'INPUT'], dtype='object')


Unnamed: 0,uID,RBP,Cellline,CLIP_rep1,CLIP_rep2,INPUT,eCLIP_uID,eCLIP_Official_Gene_Symbol,eCLIP_Final_internal_accession,eCLIP_Submission_status,...,RNASEQ_BAM_rep2,RNASEQ_TSV_rep2,Rep_rep2,CONTROL_Replicate_rep2,CONTROL_FASTQ_R1_rep2,CONTROL_FASTQ_R2_rep2,CONTROL_BAM_rep2,CONTROL_TSV_rep2,SE_jxc_file,RNASEQ_DESeq2


# Plot the A3SS/A5SS splice events (positive and negative and controls) all together

In [3]:
a3ss_control_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/erics_controls/nonredundant_renamed/a3ss/'
a5ss_control_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/erics_controls/nonredundant_renamed/a5ss/'

a5ss_k562_all = 'K562-all-native-a5ss-events'
a5ss_k562_basic = 'K562-shorter-isoform-in-majority-of-controls'
a5ss_k562_center = 'K562-mixed-psi-isoform-in-majority-of-controls'
a5ss_k562_extension = 'K562-longer-isoform-in-majority-of-controls'

a3ss_k562_all = 'K562-all-native-a3ss-events'
a3ss_k562_basic = 'K562-shorter-isoform-in-majority-of-controls'
a3ss_k562_center = 'K562-mixed-psi-isoform-in-majority-of-controls'
a3ss_k562_extension = 'K562-longer-isoform-in-majority-of-controls'

a5ss_hepg2_all = 'HepG2-all-native-a5ss-events'
a5ss_hepg2_basic = 'HepG2-shorter-isoform-in-majority-of-controls'
a5ss_hepg2_center = 'HepG2-mixed-psi-isoform-in-majority-of-controls'
a5ss_hepg2_extension = 'HepG2-longer-isoform-in-majority-of-controls'

a3ss_hepg2_all = 'HepG2-all-native-a3ss-events'
a3ss_hepg2_basic = 'HepG2-shorter-isoform-in-majority-of-controls'
a3ss_hepg2_center = 'HepG2-mixed-psi-isoform-in-majority-of-controls'
a3ss_hepg2_extension = 'HepG2-longer-isoform-in-majority-of-controls'

In [18]:
def get_clip_file_from_uid(uid, df=merged):
    df = df[df['uID']==uid]
    return df['CLIP_rep1'].values[0], \
            df['CLIP_rep2'].values[0], \
            df['INPUT'].values[0], \
            df['eCLIP_Official_Gene_Symbol'].values[0], \
            df['eCLIP_Cell_Line'].values[0], \
            df['SE_jxc_file'].values[0]

r1, r2, i, rbp, cell, jxc_se  = get_clip_file_from_uid('204')

def get_altss_annotations_from_jxc_se(jxc, jxc_dir=a3ss_annotation_dir, event='a3ss'):
    """ jxc contains the basename of the junction counts file """
    orig_file = os.path.join(jxc_dir, jxc)
    positive = orig_file.replace('SE.MATS.JunctionCountOnly.txt','{}longer-isoform-included-upon-knockdown'.format(event.upper()))
    negative = orig_file.replace('SE.MATS.JunctionCountOnly.txt','{}shorter-isoform-included-upon-knockdown'.format(event.upper()))
    if not os.path.exists(positive):
        positive = None
    if not os.path.exists(negative):
        negative = None
    return positive, negative

get_altss_annotations_from_jxc_se(jxc_se)

('/projects/ps-yeolab3/bay001/maps/current_annotations/a3ss_renamed/RBFOX2-BGHLV26-HepG2.set26.A3SSlonger-isoform-included-upon-knockdown',
 '/projects/ps-yeolab3/bay001/maps/current_annotations/a3ss_renamed/RBFOX2-BGHLV26-HepG2.set26.A3SSshorter-isoform-included-upon-knockdown')

In [19]:

events = {
    'a3ss':'/projects/ps-yeolab3/bay001/maps/current_annotations/a3ss_renamed/',
    'a5ss':'/projects/ps-yeolab3/bay001/maps/current_annotations/a5ss_renamed/',
}

img_extensions = ['svg']
out_base = '/projects/ps-yeolab3/bay001/maps/current/'
norm_method = 1

for event, annotation_dir in events.iteritems(): # for each annotation
    for img_extension in img_extensions: # for each image extension
        no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
        no_rnaseq_yet = [] # uIDs for which we have an expt id, but haven't downloaded the data yet
        cmds = []
        sf_cmds = []
        output_dir = os.path.join(out_base, '{}'.format(event))
        for uid in merged['uID']:
            
            r1, r2, i, rbp, cell, jxc_se = get_clip_file_from_uid(uid, merged)
            
            if cell == 'K562':
                if event == 'a3ss':
                    background_all = os.path.join(a3ss_control_dir, a3ss_k562_all)
                    background_basic = os.path.join(a3ss_control_dir, a3ss_k562_basic)
                    background_center = os.path.join(a3ss_control_dir, a3ss_k562_center)
                    background_extension = os.path.join(a3ss_control_dir, a3ss_k562_extension)
                elif event == 'a5ss':
                    background_all = os.path.join(a5ss_control_dir, a5ss_k562_all)
                    background_basic = os.path.join(a5ss_control_dir, a5ss_k562_basic)
                    background_center = os.path.join(a5ss_control_dir, a5ss_k562_center)
                    background_extension = os.path.join(a5ss_control_dir, a5ss_k562_extension)
                else:
                    print(event)
            elif cell == 'HepG2':
                if event == 'a3ss':
                    background_all = os.path.join(a3ss_control_dir, a3ss_hepg2_all)
                    background_basic = os.path.join(a3ss_control_dir, a3ss_hepg2_basic)
                    background_center = os.path.join(a3ss_control_dir, a3ss_hepg2_center)
                    background_extension = os.path.join(a3ss_control_dir, a3ss_hepg2_extension)
                elif event == 'a5ss':
                    background_all = os.path.join(a5ss_control_dir, a5ss_hepg2_all)
                    background_basic = os.path.join(a5ss_control_dir, a5ss_hepg2_basic)
                    background_center = os.path.join(a5ss_control_dir, a5ss_hepg2_center)
                    background_extension = os.path.join(a5ss_control_dir, a5ss_hepg2_extension)
                else:
                    print(event)
            else:
                print(cell)


            positive, negative = get_altss_annotations_from_jxc_se(
                jxc_se, annotation_dir, event
            )
            if(positive == None or negative == None):
                no_rnaseq_yet.append(uid)
            else:
                if not (rbp in positive and rbp in negative):
                    print(
                        'warning, these dont match: {}, {}, {}'.format(
                            rbp, 
                            os.path.basename(positive),
                            os.path.basename(negative)
                        )
                    )
                pos_prefix = os.path.basename(positive).split('-')[0]
                neg_prefix = os.path.basename(negative).split('-')[0]
                if not (pos_prefix in rbp and neg_prefix in rbp):
                    print(
                        'warning, these dont match: {}, {}, {}'.format(
                            rbp, 
                            os.path.basename(positive),
                            os.path.basename(negative)
                        )
                    )
                for r in [r1, r2]:
                    name = os.path.basename(r).replace('.bam','.{}'.format(img_extension))
                    output_filename = os.path.join(
                        output_dir,
                        name
                    )
                    cmd = "python " + density_runner
                    cmd = cmd + " --event {}".format(event)
                    cmd = cmd + " --ipbam {}".format(r)
                    cmd = cmd + " --inputbam {}".format(i)
                    cmd = cmd + " --output {}".format(output_filename)
                    if positive is not None and negative is not None:
                        cmd = cmd + " --annotations {} {} {} {} {} {}".format(
                            positive, negative, background_all, background_basic, background_center, background_extension
                        )
                        cmd = cmd + " --annotation_type {} {} {} {} {} {}".format(
                            'rmats', 'rmats', 'eric', 'eric', 'eric', 'eric'
                        )
                    # cmd = cmd + " --chrom_sizes {}".format(chrom_sizes)
                    cmd = cmd + " --bgnum {}".format(2)
                    cmd = cmd + " --testnum {} {}".format(0, 1)
                    cmd = cmd + " --normalization {}".format(norm_method)
                    cmd = cmd + " --sigtest {}".format('permutation')
                    if not os.path.exists(output_filename):
                        cmds.append(cmd)
        bash_script_sh = '/projects/ps-yeolab3/bay001/maps/bash_scripts/{}/{}_NR_{}.sh'.format(
            current_date, event, img_extension
        )
        Submitter(
            cmds, 
            "{}_NR_{}".format(event, img_extension), 
            sh=bash_script_sh,
            submit=False,
            array=True,
            walltime='2:00:00',
            queue='home-yeo'
        )
        with open(bash_script_sh.replace('.sh','.missing.txt'), 'w') as o:
            for no in no_rnaseq:
                
                o.write(
                    '{}\t{}\n'.format(
                        get_clip_file_from_uid(uid=no, df=merged)[3],
                        get_clip_file_from_uid(uid=no, df=merged)[4],
                    )
                )
            print("\n\nNO SUFFICIENT POSITIVE OR NEGATIVE SIGNIFICANT ANNOTATIONS:")
            for no in no_rnaseq_yet:
                print(get_clip_file_from_uid(uid=no, df=merged)[3]),

In [6]:
with open('/projects/ps-yeolab3/bay001/maps/bash_scripts/8-15-2018/sf_cmds.sh', 'w') as f:
    for cmd in sf_cmds:
        f.write(
            cmd.replace(
                '/projects/ps-yeolab3/bay001/maps/current/a3ss/',
                '/home/bay001/projects/codebase/rbp-maps/examples/a3ss/outputs_subfirst/'
            ) + '\n'
        )

# Ensure we have all of the maps for integrated paper.

In [7]:
annotation_dir = a3ss_annotation_dir
event = 'a3ss'
output_dir = os.path.join(out_base, '{}'.format(event))
ext = 'png'
for uid in merged['uID']:
    r1, r2, i, rbp, cell, jxc_se = get_clip_file_from_uid(uid)
    positive, negative = get_altss_annotations_from_jxc_se(
        jxc_se, annotation_dir, event
    )
    if(positive == None or negative == None):
        pass
    else:
        pdf = pd.read_table(positive)
        ndf = pd.read_table(negative)

        if(pdf.shape[0] >= 25 and ndf.shape[0] >= 25):
            means = glob.glob(
                os.path.join(
                    output_dir,
                    os.path.basename(r).replace('.bam','*.means.txt')
                )
            )
            
            for r in [r1, r2]:
                name = os.path.basename(r).replace('.bam','.{}'.format(ext))
                output_filename = os.path.join(
                    output_dir,
                    name
                )
                if not os.path.exists(output_filename):
                    print("{} {} doesnt exist".format(output_filename, jxc_se))
                if len(means) != 6:
                    print("missing means (found {})".format(means))

/projects/ps-yeolab3/bay001/maps/current/a3ss/206_01_HNRNPK.merged.r2.png HNRNPK-BGHLV12-HepG2.set12.SE.MATS.JunctionCountOnly.txt doesnt exist
/projects/ps-yeolab3/bay001/maps/current/a3ss/206_02_HNRNPK.merged.r2.png HNRNPK-BGHLV12-HepG2.set12.SE.MATS.JunctionCountOnly.txt doesnt exist
/projects/ps-yeolab3/bay001/maps/current/a3ss/228_01_SF3B4.merged.r2.png SF3B4-BGHLV12-HepG2.set12.SE.MATS.JunctionCountOnly.txt doesnt exist
/projects/ps-yeolab3/bay001/maps/current/a3ss/228_02_SF3B4.merged.r2.png SF3B4-BGHLV12-HepG2.set12.SE.MATS.JunctionCountOnly.txt doesnt exist
/projects/ps-yeolab3/bay001/maps/current/a3ss/242_01_U2AF2.merged.r2.png U2AF2-LV08-K562.set10.SE.MATS.JunctionCountOnly.txt doesnt exist
/projects/ps-yeolab3/bay001/maps/current/a3ss/242_02_U2AF2.merged.r2.png U2AF2-LV08-K562.set10.SE.MATS.JunctionCountOnly.txt doesnt exist
/projects/ps-yeolab3/bay001/maps/current/a3ss/244_01_U2AF1.merged.r2.png U2AF1-LV08-K562.set10.SE.MATS.JunctionCountOnly.txt doesnt exist
/projects/ps-y

In [8]:
annotation_dir = a5ss_annotation_dir
event = 'a5ss'
output_dir = os.path.join(out_base, '{}'.format(event))
ext = 'png'
for uid in merged['uID']:
    r1, r2, i, rbp, cell, jxc_se = get_clip_file_from_uid(uid)
    positive, negative = get_altss_annotations_from_jxc_se(
        jxc_se, annotation_dir, event
    )
    if(positive == None or negative == None):
        pass
    else:
        pdf = pd.read_table(positive)
        ndf = pd.read_table(negative)

        if(pdf.shape[0] >= 25 and ndf.shape[0] >= 25):
            means = glob.glob(
                os.path.join(
                    output_dir,
                    os.path.basename(r).replace('.bam','*.means.txt')
                )
            )
            for r in [r1, r2]:
                name = os.path.basename(r).replace('.bam','.{}'.format(ext))
                output_filename = os.path.join(
                    output_dir,
                    name
                )
                if not os.path.exists(output_filename):
                    print("{} {} doesnt exist".format(output_filename, jxc_se))
                if len(means) != 6:
                    print("missing means (found {})".format(means))

/projects/ps-yeolab3/bay001/maps/current/a5ss/206_01_HNRNPK.merged.r2.png HNRNPK-BGHLV12-HepG2.set12.SE.MATS.JunctionCountOnly.txt doesnt exist
/projects/ps-yeolab3/bay001/maps/current/a5ss/206_02_HNRNPK.merged.r2.png HNRNPK-BGHLV12-HepG2.set12.SE.MATS.JunctionCountOnly.txt doesnt exist
/projects/ps-yeolab3/bay001/maps/current/a5ss/216_01_SRSF9.merged.r2.png SRSF9-BGHLV12-HepG2.set12.SE.MATS.JunctionCountOnly.txt doesnt exist
/projects/ps-yeolab3/bay001/maps/current/a5ss/216_02_SRSF9.merged.r2.png SRSF9-BGHLV12-HepG2.set12.SE.MATS.JunctionCountOnly.txt doesnt exist
/projects/ps-yeolab3/bay001/maps/current/a5ss/228_01_SF3B4.merged.r2.png SF3B4-BGHLV12-HepG2.set12.SE.MATS.JunctionCountOnly.txt doesnt exist
/projects/ps-yeolab3/bay001/maps/current/a5ss/228_02_SF3B4.merged.r2.png SF3B4-BGHLV12-HepG2.set12.SE.MATS.JunctionCountOnly.txt doesnt exist
/projects/ps-yeolab3/bay001/maps/current/a5ss/242_01_U2AF2.merged.r2.png U2AF2-LV08-K562.set10.SE.MATS.JunctionCountOnly.txt doesnt exist
/proje

# Run the peak overlaps

In [15]:
peak_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/se_peak_idr_bigbeds'
events = {
    'a3ss':'/projects/ps-yeolab3/bay001/maps/current_annotations/a3ss_renamed/',
    'a5ss':'/projects/ps-yeolab3/bay001/maps/current_annotations/a5ss_renamed/',
}

img_extensions = ['svg']
out_base = '/projects/ps-yeolab3/bay001/maps/current/'
norm_method = 0

for event, annotation_dir in events.iteritems(): # for each annotation
    for img_extension in img_extensions: # for each image extension
        no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
        no_rnaseq_yet = [] # uIDs for which we have an expt id, but haven't downloaded the data yet
        cmds = []
        sf_cmds = []
        output_dir = os.path.join(out_base, '{}_IDR'.format(event))
        for uid in merged['uID']:
            r1, r2, i, rbp, cell, jxc_se = get_clip_file_from_uid(uid, merged)

            if cell == 'K562':
                if event == 'a3ss':
                    background_all = os.path.join(a3ss_control_dir, a3ss_k562_all)
                    background_basic = os.path.join(a3ss_control_dir, a3ss_k562_basic)
                    background_center = os.path.join(a3ss_control_dir, a3ss_k562_center)
                    background_extension = os.path.join(a3ss_control_dir, a3ss_k562_extension)
                elif event == 'a5ss':
                    background_all = os.path.join(a5ss_control_dir, a5ss_k562_all)
                    background_basic = os.path.join(a5ss_control_dir, a5ss_k562_basic)
                    background_center = os.path.join(a5ss_control_dir, a5ss_k562_center)
                    background_extension = os.path.join(a5ss_control_dir, a5ss_k562_extension)
                else:
                    print(event)
            elif cell == 'HepG2':
                if event == 'a3ss':
                    background_all = os.path.join(a3ss_control_dir, a3ss_hepg2_all)
                    background_basic = os.path.join(a3ss_control_dir, a3ss_hepg2_basic)
                    background_center = os.path.join(a3ss_control_dir, a3ss_hepg2_center)
                    background_extension = os.path.join(a3ss_control_dir, a3ss_hepg2_extension)
                elif event == 'a5ss':
                    background_all = os.path.join(a5ss_control_dir, a5ss_hepg2_all)
                    background_basic = os.path.join(a5ss_control_dir, a5ss_hepg2_basic)
                    background_center = os.path.join(a5ss_control_dir, a5ss_hepg2_center)
                    background_extension = os.path.join(a5ss_control_dir, a5ss_hepg2_extension)
                else:
                    print(event)
            else:
                print(cell)


            positive, negative = get_altss_annotations_from_jxc_se(
                jxc_se, annotation_dir, event
            )
            if(positive == None or negative == None):
                no_rnaseq_yet.append(uid)
            else:
                if not (rbp in positive and rbp in negative):
                    print(
                        'warning, these dont match: {}, {}, {}'.format(
                            rbp, 
                            os.path.basename(positive),
                            os.path.basename(negative)
                        )
                    )
                pos_prefix = os.path.basename(positive).split('-')[0]
                neg_prefix = os.path.basename(negative).split('-')[0]
                if not (pos_prefix in rbp and neg_prefix in rbp):
                    print(
                        'warning, these dont match: {}, {}, {}'.format(
                            rbp, 
                            os.path.basename(positive),
                            os.path.basename(negative)
                        )
                    )
                peak_files = glob.glob(os.path.join(
                    peak_dir, 
                    '{}.01v02.IDR.out.0102merged.bed.blacklist_removed.bed.p0f0.bed.sorted.bed.bb'.format(
                        uid
                    )
                ))

                assert len(peak_files) == 1
                for peak in peak_files:
                    output_filename = os.path.join(
                        output_dir, 
                        os.path.basename(peak).replace(
                            '.p0f0.bed.sorted.bed.bb', '.{}'.format(img_extension)
                        )
                    )

                    cmd = "module load rbpmaps;python " + density_runner
                    cmd = cmd + " --event {}".format(event)
                    cmd = cmd + " --peak {}".format(peak)
                    cmd = cmd + " --output {}".format(output_filename)
                    if positive is not None and negative is not None:
                        cmd = cmd + " --annotations {} {} {} {} {} {}".format(
                            positive, negative, background_all, background_basic, background_center, background_extension
                        )
                        cmd = cmd + " --annotation_type {} {} {} {} {} {}".format(
                            'rmats', 'rmats', 'eric', 'eric', 'eric', 'eric'
                        )
                    cmd = cmd + " --bgnum {}".format(2)
                    cmd = cmd + " --testnum {} {}".format(0, 1)
                    cmd = cmd + " --normalization {}".format(norm_method)
                    cmd = cmd + " --sigtest {}".format('fisher')
                    if not os.path.exists(output_filename):
                        cmds.append(cmd)
        bash_script_sh = '/projects/ps-yeolab3/bay001/maps/bash_scripts/{}/{}_NR_{}.sh'.format(
            current_date, event, img_extension
        )
        Submitter(
            cmds, 
            "{}_NR_{}".format(event, img_extension), 
            sh=bash_script_sh,
            submit=False,
            array=True,
            walltime='2:00:00',
            queue='home-yeo'
        )
        with open(bash_script_sh.replace('.sh','.missing.txt'), 'w') as o:
            for no in no_rnaseq:
                
                o.write(
                    '{}\t{}\n'.format(
                        get_clip_file_from_uid(uid=no, df=merged)[3],
                        get_clip_file_from_uid(uid=no, df=merged)[4],
                    )
                )
            print("\n\nNO SUFFICIENT POSITIVE OR NEGATIVE SIGNIFICANT ANNOTATIONS:")
            for no in no_rnaseq_yet:
                print(get_clip_file_from_uid(uid=no, df=merged)[3]),



Writing 135 tasks as an array-job.
Wrote commands to /projects/ps-yeolab3/bay001/maps/bash_scripts/11-07-2018/a5ss_NR_svg.sh.




NO SUFFICIENT POSITIVE OR NEGATIVE SIGNIFICANT ANNOTATIONS:


NO SUFFICIENT POSITIVE OR NEGATIVE SIGNIFICANT ANNOTATIONS:
IGF2BP1 IGF2BP1 FKBP4 XRN2 SLTM CSTF2T FAM120A SND1 XRCC6 SND1 ILF3 GTF2F1 GTF2F1 LIN28B PABPC4 WRN KHSRP EWSR1 LSM11 TROVE2 FASTKD2 QKI DDX24 EXOSC5 BUD13 TROVE2 EIF3H ZRANB2 YBX3 CSTF2 DDX6 TBRG4 DDX51 UTP18 GRWD1 DDX52 GRWD1 FASTKD2 RBFOX2 DDX52 FUS AKAP1 CPEB4 EXOSC5 SDAD1 DDX21 SUB1


Writing 156 tasks as an array-job.
Wrote commands to /projects/ps-yeolab3/bay001/maps/bash_scripts/11-07-2018/a3ss_NR_svg.sh.
