# Wrapper notebook for submitting the RBP maps script to TSCC

In [13]:
import pandas as pd
import os
import json
import yaml
import numpy as np
import glob
# import rethinkdb as r
from collections import defaultdict
from qtools import Submitter
from encode import manifest_helpers as m

from tqdm import tnrange, tqdm_notebook
pd.set_option("display.max_colwidth", 10000)
pd.set_option('display.max_columns', 500)

# Define programs

In [14]:
density_runner = '/home/bay001/projects/codebase/rbp-maps/maps/plot_map.py'
peak_runner = '/home/bay001/projects/codebase/rbp-maps/maps/plot_map.py'

annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/se'
control_annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/erics_controls/nonredundant_renamed/se/'

params = {
    'whole_read':{
        'output_dir' : '/projects/ps-yeolab3/bay001/maps/current/se',
        'clip_manifest' : '/projects/ps-yeolab3/bay001/reference_data/ENCODE/ENCODE_FINAL_ANNOTATIONS.uidsonly.txt.manifest.txt',
        'prefix' : 'whole_read',
        'website_manifest' : '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.website.wholeread.txt'
    },
    '5p':{
        'output_dir' : '/projects/ps-yeolab3/bay001/maps/current/se_5p/',
        'clip_manifest' : '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.5p.txt',
        'prefix' : '5p',
        'website_manifest' : '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.website.5p.txt'
    },
    'peak_bb':{
        'output_dir' : '/projects/ps-yeolab3/bay001/maps/current/se_peak/',
        'clip_manifest' : '/projects/ps-yeolab3/bay001/reference_data/ENCODE/ENCODE_FINAL_ANNOTATIONS.uidsonly.txt.manifest.txt',
        'prefix' : 'peak',
        'peak_dir' : '/projects/ps-yeolab3/bay001/maps/current_annotations/se_peak_bigbeds/'
    },
    'idr_bb':{
        'output_dir':'/home/bay001/scratch/maps/se/idr/',
        # 'output_dir' : '/projects/ps-yeolab3/bay001/maps/current/se_idr_peak/',
        'clip_manifest' : '/projects/ps-yeolab3/bay001/reference_data/ENCODE/ENCODE_FINAL_ANNOTATIONS.uidsonly.txt.manifest.txt',
        'prefix' : 'peak',
        'peak_dir' : '/projects/ps-yeolab3/bay001/maps/current_annotations/se_peak_idr_bigbeds'
    },
    'whole_read_xcompare':{
        'output_dir' : '/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/se_xcompare/',
        # 'output_dir' : '/projects/ps-yeolab3/bay001/maps/current/se_xcompare',
        'clip_manifest' : '/projects/ps-yeolab3/bay001/reference_data/ENCODE/ENCODE_FINAL_ANNOTATIONS.uidsonly.txt.manifest.txt',
        'prefix' : 'whole_read_xcompare',
        'website_manifest' : '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.website.wholeread.txt'
    },
}

# Define manifests, directories, etc.

In [15]:
current_date = '12-3-2018'
bash_scripts_dir = '/projects/ps-yeolab3/bay001/maps/bash_scripts/{}'.format(current_date)
! mkdir $bash_scripts_dir

mkdir: cannot create directory `/projects/ps-yeolab3/bay001/maps/bash_scripts/12-3-2018': File exists


# Define helper scripts for accessing/joining RNASEQ and eCLIP annotations

In [16]:
read_type = 'whole_read'

clip_df = pd.read_table(params[read_type]['clip_manifest'], sep='\t')

master_table = pd.read_table(
    '/projects/ps-yeolab3/encode/rnaseq/eCLIP_finalstatus_20180406_ENCODE_combined_RNASEQ.tsv'
)
del master_table['eCLIP_Antibody_Lot_#']  # there is a NaN value in here that's screwing things up. Don't care about antibody anyway, so delete
merged = pd.merge(
    clip_df,
    master_table,
    how='right',
    left_on=['uID'],
    right_on=['eCLIP_Final_internal_accession']
)
merged.dropna(subset=['eCLIP_Final_internal_accession','SE_jxc_file'], inplace=True)
print(merged.shape)

(203, 43)


In [17]:
merged[merged['uID']=='204']

Unnamed: 0,uID,RBP,Cellline,CLIP_rep1,CLIP_rep2,INPUT,eCLIP_uID,eCLIP_Official_Gene_Symbol,eCLIP_Final_internal_accession,eCLIP_Submission_status,eCLIP_Antibody_Cat_#,eCLIP_Cell_Line,RNASEQ_Experiment_ID,RNASEQ_RBP,RNASEQ_Official_RBP,RNASEQ_Cell_line,RNASEQ_Control_Experiment_ID,CONTROL_RBP,CONTROL_Cell_line,RNASEQ_Replicate_rep1,RNASEQ_FASTQ_R1_rep1,RNASEQ_FASTQ_R2_rep1,RNASEQ_BAM_rep1,RNASEQ_TSV_rep1,Rep_rep1,CONTROL_Replicate_rep1,CONTROL_FASTQ_R1_rep1,CONTROL_FASTQ_R2_rep1,CONTROL_BAM_rep1,CONTROL_TSV_rep1,RNASEQ_Replicate_rep2,RNASEQ_FASTQ_R1_rep2,RNASEQ_FASTQ_R2_rep2,RNASEQ_BAM_rep2,RNASEQ_TSV_rep2,Rep_rep2,CONTROL_Replicate_rep2,CONTROL_FASTQ_R1_rep2,CONTROL_FASTQ_R2_rep2,CONTROL_BAM_rep2,CONTROL_TSV_rep2,SE_jxc_file,RNASEQ_DESeq2
1,204,RBFOX2,HepG2,/projects/ps-yeolab3/encode/analysis/encode_master/204_01_RBFOX2.merged.r2.bam,/projects/ps-yeolab3/encode/analysis/encode_master/204_02_RBFOX2.merged.r2.bam,/projects/ps-yeolab3/encode/analysis/encode_master/RBFOX2-204-INPUT_S2_R1.unassigned.adapterTrim.round2.rmRep.rmDup.sorted.r2.bam,204.0,RBFOX2,204,DONE - 2 reps submitted 6/28/15,A300-864A,HepG2,ENCSR767LLP,RBFOX2,RBFOX2,HepG2,ENCSR104ABF,non-target,HepG2,RBFOX2_BGHLV26_35,ENCFF201PEI,ENCFF508QNI,ENCFF347ERZ,ENCFF222JWL,1.0,non-target_BGHLV26-1,ENCFF291QQH,ENCFF602GIQ,ENCFF988VWE,ENCFF653XRX,RBFOX2_BGHLV26_36,ENCFF040CTS,ENCFF435HLB,ENCFF946VPZ,ENCFF042VXF,2.0,non-target_BGHLV26-2,ENCFF503VRZ,ENCFF105YHI,ENCFF893QHC,ENCFF401ECA,RBFOX2-BGHLV26-HepG2.set26.SE.MATS.JunctionCountOnly.txt,HepG2_RBFOX2_BGHLV26_DESeq_output.txt


# Plot SE Density

In [18]:
def get_clip_file_from_uid(uid, df=merged):
    df = df[df['uID']==uid]
    return df['CLIP_rep1'].values[0], \
            df['CLIP_rep2'].values[0], \
            df['INPUT'].values[0], \
            df['eCLIP_Official_Gene_Symbol'].values[0], \
            df['eCLIP_Cell_Line'].values[0], \
            df['SE_jxc_file'].values[0]

r1, r2, i, rbp, cell, jxc_se  = get_clip_file_from_uid('204')

def get_annotations_from_jxc_se(jxc, jxc_dir=annotation_dir):
    """ jxc contains the basename of the junction counts file """
    orig_file = os.path.join(jxc_dir, jxc)
    positive = os.path.splitext(orig_file)[0] + ".positive.nr.txt"
    positive = positive.replace('.SE.MATS.JunctionCountOnly.positive.nr.txt','-included-upon-knockdown')
    negative = os.path.splitext(orig_file)[0] + ".negative.nr.txt"
    negative = negative.replace('.SE.MATS.JunctionCountOnly.negative.nr.txt','-excluded-upon-knockdown')
    assert os.path.exists(orig_file)
    if not os.path.exists(positive):
        positive = None
    if not os.path.exists(negative):
        negative = None
    return positive, negative

get_annotations_from_jxc_se(jxc_se)

('/projects/ps-yeolab3/bay001/maps/current_annotations/se/RBFOX2-BGHLV26-HepG2.set26-included-upon-knockdown',
 '/projects/ps-yeolab3/bay001/maps/current_annotations/se/RBFOX2-BGHLV26-HepG2.set26-excluded-upon-knockdown')

In [19]:
img_extension = ['svg']
pos_splicing_suffix = '-included-upon-knockdown'
neg_splicing_suffix = '-excluded-upon-knockdown'

if not os.path.exists(bash_scripts_dir):
    ! mkdir $bash_scripts_dir

read_type = 'whole_read'
normalization_levels = [4]

### Force override of maps
force = True

### DEFINE BACKGROUNDS (THESE ARE STATIC AND DON'T CHANGE) ###
k562_background_ce = os.path.join(control_annotation_dir, 'K562_constitutive_exons')
k562_background_nse_all = os.path.join(control_annotation_dir, 'K562_native_cassette_exons_all')
k562_background_nse_inc = os.path.join(control_annotation_dir, 'K562_natively_included_cassette_exons')
k562_background_nse_exc = os.path.join(control_annotation_dir, 'K562_natively_excluded_cassette_exons')
k562_background_nse_avg = os.path.join(control_annotation_dir, 'K562_native_cassette_exons_avg')

hepg2_background_ce = os.path.join(control_annotation_dir, 'HepG2_constitutive_exons')
hepg2_background_nse_all = os.path.join(control_annotation_dir, 'HepG2_native_cassette_exons_all')
hepg2_background_nse_inc = os.path.join(control_annotation_dir, 'HepG2_natively_included_cassette_exons')
hepg2_background_nse_exc = os.path.join(control_annotation_dir, 'HepG2_natively_excluded_cassette_exons')
hepg2_background_nse_avg = os.path.join(control_annotation_dir, 'HepG2_native_cassette_exons_avg')

# no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
no_rnaseq_yet = [] # uIDs for which we have an expt id, but haven't downloaded the data yet (or there were no significant splice events)
cmds = []

for normalization_level in normalization_levels:
    for ext in img_extension:
        for uid in merged['uID']:
            r1, r2, i, rbp, cell, jxc_se = get_clip_file_from_uid(uid)

            if cell == 'K562':
                background1 = k562_background_ce
                background2 = k562_background_nse_all
                background3 = k562_background_nse_inc
                background4 = k562_background_nse_exc
                background5 = k562_background_nse_avg
            elif cell == 'HepG2':
                background1 = hepg2_background_ce
                background2 = hepg2_background_nse_all
                background3 = hepg2_background_nse_inc
                background4 = hepg2_background_nse_exc
                background5 = hepg2_background_nse_avg
            else:
                print(cell)


            ##### get the positive and negative associated annotations using this prefix #####
            positive, negative = get_annotations_from_jxc_se(
                jxc_se
            )
            ### we HAVE to have at both positive and negative annotations to plot ###
            if(positive == None or negative == None):
                no_rnaseq_yet.append(uid)
            else:
                ### uses RBP name to ensure positive and negative annotations are being pulled ###
                pos_prefix = os.path.basename(positive).split('-')[0]
                neg_prefix = os.path.basename(negative).split('-')[0]
                if not (pos_prefix in rbp and neg_prefix in rbp):
                    print(
                        'warning, these dont match: {}, {}, {}'.format(
                            rbp, 
                            os.path.basename(positive),
                            os.path.basename(negative)
                        )
                    )
                ### Foreach replicate, build teh command used to call the python script.
                for r in [r1, r2]:
                    name = os.path.basename(r).replace('.bam','.{}.{}'.format(normalization_level, ext))
                    output_filename = os.path.join(
                        params[read_type]['output_dir'],
                        name
                    )

                    # Build the cmd line
                    cmd = "python " + density_runner
                    cmd = cmd + " --event {}".format('se')
                    cmd = cmd + " --ipbam {}".format(r)
                    cmd = cmd + " --inputbam {}".format(i)
                    cmd = cmd + " --output {}".format(output_filename)
                    cmd = cmd + " --annotations {} {} {} {} {} {} {}".format(
                        positive, negative, background1, background2, background3, background4, background5
                    )
                    cmd = cmd + " --annotation_type {} {} {} {} {} {} {}".format(
                        'rmats', 'rmats', 'eric', 'eric', 'eric', 'eric', 'eric' 
                    )
                    # cmd = cmd + " --chrom_sizes {}".format(chrom_sizes)
                    cmd = cmd + " --testnum {} {}".format(0, 1)
                    cmd = cmd + " --bgnum {}".format(3) # test against native SE
                    cmd = cmd + " --normalization_level {}".format(normalization_level)
                    cmd = cmd + " --sigtest permutation"
                    if not os.path.exists(output_filename) or force == True:
                        cmds.append(cmd)

bash_script_sh = '/projects/ps-yeolab3/bay001/maps/bash_scripts/{}/{}-SE_NR_{}.{}.sh'.format(
    current_date, 
    params[read_type]['prefix'], 
    ext,
    normalization_level
)

Submitter(
    cmds, 
    "{}-SE_NR_{}".format(params[read_type]['prefix'], ext), 
    sh=bash_script_sh,
    submit=False,
    array=True,
    walltime='3:00:00',
    queue='home-yeo'
)

### Print any missing/unavailable annotations to check over ###
print("uIDs for which we have an RNASEQ ID, but we don't have the splicing data yet: {}".format(
        len(no_rnaseq_yet))
     )

uIDs for which we have an RNASEQ ID, but we don't have the splicing data yet: 10


Writing 386 tasks as an array-job.
Wrote commands to /projects/ps-yeolab3/bay001/maps/bash_scripts/12-3-2018/whole_read-SE_NR_svg.4.sh.


# Ensure we have made all of the maps we need for integrated paper.

In [10]:
ext = 'svg'
for uid in merged['uID']:
    r1, r2, i, rbp, cell, jxc_se = get_clip_file_from_uid(uid)
    positive, negative = get_annotations_from_jxc_se(
        jxc_se
    )
    if(positive == None or negative == None):
        pass
    else:
        pdf = pd.read_table(positive)
        ndf = pd.read_table(negative)

        if(pdf.shape[0] >= 100 and ndf.shape[0] >= 100):
            for r in [r1, r2]:
                name = os.path.basename(r).replace('.bam','.{}.{}'.format(normalization_level, ext))
                means = glob.glob(
                    os.path.join(
                        params[read_type]['output_dir'],
                        os.path.basename(r).replace('.bam','*.means.txt')
                    )
                )
                output_filename = os.path.join(
                    params[read_type]['output_dir'],
                    name
                )
                if not os.path.exists(output_filename):
                    print("{} {} doesnt exist".format(output_filename, jxc_se))
                if len(means) != 7:
                    print("missing means (found {})".format(means))

# Plot SE Peaks

In [None]:
key = 'peak_bb'

img_extensions = ['svg'] #,'svg']
clip_df = pd.read_table(params[key]['clip_manifest'])

no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
no_rnaseq_yet = [] # uIDs for which we have an expt id, but haven't downloaded the data yet
cmds = []
force = True

### DEFINE BACKGROUNDS (THESE ARE STATIC AND DON'T CHANGE) ###
k562_background_ce = os.path.join(control_annotation_dir, 'K562_constitutive_exons')
k562_background_nse_all = os.path.join(control_annotation_dir, 'K562_native_cassette_exons_all')
k562_background_nse_inc = os.path.join(control_annotation_dir, 'K562_natively_included_cassette_exons')
k562_background_nse_exc = os.path.join(control_annotation_dir, 'K562_natively_excluded_cassette_exons')
k562_background_nse_avg = os.path.join(control_annotation_dir, 'K562_native_cassette_exons_avg')

hepg2_background_ce = os.path.join(control_annotation_dir, 'HepG2_constitutive_exons')
hepg2_background_nse_all = os.path.join(control_annotation_dir, 'HepG2_native_cassette_exons_all')
hepg2_background_nse_inc = os.path.join(control_annotation_dir, 'HepG2_natively_included_cassette_exons')
hepg2_background_nse_exc = os.path.join(control_annotation_dir, 'HepG2_natively_excluded_cassette_exons')
hepg2_background_nse_avg = os.path.join(control_annotation_dir, 'HepG2_native_cassette_exons_avg')


for ext in img_extensions:
    for uid in merged['uID']:
        peak_files = glob.glob(os.path.join(
            params[key]['peak_dir'], 
            '{}_0*.basedon_{}_0*.peaks.l2inputnormnew.bed.compressed.bed.p3f3.bed.sorted.bed.bb'.format(
                uid, uid
            )
        ))
        assert len(peak_files) == 2
        
        for peak in peak_files:
            output_filename = os.path.join(
                params[key]['output_dir'],
                os.path.basename(peak).replace('.bb','.bb.{}'.format(ext))
            )
            r1, r2, i, rbp, cell, jxc_se = get_clip_file_from_uid(uid)


            if cell == 'K562':
                background1 = k562_background_ce
                background2 = k562_background_nse_all
                background3 = k562_background_nse_inc
                background4 = k562_background_nse_exc
                background5 = k562_background_nse_avg
            elif cell == 'HepG2':
                background1 = hepg2_background_ce
                background2 = hepg2_background_nse_all
                background3 = hepg2_background_nse_inc
                background4 = hepg2_background_nse_exc
                background5 = hepg2_background_nse_avg
            else:
                print(cell)

            ##### get the positive and negative associated annotations using this prefix #####
            positive, negative = get_annotations_from_jxc_se(
                jxc_se
            )
            if uid == '560':
                print(positive, negative)
            if(positive == None or negative == None):
                no_rnaseq_yet.append(uid)
            else:
                # Build the cmd line
                cmd = "python " + peak_runner
                cmd = cmd + " --event {}".format('se')
                cmd = cmd + " --peak {}".format(peak)
                cmd = cmd + " --output {}".format(output_filename)
                cmd = cmd + " --annotations {} {} {} {} {} {} {}".format(
                    positive, negative, background1, background2, background3, background4, background5
                )
                cmd = cmd + " --annotation_type {} {} {} {} {} {} {}".format(
                    'rmats', 'rmats', 'eric', 'eric', 'eric', 'eric', 'eric' 
                )
                # cmd = cmd + " --chrom_sizes {}".format(chrom_sizes)
                cmd = cmd + " --testnum {} {}".format(0, 1)
                cmd = cmd + " --bgnum {}".format(3) # test against native SE
                cmd = cmd + " --normalization_level {}".format(0)
                cmd = cmd + " --sigtest {}".format("fisher")
                if not os.path.exists(output_filename) or force == True:
                    cmds.append(cmd)

bash_script_sh = '/projects/ps-yeolab3/bay001/maps/bash_scripts/{}/SE_PEAK_PNGS.sh'.format(current_date)

Submitter(
    cmds, 
    "SE_PEAK_PNGS", 
    sh=bash_script_sh,
    submit=False,
    array=True,
    walltime='0:20:00',
    queue='home-yeo'
)

with open(bash_script_sh.replace('.sh','.missing.txt'), 'w') as o:
    for no in no_rnaseq:
        o.write(
            '{}\t{}\n'.format(
                m.get_clip_file_from_uid(clip_df, no)[3],
                m.get_clip_file_from_uid(clip_df, no)[4],
            )
        )
    print("\n\nNO SUFFICIENT POSITIVE OR NEGATIVE SIGNIFICANT ANNOTATIONS:")
    for no in no_rnaseq_yet:
        print(no)

# IDR peak splice maps

In [None]:
key = 'idr_bb'

img_extensions = ['svg'] #,'svg']
clip_df = pd.read_table(params[key]['clip_manifest'])

no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
no_rnaseq_yet = [] # uIDs for which we have an expt id, but haven't downloaded the data yet
cmds = []
force = False

### DEFINE BACKGROUNDS (THESE ARE STATIC AND DON'T CHANGE) ###
k562_background_ce = os.path.join(control_annotation_dir, 'K562_constitutive_exons')
k562_background_nse_all = os.path.join(control_annotation_dir, 'K562_native_cassette_exons_all')
k562_background_nse_inc = os.path.join(control_annotation_dir, 'K562_natively_included_cassette_exons')
k562_background_nse_exc = os.path.join(control_annotation_dir, 'K562_natively_excluded_cassette_exons')
k562_background_nse_avg = os.path.join(control_annotation_dir, 'K562_native_cassette_exons_avg')

hepg2_background_ce = os.path.join(control_annotation_dir, 'HepG2_constitutive_exons')
hepg2_background_nse_all = os.path.join(control_annotation_dir, 'HepG2_native_cassette_exons_all')
hepg2_background_nse_inc = os.path.join(control_annotation_dir, 'HepG2_natively_included_cassette_exons')
hepg2_background_nse_exc = os.path.join(control_annotation_dir, 'HepG2_natively_excluded_cassette_exons')
hepg2_background_nse_avg = os.path.join(control_annotation_dir, 'HepG2_native_cassette_exons_avg')


for ext in img_extensions:
    for uid in merged['uID']:
        peak_files = glob.glob(os.path.join(
            params[key]['peak_dir'], 
            '{}.01v02.IDR.out.0102merged.bed.blacklist_removed.bed.p0f0.bed.sorted.bed.bb'.format(
                uid
            )
        ))

        assert len(peak_files) == 1
        
        for peak in peak_files:
            output_filename = os.path.join(
                params[key]['output_dir'],
                os.path.basename(peak).replace('.bb','.bb.{}'.format(ext))
            )
            r1, r2, i, rbp, cell, jxc_se = get_clip_file_from_uid(uid)


            if cell == 'K562':
                background1 = k562_background_ce
                background2 = k562_background_nse_all
                background3 = k562_background_nse_inc
                background4 = k562_background_nse_exc
                background5 = k562_background_nse_avg
            elif cell == 'HepG2':
                background1 = hepg2_background_ce
                background2 = hepg2_background_nse_all
                background3 = hepg2_background_nse_inc
                background4 = hepg2_background_nse_exc
                background5 = hepg2_background_nse_avg
            else:
                print(cell)

            ##### get the positive and negative associated annotations using this prefix #####
            positive, negative = get_annotations_from_jxc_se(
                jxc_se
            )
            if uid == '678':
                print(positive, negative)
            if(positive == None or negative == None):
                no_rnaseq_yet.append(uid)
            else:
                # Build the cmd line
                cmd = "python " + peak_runner
                cmd = cmd + " --event {}".format('se')
                cmd = cmd + " --peak {}".format(peak)
                cmd = cmd + " --output {}".format(output_filename)
                cmd = cmd + " --annotations {} {} {} {} {} {} {}".format(
                    positive, negative, background1, background2, background3, background4, background5
                )
                cmd = cmd + " --annotation_type {} {} {} {} {} {} {}".format(
                    'rmats', 'rmats', 'eric', 'eric', 'eric', 'eric', 'eric' 
                )
                # cmd = cmd + " --chrom_sizes {}".format(chrom_sizes)
                cmd = cmd + " --testnum {} {}".format(0, 1)
                cmd = cmd + " --bgnum {}".format(3) # test against native SE
                cmd = cmd + " --normalization_level {}".format(0)
                cmd = cmd + " --sigtest {}".format("fisher")
                cmd = cmd + " --confidence 1"
                if not os.path.exists(output_filename) or force == True:
                    cmds.append(cmd)

bash_script_sh = '/projects/ps-yeolab3/bay001/maps/bash_scripts/{}/SE_IDR_PNGS.sh'.format(current_date)

Submitter(
    cmds, 
    "SE_IDR_SVGS", 
    sh=bash_script_sh,
    submit=False,
    array=True,
    walltime='0:20:00',
    queue='home-yeo'
)

with open(bash_script_sh.replace('.sh','.missing.txt'), 'w') as o:
    for no in no_rnaseq:
        o.write(
            '{}\t{}\n'.format(
                m.get_clip_file_from_uid(clip_df, no)[3],
                m.get_clip_file_from_uid(clip_df, no)[4],
            )
        )
    print("\n\nNO SUFFICIENT POSITIVE OR NEGATIVE SIGNIFICANT ANNOTATIONS:")
    for no in no_rnaseq_yet:
        print(no)

# RBPclasses
- This file contains all RBPs, not just the ones we clipped, and not just the ones with RNA SEQ. Be aware that some of these RBPs have not been clipped, and thus will not be compared.

In [28]:
classes = pd.read_table(
    "/projects/ps-yeolab3/bay001/reference_data/ENCODE/RBPclasses_20180401.txt"
)
classes.head()

Unnamed: 0.1,Unnamed: 0,Class#,Class ID,SubmittedDueToFamily
0,SND1-HepG2,1,CDS,1
1,FXR1-K562,1,CDS,1
2,SND1-K562,1,CDS,1
3,FXR2-HepG2,1,CDS,1
4,G3BP1-HepG2,1,CDS,1


In [29]:
def get_rbp(row):
    return row['Unnamed: 0'].split('-')[0]
def get_cell(row):
    return row['Unnamed: 0'].split('-')[1]
classes['rbp'] = classes.apply(get_rbp, axis=1)
classes['cell'] = classes.apply(get_cell, axis=1)
classes.head()

Unnamed: 0.1,Unnamed: 0,Class#,Class ID,SubmittedDueToFamily,rbp,cell
0,SND1-HepG2,1,CDS,1,SND1,HepG2
1,FXR1-K562,1,CDS,1,FXR1,K562
2,SND1-K562,1,CDS,1,SND1,K562
3,FXR2-HepG2,1,CDS,1,FXR2,HepG2
4,G3BP1-HepG2,1,CDS,1,G3BP1,HepG2


In [30]:
def return_rbps_within_same_class(rbp, cell, df=classes):
    """ given an rbp and cell type, return list of other rbps in same binding class. """
    class_num = df[(df['cell']==cell) & (df['rbp']==rbp)]['Class#'].values[0]
    return list(set(df[(df['Class#']==class_num) & (df['cell']==cell)]['Unnamed: 0']))
          

def get_uid_from_rbp_and_cell(rbp, cell, merged=merged):
    """ given an rbp name and celltype, return uid """
    return merged[(merged['RBP']==rbp) & (merged['Cell line']==cell)]['uID'].values[0]


def get_groupid(rbp, cell, df=classes):
    class_num = df[(df['cell']==cell) & (df['rbp']==rbp)]['Class#'].values[0]
    return class_num

def compare_rbp_against_all_others_in_class(uid, df=classes, merged=merged):
    """ Given an rbp uID, return a list of uIDs in its same group. """
    all_uids_to_compare = []
    r1, r2, i, rbp, cell, jxc_se = get_clip_file_from_uid(uid, df=merged)
    rbps_list = return_rbps_within_same_class(rbp, cell, df)
    for protein in rbps_list:
        rbp_name, cell_type = protein.split('-')
        try:
            all_uids_to_compare.append(get_uid_from_rbp_and_cell(rbp_name, cell_type))
        except IndexError:
            # print(rbp_name, cell_type)
            pass
    if len(all_uids_to_compare) > 0:
        return all_uids_to_compare
    else:
        return None
    
get_groupid('POLR2G','HepG2')
compare_rbp_against_all_others_in_class('203')

merged.iloc[43]['uID']

'298'

In [32]:
ext = 'png'
pos_splicing_suffix = '-included-upon-knockdown'
neg_splicing_suffix = '-excluded-upon-knockdown'

if not os.path.exists(bash_scripts_dir):
    ! mkdir $bash_scripts_dir

read_type = 'whole_read_xcompare'
normalization_level = 4


### Force override of maps
force = False

### DEFINE BACKGROUNDS (THESE ARE STATIC AND DON'T CHANGE) ###
k562_background_ce = os.path.join(control_annotation_dir, 'K562_constitutive_exons')
k562_background_nse_all = os.path.join(control_annotation_dir, 'K562_native_cassette_exons_all')
k562_background_nse_inc = os.path.join(control_annotation_dir, 'K562_natively_included_cassette_exons')
k562_background_nse_exc = os.path.join(control_annotation_dir, 'K562_natively_excluded_cassette_exons')
k562_background_nse_avg = os.path.join(control_annotation_dir, 'K562_native_cassette_exons_avg')

hepg2_background_ce = os.path.join(control_annotation_dir, 'HepG2_constitutive_exons')
hepg2_background_nse_all = os.path.join(control_annotation_dir, 'HepG2_native_cassette_exons_all')
hepg2_background_nse_inc = os.path.join(control_annotation_dir, 'HepG2_natively_included_cassette_exons')
hepg2_background_nse_exc = os.path.join(control_annotation_dir, 'HepG2_natively_excluded_cassette_exons')
hepg2_background_nse_avg = os.path.join(control_annotation_dir, 'HepG2_native_cassette_exons_avg')

# manifest for eric (clip id / rnaseq id / group # / pos / neg)
# manifest_file = '/projects/ps-yeolab3/bay001/maps/current/se_xcompare_manifest.txt'
manifest_file = '/projects/ps-yeolab3/bay001/reference_data/ENCODE/se_xcompare_manifest.txt'
o = open(manifest_file, 'w')
o.write("{}\t{}\t{}\t{}\t{}\n".format(
    "clip_id", "uID", "group_id", "pos_file_name", "neg_file_name"
))
print("number of entries: {}".format(len(set(merged['uID']))))

TOTAL_GRP_COMPARISONS = 0
TOTAL_EXPECT_MEANS = 0  # let's add up the total (expected) number of mean values we're supposed to have.
FILTERED_EXPECT_MEANS = 0  # let's add up the filtered (expected) number of mean values we're supposed to have (n >=100).
REDUNDANT_MEANS = 0 # i use a placeholder (erics data) for maps that have less than 2 annotations. Let's keep track of these.

# no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
cmds = []
print("Number of UIDs: {}".format(len(merged['uID'])))
for uid in set(sorted(merged['uID'])):
    local_cmds = "module load rbpmaps"
    local_cmds_ct = 0 # count how many local commands we have per uID
    r1, r2, i, rbp, cell, _ = get_clip_file_from_uid(uid)
    if cell == 'K562':
        background2 = k562_background_nse_all
        background5 = k562_background_nse_avg
    elif cell == 'HepG2':
        background2 = hepg2_background_nse_all
        background5 = hepg2_background_nse_avg
    else:
        print(cell)
        
    other_uids_to_compare = compare_rbp_against_all_others_in_class(uid)
    if other_uids_to_compare is not None:
        # Just a check to make sure we're making proper comparisons...
        other_rbps = []
        for x in range(min(5, len(other_uids_to_compare))):  # preview the kinds of RBPs we're comparing against
            try:
                _, _, _, other_rbp, other_cell, _ = get_clip_file_from_uid(other_uids_to_compare[x])
                other_rbps.append("{}-{}".format(other_rbp, other_cell))
            except Exception as e:
                print(len(other_uids_to_compare))
        # print("comparing {}-{} to {}..".format(rbp, cell, other_rbps))

        # end check
        
        # for each RBP to compare:
        for other_uid in other_uids_to_compare:
            TOTAL_GRP_COMPARISONS += 1
            ##### get the positive and negative associated annotations using this prefix #####
            _, _, _, _, _, jxc_se = get_clip_file_from_uid(other_uid)
            positive, negative = get_annotations_from_jxc_se(
                jxc_se
            )
            
            ### both positive and negative annotations to plot ###
            if(positive == None and negative == None):  # we don't have any significant events to plot
                # manifest for eric (clip id / rnaseq id / group # / pos / neg)
                o.write("{}\t{}\t{}\t{}\t{}\n".format(
                    os.path.basename(r),
                    os.path.basename(jxc_se),
                    get_groupid(rbp, cell),
                    "-",
                    "-",
                ))
            elif positive is None:  # we have negative events to plot
                
                # checks the shape (number of events)
                neg_prefix = os.path.basename(negative).split('-')[0]
                negdf = pd.read_table(negative)
                
                # Build the cmd line
                for r in [r1, r2]:
                    name = os.path.basename(r).replace('.bam','.compare.{}.{}.{}'.format(
                        jxc_se.split('.')[0], normalization_level, ext
                    ))
                    output_filename = os.path.join(
                        params[read_type]['output_dir'],
                        name
                    )
                    
                    cmd = "python " + density_runner
                    cmd = cmd + " --event {}".format('se')
                    cmd = cmd + " --ipbam {}".format(r)
                    cmd = cmd + " --inputbam {}".format(i)
                    cmd = cmd + " --output {}".format(output_filename)
                    cmd = cmd + " --annotations {} {}".format(
                        negative, background5
                    )
                    cmd = cmd + " --annotation_type {} {}".format(
                        'rmats', 'eric'
                    )
                    cmd = cmd + " --normalization_level {}".format(normalization_level)
                    if not os.path.exists(output_filename) or force == True:
                        # cmds.append(cmd)
                        local_cmds = local_cmds + ';{}'.format(cmd)
                        local_cmds_ct += 1
                    if (negdf.shape[0] < 100):
                            # manifest for eric (clip id / rnaseq id / group # / pos / neg)
                            o.write("{}\t{}\t{}\t{}\t{}\n".format(
                                os.path.basename(r),
                                os.path.basename(jxc_se),
                                get_groupid(rbp, cell),
                                "-",
                                "-",
                            ))
                    # Else if we have enough events, write build the commandline and write to file.
                    else:
                        # manifest for eric (clip id / rnaseq id / group # / pos / neg)
                        o.write("{}\t{}\t{}\t{}\t{}\n".format(
                            os.path.basename(r),
                            os.path.basename(jxc_se),
                            get_groupid(rbp, cell),
                            "-",
                            output_filename.replace('.{}'.format(ext), '.{}.means.txt'.format(os.path.basename(negative))),
                        ))
                        FILTERED_EXPECT_MEANS += 1
                REDUNDANT_MEANS += 2
                TOTAL_EXPECT_MEANS += 2 # negative, negative for each replicate
                
            elif negative is None:  # we have positive events to plot
                # checks the shape (number of events)
                pos_prefix = os.path.basename(positive).split('-')[0]
                posdf = pd.read_table(positive)
                # Build the cmd line
                for r in [r1, r2]:
                    name = os.path.basename(r).replace('.bam','.compare.{}.{}.{}'.format(
                        jxc_se.split('.')[0], normalization_level, ext
                    ))
                    output_filename = os.path.join(
                        params[read_type]['output_dir'],
                        name
                    )
                    cmd = "python " + density_runner
                    cmd = cmd + " --event {}".format('se')
                    cmd = cmd + " --ipbam {}".format(r)
                    cmd = cmd + " --inputbam {}".format(i)
                    cmd = cmd + " --output {}".format(output_filename)
                    cmd = cmd + " --annotations {} {}".format(
                        positive, background5
                    )
                    cmd = cmd + " --annotation_type {} {}".format(
                        'rmats', 'eric'
                    )
                    cmd = cmd + " --normalization_level {}".format(normalization_level)
                    if not os.path.exists(output_filename) or force == True:
                        # cmds.append(cmd)
                        local_cmds = local_cmds + ';{}'.format(cmd)
                        local_cmds_ct += 1
                    if (posdf.shape[0] < 100):
                            # manifest for eric (clip id / rnaseq id / group # / pos / neg)
                            o.write("{}\t{}\t{}\t{}\t{}\n".format(
                                os.path.basename(r),
                                os.path.basename(jxc_se),
                                get_groupid(rbp, cell),
                                "-",
                                "-",
                            ))
                    # Else if we have enough events, write build the commandline and write to file.
                    else:
                        # manifest for eric (clip id / rnaseq id / group # / pos / neg)
                        o.write("{}\t{}\t{}\t{}\t{}\n".format(
                            os.path.basename(r),
                            os.path.basename(jxc_se),
                            get_groupid(rbp, cell),
                            output_filename.replace('.{}'.format(ext), '.{}.means.txt'.format(os.path.basename(positive))),
                            "-"
                        ))
                        FILTERED_EXPECT_MEANS += 1
                REDUNDANT_MEANS += 2
                TOTAL_EXPECT_MEANS += 2 # negative, negative for each replicate
            else:
                ### uses RBP name to ensure positive and negative annotations are being pulled ###
                pos_prefix = os.path.basename(positive).split('-')[0]
                neg_prefix = os.path.basename(negative).split('-')[0]
                posdf = pd.read_table(positive)
                negdf = pd.read_table(negative)


                ### Foreach replicate, build teh command used to call the python script.
                # Build the cmd line
                for r in [r1, r2]:
                    name = os.path.basename(r).replace('.bam','.compare.{}.{}.{}'.format(
                        jxc_se.split('.')[0], normalization_level, ext
                    ))
                    output_filename = os.path.join(
                        params[read_type]['output_dir'],
                        name
                    )
                    # Build the cmd line
                    cmd = "python " + density_runner
                    cmd = cmd + " --event {}".format('se')
                    cmd = cmd + " --ipbam {}".format(r)
                    cmd = cmd + " --inputbam {}".format(i)
                    cmd = cmd + " --output {}".format(output_filename)
                    cmd = cmd + " --annotations {} {}".format(
                        positive, negative
                    )
                    cmd = cmd + " --annotation_type {} {}".format(
                        'rmats', 'rmats'
                    )
                    cmd = cmd + " --normalization_level {}".format(normalization_level)
                    if not os.path.exists(output_filename) or force == True:
                        # cmds.append(cmd)
                        local_cmds = local_cmds + ';{}'.format(cmd)
                        local_cmds_ct += 1
                    ### WRITE MANIFEST ###

                    # If there are not enough positive events and not enough negative events: write to file 
                    if ((posdf.shape[0] < 100) & (negdf.shape[0] < 100)):
                        # manifest for eric (clip id / rnaseq id / group # / pos / neg)
                        o.write("{}\t{}\t{}\t{}\t{}\n".format(
                            os.path.basename(r),
                            os.path.basename(jxc_se),
                            get_groupid(rbp, cell),
                            "-",
                            "-",
                        ))
                    # Else if we have enough positive events, write build the commandline and write positive to file.
                    elif ((posdf.shape[0] >= 100) & (negdf.shape[0] < 100)):
                        # manifest for eric (clip id / rnaseq id / group # / pos / neg)
                        o.write("{}\t{}\t{}\t{}\t{}\n".format(
                            os.path.basename(r),
                            os.path.basename(jxc_se),
                            get_groupid(rbp, cell),
                            output_filename.replace('.{}'.format(ext), '.{}.means.txt'.format(os.path.basename(positive))),
                            "-"
                        ))
                        FILTERED_EXPECT_MEANS += 1
                    # Else if we have enough negative events but not positive, build commandline with just negative
                    elif ((negdf.shape[0] >= 100) & (posdf.shape[0] < 100)):
                        # manifest for eric (clip id / rnaseq id / group # / pos / neg)
                        o.write("{}\t{}\t{}\t{}\t{}\n".format(
                            os.path.basename(r),
                            os.path.basename(jxc_se),
                            get_groupid(rbp, cell),
                            "-",
                            output_filename.replace('.{}'.format(ext), '.{}.means.txt'.format(os.path.basename(negative))),
                        ))
                        FILTERED_EXPECT_MEANS += 1
                    # Else if we have enough events for both, generate commandline using both
                    elif ((negdf.shape[0] >= 100) & (posdf.shape[0] >= 100)):
                        # manifest for eric (clip id / rnaseq id / group # / pos / neg)
                        o.write("{}\t{}\t{}\t{}\t{}\n".format(
                            os.path.basename(r),
                            os.path.basename(jxc_se),
                            get_groupid(rbp, cell),
                            output_filename.replace('.{}'.format(ext), '.{}.means.txt'.format(os.path.basename(positive))),
                            output_filename.replace('.{}'.format(ext), '.{}.means.txt'.format(os.path.basename(negative))),
                        ))
                        FILTERED_EXPECT_MEANS += 2
                TOTAL_EXPECT_MEANS += 4 # negative, positive for each replicate
    if local_cmds != "module load rbpmaps":
        if local_cmds.startswith(';'):
            local_cmds = local_cmds[1:]
        print("number of cmds written for {}: {}".format(uid, local_cmds_ct))
        cmds.append(local_cmds)
o.close()
bash_script_sh = '/projects/ps-yeolab3/bay001/maps/bash_scripts/{}/{}-SE_XCOMPARE2_{}.{}.sh'.format(
    current_date, 
    params[read_type]['prefix'], 
    ext,
    normalization_level
)

Submitter(
    cmds, 
    "{}-SE_XCOMPARE2_{}".format(params[read_type]['prefix'], ext), 
    sh=bash_script_sh,
    submit=False,
    array=True,
    walltime='72:00:00',
    queue='home-yeo'
)

with open(bash_script_sh + ".slow.sh", 'w') as o:
    o.write("#!/usr/bin/env bash\n")
    for cmd in cmds:
        o.write(cmd + "\n")

### Print any missing/unavailable annotations to check over ###
print("total_grp: {}, total comparisons: {}, total (n >=100): {}, total redundant: {}".format(
        TOTAL_GRP_COMPARISONS,
        TOTAL_EXPECT_MEANS,
        FILTERED_EXPECT_MEANS,
        REDUNDANT_MEANS
        )
     )


number of entries: 203
Number of UIDs: 203
total_grp: 5223, total comparisons: 19732, total (n >=100): 7866, total redundant: 288


Writing 0 tasks as an array-job.
Wrote commands to /projects/ps-yeolab3/bay001/maps/bash_scripts/8-22-2018/whole_read_xcompare-SE_XCOMPARE2_png.4.sh.


In [33]:
cmds = []
manifest_file = '/projects/ps-yeolab3/bay001/reference_data/ENCODE/se_xcompare_manifest.nSE.txt'
o = open(manifest_file, 'w')
o.write("{}\t{}\t{}\t{}\t{}\n".format(
    "clip_id", "uID", "group_id", "nSE_all_file_name", "nSE_avg_file_name"
))

for uid in set(merged['uID']):
    # Regardless of how many pos/neg events there are, generate the maps for each RBP with just the (nSE-All and nSE-Avg) annotations.
    r1, r2, i, rbp, cell, _ = get_clip_file_from_uid(uid)
    if cell == 'K562':
        background2 = k562_background_nse_all
        background5 = k562_background_nse_avg
    elif cell == 'HepG2':
        background2 = hepg2_background_nse_all
        background5 = hepg2_background_nse_avg
    else:
        print(cell)

    for r in [r1, r2]:
        name = os.path.basename(r).replace('.bam','.compare.nSE.{}.{}'.format(
            normalization_level, ext
        ))
        output_filename = os.path.join(
            params[read_type]['output_dir'],
            name
        )
        allmeans = os.path.join(
            params[read_type]['output_dir'],
            "{}*{}*native_cassette_exons_all.means.txt".format(uid, uid)
        )
        avgmeans = os.path.join(
            params[read_type]['output_dir'],
            "{}*{}*native_cassette_exons_avg.means.txt".format(uid, uid)
        )
        # Build the cmd line

        cmd = "python " + density_runner
        cmd = cmd + " --event {}".format('se')
        cmd = cmd + " --ipbam {}".format(r)
        cmd = cmd + " --inputbam {}".format(i)
        cmd = cmd + " --output {}".format(output_filename)
        cmd = cmd + " --annotations {} {}".format( 
            background2, background5,
        )
        cmd = cmd + " --annotation_type {} {}".format( 
            'eric', 'eric',
        )
        cmd = cmd + " --normalization_level {}".format(normalization_level)
        o.write("{}\t{}\t{}\t{}\t{}\n".format(
            os.path.basename(r),
            '-',
            get_groupid(rbp, cell),
            output_filename.replace('.{}'.format(ext), '.{}.means.txt'.format(os.path.basename(background2))),
            output_filename.replace('.{}'.format(ext), '.{}.means.txt'.format(os.path.basename(background5))),
        ))
        if not os.path.exists(output_filename) or force == True:
            cmds.append(cmd)
o.close()
bash_script_sh = '/projects/ps-yeolab3/bay001/maps/bash_scripts/{}/{}-SE_XCOMPARE_NSE_{}.{}.sh'.format(
    current_date, 
    params[read_type]['prefix'], 
    ext,
    normalization_level
)

Submitter(
    cmds, 
    "{}-SE_XCOMPARE_NSE_{}".format(params[read_type]['prefix'], ext), 
    sh=bash_script_sh,
    submit=False,
    array=True,
    walltime='6:00:00',
    queue='home-yeo'
)

Writing 0 tasks as an array-job.
Wrote commands to /projects/ps-yeolab3/bay001/maps/bash_scripts/8-22-2018/whole_read_xcompare-SE_XCOMPARE_NSE_png.4.sh.


<qtools.submitter.Submitter at 0x2b9000b0ee90>

# Let's check to make sure we've made all se splice maps

In [12]:
# this is the total number of maps we've made. This includes both replicates, and only contains maps for expts which have at least 1 significant positive and negative event
num_maps = len(glob.glob(os.path.join(params['whole_read']['output_dir'], "*.svg")))

print("{} corresponds to {} RBPs with both eCLIP and RMATS significant events".format(num_maps, num_maps/2))

386 corresponds to 193 RBPs with both eCLIP and RMATS significant events


In [13]:
# Ensure that we have the correct number of rmats files by re-filtering them independently:
orig_jxc_dir = '/projects/ps-yeolab3/encode/rnaseq/alt_splicing/graveley_rmats_current' # where we downloaded data from xintao
out_dir = '/home/bay001/scratch/encode/rmats_subset_test' # we don't need to keep any of these, just calculate the number
jxc_files = glob.glob(os.path.join(orig_jxc_dir, '*.SE.MATS.JunctionCountOnly.txt')) # all jxc SE files
print("Total number of downloaded files (should be 473): {}".format(len(jxc_files)))

def subset_jxc_only(in_file, out_file):
    """
    Calls my 'nonredundant' script to return nonoverlapping splice events.
    """
    scripts_dir = '/home/bay001/projects/codebase/bfx/pyscripts/rnaseq/'
    cmd = 'python {} '.format(
        os.path.join(scripts_dir, 'subset_rmats_junctioncountonly.py')
    )
    cmd += '-i {} '.format(in_file)
    cmd += '-o {} '.format(out_file)
    cmd += '-e {} '.format('se')
    ! $cmd
    
def filter_jxc_file(fn, fdr=0.1, pvalue=0.05, sep=0.05, out_dir=out_dir):
    """ filters a JunctionCountsOnly file and saves significant up/down/all SE splice events. """
    dfp_size = 0
    dfn_size = 0
    
    df = pd.read_table(fn)
    dfs = df[(df['PValue'] < pvalue) & (df['FDR'] < fdr)]
    dfs_output = os.path.join(out_dir, os.path.basename(fn) + '.significant.txt')
    if dfs.shape[0] > 0:
        dfs.to_csv(dfs_output, sep='\t', index=False)
    
    dfp = df[(df['PValue'] < pvalue) & (df['IncLevelDifference'] > sep) & (df['FDR'] < fdr)]
    dfp_output = os.path.join(out_dir, os.path.basename(fn) + '.positive.txt')
    dfp_output_nr = os.path.join(out_dir, os.path.basename(fn) + '.positive.nr.txt')
    if dfp.shape[0] > 0:
        dfp.to_csv(dfp_output, sep='\t', index=False)
        subset_jxc_only(dfp_output, dfp_output_nr)
        dfp_size = pd.read_table(dfp_output_nr).shape[0]
        
    dfn = df[(df['PValue'] < pvalue) & (df['IncLevelDifference'] < -(sep)) & (df['FDR'] < fdr)]
    dfn_output = os.path.join(out_dir, os.path.basename(fn) + '.negative.txt')
    dfn_output_nr = os.path.join(out_dir, os.path.basename(fn) + '.negative.nr.txt')
    if dfn.shape[0] > 0:
        dfn.to_csv(dfn_output, sep='\t', index=False)
        subset_jxc_only(dfn_output, dfn_output_nr)
        dfn_size = pd.read_table(dfn_output_nr).shape[0]
    
    return dfs.shape[0], dfp.shape[0], dfn.shape[0], dfp_size, dfn_size
    
def get_filtered_jxc_num(jxc_files):
    """
    Iterates over filter_jxc_file() and returns a dictionary of the number of 
    significantly up/down/all SE events.
    """
    jxc_dict = defaultdict(dict)
    progress = tnrange(len(jxc_files))
    for jxc_file in jxc_files:
        significant, positive, negative, positive_nr, negative_nr = filter_jxc_file(jxc_file)
        jxc_dict[os.path.basename(jxc_file)] = {'sig':significant,'pos':positive,'neg':negative, 'pos_nr':positive_nr, 'neg_nr':negative_nr}
        progress.update(1)
    return jxc_dict

Total number of downloaded files (should be 473): 473


In [14]:
jxc_dict = get_filtered_jxc_num(jxc_files)

In [16]:
# make a copy of the merged manifest so we can delete stuff without worrying about modifying original dataframe
merged2 = merged.copy(deep=True)  
# convert rbp+cell line into a single string that we can search/use as a unique key
merged2['code'] = merged2.apply(lambda x: '{}-{}'.format(x['RBP'], x['Cellline']), axis=1) 
# check manually if these numbers match what is being printed in the figure:
for fn, num_events in jxc_dict.iteritems(): 
    # first check to see if jxc file is needed at all (is in our "final" matrix)
    if fn in set(merged['SE_jxc_file']):
        cell = fn.split('-')[2].split('.')[0]
        # if there are 0 sig events for this file, exclude this rbp+cell from this study (remove the row)
        if jxc_dict[fn]['pos_nr'] == 0:
            rbp = fn.split('-')[0]
            
            code = '{}-{}'.format(rbp, cell)
            print('not enough positive events: {} {}'.format(code, fn))
            merged2 = merged2[merged2['code']!=code]
        elif jxc_dict[fn]['neg_nr'] == 0:
            rbp = fn.split('-')[0]
            code = '{}-{}'.format(rbp, cell)
            print('not enough negative events: {} {}'.format(code, fn))
            merged2 = merged2[merged2['code']!=code]

print("we have {} valid filtered+nonredundant JXC files.".format(merged2.shape[0]))

not enough negative events: GRWD1-K562 GRWD1-BGKLV21-K562.set21.SE.MATS.JunctionCountOnly.txt
not enough positive events: CSTF2T-K562 CSTF2T-BGKLV13-K562.set13.SE.MATS.JunctionCountOnly.txt
not enough positive events: AKAP1-K562 AKAP1-LV08-K562.set10.SE.MATS.JunctionCountOnly.txt
not enough positive events: EXOSC5-HepG2 EXOSC5-BGHcLV07-HepG2.set39_H.SE.MATS.JunctionCountOnly.txt
not enough positive events: LSM11-K562 LSM11-BGKLV21-K562.set21.SE.MATS.JunctionCountOnly.txt
not enough positive events: FASTKD2-K562 FASTKD2-BGKLV13-K562.set13.SE.MATS.JunctionCountOnly.txt
not enough positive events: SLTM-K562 SLTM-BGKLV13-K562.set13.SE.MATS.JunctionCountOnly.txt
not enough positive events: DDX52-K562 DDX52-BGKLV19-K562.set19.SE.MATS.JunctionCountOnly.txt
not enough positive events: FAM120A-K562 FAM120A-BGKLV19-K562.set19.SE.MATS.JunctionCountOnly.txt
not enough positive events: DDX51-K562 DDX51-BGKLV19-K562.set19.SE.MATS.JunctionCountOnly.txt
we have 193 valid filtered+nonredundant JXC file

# The numbers match up so far...
- 193 valid jxc files with eCLIP is 10 less than 203 total jxc + eCLIP.
- double check that the 10 samples truly don't have enough significant events to be mapped:

In [17]:
print("number of invalid/insufficient samples: {}".format(
    len(set(merged['SE_jxc_file']).difference(set(merged2['SE_jxc_file']))))
)
list_of_insignificant_jxc_files = list(set(merged['SE_jxc_file']).difference(set(merged2['SE_jxc_file'])))
for fn in list_of_insignificant_jxc_files:
    fn = os.path.join(orig_jxc_dir, fn)
    significant, positive, negative, positive_nr, negative_nr = filter_jxc_file(fn)
    print("fn: {} sig: {} pos: {} pos_nr: {} neg: {} neg_nr: {}".format(os.path.basename(fn), significant, positive, positive_nr, negative, negative_nr))

number of invalid/insufficient samples: 10
fn: CSTF2T-BGKLV13-K562.set13.SE.MATS.JunctionCountOnly.txt sig: 0 pos: 0 pos_nr: 0 neg: 0 neg_nr: 0
fn: EXOSC5-BGHcLV07-HepG2.set39_H.SE.MATS.JunctionCountOnly.txt sig: 0 pos: 0 pos_nr: 0 neg: 0 neg_nr: 0
fn: DDX51-BGKLV19-K562.set19.SE.MATS.JunctionCountOnly.txt sig: 1 pos: 0 pos_nr: 0 neg: 1 neg_nr: 1
fn: DDX52-BGKLV19-K562.set19.SE.MATS.JunctionCountOnly.txt sig: 0 pos: 0 pos_nr: 0 neg: 0 neg_nr: 0
fn: AKAP1-LV08-K562.set10.SE.MATS.JunctionCountOnly.txt sig: 9 pos: 0 pos_nr: 0 neg: 7 neg_nr: 7
fn: FAM120A-BGKLV19-K562.set19.SE.MATS.JunctionCountOnly.txt sig: 1 pos: 0 pos_nr: 0 neg: 1 neg_nr: 1
fn: GRWD1-BGKLV21-K562.set21.SE.MATS.JunctionCountOnly.txt sig: 2 pos: 2 pos_nr: 2 neg: 0 neg_nr: 0
fn: LSM11-BGKLV21-K562.set21.SE.MATS.JunctionCountOnly.txt sig: 2 pos: 0 pos_nr: 0 neg: 0 neg_nr: 0
fn: FASTKD2-BGKLV13-K562.set13.SE.MATS.JunctionCountOnly.txt sig: 0 pos: 0 pos_nr: 0 neg: 0 neg_nr: 0
fn: SLTM-BGKLV13-K562.set13.SE.MATS.JunctionCountO

In [18]:
list_of_insignificant_jxc_files

['CSTF2T-BGKLV13-K562.set13.SE.MATS.JunctionCountOnly.txt',
 'EXOSC5-BGHcLV07-HepG2.set39_H.SE.MATS.JunctionCountOnly.txt',
 'DDX51-BGKLV19-K562.set19.SE.MATS.JunctionCountOnly.txt',
 'DDX52-BGKLV19-K562.set19.SE.MATS.JunctionCountOnly.txt',
 'AKAP1-LV08-K562.set10.SE.MATS.JunctionCountOnly.txt',
 'FAM120A-BGKLV19-K562.set19.SE.MATS.JunctionCountOnly.txt',
 'GRWD1-BGKLV21-K562.set21.SE.MATS.JunctionCountOnly.txt',
 'LSM11-BGKLV21-K562.set21.SE.MATS.JunctionCountOnly.txt',
 'FASTKD2-BGKLV13-K562.set13.SE.MATS.JunctionCountOnly.txt',
 'SLTM-BGKLV13-K562.set13.SE.MATS.JunctionCountOnly.txt']

# AKAP1 K562 has 9 significant events but only 7 significant negative events
- make sure the IncLevelDifference is responsible for this difference

In [None]:
AKAP1_K562 = pd.read_table(os.path.join(orig_jxc_dir, 'AKAP1-LV08-K562.set10.SE.MATS.JunctionCountOnly.txt'))
print(
    "Number passing pvalue/fdr: {}".format(
        AKAP1_K562[(AKAP1_K562['FDR'] < 0.1) & (AKAP1_K562['PValue'] < 0.05)].shape[0]
    )
)
print(
    "Number of sig. pos: {}".format(
        AKAP1_K562[(AKAP1_K562['FDR'] < 0.1) & (AKAP1_K562['PValue'] < 0.05) & (AKAP1_K562['IncLevelDifference'] > 0.05)].shape[0]
    )
)
print(
    "Number of sig. neg: {}".format(
        AKAP1_K562[(AKAP1_K562['FDR'] < 0.1) & (AKAP1_K562['PValue'] < 0.05) & (AKAP1_K562['IncLevelDifference'] < -0.05)].shape[0]
)

# Check splice map manifest

In [None]:
df = pd.read_table(
    '/projects/ps-yeolab3/bay001/maps/current/se_xcompare_manifest.txt'
)
print(df.shape)
print(len(set(df['rnaseq_id'])))
df

In [None]:
df = pd.read_table(
    "/projects/ps-yeolab3/bay001/maps/current_annotations/se/HNRNPA1-BGKLV21-K562.set21.SE.MATS.JunctionCountOnly.txt"
)
print(df[(df['IncLevelDifference']>=0.05) & (df['PValue']<=0.05) & (df['FDR']<=0.1)].shape)
print(df[(df['IncLevelDifference']<=-0.05) & (df['PValue']<=0.05) & (df['FDR']<=0.1)].shape)

# Create list of nSE control mean values

In [None]:
all_hepg2_files = glob.glob("/projects/ps-yeolab3/bay001/maps/current/se_xcompare/*.1.HepG2_native_cassette_exons_all.means.txt")
all_k562_files = glob.glob("/projects/ps-yeolab3/bay001/maps/current/se_xcompare/*.1.K562_native_cassette_exons_all.means.txt")
print(len(all_hepg2_files), len(all_k562_files))

In [None]:
all_kept_hepg2_files = []
all_kept_k562_files = []

for fn in all_hepg2_files:
    if '.compare' not in fn:
        all_kept_hepg2_files.append(fn)

for fn in all_k562_files:
    if '.compare' not in fn:
        all_kept_k562_files.append(fn)
        
print(len(all_kept_hepg2_files), len(all_kept_k562_files))

In [None]:
all_kept_hepg2_files

# Compare values in scratch vs permanent storage (yeolab3)

In [None]:
all_excluded = glob.glob('/projects/ps-yeolab3/bay001/maps/current/se_idr_peak/*excluded-upon-knockdown.hist.txt')
print(len(all_excluded))
for excluded in all_excluded:
    permanent = glob.glob(os.path.join('/home/bay001/scratch/maps/se/idr/', os.path.basename(excluded)))
    if len(permanent) != 1:
        print(excluded)
    else:
        permanent = permanent[0]
        ! diff $permanent $excluded | head

In [None]:
all_included = glob.glob('/projects/ps-yeolab3/bay001/maps/current/se_idr_peak/*included-upon-knockdown.hist.txt')
print(len(all_excluded))
for included in all_included:
    permanent = glob.glob(os.path.join('/home/bay001/scratch/maps/se/idr/', os.path.basename(included)))
    if len(permanent) != 1:
        print(included)
    else:
        permanent = permanent[0]
        ! diff $permanent $included | head

In [None]:
all_excluded = glob.glob('/projects/ps-yeolab3/bay001/maps/current/se_peak/*excluded-upon-knockdown.hist.txt')
print(len(all_excluded))
for excluded in all_excluded:
    permanent = glob.glob(os.path.join('/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/peak_bb/', os.path.basename(excluded)))
    if len(permanent) != 1:
        print(excluded)
    else:
        permanent = permanent[0]
        ! diff $permanent $excluded | head

In [None]:
all_excluded = glob.glob('/projects/ps-yeolab3/bay001/maps/current/se_peak/*excluded-upon-knockdown.hist.txt')
print(len(all_excluded))
for excluded in all_excluded:
    permanent = glob.glob(os.path.join('/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/peak_bb/', os.path.basename(excluded)))
    if len(permanent) != 1:
        print(excluded)
    else:
        permanent = permanent[0]
        ! diff $permanent $excluded | head

In [2]:
df = pd.read_table('/projects/ps-yeolab3/bay001/reference_data/ENCODE/se_xcompare_manifest.txt')
df.head()

Unnamed: 0,clip_id,uID,group_id,pos_file_name,neg_file_name
0,216_01_SRSF9.merged.r2.bam,U2AF1-BGHLV30-HepG2.set30.SE.MATS.JunctionCountOnly.txt,2,/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/se_xcompare/216_01_SRSF9.merged.r2.compare.U2AF1-BGHLV30-HepG2.4.U2AF1-BGHLV30-HepG2.set30-included-upon-knockdown.means.txt,/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/se_xcompare/216_01_SRSF9.merged.r2.compare.U2AF1-BGHLV30-HepG2.4.U2AF1-BGHLV30-HepG2.set30-excluded-upon-knockdown.means.txt
1,216_02_SRSF9.merged.r2.bam,U2AF1-BGHLV30-HepG2.set30.SE.MATS.JunctionCountOnly.txt,2,/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/se_xcompare/216_02_SRSF9.merged.r2.compare.U2AF1-BGHLV30-HepG2.4.U2AF1-BGHLV30-HepG2.set30-included-upon-knockdown.means.txt,/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/se_xcompare/216_02_SRSF9.merged.r2.compare.U2AF1-BGHLV30-HepG2.4.U2AF1-BGHLV30-HepG2.set30-excluded-upon-knockdown.means.txt
2,216_01_SRSF9.merged.r2.bam,SF3A3-BGHLV33-HepG2.set33.SE.MATS.JunctionCountOnly.txt,2,/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/se_xcompare/216_01_SRSF9.merged.r2.compare.SF3A3-BGHLV33-HepG2.4.SF3A3-BGHLV33-HepG2.set33-included-upon-knockdown.means.txt,/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/se_xcompare/216_01_SRSF9.merged.r2.compare.SF3A3-BGHLV33-HepG2.4.SF3A3-BGHLV33-HepG2.set33-excluded-upon-knockdown.means.txt
3,216_02_SRSF9.merged.r2.bam,SF3A3-BGHLV33-HepG2.set33.SE.MATS.JunctionCountOnly.txt,2,/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/se_xcompare/216_02_SRSF9.merged.r2.compare.SF3A3-BGHLV33-HepG2.4.SF3A3-BGHLV33-HepG2.set33-included-upon-knockdown.means.txt,/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/se_xcompare/216_02_SRSF9.merged.r2.compare.SF3A3-BGHLV33-HepG2.4.SF3A3-BGHLV33-HepG2.set33-excluded-upon-knockdown.means.txt
4,216_01_SRSF9.merged.r2.bam,U2AF2-BGHLV26-HepG2.set26.SE.MATS.JunctionCountOnly.txt,2,/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/se_xcompare/216_01_SRSF9.merged.r2.compare.U2AF2-BGHLV26-HepG2.4.U2AF2-BGHLV26-HepG2.set26-included-upon-knockdown.means.txt,/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/se_xcompare/216_01_SRSF9.merged.r2.compare.U2AF2-BGHLV26-HepG2.4.U2AF2-BGHLV26-HepG2.set26-excluded-upon-knockdown.means.txt


In [17]:
from tqdm import tnrange, tqdm_notebook

progress = tnrange(len(df['pos_file_name']))
for pos in df['pos_file_name']:
    if pos != '-':
        x = ! wc -l $pos
        assert 1400 == int(list(x)[0].split(' ')[0])
    progress.update(1)

^C


IndexError: list index out of range

In [16]:
progress = tnrange(len(df['neg_file_name']))
for neg in df['neg_file_name']:
    if neg != '-':
        x = ! wc -l $neg
        try:
            assert 1400 == int(list(x)[0].split(' ')[0])
        except Exception as e:
            print(x)
    progress.update(1)




In [34]:
df = pd.read_table('/projects/ps-yeolab3/bay001/reference_data/ENCODE/se_xcompare_manifest.nSE.txt')
df.head()

Unnamed: 0,clip_id,uID,group_id,nSE_all_file_name,nSE_avg_file_name
0,216_01_SRSF9.merged.r2.bam,-,2,/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/se_xcompare/216_01_SRSF9.merged.r2.compare.nSE.4.HepG2_native_cassette_exons_all.means.txt,/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/se_xcompare/216_01_SRSF9.merged.r2.compare.nSE.4.HepG2_native_cassette_exons_avg.means.txt
1,216_02_SRSF9.merged.r2.bam,-,2,/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/se_xcompare/216_02_SRSF9.merged.r2.compare.nSE.4.HepG2_native_cassette_exons_all.means.txt,/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/se_xcompare/216_02_SRSF9.merged.r2.compare.nSE.4.HepG2_native_cassette_exons_avg.means.txt
2,215_01_TIA1.merged.r2.bam,-,6,/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/se_xcompare/215_01_TIA1.merged.r2.compare.nSE.4.HepG2_native_cassette_exons_all.means.txt,/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/se_xcompare/215_01_TIA1.merged.r2.compare.nSE.4.HepG2_native_cassette_exons_avg.means.txt
3,215_02_TIA1.merged.r2.bam,-,6,/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/se_xcompare/215_02_TIA1.merged.r2.compare.nSE.4.HepG2_native_cassette_exons_all.means.txt,/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/se_xcompare/215_02_TIA1.merged.r2.compare.nSE.4.HepG2_native_cassette_exons_avg.means.txt
4,211_01_IGF2BP3.merged.r2.bam,-,6,/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/se_xcompare/211_01_IGF2BP3.merged.r2.compare.nSE.4.HepG2_native_cassette_exons_all.means.txt,/home/bay001/projects/brian_rbpmaps_20180202/temporary_data/se_xcompare/211_01_IGF2BP3.merged.r2.compare.nSE.4.HepG2_native_cassette_exons_avg.means.txt


In [35]:
progress = tnrange(len(df['nSE_all_file_name']))
for a in df['nSE_all_file_name']:
    if pos != '-':
        x = ! wc -l $a
        assert 1400 == int(list(x)[0].split(' ')[0])
    progress.update(1)




In [36]:
progress = tnrange(len(df['nSE_avg_file_name']))
for a in df['nSE_avg_file_name']:
    if pos != '-':
        x = ! wc -l $a
        assert 1400 == int(list(x)[0].split(' ')[0])
    progress.update(1)