# Wrapper notebook for submitting the RBP maps script to TSCC
- split these up into their respective events notebooks (easier to read that way)


In [2]:
import pandas as pd
import os
import json
import yaml
import glob
import rethinkdb as r
from collections import defaultdict
from qtools import Submitter
from tqdm import tnrange, tqdm_notebook
pd.set_option("display.max_colwidth", 10000)


In [3]:
# r.connect(host='metadata.yeolab.sdsc.edu',port=8080,user='rbpmaps',password='rbpmaps')

# Define manifests, directories, etc.
- SBDS-BGKLV24-K562 and PPIL4-BGKLV24-K562 were changed in the K562.csv list from SBDS-BGKLV24_2-K562 and PPIL4-BGKLV24_2-K562 per an email discussion from xintao.

In [6]:
current_date = '5-12-2017'
prefix = ''
hepg2_rnaseq_manifest = '/home/bay001/projects/maps_20160420/permanent_data/RNASeq_final_exp_list_HepG2.csv'
k562_rnaseq_manifest = '/home/bay001/projects/maps_20160420/permanent_data/RNASeq_final_exp_list_K562.csv'
rnaseq_manifests = {'HepG2':hepg2_rnaseq_manifest, 'K562':k562_rnaseq_manifest}
chrom_sizes = '/projects/ps-yeolab/genomes/hg19/hg19.chrom.sizes'
density_runner = '/home/bay001/projects/codebase/stable/rbp-maps/maps/plot_density.py'
peak_runner = '/home/bay001/projects/codebase/stable/rbp-maps/maps/plot_peak.py'


def split_uid_and_rep(name):
    """
    Splits the name (ie. 204_01_RBFOX2) into: [204, 01, RBFOX2] 
    (all strings)
    """
    name = name.split('_')
    uid = name[0]
    rep = name[1]
    others = '_'.join(name[2:])
    return uid, rep, others

def get_clip_file_from_uid(clip, rbp_uid):
    """
    Returns attributes from the submitted CLIP manifest given an id
    """
    # clip = pd.read_table(manifest_file)
    rbp_df = clip[clip['uID']==rbp_uid]
    clip_rbp = rbp_df['RBP'].to_string(index=False, header=False)
    clip_celltype = rbp_df['Cell line'].to_string(index=False, header=False)
    clip_rep1 = rbp_df['CLIP_rep1'].to_string(index=False, header=False)
    clip_rep2 = rbp_df['CLIP_rep2'].to_string(index=False, header=False)
    input_rep = rbp_df['INPUT'].to_string(index=False, header=False)
    return clip_rep1, clip_rep2, input_rep, clip_rbp, clip_celltype

def get_rnaseq_splicing_prefix_from_rbpname(rnaseq_manifests, rbp_name, cell_type):
    """
    Returns the experiment ID (ie. RBFOX2-BGHLVsomething-K562) given an RBP name 
    and cell type.
    """
    manifest_filename = rnaseq_manifests[cell_type]
    df = pd.read_table(manifest_filename)
    exp = df[df['Official_RBP']==rbp_name]['EXP']
    if exp.shape[0] == 0:
        return "NO_RNASEQ"
    return exp.to_string(index=False, header=False)

def get_annotations_from_splicing_prefix(
    annotation_dir, splicing_prefix, 
    pos_splicing_suffix = '-included-upon-knockdown',
    neg_splicing_suffix = '-excluded-upon-knockdown'
):
    """
    For each RBP, we're looking specifically for positive and negative annotation.
    For PNGs, since we're putting this on a website, we wanted to name these annotations properly.
    
    Valid suffixes so far:
    
    PNG: -included-upon-knockdown / -excluded-upon-knockdown
    SVG: .positive.nr.txt / .negative.nr.txt
    PEAK: .positive.nr.miso / .negative.nr.miso
    
    """
    
    
    positive = glob.glob(
        os.path.join(annotation_dir, splicing_prefix) + "*" + pos_splicing_suffix
    )
    negative = glob.glob(
        os.path.join(annotation_dir, splicing_prefix) + "*" + neg_splicing_suffix
    )
    
    if (len(positive)==1 and len(negative)==1):
        return positive[0], negative[0]
    else:
        return None, None

def get_peak_annotations_from_splicing_prefix(annotation_dir, splicing_prefix):
    return get_annotations_from_splicing_prefix(
        annotation_dir,
        splicing_prefix,
        '.positive.nr.miso',
        '.negative.nr.miso'
    )

# Plot SE Density

In [None]:
"""pos_splicing_suffix = '.positive.nr.txt'
neg_splicing_suffix = '.negative.nr.txt'
clip_manifest = '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.txt'
annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/se/'
background_control_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/se/'
output_dir = '/projects/ps-yeolab3/bay001/maps/current/se_nr{}/'.format(prefix)
bash_scripts_dir = '/home/bay001/projects/maps_20160420/scripts/bash_scripts/'

k562_background_control_file = 'K562-constitutive-exons' # 'Brent_RNASEQlist_fromDCC.tsv.file_accessions_K562.hg19_V19.20161207.tsv.strict_CE_all'
hepg2_background_control_file = 'HepG2-constitutive-exons' # 'Brent_RNASEQlist_fromDCC.tsv.file_accessions_HepG2.hg19_V19.20161207.tsv.strict_CE_all'

k562_background_ce = os.path.join(background_control_dir, k562_background_control_file)
k562_background_ase = None
k562_background_nse = None
hepg2_background_ce = os.path.join(background_control_dir, hepg2_background_control_file)
hepg2_background_ase = None
hepg2_background_nse = None

img_extension = 'png'
event = 'se'

no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
no_rnaseq_yet = [] # uIDs for which we have an expt id, but haven't downloaded the data yet
cmds = []
clip_df = pd.read_table(clip_manifest)

for uid in clip_df['uID']:
    
    metadata = defaultdict()
    metadata[uid] = defaultdict()
    metadata[uid]['rep1'] = defaultdict()
    metadata[uid]['rep2'] = defaultdict()
    
    
    r1, r2, i, rbp, cell = get_clip_file_from_uid(clip_df, uid)
    
    if cell == 'K562':
        background = k562_background_ce
    else:
        background = hepg2_background_ce
    splicing_prefix = get_rnaseq_splicing_prefix_from_rbpname(
        rnaseq_manifests, rbp, cell
    )
    if(splicing_prefix == "NO_RNASEQ"): # we don't have an rna seq expt for this clip:
        no_rnaseq.append(uid)
    else:
        positive, negative = get_annotations_from_splicing_prefix(
            annotation_dir, splicing_prefix, 
            pos_splicing_suffix, neg_splicing_suffix
        )
    if(positive == None or negative == None):
        no_rnaseq_yet.append(uid)
    else:
        for r in [r1, r2]:
            name = os.path.basename(r).replace('.bam','.{}'.format(img_extension))
            output_filename = os.path.join(
                output_dir,
                name
            )

            # Build the cmd line
            cmd = "python " + density_runner
            cmd = cmd + " --event {}".format(event)
            cmd = cmd + " --ipbam {}".format(r)
            cmd = cmd + " --inputbam {}".format(i)
            cmd = cmd + " --output {}".format(output_filename)
            cmd = cmd + " --annotations {} {} {}".format(
                positive, negative, background
            )
            cmd = cmd + " --annotation_type {} {} {}".format(
                'rmats', 'rmats', 'eric',
            )
            cmd = cmd + " --chrom_sizes {}".format(chrom_sizes)
            # cmd = cmd + " --unflip"
            cmds.append(cmd)

            # Build the json dict
            rep = 'rep1' if r == r1 else 'rep2'

            metadata[uid][rep]['annotations'] = list()
            metadata[uid][rep]['annotation_type'] = list()

            metadata[uid][rep]['cell'] = cell
            metadata[uid][rep]['rbp'] = rbp
            metadata[uid][rep]['date'] = current_date
            metadata[uid][rep]['ip_bam'] = {'class':'File', 'path':r}
            # metadata[uid][rep]['ip_pos_bw'] = r.replace('.bam','.norm.pos.bw') # prog decides this for us.
            # metadata[uid][rep]['ip_neg_bw'] = r.replace('.bam','.norm.neg.bw')

            metadata[uid][rep]['input_bam'] = {'class':'File', 'path':i}
            # metadata[uid][rep]['input_pos_bw'] = i.replace('.bam','.norm.pos.bw')
            # metadata[uid][rep]['input_neg_bw'] = i.replace('.bam','.norm.neg.bw')

            metadata[uid][rep]['chrom_sizes'] = {'class':'File', 'path':chrom_sizes}
            metadata[uid][rep]['unflip'] = False
            metadata[uid][rep]['event'] = 'se'
            metadata[uid][rep]['exon_offset'] = 50
            metadata[uid][rep]['intron_offset'] = 300

            metadata[uid][rep]['annotations'].append({'class':'File', 'path':positive})
            metadata[uid][rep]['annotations'].append({'class':'File', 'path':negative})
            metadata[uid][rep]['annotations'].append({'class':'File', 'path':background})

            metadata[uid][rep]['annotation_type'].append('rmats')
            metadata[uid][rep]['annotation_type'].append('rmats')
            metadata[uid][rep]['annotation_type'].append('eric')

            with open(
                os.path.join(
                    bash_scripts_dir,"{}.{}.{}.{}.{}.{}.json".format(
                        uid, rep, rbp, cell, current_date, event
                    )
                ),
                'w'
            ) as f:
                f.write('#!/usr/bin/env make_rbp_map.cwl\n')
                json.dump(metadata[uid][rep], f)
Submitter(
    cmds, 
    "SE_NR_{}".format(img_extension), 
    sh='/projects/ps-yeolab3/bay001/maps/bash_scripts/SE_NR_{}.sh'.format(img_extension),
    submit=True,
    array=True,
    walltime='2:00:00'
)
print("uIDs for which we don't have splicing data for: {}".format(
        len(no_rnaseq))
     )
print("uIDs for which we have an RNASEQ ID, but we don't have the splicing data yet: {}".format(
        len(no_rnaseq_yet))
     )"""

# Plot SVGs (using all backgrounds this time)

In [None]:
is_5p = True
img_extension = 'svg'
pos_splicing_suffix = '.positive.nr.txt'
neg_splicing_suffix = '.negative.nr.txt'


annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/se/'
background_control_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/se/'
bash_scripts_dir = '/home/bay001/projects/maps_20160420/scripts/bash_scripts/'

if is_5p == True:
    output_dir = '/projects/ps-yeolab3/bay001/maps/current/se_nr_5p/'
    clip_manifest = '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.5p.txt'
    prefix = '5p'
else:
    output_dir = '/projects/ps-yeolab3/bay001/maps/current/se_nr/'
    clip_manifest = '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.txt'
    prefix = 'wholeread'

k562_background_ce = os.path.join(background_control_dir, 'k562_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.strict_CE_all_20170401')
k562_background_nse_all = os.path.join(background_control_dir, 'k562_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nSEall_0.5_20170401')
k562_background_nse_inc = os.path.join(background_control_dir, 'k562_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nSEincl_0.5_20170401')
k562_background_nse_exc = os.path.join(background_control_dir, 'k562_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nSEexcl_0.5_20170401')

hepg2_background_ce = os.path.join(background_control_dir, 'hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.strict_CE_all_20170401')
hepg2_background_nse_all = os.path.join(background_control_dir, 'hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nSEall_0.5_20170401')
hepg2_background_nse_inc = os.path.join(background_control_dir, 'hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nSEincl_0.5_20170401')
hepg2_background_nse_exc = os.path.join(background_control_dir, 'hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nSEexcl_0.5_20170401')

event = 'se'
clip_df = pd.read_table(clip_manifest)

no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
no_rnaseq_yet = [] # uIDs for which we have an expt id, but haven't downloaded the data yet (or there were no significant splice events)
cmds = []

for uid in clip_df['uID']:
    
    metadata = defaultdict()
    metadata[uid] = defaultdict()
    metadata[uid]['rep1'] = defaultdict()
    metadata[uid]['rep2'] = defaultdict()
    
    
    r1, r2, i, rbp, cell = get_clip_file_from_uid(clip_df, uid)
    
    
    if cell == 'K562':
        background1 = k562_background_ce
        background2 = k562_background_nse_all
        background3 = k562_background_nse_inc
        background4 = k562_background_nse_exc
    elif cell == 'HepG2':
        background1 = hepg2_background_ce
        background2 = hepg2_background_nse_all
        background3 = hepg2_background_nse_inc
        background4 = hepg2_background_nse_exc
    else:
        print(cell)
    
    splicing_prefix = get_rnaseq_splicing_prefix_from_rbpname(
        rnaseq_manifests, rbp, cell
    )
    if(splicing_prefix == "NO_RNASEQ"): # we don't have an rna seq expt for this clip:
        no_rnaseq.append(uid)
    else:
        positive, negative = get_annotations_from_splicing_prefix(
            annotation_dir, splicing_prefix, 
            pos_splicing_suffix, neg_splicing_suffix
        )
        if(positive == None or negative == None):
            no_rnaseq_yet.append(uid)
        else:
            if not (rbp in positive and rbp in negative):
                print(
                    'warning, these dont match: {}, {}, {}'.format(
                        rbp, 
                        os.path.basename(positive),
                        os.path.basename(negative)
                    )
                )
            pos_prefix = os.path.basename(positive).split('-')[0]
            neg_prefix = os.path.basename(negative).split('-')[0]
            if not (pos_prefix in rbp and neg_prefix in rbp):
                print(
                    'warning, these dont match: {}, {}, {}'.format(
                        rbp, 
                        os.path.basename(positive),
                        os.path.basename(negative)
                    )
                )
            for r in [r1, r2]:
                name = os.path.basename(r).replace('.bam','.{}'.format(img_extension))
                output_filename = os.path.join(
                    output_dir,
                    name
                )

                # Build the cmd line
                cmd = "python " + density_runner
                cmd = cmd + " --event {}".format(event)
                cmd = cmd + " --ipbam {}".format(r)
                cmd = cmd + " --inputbam {}".format(i)
                cmd = cmd + " --output {}".format(output_filename)
                cmd = cmd + " --annotations {} {} {} {} {} {}".format(
                    positive, negative, background1, background2, background3, background4
                )
                cmd = cmd + " --annotation_type {} {} {} {} {} {}".format(
                    'rmats', 'rmats', 'eric', 'eric', 'eric', 'eric' 
                )
                cmd = cmd + " --chrom_sizes {}".format(chrom_sizes)
                # if is_5p == True:
                #     cmd = cmd + " --unflip"
                
                cmds.append(cmd)

                # Build the json dict
                rep = 'rep1' if r == r1 else 'rep2'

                metadata[uid][rep]['annotations'] = list()
                metadata[uid][rep]['annotation_type'] = list()

                metadata[uid][rep]['cell'] = cell
                metadata[uid][rep]['rbp'] = rbp
                metadata[uid][rep]['date'] = current_date
                metadata[uid][rep]['ip_bam'] = {'class':'File', 'path':r}
                # metadata[uid][rep]['ip_pos_bw'] = r.replace('.bam','.norm.pos.bw') # prog decides this for us.
                # metadata[uid][rep]['ip_neg_bw'] = r.replace('.bam','.norm.neg.bw')

                metadata[uid][rep]['input_bam'] = {'class':'File', 'path':i}
                # metadata[uid][rep]['input_pos_bw'] = i.replace('.bam','.norm.pos.bw')
                # metadata[uid][rep]['input_neg_bw'] = i.replace('.bam','.norm.neg.bw')

                metadata[uid][rep]['chrom_sizes'] = {'class':'File', 'path':chrom_sizes}
                metadata[uid][rep]['unflip'] = False
                metadata[uid][rep]['event'] = 'se'
                metadata[uid][rep]['exon_offset'] = 50
                metadata[uid][rep]['intron_offset'] = 300

                metadata[uid][rep]['annotations'].append({'class':'File', 'path':background1})
                metadata[uid][rep]['annotations'].append({'class':'File', 'path':background2})
                metadata[uid][rep]['annotations'].append({'class':'File', 'path':background3})
                metadata[uid][rep]['annotations'].append({'class':'File', 'path':positive})
                metadata[uid][rep]['annotations'].append({'class':'File', 'path':negative})

                metadata[uid][rep]['annotation_type'].append('eric')
                metadata[uid][rep]['annotation_type'].append('eric')
                metadata[uid][rep]['annotation_type'].append('eric')
                metadata[uid][rep]['annotation_type'].append('rmats')
                metadata[uid][rep]['annotation_type'].append('rmats')


                with open(
                    os.path.join(
                        bash_scripts_dir,"{}.{}.{}.{}.{}.{}.json".format(
                            uid, rep, rbp, cell, current_date, event
                        )
                    ),
                    'w'
                ) as f:
                    f.write('#!/usr/bin/env make_rbp_map.cwl\n')
                    json.dump(metadata[uid][rep], f)
Submitter(
    cmds, 
    "{}-SE_NR_{}".format(prefix, img_extension), 
    sh='/projects/ps-yeolab3/bay001/maps/bash_scripts/{}-SE_NR_{}.sh'.format(prefix, img_extension),
    submit=True,
    array=True,
    walltime='3:00:00'
)
print("uIDs for which we don't have splicing data for: {}".format(
        len(no_rnaseq))
     )
print("uIDs for which we have an RNASEQ ID, but we don't have the splicing data yet: {}".format(
        len(no_rnaseq_yet))
     )

In [None]:
is_5p = False
img_extension = 'png'
pos_splicing_suffix = '-included-upon-knockdown'
neg_splicing_suffix = '-excluded-upon-knockdown'


annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/se_renamed/'
background_control_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/se_renamed/'

bash_scripts_dir = '/home/bay001/projects/maps_20160420/scripts/bash_scripts/{}'.format(current_date)
! mkdir $bash_scripts_dir
if is_5p == True:
    output_dir = '/projects/ps-yeolab3/bay001/maps/current/se_nr_5p/'
    clip_manifest = '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.5p.txt'
    prefix = '5p'
else:
    output_dir = '/projects/ps-yeolab3/bay001/maps/current/se_nr/'
    clip_manifest = '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.txt'
    prefix = 'wholeread'

k562_background_ce = os.path.join(background_control_dir, 'K562_constitutive_exons')
k562_background_nse_all = os.path.join(background_control_dir, 'K562_native_cassette_exons')
k562_background_nse_inc = os.path.join(background_control_dir, 'K562_natively_included_exons')
k562_background_nse_exc = os.path.join(background_control_dir, 'K562_natively_excluded_exons')

hepg2_background_ce = os.path.join(background_control_dir, 'HepG2_constitutive_exons')
hepg2_background_nse_all = os.path.join(background_control_dir, 'HepG2_native_cassette_exons')
hepg2_background_nse_inc = os.path.join(background_control_dir, 'HepG2_natively_included_exons')
hepg2_background_nse_exc = os.path.join(background_control_dir, 'HepG2_natively_excluded_exons')

event = 'se'
clip_df = pd.read_table(clip_manifest)

no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
no_rnaseq_yet = [] # uIDs for which we have an expt id, but haven't downloaded the data yet (or there were no significant splice events)
cmds = []

for uid in clip_df['uID']:
    
    metadata = defaultdict()
    metadata[uid] = defaultdict()
    metadata[uid]['rep1'] = defaultdict()
    metadata[uid]['rep2'] = defaultdict()
    
    
    r1, r2, i, rbp, cell = get_clip_file_from_uid(clip_df, uid)
    
    
    if cell == 'K562':
        background1 = k562_background_ce
        background2 = k562_background_nse_all
        background3 = k562_background_nse_inc
        background4 = k562_background_nse_exc
    elif cell == 'HepG2':
        background1 = hepg2_background_ce
        background2 = hepg2_background_nse_all
        background3 = hepg2_background_nse_inc
        background4 = hepg2_background_nse_exc
    else:
        print(cell)
    
    splicing_prefix = get_rnaseq_splicing_prefix_from_rbpname(
        rnaseq_manifests, rbp, cell
    )
    if(splicing_prefix == "NO_RNASEQ"): # we don't have an rna seq expt for this clip:
        no_rnaseq.append(uid)
    else:
        positive, negative = get_annotations_from_splicing_prefix(
            annotation_dir, splicing_prefix, 
            pos_splicing_suffix, neg_splicing_suffix
        )
        if(positive == None or negative == None):
            no_rnaseq_yet.append(uid)
        else:
            if not (rbp in positive and rbp in negative):
                print(
                    'warning, these dont match: {}, {}, {}'.format(
                        rbp, 
                        os.path.basename(positive),
                        os.path.basename(negative)
                    )
                )
            pos_prefix = os.path.basename(positive).split('-')[0]
            neg_prefix = os.path.basename(negative).split('-')[0]
            if not (pos_prefix in rbp and neg_prefix in rbp):
                print(
                    'warning, these dont match: {}, {}, {}'.format(
                        rbp, 
                        os.path.basename(positive),
                        os.path.basename(negative)
                    )
                )
            for r in [r1, r2]:
                name = os.path.basename(r).replace('.bam','.{}'.format(img_extension))
                output_filename = os.path.join(
                    output_dir,
                    name
                )

                # Build the cmd line
                cmd = "python " + density_runner
                cmd = cmd + " --event {}".format(event)
                cmd = cmd + " --ipbam {}".format(r)
                cmd = cmd + " --inputbam {}".format(i)
                cmd = cmd + " --output {}".format(output_filename)
                cmd = cmd + " --annotations {} {} {} {} {} {}".format(
                    positive, negative, background1, background2, background3, background4
                )
                cmd = cmd + " --annotation_type {} {} {} {} {} {}".format(
                    'rmats', 'rmats', 'eric', 'eric', 'eric', 'eric' 
                )
                cmd = cmd + " --chrom_sizes {}".format(chrom_sizes)
                # if is_5p == True:
                #     cmd = cmd + " --unflip"
                
                cmds.append(cmd)

                # Build the json dict
                rep = 'rep1' if r == r1 else 'rep2'

                metadata[uid][rep]['annotations'] = list()
                metadata[uid][rep]['annotation_type'] = list()

                metadata[uid][rep]['cell'] = cell
                metadata[uid][rep]['rbp'] = rbp
                metadata[uid][rep]['date'] = current_date
                metadata[uid][rep]['ip_bam'] = {'class':'File', 'path':r}
                # metadata[uid][rep]['ip_pos_bw'] = r.replace('.bam','.norm.pos.bw') # prog decides this for us.
                # metadata[uid][rep]['ip_neg_bw'] = r.replace('.bam','.norm.neg.bw')

                metadata[uid][rep]['input_bam'] = {'class':'File', 'path':i}
                # metadata[uid][rep]['input_pos_bw'] = i.replace('.bam','.norm.pos.bw')
                # metadata[uid][rep]['input_neg_bw'] = i.replace('.bam','.norm.neg.bw')

                metadata[uid][rep]['chrom_sizes'] = {'class':'File', 'path':chrom_sizes}
                metadata[uid][rep]['unflip'] = False
                metadata[uid][rep]['event'] = 'se'
                metadata[uid][rep]['exon_offset'] = 50
                metadata[uid][rep]['intron_offset'] = 300

                metadata[uid][rep]['annotations'].append({'class':'File', 'path':background1})
                metadata[uid][rep]['annotations'].append({'class':'File', 'path':background2})
                metadata[uid][rep]['annotations'].append({'class':'File', 'path':background3})
                metadata[uid][rep]['annotations'].append({'class':'File', 'path':positive})
                metadata[uid][rep]['annotations'].append({'class':'File', 'path':negative})

                metadata[uid][rep]['annotation_type'].append('eric')
                metadata[uid][rep]['annotation_type'].append('eric')
                metadata[uid][rep]['annotation_type'].append('eric')
                metadata[uid][rep]['annotation_type'].append('rmats')
                metadata[uid][rep]['annotation_type'].append('rmats')


                with open(
                    os.path.join(
                        bash_scripts_dir,"{}.{}.{}.{}.{}.{}.json".format(
                            uid, rep, rbp, cell, current_date, event
                        )
                    ),
                    'w'
                ) as f:
                    f.write('#!/usr/bin/env make_rbp_map.cwl\n')
                    json.dump(metadata[uid][rep], f)
Submitter(
    cmds, 
    "{}-SE_NR_{}".format(prefix, img_extension), 
    sh='/projects/ps-yeolab3/bay001/maps/bash_scripts/{}/{}-SE_NR_{}.sh'.format(current_date, prefix, img_extension),
    submit=False,
    array=True,
    walltime='3:00:00'
)
print("uIDs for which we don't have splicing data for: {}".format(
        len(no_rnaseq))
     )
print("uIDs for which we have an RNASEQ ID, but we don't have the splicing data yet: {}".format(
        len(no_rnaseq_yet))
     )

In [None]:
# output the csv to 
pd.Series(no_rnaseq_yet).to_csv('/home/bay001/projects/encode/analysis/tests/missing_rnaseq/se-missing.txt',index=None)
pd.Series(no_rnaseq).to_csv('/home/bay001/projects/encode/analysis/tests/missing_rnaseq/se-nonexistant.txt',index=None)
no_rnaseq_yet

# Plot the 5p SE densities

In [None]:
"""pos_splicing_suffix = '.positive.nr.txt'
neg_splicing_suffix = '.negative.nr.txt'

clip_manifest = '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.5p.txt'
annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/se/'
background_control_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/se/'
output_dir = '/projects/ps-yeolab3/bay001/maps/current/se_nr_5p/'
bash_scripts_dir = '/home/bay001/projects/maps_20160420/scripts/bash_scripts/'

k562_background_ce = os.path.join(background_control_dir, 'k562_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.strict_CE_all_20170401')
k562_background_nse_all = os.path.join(background_control_dir, 'k562_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nSEall_0.5_20170401')
k562_background_nse_inc = os.path.join(background_control_dir, 'k562_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nSEincl_0.5_20170401')
k562_background_nse_exc = os.path.join(background_control_dir, 'k562_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nSEexcl_0.5_20170401')

hepg2_background_ce = os.path.join(background_control_dir, 'hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.strict_CE_all_20170401')
hepg2_background_nse_all = os.path.join(background_control_dir, 'hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nSEall_0.5_20170401')
hepg2_background_nse_inc = os.path.join(background_control_dir, 'hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nSEincl_0.5_20170401')
hepg2_background_nse_exc = os.path.join(background_control_dir, 'hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nSEexcl_0.5_20170401')

img_ext = 'png'
event = 'se'

no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
no_rnaseq_yet = [] # uIDs for which we have an expt id, but haven't downloaded the data yet
cmds = []

clip_df = pd.read_table(clip_manifest)

for uid in clip_df['uID']:
    
    metadata = defaultdict()
    metadata[uid] = defaultdict()
    metadata[uid]['rep1'] = defaultdict()
    metadata[uid]['rep2'] = defaultdict()
    
    
    r1, r2, i, rbp, cell = get_clip_file_from_uid(clip_df, uid)
    
    if cell == 'K562':
        background = k562_background_ce
    elif cell == 'HepG2':
        background = hepg2_background_ce
    else:
        print(cell)
    splicing_prefix = get_rnaseq_splicing_prefix_from_rbpname(
        rnaseq_manifests, rbp, cell
    )
    if(splicing_prefix == "NO_RNASEQ"): # we don't have an rna seq expt for this clip:
        no_rnaseq.append(uid)
    else:
        positive, negative = get_annotations_from_splicing_prefix(
            annotation_dir, splicing_prefix, 
            pos_splicing_suffix, neg_splicing_suffix
        )
        if(positive == None or negative == None):
            no_rnaseq_yet.append(uid)
        else:
            for r in [r1, r2]:
                name = os.path.basename(r).replace('.bam','.{}'.format(img_ext))
                output_filename = os.path.join(
                    output_dir,
                    name
                )

                # Build the cmd line
                cmd = "python " + density_runner
                cmd = cmd + " --event {}".format(event)
                cmd = cmd + " --ipbam {}".format(r)
                cmd = cmd + " --inputbam {}".format(i)
                cmd = cmd + " --output {}".format(output_filename)
                cmd = cmd + " --annotations {} {} {}".format(
                    positive, negative, background
                )
                cmd = cmd + " --annotation_type {} {} {}".format(
                    'rmats', 'rmats', 'eric',
                )
                cmd = cmd + " --chrom_sizes {}".format(chrom_sizes)
                cmd = cmd + " --unflip"
                cmds.append(cmd)

                # Build the json dict
                rep = 'rep1' if r == r1 else 'rep2'

                metadata[uid][rep]['annotations'] = list()
                metadata[uid][rep]['annotation_type'] = list()

                metadata[uid][rep]['cell'] = cell
                metadata[uid][rep]['rbp'] = rbp
                metadata[uid][rep]['date'] = current_date
                metadata[uid][rep]['ip_bam'] = {'class':'File', 'path':r}

                metadata[uid][rep]['input_bam'] = {'class':'File', 'path':i}

                metadata[uid][rep]['chrom_sizes'] = {'class':'File', 'path':chrom_sizes}
                metadata[uid][rep]['unflip'] = False
                metadata[uid][rep]['event'] = 'se'
                metadata[uid][rep]['exon_offset'] = 50
                metadata[uid][rep]['intron_offset'] = 300

                metadata[uid][rep]['annotations'].append({'class':'File', 'path':positive})
                metadata[uid][rep]['annotations'].append({'class':'File', 'path':negative})
                metadata[uid][rep]['annotations'].append({'class':'File', 'path':background})

                metadata[uid][rep]['annotation_type'].append('rmats')
                metadata[uid][rep]['annotation_type'].append('rmats')
                metadata[uid][rep]['annotation_type'].append('eric')

                with open(
                    os.path.join(
                        bash_scripts_dir,"{}.{}.{}.{}.{}.{}.json".format(
                            uid, rep, rbp, cell, current_date, event
                        )
                    ),
                    'w'
                ) as f:
                    f.write('#!/usr/bin/env make_rbp_map.cwl\n')
                    json.dump(metadata[uid][rep], f)
Submitter(
    cmds, 
    "SE_NR_5p_{}".format(img_extension), 
    sh='/projects/ps-yeolab3/bay001/maps/bash_scripts/SE_NR_5p_{}.sh'.format(img_extension),
    submit=False,
    array=True,
    walltime='2:00:00'
)
print("uIDs for which we don't have splicing data for: {}".format(
        len(no_rnaseq))
     )
print("uIDs for which we have an RNASEQ ID, but we don't have the splicing data yet: {}".format(
        len(no_rnaseq_yet))
     )"""

# Plot Peaks

In [5]:
peak_dir = '/home/elvannostrand/data/clip/CLIPseq_analysis/ENCODE_CLIPperv2_20161120/'
miso_annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/as_miso_renamed'
program_runner = '/home/bay001/projects/codebase/rbp-maps/maps/plot_peak.py'
clip_manifest = '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.txt'

outpath = '/projects/ps-yeolab3/bay001/maps/current/se_nr_peak/'
img_ext = 'png'

In [6]:
event = 'se'
clip_df = pd.read_table(clip_manifest)


In [7]:
all_peaks = glob.glob(os.path.join(peak_dir,'*.compressed.bed'))
no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
no_rnaseq_yet = [] # uIDs for which we have an expt id, but haven't downloaded the data yet
cmds = []

for peak in all_peaks:
    outfile = os.path.join(
        outpath,
        os.path.basename(peak).replace('.compressed.bed','.compressed.{}'.format(img_ext))
    )
    uid, rep, other = split_uid_and_rep(os.path.basename(peak).split('.')[0])
    _, _, _, rbp, cell = get_clip_file_from_uid(
        clip_df, uid
    )
    splicing_prefix = get_rnaseq_splicing_prefix_from_rbpname(
        rnaseq_manifests, rbp, cell
    )
    
    if cell == 'HepG2':
        ce_background = os.path.join(miso_annotation_dir,'HepG2-constitutive-exons.miso')
        nc_background = os.path.join(miso_annotation_dir,'HepG2-native-cassette-exons.miso')
        ni_background = os.path.join(miso_annotation_dir,'HepG2-native-included-exons.miso')
        ne_background = os.path.join(miso_annotation_dir,'HepG2-native-excluded-exons.miso')
    elif cell == 'K562':
        ce_background = os.path.join(miso_annotation_dir,'K562-constitutive-exons.miso')
        nc_background = os.path.join(miso_annotation_dir,'K562-native-cassette-exons.miso')
        ni_background = os.path.join(miso_annotation_dir,'K562-native-included-exons.miso')
        ne_background = os.path.join(miso_annotation_dir,'K562-native-excluded-exons.miso')
    else:
        print('error')
        
    if(splicing_prefix == "NO_RNASEQ"): # we don't have an rna seq expt for this clip:
        no_rnaseq.append(uid)
    else:
        positive, negative = get_annotations_from_splicing_prefix(
            miso_annotation_dir, splicing_prefix, 
            
        )
    if(positive == None or negative == None):
        no_rnaseq_yet.append(uid)
    elif not os.path.exists(outfile):
        cmd = "python {} -i {} -o {} -m {} {} {} {} {} {}".format(
            program_runner,
            peak,
            outfile,
            positive,
            negative,
            ce_background,
            nc_background,
            ni_background,
            ne_background
        )
        cmds.append(cmd)

Submitter(
    cmds, 
    "SE_PEAK_PNGS", 
    sh='/projects/ps-yeolab3/bay001/maps/bash_scripts/SE_PEAK_PNGS2.sh',
    submit=True,
    array=True,
    walltime='2:00:00'
)
print("uIDs for which we don't have splicing data for: {}".format(
        len(no_rnaseq))
     )
print("uIDs for which we have an RNASEQ ID, but we don't have the splicing data yet: {}".format(
        len(no_rnaseq_yet))
     )

8355403[].tscc-mgr.local
uIDs for which we don't have splicing data for: 46
uIDs for which we have an RNASEQ ID, but we don't have the splicing data yet: 14


running 46 tasks as an array-job.
Wrote commands to /projects/ps-yeolab3/bay001/maps/bash_scripts/SE_PEAK_PNGS2.sh.
Submitted script to queue home.
 Job ID: 8355403


# Plot the rest of the splice events
- The first cell plots all the background events + positive (included) and negative (excluded) events separately.
- The second cell plots a single background event + positive (included) + negative (excluded) events

In [None]:
"""density_runner = '/home/bay001/projects/codebase/rbp-maps/maps/plot_density.py'
clip_manifest = '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.txt'
clip_df = pd.read_table(clip_manifest)

events = {
    # 'mxe':'/projects/ps-yeolab3/bay001/maps/current_annotations/mxe/',
    'a3ss':'/projects/ps-yeolab3/bay001/maps/current_annotations/a3ss/',
    'a5ss':'/projects/ps-yeolab3/bay001/maps/current_annotations/a5ss/',
    # 'ri':'/projects/ps-yeolab3/bay001/maps/current_annotations/ri/'
}


img_extension = 'png'
out_base = '/projects/ps-yeolab3/bay001/maps/current/'
pos_splicing_suffix = 'positive.nr.txt'
neg_splicing_suffix = 'negative.nr.txt'

for event, annotation_dir in events.iteritems():
    no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
    no_rnaseq_yet = [] # uIDs for which we have an expt id, but haven't downloaded the data yet
    cmds = []
    output_dir = os.path.join(out_base, '{}_nr'.format(event))
    
    for uid in clip_df['uID']:
        r1, r2, i, rbp, cell = get_clip_file_from_uid(clip_df, uid)

        if cell == 'K562':
            background_basic = os.path.join(annotation_dir, 'K562-background-basic.txt')
            background_center = os.path.join(annotation_dir, 'K562-background-center.txt')
            background_extension = os.path.join(annotation_dir, 'K562-background-extension.txt')
        elif cell == 'HepG2':
            background_basic = os.path.join(annotation_dir, 'HepG2-background-basic.txt')
            background_center = os.path.join(annotation_dir, 'HepG2-background-center.txt')
            background_extension = os.path.join(annotation_dir, 'HepG2-background-extension.txt')
        else:
            print(cell)
            
        splicing_prefix = get_rnaseq_splicing_prefix_from_rbpname(rnaseq_manifests, rbp, cell)
        if(splicing_prefix == "NO_RNASEQ"): # we don't have an rna seq expt for this clip:
            no_rnaseq.append(uid)
        else:
            positive, negative = get_annotations_from_splicing_prefix(
                annotation_dir, 
                splicing_prefix,
                pos_splicing_suffix=pos_splicing_suffix,
                neg_splicing_suffix=neg_splicing_suffix
            )

        for r in [r1, r2]:
            name = os.path.basename(r).replace('.bam','.{}'.format(img_extension))
            output_filename = os.path.join(
                output_dir,
                name
            )
            cmd = "python " + density_runner
            cmd = cmd + " --event {}".format(event)
            cmd = cmd + " --ipbam {}".format(r)
            cmd = cmd + " --inputbam {}".format(i)
            cmd = cmd + " --output {}".format(output_filename)
            if positive is not None and negative is not None:
                cmd = cmd + " --annotations {} {} {}".format(
                    positive, negative, background_basic
                )
                cmd = cmd + " --annotation_type {} {} {}".format(
                    'rmats', 'rmats', 'eric'
                )
            cmd = cmd + " --chrom_sizes {}".format(chrom_sizes)
            cmds.append(cmd)
    Submitter(
        cmds, 
        "{}_NR_{}".format(event, img_extension), 
        sh='/projects/ps-yeolab3/bay001/maps/bash_scripts/{}_NR_{}.sh'.format(
            event, img_extension
        ),
        submit=True,
        array=True,
        walltime='4:00:00'
    )
# print("{} {} uIDs for which we don't have splicing data for: {}".format(splice_condition, event, len(no_rnaseq)))
# print("{} {} uIDs for which we have an RNASEQ ID, but we don't have the splicing data yet: {}".format(splice_condition, event, len(no_rnaseq_yet)))"""

# Plot the A3SS/A5SS splice events (positive and negative and controls) all together

In [None]:
a5ss_k562_all = 'k562_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nA5SSall_0.5_20170401'
a5ss_k562_basic = 'k562_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nA5SSbasic_0.5_20170401'
a5ss_k562_center = 'k562_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nA5SScenter_0.5_20170401'
a5ss_k562_extension = 'k562_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nA5SSextension_0.5_20170401'

a3ss_k562_all = 'k562_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nA3SSall_0.5_20170401'
a3ss_k562_basic = 'k562_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nA3SSbasic_0.5_20170401'
a3ss_k562_center = 'k562_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nA3SScenter_0.5_20170401'
a3ss_k562_extension = 'k562_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nA3SSextension_0.5_20170401'

a5ss_hepg2_all = 'hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nA5SSall_0.5_20170401'
a5ss_hepg2_basic = 'hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nA5SSbasic_0.5_20170401'
a5ss_hepg2_center = 'hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nA5SScenter_0.5_20170401'
a5ss_hepg2_extension = 'hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nA5SSextension_0.5_20170401'

a3ss_hepg2_all = 'hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nA3SSall_0.5_20170401'
a3ss_hepg2_basic = 'hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nA3SSbasic_0.5_20170401'
a3ss_hepg2_center = 'hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nA3SScenter_0.5_20170401'
a3ss_hepg2_extension = 'hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nA3SSextension_0.5_20170401'


In [None]:
density_runner = '/home/bay001/projects/codebase/rbp-maps/maps/plot_density.py'
clip_manifest = '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.txt'
clip_df = pd.read_table(clip_manifest)

events = {
    'a3ss':'/projects/ps-yeolab3/bay001/maps/current_annotations/a3ss/',
    'a5ss':'/projects/ps-yeolab3/bay001/maps/current_annotations/a5ss/',
}


img_extension = 'svg'
out_base = '/projects/ps-yeolab3/bay001/maps/current/'
pos_splicing_suffix = 'positive.nr.txt'
neg_splicing_suffix = 'negative.nr.txt'

for event, annotation_dir in events.iteritems():
    no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
    no_rnaseq_yet = [] # uIDs for which we have an expt id, but haven't downloaded the data yet
    cmds = []
    output_dir = os.path.join(out_base, '{}_nr'.format(event))
    for uid in clip_df['uID']:
        r1, r2, i, rbp, cell = get_clip_file_from_uid(clip_df, uid)
        
        if cell == 'K562':
            if event == 'a3ss':
                background_all = os.path.join(annotation_dir, a3ss_k562_all)
                background_basic = os.path.join(annotation_dir, a3ss_k562_basic)
                background_center = os.path.join(annotation_dir, a3ss_k562_center)
                background_extension = os.path.join(annotation_dir, a3ss_k562_extension)
            elif event == 'a5ss':
                background_all = os.path.join(annotation_dir, a5ss_k562_all)
                background_basic = os.path.join(annotation_dir, a5ss_k562_basic)
                background_center = os.path.join(annotation_dir, a5ss_k562_center)
                background_extension = os.path.join(annotation_dir, a5ss_k562_extension)
            else:
                print(event)
        elif cell == 'HepG2':
            if event == 'a3ss':
                background_all = os.path.join(annotation_dir, a3ss_hepg2_all)
                background_basic = os.path.join(annotation_dir, a3ss_hepg2_basic)
                background_center = os.path.join(annotation_dir, a3ss_hepg2_center)
                background_extension = os.path.join(annotation_dir, a3ss_hepg2_extension)
            elif event == 'a5ss':
                background_all = os.path.join(annotation_dir, a5ss_hepg2_all)
                background_basic = os.path.join(annotation_dir, a5ss_hepg2_basic)
                background_center = os.path.join(annotation_dir, a5ss_hepg2_center)
                background_extension = os.path.join(annotation_dir, a5ss_hepg2_extension)
            else:
                print(event)
        else:
            print(cell)
            
        splicing_prefix = get_rnaseq_splicing_prefix_from_rbpname(rnaseq_manifests, rbp, cell)
        if(splicing_prefix == "NO_RNASEQ"): # we don't have an rna seq expt for this clip:
            no_rnaseq.append(uid)
        else:
            positive, negative = get_annotations_from_splicing_prefix(
                annotation_dir, 
                splicing_prefix,
                pos_splicing_suffix=pos_splicing_suffix,
                neg_splicing_suffix=neg_splicing_suffix
            )
            if(positive == None or negative == None):
                no_rnaseq_yet.append(uid)
            else:
                if not (rbp in positive and rbp in negative):
                    print(
                        'warning, these dont match: {}, {}, {}'.format(
                            rbp, 
                            os.path.basename(positive),
                            os.path.basename(negative)
                        )
                    )
                pos_prefix = os.path.basename(positive).split('-')[0]
                neg_prefix = os.path.basename(negative).split('-')[0]
                if not (pos_prefix in rbp and neg_prefix in rbp):
                    print(
                        'warning, these dont match: {}, {}, {}'.format(
                            rbp, 
                            os.path.basename(positive),
                            os.path.basename(negative)
                        )
                    )
                for r in [r1, r2]:
                    name = os.path.basename(r).replace('.bam','.{}'.format(img_extension))
                    output_filename = os.path.join(
                        output_dir,
                        name
                    )
                    cmd = "python " + density_runner
                    cmd = cmd + " --event {}".format(event)
                    cmd = cmd + " --ipbam {}".format(r)
                    cmd = cmd + " --inputbam {}".format(i)
                    cmd = cmd + " --output {}".format(output_filename)
                    if positive is not None and negative is not None:
                        cmd = cmd + " --annotations {} {} {} {} {} {}".format(
                            positive, negative, background_all, background_basic, background_center, background_extension
                        )
                        cmd = cmd + " --annotation_type {} {} {} {} {} {}".format(
                            'rmats', 'rmats', 'eric', 'eric', 'eric', 'eric'
                        )
                    cmd = cmd + " --chrom_sizes {}".format(chrom_sizes)
                    cmds.append(cmd)
            # if(uid == '228'):
            #     print(r1, r2, i, rbp, cell, annotation_dir, splicing_prefix, pos_splicing_suffix, neg_splicing_suffix)
    
    Submitter(
        cmds, 
        "{}_NR_{}".format(event, img_extension), 
        sh='/projects/ps-yeolab3/bay001/maps/bash_scripts/{}_NR_{}.sh'.format(
            event, img_extension
        ),
        submit=True,
        array=True,
        walltime='4:00:00'
    )
    # output the csv to missing folder 
    pd.Series(no_rnaseq_yet).to_csv('/home/bay001/projects/encode/analysis/tests/missing_rnaseq/{}-missing.txt'.format(event),index=None)
    pd.Series(no_rnaseq).to_csv('/home/bay001/projects/encode/analysis/tests/missing_rnaseq/{}-nonexistant.txt'.format(event),index=None)

# print("{} {} uIDs for which we don't have splicing data for: {}".format(splice_condition, event, len(no_rnaseq)))
# print("{} {} uIDs for which we have an RNASEQ ID, but we don't have the splicing data yet: {}".format(splice_condition, event, len(no_rnaseq_yet)))

In [None]:
a5ss_k562_all = 'K562-all-native-a5ss-events'
a5ss_k562_basic = 'K562-shorter-isoform-in-majority-of-controls'
a5ss_k562_center = 'K562-mixed-psi-isoform-in-majority-of-controls'
a5ss_k562_extension = 'K562-longer-isoform-in-majority-of-controls'

a3ss_k562_all = 'K562-all-native-a3ss-events'
a3ss_k562_basic = 'K562-shorter-isoform-in-majority-of-controls'
a3ss_k562_center = 'K562-mixed-psi-isoform-in-majority-of-controls'
a3ss_k562_extension = 'K562-longer-isoform-in-majority-of-controls'

a5ss_hepg2_all = 'HepG2-all-native-a5ss-events'
a5ss_hepg2_basic = 'HepG2-shorter-isoform-in-majority-of-controls'
a5ss_hepg2_center = 'HepG2-mixed-psi-isoform-in-majority-of-controls'
a5ss_hepg2_extension = 'HepG2-longer-isoform-in-majority-of-controls'

a3ss_hepg2_all = 'HepG2-all-native-a3ss-events'
a3ss_hepg2_basic = 'HepG2-shorter-isoform-in-majority-of-controls'
a3ss_hepg2_center = 'HepG2-mixed-psi-isoform-in-majority-of-controls'
a3ss_hepg2_extension = 'HepG2-longer-isoform-in-majority-of-controls'

In [None]:
density_runner = '/home/bay001/projects/codebase/rbp-maps/maps/plot_density.py'
clip_manifest = '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.txt'
clip_df = pd.read_table(clip_manifest)

events = {
    'a3ss':'/projects/ps-yeolab3/bay001/maps/current_annotations/a3ss_renamed/',
    'a5ss':'/projects/ps-yeolab3/bay001/maps/current_annotations/a5ss_renamed/',
}


img_extension = 'png'
out_base = '/projects/ps-yeolab3/bay001/maps/current/'
pos_splicing_suffix = '-longer-isoform-included-upon-knockdown'
neg_splicing_suffix = '-shorter-isoform-included-upon-knockdown'

for event, annotation_dir in events.iteritems():
    no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
    no_rnaseq_yet = [] # uIDs for which we have an expt id, but haven't downloaded the data yet
    cmds = []
    output_dir = os.path.join(out_base, '{}_nr'.format(event))
    for uid in clip_df['uID']:
        r1, r2, i, rbp, cell = get_clip_file_from_uid(clip_df, uid)
        
        if cell == 'K562':
            if event == 'a3ss':
                background_all = os.path.join(annotation_dir, a3ss_k562_all)
                background_basic = os.path.join(annotation_dir, a3ss_k562_basic)
                background_center = os.path.join(annotation_dir, a3ss_k562_center)
                background_extension = os.path.join(annotation_dir, a3ss_k562_extension)
            elif event == 'a5ss':
                background_all = os.path.join(annotation_dir, a5ss_k562_all)
                background_basic = os.path.join(annotation_dir, a5ss_k562_basic)
                background_center = os.path.join(annotation_dir, a5ss_k562_center)
                background_extension = os.path.join(annotation_dir, a5ss_k562_extension)
            else:
                print(event)
        elif cell == 'HepG2':
            if event == 'a3ss':
                background_all = os.path.join(annotation_dir, a3ss_hepg2_all)
                background_basic = os.path.join(annotation_dir, a3ss_hepg2_basic)
                background_center = os.path.join(annotation_dir, a3ss_hepg2_center)
                background_extension = os.path.join(annotation_dir, a3ss_hepg2_extension)
            elif event == 'a5ss':
                background_all = os.path.join(annotation_dir, a5ss_hepg2_all)
                background_basic = os.path.join(annotation_dir, a5ss_hepg2_basic)
                background_center = os.path.join(annotation_dir, a5ss_hepg2_center)
                background_extension = os.path.join(annotation_dir, a5ss_hepg2_extension)
            else:
                print(event)
        else:
            print(cell)
            
        splicing_prefix = get_rnaseq_splicing_prefix_from_rbpname(rnaseq_manifests, rbp, cell)
        if(splicing_prefix == "NO_RNASEQ"): # we don't have an rna seq expt for this clip:
            no_rnaseq.append(uid)
        else:
            positive, negative = get_annotations_from_splicing_prefix(
                annotation_dir, 
                splicing_prefix,
                pos_splicing_suffix=pos_splicing_suffix,
                neg_splicing_suffix=neg_splicing_suffix
            )
            if(positive == None or negative == None):
                no_rnaseq_yet.append(uid)
            else:
                if not (rbp in positive and rbp in negative):
                    print(
                        'warning, these dont match: {}, {}, {}'.format(
                            rbp, 
                            os.path.basename(positive),
                            os.path.basename(negative)
                        )
                    )
                pos_prefix = os.path.basename(positive).split('-')[0]
                neg_prefix = os.path.basename(negative).split('-')[0]
                if not (pos_prefix in rbp and neg_prefix in rbp):
                    print(
                        'warning, these dont match: {}, {}, {}'.format(
                            rbp, 
                            os.path.basename(positive),
                            os.path.basename(negative)
                        )
                    )
                for r in [r1, r2]:
                    name = os.path.basename(r).replace('.bam','.{}'.format(img_extension))
                    output_filename = os.path.join(
                        output_dir,
                        name
                    )
                    cmd = "python " + density_runner
                    cmd = cmd + " --event {}".format(event)
                    cmd = cmd + " --ipbam {}".format(r)
                    cmd = cmd + " --inputbam {}".format(i)
                    cmd = cmd + " --output {}".format(output_filename)
                    if positive is not None and negative is not None:
                        cmd = cmd + " --annotations {} {} {} {} {} {}".format(
                            positive, negative, background_all, background_basic, background_center, background_extension
                        )
                        cmd = cmd + " --annotation_type {} {} {} {} {} {}".format(
                            'rmats', 'rmats', 'eric', 'eric', 'eric', 'eric'
                        )
                    cmd = cmd + " --chrom_sizes {}".format(chrom_sizes)
                    cmds.append(cmd)
            # if(uid == '228'):
            #     print(r1, r2, i, rbp, cell, annotation_dir, splicing_prefix, pos_splicing_suffix, neg_splicing_suffix)
    
    Submitter(
        cmds, 
        "{}_NR_{}".format(event, img_extension), 
        sh='/projects/ps-yeolab3/bay001/maps/bash_scripts/{}_NR_{}.sh'.format(
            event, img_extension
        ),
        submit=False,
        array=True,
        walltime='4:00:00'
    )
    # output the csv to missing folder 
    pd.Series(no_rnaseq_yet).to_csv('/home/bay001/projects/encode/analysis/tests/missing_rnaseq/{}-missing.txt'.format(event),index=None)
    pd.Series(no_rnaseq).to_csv('/home/bay001/projects/encode/analysis/tests/missing_rnaseq/{}-nonexistant.txt'.format(event),index=None)

# print("{} {} uIDs for which we don't have splicing data for: {}".format(splice_condition, event, len(no_rnaseq)))
# print("{} {} uIDs for which we have an RNASEQ ID, but we don't have the splicing data yet: {}".format(splice_condition, event, len(no_rnaseq_yet)))

# Plot RI

In [None]:
density_runner = '/home/bay001/projects/codebase/rbp-maps/maps/plot_density.py'

events = {
    'ri':'/projects/ps-yeolab3/bay001/maps/current_annotations/ri_renamed/'
}


img_extension = 'png'
out_base = '/projects/ps-yeolab3/bay001/maps/current/'
pos_splicing_suffix = '-included-upon-knockdown'
neg_splicing_suffix = '-excluded-upon-knockdown'

for event, annotation_dir in events.iteritems():
    no_rnaseq = [] # uIDs for which we don't have rna seq expt ids for
    no_rnaseq_yet = [] # uIDs for which we have an expt id, but haven't downloaded the data yet
    cmds = []
    output_dir = os.path.join(out_base, '{}_nr'.format(event))
    for uid in clip_df['uID']:
        r1, r2, i, rbp, cell = get_clip_file_from_uid(clip_df, uid)
        if cell == 'HepG2':
            background1 = os.path.join(annotation_dir, 'HepG2-constitutive-introns')
            background2 = os.path.join(annotation_dir, 'HepG2-all-retained-introns')
            background3 = os.path.join(annotation_dir, 'HepG2-greater-than-50-percent-spliced')
            background4 = os.path.join(annotation_dir, 'HepG2-greater-than-50-percent-retained')
        elif cell == 'K562':
            background1 = os.path.join(annotation_dir, 'K562-constitutive-introns')
            background2 = os.path.join(annotation_dir, 'K562-all-retained-introns')
            background3 = os.path.join(annotation_dir, 'K562-greater-than-50-percent-spliced')
            background4 = os.path.join(annotation_dir, 'K562-greater-than-50-percent-retained')
        else:
            print(cell)
        splicing_prefix = get_rnaseq_splicing_prefix_from_rbpname(rnaseq_manifests, rbp, cell)
        if(splicing_prefix == "NO_RNASEQ"): # we don't have an rna seq expt for this clip:
            no_rnaseq.append(uid)
        else:
            positive, negative = get_annotations_from_splicing_prefix(
                annotation_dir, 
                splicing_prefix,
                pos_splicing_suffix=pos_splicing_suffix,
                neg_splicing_suffix=neg_splicing_suffix
            )
            if positive is not None and negative is not None:
                if not (rbp in positive and rbp in negative):
                    print(
                        'warning, these dont match: {}, {}, {}'.format(
                            rbp, 
                            os.path.basename(positive),
                            os.path.basename(negative)
                        )
                    )
                pos_prefix = os.path.basename(positive).split('-')[0]
                neg_prefix = os.path.basename(negative).split('-')[0]
                if not (pos_prefix in rbp and neg_prefix in rbp):
                    print(
                        'warning, these dont match: {}, {}, {}'.format(
                            rbp, 
                            os.path.basename(positive),
                            os.path.basename(negative)
                        )
                    )
                for r in [r1, r2]:
                    name = os.path.basename(r).replace('.bam','.{}'.format(img_extension))
                    output_filename = os.path.join(
                        output_dir,
                        name
                    )
                    cmd = "python " + density_runner
                    cmd = cmd + " --event {}".format(event)
                    cmd = cmd + " --ipbam {}".format(r)
                    cmd = cmd + " --inputbam {}".format(i)
                    cmd = cmd + " --output {}".format(output_filename)
                    if positive is not None and negative is not None:
                        cmd = cmd + " --annotations {} {} {} {} {} {}".format(
                            positive, negative, background1, background2, background3, background4
                        )
                        cmd = cmd + " --annotation_type {} {} {} {} {} {}".format(
                            'rmats', 'rmats', 'eric', 'eric', 'eric', 'eric', 
                        )
                    cmd = cmd + " --chrom_sizes {}".format(chrom_sizes)
                    cmds.append(cmd)
    Submitter(
        cmds, 
        "{}_NR_{}".format(event, img_extension), 
        sh='/projects/ps-yeolab3/bay001/maps/bash_scripts/{}_NR_{}.sh'.format(
            event, img_extension
        ),
        submit=False,
        array=True,
        walltime='4:00:00'
    )
print("{} uIDs for which we don't have splicing data for: {}".format(event, len(no_rnaseq)))
print("{} uIDs for which we have an RNASEQ ID, but we don't have the splicing data yet: {}".format(event, len(no_rnaseq_yet)))

In [None]:
o = open('/home/bay001/projects/encode/analysis/run_ri_quickly.sh','w')
for cmd in cmds:
    o.write(cmd + '\n')
    
o.close()

# Plot the IDR peaks

In [8]:
clip_manifest = '/home/bay001/projects/maps_20160420/permanent_data/ALLDATASETS_submittedonly.txt'
clip_df = pd.read_table(clip_manifest)
miso_annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/as_miso_renamed'
idr_peak_dir = '/home/elvannostrand/data/clip/CLIPseq_analysis/ENCODE_FINALforpapers_20170325/IDR/'
idr_peak_outdir = '/projects/ps-yeolab3/bay001/maps/current/idr_peaks_bg'
program_runner = '/home/bay001/projects/codebase/rbp-maps/maps/plot_peak.py'
all_peaks = glob.glob(os.path.join(idr_peak_dir,'*0102merged.bed'))

progress = tnrange(len(all_peaks))
cmds = []
for peak in all_peaks:
    outfile = os.path.join(
        idr_peak_outdir,
        os.path.basename(peak).replace('merged.bed','merged.png')
    )
    uid = os.path.basename(peak).split('.')[0]
    _, _, _, rbp, cell = get_clip_file_from_uid(clip_df, uid)
    splicing_prefix = get_rnaseq_splicing_prefix_from_rbpname(rnaseq_manifests, rbp, cell)
    # print(splicing_prefix)
    positive, negative = get_annotations_from_splicing_prefix(miso_annotation_dir, splicing_prefix)
    
    if positive is not None and negative is not None:
        if cell == 'HepG2':
            ce_background = os.path.join(miso_annotation_dir,'HepG2-constitutive-exons.miso')
            nc_background = os.path.join(miso_annotation_dir,'HepG2-native-cassette-exons.miso')
            ni_background = os.path.join(miso_annotation_dir,'HepG2-native-included-exons.miso')
            ne_background = os.path.join(miso_annotation_dir,'HepG2-native-excluded-exons.miso')
        elif cell == 'K562':
            ce_background = os.path.join(miso_annotation_dir,'K562-constitutive-exons.miso')
            nc_background = os.path.join(miso_annotation_dir,'K562-native-cassette-exons.miso')
            ni_background = os.path.join(miso_annotation_dir,'K562-native-included-exons.miso')
            ne_background = os.path.join(miso_annotation_dir,'K562-native-excluded-exons.miso')
        else:
            print('error')
        cmd = "python {} -i {} -o {} -m {} {} {} {} {} {}".format(
            program_runner,
            peak,
            outfile,
            positive,
            negative,
            ce_background,
            nc_background,
            ni_background,
            ne_background
        )
        # ! $cmd ####################################
        cmds.append(cmd)
    else:
        print(uid, rbp, cell)
    
    progress.update(1)
    

('246', u'AUH', u'K562')
('543', u'AGGF1', u'HepG2')
('531', u'RPS5', u'K562')
('417', u'POLR2G', u'K562')
('358', u'GNL3', u'K562')
('256', u'SLTM', u'K562')
('540', u'RPS3', u'HepG2')
('516', u'NOLC1', u'K562')
('553', u'NOLC1', u'HepG2')
('491', u'EXOSC5', u'K562')
('684', u'ZC3H11A', u'K562')
('415', u'TNRC6A', u'HepG2')
('332', u'RBM27', u'K562')
('444', u'LSM11', u'K562')
('570', u'ZNF622', u'K562')
('529', u'RPS24', u'K562')
('614', u'RPS11', u'K562')
('341', u'DGCR8', u'K562')
('484', u'POLR2G', u'HepG2')
('460', u'FASTKD2', u'K562')
('275', u'CSTF2T', u'K562')
('367', u'YWHAG', u'K562')
('383', u'DGCR8', u'HepG2')
('545', u'EIF3H', u'HepG2')
('241', u'DDX42', u'K562')
('390x4000', u'DDX20', u'HepG2')
('641', u'ZC3H11A', u'HepG2')
('679', u'RPL23A', u'HepG2')
('366', u'TNRC6A', u'K562')
('279', u'FAM120A', u'K562')


In [9]:
Submitter(
    cmds, 
    'idr_bg_peaks', 
    sh='/projects/ps-yeolab3/bay001/maps/bash_scripts/IDR_PNGS.sh',
    submit=True,
    array=True,
    walltime='2:00:00',
    queue='home-scrm'
)

running 151 tasks as an array-job.
Wrote commands to /projects/ps-yeolab3/bay001/maps/bash_scripts/IDR_PNGS.sh.
Submitted script to queue home-scrm.
 Job ID: 8400286


8400286[].tscc-mgr.local


<qtools.submitter.Submitter at 0x2abc7e63db10>

          181/|/100%|| 181/181 [00:13<00:00, 51.20it/s]

# Plot CDS

In [None]:
peak_dir = '/home/elvannostrand/data/clip/CLIPseq_analysis/ENCODE_CLIPperv2_20161120/'
annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/'
program_runner = '/home/bay001/projects/codebase/rbp-maps/maps/plot_density.py'
outpath = '/projects/ps-yeolab3/bay001/maps/current/'
chrom_sizes = '/projects/ps-yeolab/genomes/hg19/hg19.chrom.sizes'

In [None]:
events = ['all_txStart','all_txStop','all_cdsStart']

for event in events:
    subtract_cmds = []
    entropy_cmds = []
    rawip_cmds = []
    input_cmds = []
    
    output_dir = os.path.join(outpath, event)
    for uid in clip_df['uID']:
        r1, r2, i, rbp, cell = get_clip_file_from_uid(clip_df, uid)
        if cell == 'k562':
            annotation = os.path.join(annotation_dir, '{}_k562.bed'.format(event))
        else:
            annotation = os.path.join(annotation_dir, '{}_hepg2.bed'.format(event))

        for r in [r1, r2]:
            subtract_name = os.path.basename(r).replace('.bam','.norm_subtract.png')
            entropy_name = os.path.basename(r).replace('.bam','.norm_entropy.png')
            ip_name = os.path.basename(r).replace('.bam','.no_norm.png')
            inp_name = os.path.basename(r).replace('.bam','.input.png')
            
            output_filename_subtract = os.path.join(output_dir, subtract_name)
            output_filename_entropy = os.path.join(output_dir, entropy_name)
            output_filename_ip = os.path.join(output_dir, ip_name)
            output_filename_inp = os.path.join(output_dir, inp_name)
            
            cmd = "python " + program_runner
            cmd = cmd + " --event bed --ipbam {} --inputbam {} --annotations {} --annotation_type {} --intron_offset {} --exon_offset {} --scale --chrom_sizes {} ".format(
                r, i, annotation, 'bed', 0, 0, chrom_sizes
            )
            
            subtract_cmd = cmd + '--normalization_level 1 ' + '--output {}'.format(output_filename_subtract)
            entropy_cmd = cmd + '--normalization_level 2 ' + '--output {}'.format(output_filename_entropy)
            rawip_cmd = cmd + '--normalization_level 3 ' + '--output {}'.format(output_filename_ip)
            input_cmd = cmd + '--normalization_level 4 ' + '--output {}'.format(output_filename_inp)
            
            subtract_cmds.append(subtract_cmd)
            entropy_cmds.append(entropy_cmd)
            rawip_cmds.append(rawip_cmd)
            input_cmds.append(input_cmd)
            
    Submitter(
        subtract_cmds, 
        "{}_{}_PNGS".format(event, 'subtract'), 
        sh='/projects/ps-yeolab3/bay001/maps/bash_scripts/{}_{}_PNGS.sh'.format(event, 'subtract'),
        submit=False,
        array=True,
        walltime='2:00:00',
        queue='condo'
    )
    Submitter(
        entropy_cmds, 
        "{}_{}_PNGS".format(event, 'entropy'), 
        sh='/projects/ps-yeolab3/bay001/maps/bash_scripts/{}_{}_PNGS.sh'.format(event, 'entropy'),
        submit=False,
        array=True,
        walltime='2:00:00',
        queue='home-scrm'
    )
    Submitter(
        rawip_cmds, 
        "{}_{}_PNGS".format(event, 'ip'), 
        sh='/projects/ps-yeolab3/bay001/maps/bash_scripts/{}_{}_PNGS.sh'.format(event, 'rawip'),
        submit=False,
        array=True,
        walltime='2:00:00',
        queue='home-scrm'
    )
    Submitter(
        input_cmds, 
        "{}_{}_PNGS".format(event, 'input'), 
        sh='/projects/ps-yeolab3/bay001/maps/bash_scripts/{}_{}_PNGS.sh'.format(event, 'input'),
        submit=True,
        array=True,
        walltime='2:00:00',
        queue='home-scrm'
    )
    

### Will need to qsub these individually since there are tons of jobs
- all_cdsStart_entropy_PNGS.sh  
- all_cdsStart_rawip_PNGS.sh  
- all_cdsStart_subtract_PNGS.sh  
- all_txStart_entropy_PNGS.sh  
- all_txStart_rawip_PNGS.sh  
- all_txStart_subtract_PNGS.sh  
- all_txStop_entropy_PNGS.sh  
- all_txStop_rawip_PNGS.sh  
- all_txStop_subtract_PNGS.sh

# Verify RBP SE Maps
- see if we have all the maps

In [None]:
clip_df = pd.read_table(clip_manifest)

hepg2_rnaseq_df = pd.read_table(rnaseq_manifests['HepG2'])
k562_rnaseq_df = pd.read_table(rnaseq_manifests['K562'])

hepg2_clip_df = clip_df[clip_df['Cell line']=='HepG2']
k562_clip_df = clip_df[clip_df['Cell line']=='K562']

In [None]:
hepg2_rbps = pd.merge(hepg2_clip_df, hepg2_rnaseq_df, how='outer', left_on='RBP', right_on='Official_RBP').dropna()
k562_rbps = pd.merge(k562_clip_df, k562_rnaseq_df, how='outer', left_on='RBP', right_on='Official_RBP').dropna()

In [None]:
pos_annotations = glob.glob('/projects/ps-yeolab3/bay001/maps/current_annotations/se/*.positive.nr.txt')
neg_annotations = glob.glob('/projects/ps-yeolab3/bay001/maps/current_annotations/se/*.negative.nr.txt')

def get_rbp(f):
    lst = os.path.basename(f).split('-')
    return '{}-{}'.format(lst[0], lst[2])

In [None]:
pos_rbp = set([get_rbp(f) for f in pos_annotations])
neg_rbp = set([get_rbp(f) for f in neg_annotations])

In [None]:
allrbp = (pos_rbp.intersection(neg_rbp))
print("We have annotations for: {} rbps".format(len(allrbp)))