In [1]:
import pandas as pd
import numpy as np
import os
import glob
from collections import defaultdict
from qtools import Submitter
from tqdm import tnrange, tqdm_notebook

In [2]:
pvalue = 0.05
fdr = 0.1
sep = 0.05  # This is absolute; subset_rmats() automatically handles for negative IncLevelDifference

annotation_subdirectories = ['se','a3ss','a5ss','ri']
annotation_directory = '/projects/ps-yeolab3/bay001/maps/current_annotations/'

if not os.path.exists(annotation_directory):
    ! mkdir annotation_directory
    
for annotation_subdirectory in annotation_subdirectories:
    if not os.path.exists(os.path.join(annotation_directory, annotation_subdirectory)):
        subdir = os.path.join(annotation_directory, annotation_subdirectory)
        ! mkdir $subdir

In [3]:
graveley_rmats_dir = '/projects/ps-yeolab3/encode/rnaseq/alt_splicing/graveley_rmats_current/'

def move_files(mats_dir, dest_parent_dir):
    events = ['se','a3ss','a5ss','mxe','ri']
    all_files = glob.glob(os.path.join(mats_dir, '*.JunctionCountOnly.txt'))
    print("number of files found: {}".format(len(all_files)))
    progress = tnrange(len(all_files))
    for src in all_files:
        for e in events:
            dest = os.path.join(dest_parent_dir, e)
            dest_fullpath = os.path.join(dest, os.path.basename(src))
            if "{}.MATS.JunctionCountOnly.txt".format(e.upper()) in src:
                if not os.path.exists(dest_fullpath):
                    ! ln -s $src $dest_fullpath
                progress.update(1)

### UNCOMMENT if we're making completely new annotations - i'm just renaming them here.
# move_files(graveley_rmats_dir, annotation_directory)

In [4]:
all_se_mats = glob.glob(os.path.join(annotation_directory,'se/*.SE.MATS.JunctionCountOnly.txt'))
all_a3ss_mats = glob.glob(os.path.join(annotation_directory,'a3ss/*.A3SS.MATS.JunctionCountOnly.txt'))
all_a5ss_mats = glob.glob(os.path.join(annotation_directory,'a5ss/*.A5SS.MATS.JunctionCountOnly.txt'))
all_mxe_mats = glob.glob(os.path.join(annotation_directory,'mxe/*.MXE.MATS.JunctionCountOnly.txt'))
all_ri_mats = glob.glob(os.path.join(annotation_directory,'ri/*.RI.MATS.JunctionCountOnly.txt'))
print("number of se rmats calls found: {}".format(len(all_se_mats)))
print("number of a3ss rmats calls found: {}".format(len(all_a3ss_mats)))
print("number of a5ss rmats calls found: {}".format(len(all_a5ss_mats)))
print("number of mxe rmats calls found: {}".format(len(all_mxe_mats)))
print("number of ri rmats calls found: {}".format(len(all_ri_mats)))

number of se rmats calls found: 473
number of a3ss rmats calls found: 473
number of a5ss rmats calls found: 473
number of mxe rmats calls found: 473
number of ri rmats calls found: 473


# Filter all files for significantly included/excluded events
- also checks the number of missing or insignificant rmats events (RBPs with no significant events)

In [5]:
def subset_rmats(f, pvalue, fdr, sep):
    """
    Given a raw output from rmats (f)ile, pvalue, fdr, and IncLevelDifference (sep) cutoff score, 
    Filter and save significant results to thre files: 
    *.significant.txt (contains all files passing pvalue and fdr cutoffs)
    *.positive.txt (contains all significant values that pass IncLevelDifference cutoffs)
    *.negative.txt (contains all significant values that pass -IncLevelDifference cutoffs)
    """
    insignificant_positive_rbps = ''
    insignificant_negative_rbps = ''
    num_missing=0
    df = pd.read_table(f)
    dfp = df[
        (df['PValue'] < pvalue) & (df['FDR'] < fdr) & (df['IncLevelDifference'] > sep)
    ]
    dfn = df[
        (df['PValue'] < pvalue) & (df['FDR'] < fdr) & (df['IncLevelDifference'] < -sep)
    ]
    dfs = pd.concat([dfp, dfn])
    if dfs.shape[0] > 0:
        dest = f.replace('.txt','.significant.txt')
        if not os.path.exists(dest):
            dfs.to_csv(dest, sep='\t', index=None)
    
    if(dfp.shape[0] > 0):
        dest = f.replace('.txt','.positive.txt')
        if not os.path.exists(dest):
            dfp.to_csv(dest, sep='\t', index=None)
    else:
        insignificant_positive_rbps = f
        num_missing+=1
    if(dfn.shape[0] > 0):
        dest = f.replace('.txt','.negative.txt')
        if not os.path.exists(dest):
            dfn.to_csv(dest, sep='\t', index=None) 
    else:
        insignificant_negative_rbps = f
        num_missing+=1
    
    return [insignificant_positive_rbps, insignificant_negative_rbps], num_missing

In [6]:
events = {'se':all_se_mats, 'a3ss':all_a3ss_mats, 'a5ss':all_a5ss_mats, 'ri':all_ri_mats}

overall_progress = tnrange(len(events))
insufficient = defaultdict(list)
total_missing = 0
for label, event in events.iteritems():
    event_progress = tnrange(len(event))
    event_progress.set_description("{}".format(label))
    for mat in event:
        no_samples, num_missing = subset_rmats(mat, pvalue, fdr, sep)
        if len(no_samples[0]) > 0 or len(no_samples[1]) > 0:
            insufficient[label].append(no_samples)
        total_missing = total_missing + num_missing
        event_progress.update(1)

In [6]:
len(insufficient['se']) # 25

NameError: name 'insufficient' is not defined

# Subset the nonredundant ones.

In [8]:
subset_prog = '/home/bay001/projects/codebase/bfx/pyscripts/rnaseq/subset_rmats_junctioncountonly.py'
overall_progress = tnrange(len(annotation_subdirectories))
for subdirectory in annotation_subdirectories:
    all_positive = glob.glob(os.path.join(annotation_directory,'{}/*.positive.txt'.format(subdirectory)))
    all_negative = glob.glob(os.path.join(annotation_directory,'{}/*.negative.txt'.format(subdirectory)))
    all_significant = glob.glob(os.path.join(annotation_directory,'{}/*.significant.txt'.format(subdirectory)))
    print("number of positive and negative: {}".format(subdirectory), len(all_positive), len(all_negative))
    
    for cond in [all_positive, all_negative]:
        progress = tnrange(len(cond))
        progress.set_description(subdirectory)
        for sub in cond:
            jobname = os.path.basename(sub).split('.')[0]
            cmd = "python {} -i {} -o {} -e {}".format(
                subset_prog,
                sub,
                sub.replace('.txt','.nr.txt'),
                subdirectory,
            )
            if not os.path.exists(sub.replace('.txt','.nr.txt')):
                ! $cmd
            progress.update(1)
    overall_progress.update(1)


('number of positive and negative: se', 457, 455)



('number of positive and negative: a3ss', 415, 392)



('number of positive and negative: a5ss', 362, 364)



('number of positive and negative: ri', 441, 438)





# Rename the final annotations to something less ugly
- the files are usually something like "MAK16-BGKLV32-K562-SE.MATS.JunctionCountOnly.negative.nr.txt"
- to avoid that being used in the final maps, I'm going to rename them to something nicer like "Excluded upon KD"

In [24]:
annotation_directory = '/projects/ps-yeolab3/bay001/maps/current_annotations/'
all_positive = glob.glob(os.path.join(annotation_directory,'se/*.positive.nr.txt'))
all_negative = glob.glob(os.path.join(annotation_directory,'se/*.negative.nr.txt'))
print("number of positive and negative: ", len(all_positive), len(all_negative))

('number of positive and negative: ', 457, 455)


In [25]:
t = tnrange(len(all_positive) + len(all_negative))
renamed_directory = '/projects/ps-yeolab3/bay001/maps/current_annotations/se_renamed'

for pos in all_positive:
    renamed = pos.replace('.positive.nr.txt','included-upon-knockdown')
    renamed = renamed.replace('.SE.MATS.JunctionCountOnly','-')
    if not os.path.exists(renamed):
        ! ln -s $pos $renamed
    else:
        print(renamed)
        break
    t.update(1)
    
for neg in all_negative:
    renamed = neg.replace('.negative.nr.txt','excluded-upon-knockdown')
    renamed = renamed.replace('.SE.MATS.JunctionCountOnly','-')
    if not os.path.exists(renamed):
        ! ln -s $neg $renamed
    t.update(1)

In [17]:
renamed

'/projects/ps-yeolab3/bay001/maps/current_annotations/se/DDX52-BGHLV16-HepG2.set16-included-upon-knockdown'

In [11]:
annotation_directory = '/projects/ps-yeolab3/bay001/maps/current_annotations/'
events = ['a3ss','a5ss']
for event in events:
    all_positive = glob.glob(os.path.join(annotation_directory,'{}/*.positive.nr.txt'.format(event)))
    all_negative = glob.glob(os.path.join(annotation_directory,'{}/*.negative.nr.txt'.format(event)))
    print("number of positive and negative: ", len(all_positive), len(all_negative), event)
    t = tnrange(len(all_positive) + len(all_negative))
    renamed_directory = '/projects/ps-yeolab3/bay001/maps/current_annotations/{}_renamed'.format(event)

    for pos in all_positive:
        renamed = os.path.basename(pos).replace('.positive.nr.txt','longer-isoform-included-upon-knockdown')
        renamed = renamed.replace('.MATS.JunctionCountOnly','').replace('-{}'.format(event.upper()),'-')
        renamed = os.path.join(renamed_directory, renamed)
        if not os.path.exists(renamed):
            ! ln -s $pos $renamed
        t.update(1)

    for neg in all_negative:
        renamed = os.path.basename(neg).replace('.negative.nr.txt','shorter-isoform-included-upon-knockdown')
        renamed = renamed.replace('.MATS.JunctionCountOnly','').replace('-{}'.format(event.upper()),'-')
        renamed = os.path.join(renamed_directory, renamed)
        if not os.path.exists(renamed):
            ! ln -s $neg $renamed
        t.update(1)

('number of positive and negative: ', 415, 392, 'a3ss')



('number of positive and negative: ', 362, 364, 'a5ss')


In [28]:
event = 'ri'

all_positive = glob.glob(os.path.join(annotation_directory,'{}/*.positive.nr.txt'.format(event)))
all_negative = glob.glob(os.path.join(annotation_directory,'{}/*.negative.nr.txt'.format(event)))
    
t = tnrange(len(all_positive) + len(all_negative))
renamed_directory = '/projects/ps-yeolab3/bay001/maps/current_annotations/{}_renamed'.format(event)

for pos in all_positive:
    renamed = os.path.basename(pos).replace('.positive.nr.txt','included-upon-knockdown')
    renamed = renamed.replace('.RI.MATS.JunctionCountOnly','-')
    renamed = os.path.join(renamed_directory, renamed)
    if not os.path.exists(renamed):
        ! ln -s $pos $renamed
    t.update(1)
    
for neg in all_negative:
    renamed = os.path.basename(neg).replace('.negative.nr.txt','excluded-upon-knockdown')
    renamed = renamed.replace('.RI.MATS.JunctionCountOnly','-')
    renamed = os.path.join(renamed_directory, renamed)
    if not os.path.exists(renamed):
        ! ln -s $neg $renamed
    t.update(1)

# new control datasets

# Subset nonredundant control events

In [19]:
input_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/erics_controls/from_eric/'
output_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/erics_controls/nonredundant/'


nSE_annotations = glob.glob(os.path.join(input_dir, '*nSE*20180413'))
aSE_annotations = glob.glob(os.path.join(input_dir, '*aSE*20180413'))
CE_annotations = glob.glob(os.path.join(input_dir, '*CE*20180413'))
se_annotations = nSE_annotations + aSE_annotations + CE_annotations

progress = tnrange(len(se_annotations))

for se_annotation in se_annotations:
    if '.NR' not in se_annotation:
        se_nr_annotation = os.path.join(output_dir, os.path.basename(se_annotation) + '.NR')
        cmd = 'python {} '.format(subset_prog)
        cmd = cmd + '-i {} '.format(se_annotation)
        cmd = cmd + '-o {} '.format(se_nr_annotation)
        cmd = cmd + '-e {} '.format('se')
        cmd = cmd + '-f {} '.format('eric')
        ! $cmd
    progress.update(1)

In [22]:
input_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/erics_controls/from_eric/'
output_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/erics_controls/nonredundant/se'


a3ss_center = glob.glob(os.path.join(input_dir, '*aA3SScenter*20180413'))
a3ss_all = glob.glob(os.path.join(input_dir, '*nA3SSall*20180413'))
a3ss_basic = glob.glob(os.path.join(input_dir, '*nA3SSbasic*20180413'))
a3ss_extension = glob.glob(os.path.join(input_dir, '*nA3SSextension*20180413'))
a3ss_annotations = a3ss_center + a3ss_all + a3ss_basic + a3ss_extension

progress = tnrange(len(a3ss_annotations))

for annotation in a3ss_annotations:
    if '.NR' not in annotation:
        nr_annotation = os.path.join(output_dir, os.path.basename(annotation) + '.NR')
        cmd = 'python {} '.format(subset_prog)
        cmd = cmd + '-i {} '.format(annotation)
        cmd = cmd + '-o {} '.format(nr_annotation)
        cmd = cmd + '-e {} '.format('a3ss')
        cmd = cmd + '-f {} '.format('eric')
        ! $cmd
    progress.update(1)




In [23]:
input_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/erics_controls/from_eric/'
output_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/erics_controls/nonredundant/a3ss'


a5ss_center = glob.glob(os.path.join(input_dir, '*aA5SScenter*20180413'))
a5ss_all = glob.glob(os.path.join(input_dir, '*nA5SSall*20180413'))
a5ss_basic = glob.glob(os.path.join(input_dir, '*nA5SSbasic*20180413'))
a5ss_extension = glob.glob(os.path.join(input_dir, '*nA5SSextension*20180413'))
a5ss_annotations = a5ss_center + a5ss_all + a5ss_basic + a5ss_extension

progress = tnrange(len(a5ss_annotations))

for annotation in a5ss_annotations:
    if '.NR' not in annotation:
        nr_annotation = os.path.join(output_dir, os.path.basename(annotation) + '.NR')
        cmd = 'python {} '.format(subset_prog)
        cmd = cmd + '-i {} '.format(annotation)
        cmd = cmd + '-o {} '.format(nr_annotation)
        cmd = cmd + '-e {} '.format('a5ss')
        cmd = cmd + '-f {} '.format('eric')
        ! $cmd
    progress.update(1)

In [None]:
input_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/erics_controls/from_eric/'
output_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/erics_controls/nonredundant/a5ss'


a5ss_center = glob.glob(os.path.join(input_dir, '*aA5SScenter*20180413'))
a5ss_all = glob.glob(os.path.join(input_dir, '*nA5SSall*20180413'))
a5ss_basic = glob.glob(os.path.join(input_dir, '*nA5SSbasic*20180413'))
a5ss_extension = glob.glob(os.path.join(input_dir, '*nA5SSextension*20180413'))
a5ss_annotations = a5ss_center + a5ss_all + a5ss_basic + a5ss_extension

progress = tnrange(len(a5ss_annotations))

for annotation in a5ss_annotations:
    if '.NR' not in annotation:
        nr_annotation = os.path.join(output_dir, os.path.basename(annotation) + '.NR')
        cmd = 'python {} '.format(subset_prog)
        cmd = cmd + '-i {} '.format(annotation)
        cmd = cmd + '-o {} '.format(nr_annotation)
        cmd = cmd + '-e {} '.format('a5ss')
        cmd = cmd + '-f {} '.format('eric')
        ! $cmd
    progress.update(1)

In [None]:
input_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/erics_controls/from_eric/'
output_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/erics_controls/nonredundant/ri'


ri = glob.glob(os.path.join(input_dir, '**20180413'))
ri = glob.glob(os.path.join(input_dir, '**20180413'))
ri = glob.glob(os.path.join(input_dir, '**20180413'))
ri = glob.glob(os.path.join(input_dir, '**20180413'))
ri_annotations = ri + ri + ri + ri

progress = tnrange(len(a5ss_annotations))

for annotation in a5ss_annotations:
    if '.NR' not in annotation:
        nr_annotation = os.path.join(output_dir, os.path.basename(annotation) + '.NR')
        cmd = 'python {} '.format(subset_prog)
        cmd = cmd + '-i {} '.format(annotation)
        cmd = cmd + '-o {} '.format(nr_annotation)
        cmd = cmd + '-e {} '.format('ri')
        cmd = cmd + '-f {} '.format('eric')
        ! $cmd
    progress.update(1)

# I forget what the rest does so not including it for now.

# concatenate the 50% CI and 50% RI for KS test calculation background:
- RI_background = nRI_0.5_CI    +    nRI_0.5_RI      -- control for ks-tests / normalization


# all SEs that change in either RBFOX2 or QKI knockdowns (hepg2), but with the duplicates removed

In [11]:
# get both rmats files:
annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/se/'
output_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/combined_nr_rbfox2_qki_hepg2/'

rbfox2_pos = pd.read_table(
    os.path.join(annotation_dir, 'RBFOX2-BGHLV26-HepG2.set26.SE.MATS.JunctionCountOnly.positive.txt')
)
rbfox2_neg = pd.read_table(
    os.path.join(annotation_dir, 'RBFOX2-BGHLV26-HepG2.set26.SE.MATS.JunctionCountOnly.negative.txt')
)
qki_pos = pd.read_table(
    os.path.join(annotation_dir, 'QKI-BGHLV12-HepG2.set12.SE.MATS.JunctionCountOnly.positive.txt')
)
qki_neg = pd.read_table(
    os.path.join(annotation_dir, 'QKI-BGHLV12-HepG2.set12.SE.MATS.JunctionCountOnly.negative.txt')
)
print(rbfox2_pos.shape, rbfox2_neg.shape, qki_pos.shape, qki_neg.shape)

((129, 23), (167, 23), (392, 23), (130, 23))


In [13]:
# concatenate the positive/negative files for each:

pos = pd.concat([rbfox2_pos, qki_pos])
pos_fn = os.path.join(output_dir, 'RBFOX2-QKI-SE.MATS.JunctionCountOnly.positive.txt')
pos.to_csv(pos_fn, sep='\t')
neg = pd.concat([rbfox2_neg, qki_neg])
neg_fn = os.path.join(output_dir, 'RBFOX2-QKI-SE.MATS.JunctionCountOnly.negative.txt')
neg.to_csv(neg_fn, sep='\t')
print(pos.shape, neg.shape)

((521, 23), (297, 23))


In [15]:
subset_prog = '/home/bay001/projects/codebase/bfx/pyscripts/rnaseq/subset_rmats_junctioncountonly.py'

for annotation in [pos_fn, neg_fn]:
    nr_annotation = os.path.join(output_dir, os.path.basename(annotation) + '.NR')
    cmd = 'python {} '.format(subset_prog)
    cmd = cmd + '-i {} '.format(annotation)
    cmd = cmd + '-o {} '.format(nr_annotation)
    cmd = cmd + '-e {} '.format('se')
    cmd = cmd + '-f {} '.format('rmats')
    ! $cmd


In [20]:
df = pd.read_table(os.path.join(output_dir, 'RBFOX2-BGHLV26-HepG2.set26.SE.MATS.JunctionCountOnly.txt'))
pvalue = 0.05
fdr = 0.1
sep = 0.05

dfp = df[
    (df['PValue'] < pvalue) & (df['FDR'] < fdr) & (df['IncLevelDifference'] > sep)
]
dfn = df[
    (df['PValue'] < pvalue) & (df['FDR'] < fdr) & (df['IncLevelDifference'] < -sep)
]
print(dfp.shape, dfn.shape)

((129, 23), (167, 23))


# Try again but without separating incl/excl

In [25]:
# get both rmats files:
annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/se/'
output_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/combined_nr_rbfox2_qki_hepg2/'

rbfox2 = pd.read_table(
    os.path.join(annotation_dir, 'RBFOX2-BGHLV26-HepG2.set26.SE.MATS.JunctionCountOnly.significant.txt')
)
qki = pd.read_table(
    os.path.join(annotation_dir, 'QKI-BGHLV12-HepG2.set12.SE.MATS.JunctionCountOnly.significant.txt')
)
print(rbfox2.shape, qki.shape)

((296, 23), (522, 23))


In [23]:
combined = pd.concat([rbfox2, qki])
combined_fn = os.path.join(output_dir, 'RBFOX2-QKI-SE.MATS.JunctionCountOnly.significant.txt')
print(combined.shape)
combined.to_csv(combined_fn, sep='\t')

(818, 23)


In [24]:
subset_prog = '/home/bay001/projects/codebase/bfx/pyscripts/rnaseq/subset_rmats_junctioncountonly.py'

for annotation in [combined_fn]:
    nr_annotation = os.path.join(output_dir, os.path.basename(annotation) + '.NR')
    cmd = 'python {} '.format(subset_prog)
    cmd = cmd + '-i {} '.format(annotation)
    cmd = cmd + '-o {} '.format(nr_annotation)
    cmd = cmd + '-e {} '.format('se')
    cmd = cmd + '-f {} '.format('rmats')
    ! $cmd

# TIA/TIAL1 now

In [27]:
# get both rmats files:
annotation_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/se/'
output_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/combined_nr_tia1_tial1_hepg2/'

tia1 = pd.read_table(
    os.path.join(annotation_dir, 'TIA1-BGHLV12-HepG2.set12.SE.MATS.JunctionCountOnly.significant.txt')
)
tial1 = pd.read_table(
    os.path.join(annotation_dir, 'TIAL1-BGHLV20-HepG2.set20.SE.MATS.JunctionCountOnly.significant.txt')
)
print(tia1.shape, tial1.shape)

((243, 23), (596, 23))


In [28]:
combined = pd.concat([tia1, tial1])
combined_fn = os.path.join(output_dir, 'TIA1-TIAL1-SE.MATS.JunctionCountOnly.significant.txt')
print(combined.shape)
combined.to_csv(combined_fn, sep='\t')

(839, 23)


In [29]:
subset_prog = '/home/bay001/projects/codebase/bfx/pyscripts/rnaseq/subset_rmats_junctioncountonly.py'

for annotation in [combined_fn]:
    nr_annotation = os.path.join(output_dir, os.path.basename(annotation) + '.NR')
    cmd = 'python {} '.format(subset_prog)
    cmd = cmd + '-i {} '.format(annotation)
    cmd = cmd + '-o {} '.format(nr_annotation)
    cmd = cmd + '-e {} '.format('se')
    cmd = cmd + '-f {} '.format('rmats')
    ! $cmd

# re-subset RBFOX2
- accidentally deleted the .positive.nr.txt

In [8]:
rbfox2 = '/projects/ps-yeolab3/bay001/maps/current_annotations/se/RBFOX2-BGHLV26-HepG2.set26.SE.MATS.JunctionCountOnly.txt'
subset_rmats(f=rbfox2, pvalue=0.05, fdr=0.1, sep=0.05)

(['', ''], 0)