In [2]:
import pandas as pd
import numpy as np
import os
import glob
from collections import defaultdict
from qtools import Submitter
from tqdm import tnrange, tqdm_notebook

In [14]:
pvalue = 0.05
fdr = 0.1
sep = 0.05

annotation_subdirectories = ['se','a3ss','a5ss','mxe','ri']
annotation_directory = '/projects/ps-yeolab3/bay001/maps/current_normed_annotations/'

if not os.path.exists(annotation_directory):
    ! mkdir annotation_directory
    
for annotation_subdirectory in annotation_subdirectories:
    if not os.path.exists(os.path.join(annotation_directory, annotation_subdirectory)):
        subdir = os.path.join(annotation_directory, annotation_subdirectory)
        ! mkdir $subdir


In [None]:
graveley_rmats_dir = '/projects/ps-yeolab3/encode/rnaseq/alt_splicing/graveley_rmats_3-30-2017/normed/'

def move_files(mats_dir, dest_parent_dir):
    events = ['se','a3ss','a5ss','mxe','ri']
    all_files = glob.glob(os.path.join(mats_dir, '*.JunctionCountOnly.txt'))
    print("number of files found: {}".format(len(all_files)))
    progress = tnrange(len(all_files))
    for src in all_files:
        for e in events:
            dest = os.path.join(dest_parent_dir, e)
            dest_fullpath = os.path.join(dest, os.path.basename(src))
            if "{}.MATS.JunctionCountOnly.txt".format(e.upper()) in src:
                if not os.path.exists(dest_fullpath):
                    ! ln -s $src $dest_fullpath
                progress.update(1)
                
move_files(graveley_rmats_dir, annotation_directory)

In [15]:
all_se_mats = glob.glob(os.path.join(annotation_directory,'se/*-SE.MATS.JunctionCountOnly.txt'))
all_a3ss_mats = glob.glob(os.path.join(annotation_directory,'a3ss/*-A3SS.MATS.JunctionCountOnly.txt'))
all_a5ss_mats = glob.glob(os.path.join(annotation_directory,'a5ss/*-A5SS.MATS.JunctionCountOnly.txt'))
all_mxe_mats = glob.glob(os.path.join(annotation_directory,'mxe/*-MXE.MATS.JunctionCountOnly.txt'))
all_ri_mats = glob.glob(os.path.join(annotation_directory,'ri/*-RI.MATS.JunctionCountOnly.txt'))
print("number of se rmats calls found: {}".format(len(all_se_mats)))
print("number of a3ss rmats calls found: {}".format(len(all_a3ss_mats)))
print("number of a5ss rmats calls found: {}".format(len(all_a5ss_mats)))
print("number of mxe rmats calls found: {}".format(len(all_mxe_mats)))
print("number of ri rmats calls found: {}".format(len(all_ri_mats)))

number of se rmats calls found: 452
number of a3ss rmats calls found: 452
number of a5ss rmats calls found: 452
number of mxe rmats calls found: 452
number of ri rmats calls found: 452


In [16]:
def subset_rmats(f, pvalue, fdr, sep):
    insignificant_positive_rbps = ''
    insignificant_negative_rbps = ''
    num_missing=0
    df = pd.read_table(f)
    dfp = df[
        (df['PValue'] < pvalue) & (df['FDR'] < fdr) & (df['IncLevelDifference'] > sep)
    ]
    dfn = df[
        (df['PValue'] < pvalue) & (df['FDR'] < fdr) & (df['IncLevelDifference'] < -sep)
    ]
    dfs = pd.concat([dfp, dfn])
    if dfs.shape[0] > 0:
        dest = f.replace('.txt','.significant.txt')
        if not os.path.exists(dest):
            dfs.to_csv(dest, sep='\t', index=None)
    
    if(dfp.shape[0] > 0):
        dest = f.replace('.txt','.positive.txt')
        if not os.path.exists(dest):
            dfp.to_csv(dest, sep='\t', index=None)
    else:
        insignificant_positive_rbps = f
        num_missing+=1
    if(dfn.shape[0] > 0):
        dest = f.replace('.txt','.negative.txt')
        if not os.path.exists(dest):
            dfn.to_csv(dest, sep='\t', index=None) 
    else:
        insignificant_negative_rbps = f
        num_missing+=1
    
    return [insignificant_positive_rbps, insignificant_negative_rbps], num_missing

In [17]:
events = {'se':all_se_mats, 'a3ss':all_a3ss_mats, 'a5ss':all_a5ss_mats, 'mxe':all_mxe_mats, 'ri':all_ri_mats}

overall_progress = tnrange(len(events))
insufficient = defaultdict(list)
total_missing = 0
for label, event in events.iteritems():
    event_progress = tnrange(len(event))
    for mat in event:
        no_samples, num_missing = subset_rmats(mat, pvalue, fdr, sep)
        if len(no_samples[0]) > 0 or len(no_samples[1]) > 0:
            insufficient[label].append(no_samples)
        total_missing = total_missing + num_missing
        event_progress.update(1)
        

In [None]:
insufficient['mxe']

# Subset the nonredundant ones.

In [19]:
subset_prog = '/home/bay001/projects/codebase/bfx/pyscripts/rnaseq/subset_rmats_junctioncountonly.py'
overall_progress = tnrange(len(annotation_subdirectories))
for subdirectory in annotation_subdirectories:
    all_positive = glob.glob(os.path.join(annotation_directory,'{}/*.positive.txt'.format(subdirectory)))
    all_negative = glob.glob(os.path.join(annotation_directory,'{}/*.negative.txt'.format(subdirectory)))
    all_significant = glob.glob(os.path.join(annotation_directory,'{}/*.significant.txt'.format(subdirectory)))
    print("number of positive and negative: {}".format(subdirectory), len(all_positive), len(all_negative))
    progress = tnrange(len(all_positive) + len(all_negative))

    for cond in [all_significant]:# , all_positive, all_negative]:

        for sub in cond:
            jobname = os.path.basename(sub).split('.')[0]
            cmd = "python {} -i {} -o {} -e {}".format(
                subset_prog,
                sub,
                sub.replace('.txt','.nr.txt'),
                subdirectory,
            )
            if not os.path.exists(sub.replace('.txt','.nr.txt')):
                ! $cmd
            progress.update(1)
    overall_progress.update(1)

('number of positive and negative: se', 448, 446)


('number of positive and negative: a3ss', 407, 407)


('number of positive and negative: a5ss', 361, 389)


('number of positive and negative: mxe', 452, 452)


('number of positive and negative: ri', 428, 420)


# Rename the final annotations to something less ugly
- the files are usually something like "MAK16-BGKLV32-K562-SE.MATS.JunctionCountOnly.negative.nr.txt"
- to avoid that being used in the final maps, I'm going to rename them to something nicer like "Excluded upon KD"

In [None]:
annotation_directory = '/projects/ps-yeolab3/bay001/maps/current_annotations/'
all_positive = glob.glob(os.path.join(annotation_directory,'se/*.positive.nr.txt'))
all_negative = glob.glob(os.path.join(annotation_directory,'se/*.negative.nr.txt'))
print("number of positive and negative: ", len(all_positive), len(all_negative))

In [None]:
t = tnrange(len(all_positive) + len(all_negative))
renamed_directory = '/projects/ps-yeolab3/bay001/maps/current_annotations/se_renamed'

for pos in all_positive:
    renamed = pos.replace('.positive.nr.txt','included-upon-knockdown')
    renamed = renamed.replace('-SE.MATS.JunctionCountOnly','-')
    ! ln -s $pos $renamed
    t.update(1)
    
for neg in all_negative:
    renamed = neg.replace('.negative.nr.txt','excluded-upon-knockdown')
    renamed = renamed.replace('-SE.MATS.JunctionCountOnly','-')
    ! ln -s $neg $renamed
    t.update(1)

In [6]:
annotation_directory = '/projects/ps-yeolab3/bay001/maps/current_annotations/'
events = ['a3ss','a5ss']
for event in events:
    all_positive = glob.glob(os.path.join(annotation_directory,'{}/*.positive.nr.txt'.format(event)))
    all_negative = glob.glob(os.path.join(annotation_directory,'{}/*.negative.nr.txt'.format(event)))
    print("number of positive and negative: ", len(all_positive), len(all_negative), event)
    t = tnrange(len(all_positive) + len(all_negative))
    renamed_directory = '/projects/ps-yeolab3/bay001/maps/current_annotations/{}_renamed'.format(event)

    for pos in all_positive:
        renamed = os.path.basename(pos).replace('.positive.nr.txt','longer-isoform-included-upon-knockdown')
        renamed = renamed.replace('.MATS.JunctionCountOnly','').replace('-{}'.format(event.upper()),'-')
        renamed = os.path.join(renamed_directory, renamed)
        ! ln -s $pos $renamed
        t.update(1)

    for neg in all_negative:
        renamed = os.path.basename(neg).replace('.negative.nr.txt','shorter-isoform-included-upon-knockdown')
        renamed = renamed.replace('.MATS.JunctionCountOnly','').replace('-{}'.format(event.upper()),'-')
        renamed = os.path.join(renamed_directory, renamed)
        ! ln -s $neg $renamed
        t.update(1)

('number of positive and negative: ', 400, 376, 'a3ss')
('number of positive and negative: ', 350, 348, 'a5ss')


# Reformat the final annotations as miso files for peak maps

In [None]:
miso_output_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/as_miso/'
all_positive = glob.glob(os.path.join(annotation_directory,'se/*.positive.nr.txt'))
all_negative = glob.glob(os.path.join(annotation_directory,'se/*.negative.nr.txt'))
print("number of positive and negative: ", len(all_positive), len(all_negative))
rmats2miso_runner = '/home/bay001/projects/codebase/bfx/pyscripts/rnaseq/rmats2miso.py'

In [None]:
t = tnrange(len(all_positive) + len(all_negative))

for p in all_positive:
    dest = os.path.basename(p).replace('.txt','.miso')
    cmd = 'python {} -i {} -o {}'.format(
        rmats2miso_runner, 
        p, 
        os.path.join(miso_output_dir, dest),
    )
    if not os.path.exists(dest):
        ! $cmd
    t.update(1)
    
for n in all_negative:
    dest = os.path.basename(n).replace('.txt','.miso')
    cmd = 'python {} -i {} -o {}'.format(
        rmats2miso_runner, 
        n, 
        os.path.join(miso_output_dir, dest),
    )
    if not os.path.exists(dest):
        ! $cmd
    t.update(1)

# Rename the miso annotations 

In [5]:
annotation_directory = '/projects/ps-yeolab3/bay001/maps/current_annotations/'
all_positive = glob.glob(os.path.join(annotation_directory,'as_miso/*.positive.nr.miso'))
all_negative = glob.glob(os.path.join(annotation_directory,'as_miso/*.negative.nr.miso'))
print("number of positive and negative: ", len(all_positive), len(all_negative), event)
t = tnrange(len(all_positive) + len(all_negative))
renamed_directory = '/projects/ps-yeolab3/bay001/maps/current_annotations/as_miso_renamed'

for pos in all_positive:
    renamed = os.path.basename(pos).replace('.positive.nr.miso','included-upon-knockdown')
    renamed = renamed.replace('.MATS.JunctionCountOnly','').replace('-{}'.format(event.upper()),'-')
    renamed = os.path.join(renamed_directory, renamed)
    ! ln -s $pos $renamed
    t.update(1)

for neg in all_negative:
    renamed = os.path.basename(neg).replace('.negative.nr.miso','excluded-upon-knockdown')
    renamed = renamed.replace('.MATS.JunctionCountOnly','').replace('-{}'.format(event.upper()),'-')
    renamed = os.path.join(renamed_directory, renamed)
    ! ln -s $neg $renamed
    t.update(1)

('number of positive and negative: ', 437, 435, 'se')


# Reformat eric's annotation to miso

In [23]:
bg_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/se/'
new_bg_dir = '/projects/ps-yeolab3/bay001/maps/current_annotations/as_miso_renamed/'

header = ['annotation','lowpos','cassette','hipos']


In [10]:
def as_miso(row):
    chrom, strand, _, _, _ = row['annotation'].split('|')
    low_start, low_end = row['lowpos'].split('-')
    cass_start, cass_end = row['cassette'].split('-')
    hi_start, hi_end = row['hipos'].split('-')
    if strand == '+':
        return '{}:{}:{}:{}@{}:{}:{}:{}@{}:{}:{}:{}'.format(
            chrom, int(low_start)+1, int(low_end)+1, strand,
            chrom, int(cass_start)+1, int(cass_end)+1, strand,
            chrom, int(hi_start)+1, int(hi_end)+1, strand,
        )
    elif strand == '-':
        return '{}:{}:{}:{}@{}:{}:{}:{}@{}:{}:{}:{}'.format(
            chrom, int(hi_start)+1, int(hi_end)+1, strand,
            chrom, int(cass_start)+1, int(cass_end)+1, strand,
            chrom, int(low_start)+1, int(low_end)+1, strand,
        )
    else:
        print("BAD")
        return 1

In [46]:

original_ce = pd.read_table(
    os.path.join(
        bg_dir, 'k562_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.strict_CE_all_20170401',
        
    ), names=header
)
original_ce['miso'] = original_ce.apply(as_miso, axis=1)
original_ce[['miso','annotation']].to_csv(
    os.path.join(new_bg_dir, 'K562-constitutive-exons.miso'), sep='\t', index=False, header=False
)

In [26]:
original_ce = pd.read_table(
    os.path.join(
        bg_dir, 'k562_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nSEall_0.5_20170401',
        
    ), names=header
)
original_ce['miso'] = original_ce.apply(as_miso, axis=1)
original_ce[['miso','annotation']].to_csv(
    os.path.join(new_bg_dir, 'K562-native-cassette-exons.miso'), sep='\t', index=False, header=False
)

In [27]:
original_ce = pd.read_table(
    os.path.join(
        bg_dir, 'k562_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nSEincl_0.5_20170401',
        
    ), names=header
)
original_ce['miso'] = original_ce.apply(as_miso, axis=1)
original_ce[['miso','annotation']].to_csv(
    os.path.join(new_bg_dir, 'K562-native-included-exons.miso'), sep='\t', index=False, header=False
)

In [28]:
original_ce = pd.read_table(
    os.path.join(
        bg_dir, 'k562_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nSEexcl_0.5_20170401',
        
    ), names=header
)
original_ce['miso'] = original_ce.apply(as_miso, axis=1)
original_ce[['miso','annotation']].to_csv(
    os.path.join(new_bg_dir, 'K562-native-excluded-exons.miso'), sep='\t', index=False, header=False
)

In [29]:

original_ce = pd.read_table(
    os.path.join(
        bg_dir, 'hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.strict_CE_all_20170401',
        
    ), names=header
)
original_ce['miso'] = original_ce.apply(as_miso, axis=1)
original_ce[['miso','annotation']].to_csv(
    os.path.join(new_bg_dir, 'HepG2-constitutive-exons.miso'), sep='\t', index=False, header=False
)

In [30]:
original_ce = pd.read_table(
    os.path.join(
        bg_dir, 'hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nSEall_0.5_20170401',
        
    ), names=header
)
original_ce['miso'] = original_ce.apply(as_miso, axis=1)
original_ce[['miso','annotation']].to_csv(
    os.path.join(new_bg_dir, 'HepG2-native-cassette-exons.miso'), sep='\t', index=False, header=False
)

In [31]:
original_ce = pd.read_table(
    os.path.join(
        bg_dir, 'hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nSEincl_0.5_20170401',
        
    ), names=header
)
original_ce['miso'] = original_ce.apply(as_miso, axis=1)
original_ce[['miso','annotation']].to_csv(
    os.path.join(new_bg_dir, 'HepG2-native-included-exons.miso'), sep='\t', index=False, header=False
)

In [32]:
original_ce = pd.read_table(
    os.path.join(
        bg_dir, 'hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt.nSEexcl_0.5_20170401',
        
    ), names=header
)
original_ce['miso'] = original_ce.apply(as_miso, axis=1)
original_ce[['miso','annotation']].to_csv(
    os.path.join(new_bg_dir, 'HepG2-native-excluded-exons.miso'), sep='\t', index=False, header=False
)

In [44]:
some_list = [1,1,1,1,1]
some_list_ps = [x+1 for x in some_list]
normed_list = [float(x)/sum(some_list_ps) for x in some_list_ps]

In [45]:
normed_list

[0.2, 0.2, 0.2, 0.2, 0.2]

# new control datasets

In [11]:
import glob
import os
import pandas as pd

header = ['annotation','lowpos','cassette','hipos']

new_control_dir = '/home/bay001/projects/maps_20160420/analysis/tests/eric_new_ctrls/'
original_bgs = glob.glob(os.path.join(new_control_dir,'*.tpm1'))
for bg in original_bgs:
    new_bg = bg.replace('.tpm1','.tpm1.miso')
    df = pd.read_table(bg, names=header)
    df['miso'] = df.apply(as_miso, axis=1)
    df[['miso', 'annotation']].to_csv(
        new_bg, sep='\t', index=False, header=False
    )

# concatenate the 50% CI and 50% RI for KS test calculation background:
- RI_background = nRI_0.5_CI    +    nRI_0.5_RI      -- control for ks-tests / normalization


In [12]:
ri_dir = '/projects/ps-yeolab3/bay001/maps/current/ri_nr'
retained_intron_bgs = glob.glob(os.path.join(ri_dir,'*-greater-than-50-percent-retained.normalize_and_per_region_subtract.csv'))

In [14]:
keys = []
progress = tnrange(len(retained_intron_bgs))
for retained_file in retained_intron_bgs:
    key = os.path.basename(retained_file).split('-')[0]
    keys.append(key)
    spliced_file = glob.glob(
        os.path.join(
            ri_dir,'{}*-greater-than-50-percent-spliced.normalize_and_per_region_subtract.csv'.format(
                key
            )
        )
    )[0]
    retained = pd.read_table(retained_file, sep=',', index_col=0)
    spliced = pd.read_table(spliced_file, sep=',', index_col=0)
    combined = pd.concat([retained, spliced])
    combined.to_csv(
        os.path.join(
            ri_dir, retained_file.replace(
                '-greater-than-50-percent-retained',
                '-greater-than-50-percent-retained-and-spliced-combined'
            )
        )
    )
    progress.update(1)


