In [1]:
# 

In [22]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from rnaseq import subset_rmats_junctioncountonly as subset
import pandas as pd
import pybedtools as bt
import glob
import os
from collections import defaultdict
from tqdm import tnrange, tqdm_notebook

In [3]:
all_positive = glob.glob('/projects/ps-yeolab3/bay001/maps/current_annotations/se/*.positive.txt')
all_negative = glob.glob('/projects/ps-yeolab3/bay001/maps/current_annotations/se/*.negative.txt')

all_jxc = all_positive + all_negative

In [17]:
def get_avg_number_removed(samples, o, e='se'):
    """
    Gets the average number of events removed from a splice event list
    by calculating the number of events before and after duplicate removal.
    
    """
    progress = tnrange(len(samples))
    nums_original = []
    nums_
    nums_removed = []
    for i in samples:
        # read in original jxc only dataframe (list of events)
        df_before = pd.read_table(i)
        
        # do the subsetting
        df_after = subset.run_subset_rmats_junctioncountonly(i, o, e)
        
        num_events_after = df_after.shape[0]
        num_events_before = df_before.shape[0]
        
        num_events_removed = num_events_before - num_events_after
        
        nums_removed.append(num_events_removed)
        
        if num_events_removed < 0: 
            print("problem", i)
            return 1
        
        progress.update(1)
    return sum(nums_removed) / float(len(nums_removed))

o = '/projects/ps-yeolab3/bay001/tmp/test2.jxc'
get_avg_number_removed(all_jxc, o)

61.987385321100916

# Calculate:
- total number of events for all
- total number of events for significant
- total number of events for positive significant and negative
- total number of events for positive significant and negative collapsed

In [18]:
all_sig_positive = glob.glob('/projects/ps-yeolab3/bay001/maps/current_annotations/se/*.positive.txt')
all_sig_negative = glob.glob('/projects/ps-yeolab3/bay001/maps/current_annotations/se/*.negative.txt')
all_sig = glob.glob('/projects/ps-yeolab3/bay001/maps/current_annotations/se/*.significant.txt')
all_original = glob.glob('/projects/ps-yeolab3/bay001/maps/current_annotations/se/*.JunctionCountOnly.txt')

In [21]:
def get_prefix(fn):
    return os.path.basename(fn).split('-SE.MATS')[0]

In [54]:

def build_dictionary_of_files(all_original):
    progress = tnrange(len(all_original))
    d = defaultdict(dict) # dictionary of files
    to_check = [] # list of files that we have no events for
    for fn in all_original:
        prefix = get_prefix(fn)
        original = pd.read_table(fn)
        original = original.shape[0]
        try:
            significant = glob.glob(fn.replace('.txt','.significant.txt'))[0]
            significant = pd.read_table(significant)
            significant = significant.shape[0]
        except IndexError:
            print("{} has no significant events".format(fn))
            significant = 0
            to_check.append(fn)
        try:
            positive = glob.glob(fn.replace('.txt','.positive.txt'))[0]
            positive = pd.read_table(positive)
            positive = positive.shape[0]
        except IndexError:
            positive = 0
        try:
            negative = glob.glob(fn.replace('.txt','.negative.txt'))[0]
            negative = pd.read_table(negative)
            negative = negative.shape[0]
        except IndexError:
            negative = 0
        try:
            positive_collapsed = glob.glob(fn.replace('.txt','.positive.nr.txt'))[0]
            positive_collapsed = pd.read_table(positive_collapsed)
            positive_collapsed = positive_collapsed.shape[0]
        except IndexError:
            positive_collapsed = 0
        try:
            negative_collapsed = glob.glob(fn.replace('.txt','.negative.nr.txt'))[0]
            negative_collapsed = pd.read_table(negative_collapsed)
            negative_collapsed = negative_collapsed.shape[0]
        except IndexError:
            negative_collapsed = 0
        
        d[prefix] = {
            'original_file':fn,
            'original_num':original,
            'significant':significant,
            'significant_positive':positive,
            'significant_negative':negative,
            'significant_positive_collapsed':positive_collapsed,
            'significant_negative_collapsed':negative_collapsed,
        }
        progress.update(1)
        
    return pd.DataFrame(d), to_check

df, to_check = build_dictionary_of_files(all_original)

/projects/ps-yeolab3/bay001/maps/current_annotations/se/CSTF2T-BGKLV13-K562-SE.MATS.JunctionCountOnly.txt has no significant events
/projects/ps-yeolab3/bay001/maps/current_annotations/se/RPL23A-BGHLV18-HepG2-SE.MATS.JunctionCountOnly.txt has no significant events
/projects/ps-yeolab3/bay001/maps/current_annotations/se/FASTKD2-BGKLV13-K562-SE.MATS.JunctionCountOnly.txt has no significant events
/projects/ps-yeolab3/bay001/maps/current_annotations/se/DDX28-BGKLV19-K562-SE.MATS.JunctionCountOnly.txt has no significant events
/projects/ps-yeolab3/bay001/maps/current_annotations/se/NUFIP2-BGKLV34-K562-SE.MATS.JunctionCountOnly.txt has no significant events
/projects/ps-yeolab3/bay001/maps/current_annotations/se/DDX52-BGKLV19-K562-SE.MATS.JunctionCountOnly.txt has no significant events
/projects/ps-yeolab3/bay001/maps/current_annotations/se/LSM11-BGKLV21-K562-SE.MATS.JunctionCountOnly.txt has no significant events
/projects/ps-yeolab3/bay001/maps/current_annotations/se/PA2G4-BGKLV19-K562-SE

In [55]:
# check to make sure the files with missing *.significant.txt actually have zero significant events.

In [56]:
def check_missing_sigevents(to_check):
    for fn in to_check:
        df = pd.read_table(fn)
        df = df[(df['IncLevelDifference']>0.05) | (df['IncLevelDifference']<-.05)]
        df = df[(df['PValue'] < 0.05) & (df['FDR'] < 0.1)]
        assert df.shape[0] == 0
check_missing_sigevents(to_check)

In [57]:
df.to_csv('/projects/ps-yeolab3/bay001/gabe_qc_20170612/permanent_data/event_metrics.tsv', sep='\t')

In [58]:
df

Unnamed: 0,AARS-BGHLV17-HepG2,AARS-BGKLV21-K562,AATF-BGHLV14-HepG2,AATF-BGKLV13-K562,ABCF1-BGHLV30-HepG2,ABCF1-BGKLV29-K562,ACO1-BGHLV31-HepG2,ADAR-BGHLV20-HepG2,ADAR-BGKLV24-K562,AGGF1-BGKLV32-K562,...,XRCC5-BGKLV13-K562,XRCC6-BGHLV14-HepG2,XRCC6-BGKLV13-K562,XRN2-BGHLV17-HepG2,XRN2-BGKLV19-K562,YTHDC2-BGKLV28-K562,ZC3H8-BGKLV38-K562,ZNF622-BGHLV33-HepG2,ZRANB2-BGHLV14-HepG2,ZRANB2-BGKLV13-K562
original_file,/projects/ps-yeolab3/bay001/maps/current_annot...,/projects/ps-yeolab3/bay001/maps/current_annot...,/projects/ps-yeolab3/bay001/maps/current_annot...,/projects/ps-yeolab3/bay001/maps/current_annot...,/projects/ps-yeolab3/bay001/maps/current_annot...,/projects/ps-yeolab3/bay001/maps/current_annot...,/projects/ps-yeolab3/bay001/maps/current_annot...,/projects/ps-yeolab3/bay001/maps/current_annot...,/projects/ps-yeolab3/bay001/maps/current_annot...,/projects/ps-yeolab3/bay001/maps/current_annot...,...,/projects/ps-yeolab3/bay001/maps/current_annot...,/projects/ps-yeolab3/bay001/maps/current_annot...,/projects/ps-yeolab3/bay001/maps/current_annot...,/projects/ps-yeolab3/bay001/maps/current_annot...,/projects/ps-yeolab3/bay001/maps/current_annot...,/projects/ps-yeolab3/bay001/maps/current_annot...,/projects/ps-yeolab3/bay001/maps/current_annot...,/projects/ps-yeolab3/bay001/maps/current_annot...,/projects/ps-yeolab3/bay001/maps/current_annot...,/projects/ps-yeolab3/bay001/maps/current_annot...
original_num,46311,39811,36460,44607,51669,57292,44126,35319,34620,56767,...,48579,42306,50290,44671,45838,49554,68868,48997,39549,43819
significant,17,78,11,5,151,104,243,32,33,2514,...,58,192,585,143,108,1416,446,275,36,46
significant_negative,8,32,3,2,74,38,121,8,16,2151,...,10,96,302,82,49,980,207,80,19,21
significant_negative_collapsed,7,30,3,2,60,33,90,7,12,1547,...,10,73,232,59,37,770,164,64,15,20
significant_positive,9,46,8,3,77,66,122,24,17,363,...,48,96,283,61,59,436,239,195,17,25
significant_positive_collapsed,6,40,6,3,65,51,99,21,11,288,...,35,76,209,48,46,350,189,139,12,18


In [59]:
dx = pd.read_table('/projects/ps-yeolab3/bay001/maps/current_annotations/se/AARS-BGHLV17-HepG2-SE.MATS.JunctionCountOnly.txt')

In [62]:
dx = dx[(dx['IncLevelDifference']>0.05) | (dx['IncLevelDifference']<-.05)]
dx = dx[(dx['PValue'] < 0.05) & (dx['FDR'] < 0.1)]
dx.shape[0]

17