### This script tests whether or not we are getting the right splicing calls from an RMATS directory. Using each line (uID + rep1/2), the map software attempts to search a predefined rMATS directory [knockdown]_vs_[control] for the file: 'MATS_output/SE.MATS.JunctionCountOnly.txt' and returns the events (in MISO format) for plotting. 

In [10]:
import pandas as pd
import numpy as np

In [11]:
# This is what we get from the rbp map software:

names = ['miso', 'gene']
all_events = pd.read_table('/projects/ps-yeolab3/bay001/maps/se/all_data/7-18-2016/204_01_RBFOX2.allRMATS_all_genes.temp',skiprows=2,names=names)
excluded_events = pd.read_table('/projects/ps-yeolab3/bay001/maps/se/all_data/7-18-2016/204_01_RBFOX2.excluded_all_genes.temp',skiprows=2,names=names)
included_events = pd.read_table('/projects/ps-yeolab3/bay001/maps/se/all_data/7-18-2016/204_01_RBFOX2.included_all_genes.temp',skiprows=2,names=names)


In [29]:
# Now we'll start from the beginning. Testing whether or not we get the right ENCODE ID from the manifest:

uid = 204
manifest = pd.read_table('/home/gpratt/Dropbox/encode_integration/20160408_ENCODE_MASTER_ID_LIST_AllDatasets.csv',
                         sep='\t',
                         dtype={'uID':str})
control = list(manifest[manifest['uID']==str(uid)]['RNASEQ_ControlENC'])[0]
rbp = list(manifest[manifest['uID']==str(uid)]['RNASEQ_ENCODEAccID'])[0]
print("control: {}".format(control))
print("RBP: {}".format(rbp))
manifest[manifest['uID']==str(uid)]


control: ENCSR104ABF
RBP: ENCSR767LLP


Unnamed: 0.1,Unnamed: 0,uID,RBP_gID,CellLine,CLIP_rep1,CLIP_rep2,INPUT,RNASEQ_ControlRep1Bam,RNASEQ_ControlRep2Bam,RNASEQ_ControlENC,RNASEQ_KDRep1Bam,RNASEQ_KDRep2Bam,RNASEQ_ENCODEAccID,selection_method,RBP_ENSG
2,2,204,RBFOX2,HepG2,/projects/ps-yeolab2/encode/analysis/encode_v1...,/projects/ps-yeolab2/encode/analysis/encode_v1...,/projects/ps-yeolab2/encode/analysis/encode_v1...,ENCFF893QHC.bam,ENCFF988VWE.bam,ENCSR104ABF,ENCFF946VPZ.bam,ENCFF347ERZ.bam,ENCSR767LLP,polyadenylated mRNA,ENSG00000100320.18


In [51]:
# Open splice file and check whether or not these events match what we got:
import os
def rmats_to_miso(row):
    if row['strand'] == '+':
        return '{}:{}:{}:{}@{}:{}:{}:{}@{}:{}:{}:{}'.format(
            row['chr'],row['upstreamES'],row['upstreamEE'],row['strand'],
            row['chr'],row['exonStart_0base'],row['exonEnd'],row['strand'],
            row['chr'],row['downstreamES'],row['downstreamEE'],row['strand']
        )
    else:
        return '{}:{}:{}:{}@{}:{}:{}:{}@{}:{}:{}:{}'.format(
            row['chr'],row['downstreamES'],row['downstreamEE'],row['strand'],
            row['chr'],row['exonStart_0base'],row['exonEnd'],row['strand'],
            row['chr'],row['upstreamES'],row['upstreamEE'],row['strand']
        )
fdr = 0.05 # default
inc = 0.0 # default
rmats_dir = '/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/'
rbfox2 = "{}_vs_{}".format(rbp,control)
junctions_only_file = os.path.join(rmats_dir,rbfox2,'MATS_output/SE.MATS.JunctionCountOnly.txt')
splice = pd.read_table(junctions_only_file,dtype={'IncLevelDifference':float,'FDR':float})
splice = splice[(abs(splice['IncLevelDifference'])>=inc)&(splice['FDR']<=fdr)]
splice['miso'] = splice.apply(rmats_to_miso,axis=1)
print(splice.shape)
splice.head()

(262, 24)


Unnamed: 0,ID,GeneID,geneSymbol,chr,strand,exonStart_0base,exonEnd,upstreamES,upstreamEE,downstreamES,...,IJC_SAMPLE_2,SJC_SAMPLE_2,IncFormLen,SkipFormLen,PValue,FDR,IncLevel1,IncLevel2,IncLevelDifference,miso
0,12823,ENSG00000131504.11,DIAPH1,chr5,-,140967790,140967817,140966608,140966764,140998364,...,70,7429,126,100,0,0,"0.914,0.76","0.07,0.0",0.802,chr5:140998364:140998566:-@chr5:140967790:1409...
1,1579,ENSG00000122591.7,FAM126A,chr7,-,22986570,22986866,22980887,22985782,22999874,...,17,2657,199,100,0,0,"0.616,0.622","0.019,0.058",0.581,chr7:22999874:23000034:-@chr7:22986570:2298686...
2,16290,ENSG00000085733.11,CTTN,chr11,+,70267575,70267642,70266328,70266616,70269045,...,9779,152150,166,100,0,0,"0.701,0.642","0.278,0.241",0.412,chr11:70266328:70266616:+@chr11:70267575:70267...
3,16299,ENSG00000085733.11,CTTN,chr11,+,70267575,70267686,70266505,70266616,70269045,...,462329,152150,199,100,0,0,"0.896,0.831","0.604,0.524",0.299,chr11:70266505:70266616:+@chr11:70267575:70267...
4,19274,ENSG00000111206.8,FOXM1,chr12,-,2974520,2974565,2973848,2973918,2975558,...,4367,399520,144,100,0,0,"0.371,0.405","0.07,0.082",0.312,chr12:2975558:2975687:-@chr12:2974520:2974565:...


In [62]:
# This is testing against emily's function: 
import os
def make_triplet_annotations(mats_output_file, output_filename, Inclevel, fdr, pval):
    df = pd.read_table(mats_output_file)
    df_clean = df.loc[(abs(df['IncLevelDifference']) > Inclevel) &
                     (df['FDR'] < fdr) & (df['PValue'] < pval)]
    df_clean.reset_index(inplace=True)
    df_clean['miso'] = "." # changed from triplet to miso
    
    for i in df_clean.index:
        chrom = df_clean.loc[i]['chr']
        strand = df_clean.loc[i]['strand']
        if strand == '+':
            upstream = chrom+":"+df_clean.loc[i]['upstreamES'].astype(str)+\
            ":"+df_clean.loc[i]['upstreamEE'].astype(str)+\
            ":"+strand

            skipped = chrom+":"+df_clean.loc[i]['exonStart_0base'].astype(str)+\
            ":"+df_clean.loc[i]['exonEnd'].astype(str)+\
            ":"+strand

            downstream = chrom+":"+df_clean.loc[i]['downstreamES'].astype(str)+\
            ":"+df_clean.loc[i]['downstreamEE'].astype(str)+\
            ":"+strand

        elif strand == '-':
            downstream = chrom+":"+df_clean.loc[i]['upstreamES'].astype(str)+\
            ":"+df_clean.loc[i]['upstreamEE'].astype(str)+\
            ":"+strand

            skipped = chrom+":"+df_clean.loc[i]['exonStart_0base'].astype(str)+\
            ":"+df_clean.loc[i]['exonEnd'].astype(str)+\
            ":"+strand

            upstream = chrom+":"+df_clean.loc[i]['downstreamES'].astype(str)+\
            ":"+df_clean.loc[i]['downstreamEE'].astype(str)+\
            ":"+strand

        row = upstream+"@"+skipped+"@"+downstream

        df_clean.loc[i,'miso'] = row
    
    wt = df_clean.loc[df_clean['IncLevelDifference'] > 0]
    wt = wt[['miso','GeneID','IncLevelDifference']]
    
    mut = df_clean.loc[df_clean['IncLevelDifference'] < 0]
    mut = mut[['miso','GeneID','IncLevelDifference']]
    
    print 'more included in wt = '+str(len(wt))
    print 'more included in mut = '+str(len(mut))
    
    # wt.to_csv(output_filename+"_se_triplets_included_in_wt.txt",index = False,header=None,sep="\t")
    # mut.to_csv(output_filename+"_se_triplets_included_in_mut.txt", index = False,header=None,sep="\t")
    
    """
    Let's return df_clean instead.
    """
    return df_clean

fdr = 0.05 # default
Inclevel = 0.0 # default
rmats_dir = '/home/gpratt/projects/encode/analysis/ad-hoc/rMATS/'
rbfox2 = "{}_vs_{}".format(rbp,control)
mats_output_file = os.path.join(rmats_dir,rbfox2,'MATS_output/SE.MATS.JunctionCountOnly.txt')
output_filename = '/home/bay001/projects/maps_20160420/data/emily_rmats_to_miso_test.txt' # we dont really need this so commenting out
pval = 1 # we don't really need this either

emily_df = make_triplet_annotations(mats_output_file, output_filename, Inclevel, fdr, pval)
emily_df.head()

more included in wt = 153
more included in mut = 109


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,index,ID,GeneID,geneSymbol,chr,strand,exonStart_0base,exonEnd,upstreamES,upstreamEE,...,IJC_SAMPLE_2,SJC_SAMPLE_2,IncFormLen,SkipFormLen,PValue,FDR,IncLevel1,IncLevel2,IncLevelDifference,miso
0,0,12823,ENSG00000131504.11,DIAPH1,chr5,-,140967790,140967817,140966608,140966764,...,70,7429,126,100,0,0,"0.914,0.76","0.07,0.0",0.802,chr5:140998364:140998566:-@chr5:140967790:1409...
1,1,1579,ENSG00000122591.7,FAM126A,chr7,-,22986570,22986866,22980887,22985782,...,17,2657,199,100,0,0,"0.616,0.622","0.019,0.058",0.581,chr7:22999874:23000034:-@chr7:22986570:2298686...
2,2,16290,ENSG00000085733.11,CTTN,chr11,+,70267575,70267642,70266328,70266616,...,9779,152150,166,100,0,0,"0.701,0.642","0.278,0.241",0.412,chr11:70266328:70266616:+@chr11:70267575:70267...
3,3,16299,ENSG00000085733.11,CTTN,chr11,+,70267575,70267686,70266505,70266616,...,462329,152150,199,100,0,0,"0.896,0.831","0.604,0.524",0.299,chr11:70266505:70266616:+@chr11:70267575:70267...
4,4,19274,ENSG00000111206.8,FOXM1,chr12,-,2974520,2974565,2973848,2973918,...,4367,399520,144,100,0,0,"0.371,0.405","0.07,0.082",0.312,chr12:2975558:2975687:-@chr12:2974520:2974565:...


In [63]:
# emily_df['triplet'] = emily's dataframe
# splice['miso'] = my dataframe
# they should match...
# 
print(emily_df.shape)
print(splice.shape)
both = pd.merge(emily_df,splice,on=['miso'],how="left")
inds = pd.isnull(both).any(1).nonzero()[0]

(262, 25)
(262, 24)


In [69]:
x = pd.read_table(mats_output_file)
x.drop(['ID','geneSymbol','chr','strand'],axis=1)

Unnamed: 0,GeneID,exonStart_0base,exonEnd,upstreamES,upstreamEE,downstreamES,downstreamEE,ID.1,IJC_SAMPLE_1,SJC_SAMPLE_1,IJC_SAMPLE_2,SJC_SAMPLE_2,IncFormLen,SkipFormLen,PValue,FDR,IncLevel1,IncLevel2,IncLevelDifference
0,ENSG00000131504.11,140967790,140967817,140966608,140966764,140998364,140998566,12823,4024,36,70,7429,126,100,0.000000e+00,0.000000e+00,"0.914,0.76","0.07,0.0",0.802
1,ENSG00000122591.7,22986570,22986866,22980887,22985782,22999874,23000034,1579,5159,1618,17,2657,199,100,0.000000e+00,0.000000e+00,"0.616,0.622","0.019,0.058",0.581
2,ENSG00000085733.11,70267575,70267642,70266328,70266616,70269045,70269101,16290,78128,2043,9779,152150,166,100,0.000000e+00,0.000000e+00,"0.701,0.642","0.278,0.241",0.412
3,ENSG00000085733.11,70267575,70267686,70266505,70266616,70269045,70269101,16299,342422,2043,462329,152150,199,100,0.000000e+00,0.000000e+00,"0.896,0.831","0.604,0.524",0.299
4,ENSG00000111206.8,2974520,2974565,2973848,2973918,2975558,2975687,19274,159295,187301,4367,399520,144,100,0.000000e+00,0.000000e+00,"0.371,0.405","0.07,0.082",0.312
5,ENSG00000136153.15,76383289,76383319,76378424,76378677,76391296,76391414,22083,88106,1048,1517,82126,129,100,0.000000e+00,0.000000e+00,"0.872,0.631","0.124,0.095",0.642
6,ENSG00000173905.4,167758573,167758657,167754623,167754782,167759179,167759262,22110,91162,133,1732,5257,183,100,0.000000e+00,0.000000e+00,"0.793,0.967","0.152,0.235",0.686
7,ENSG00000221995.4,27412621,27412666,27409333,27409456,27413455,27413595,22448,17360,8417,1310,251102,144,100,0.000000e+00,0.000000e+00,"0.589,0.71","0.035,0.064",0.600
8,ENSG00000159023.14,29386933,29386996,29379615,29379824,29391493,29391670,2287,104209,4052,1512,132119,162,100,0.000000e+00,0.000000e+00,"0.616,0.713","0.066,0.059",0.602
9,ENSG00000154380.12,225692692,225692755,225688693,225688772,225695652,225695719,26594,133241,43136,419,106148,162,100,0.000000e+00,0.000000e+00,"0.656,0.522","0.023,0.073",0.541
