In [145]:
import pandas as pd
pd.set_option('precision', 12)

def bottom_top_values_from_dataframe(df, bottom_percent=0.5, top_percent=0.5):
    """
    This takes a dataframe and computes the bottom and top % for each
    column. This helps with the "permutation" background significance calculations.

    Parameters
    ----------
    df
    bottom_percent: float
        bottom percent to take median of
        (taking the bottom 25% would be bottom_percent=25)
    top_percent: float
        top percent to take median of
        (taking the top 25% would be top_percent=25)

    Returns
    -------
    bottom_values : list
    top_values : list
    """
    bottom_values = []
    top_values = []
    for key, value in df.iteritems():
        # get true percentage
        bottom_actual_percent = bottom_percent * 0.01
        top_actual_percent = top_percent * 0.01
        # foreach column, drop nans and get the number of starting events, sorted.
        single_col = df[key].dropna()
        single_col = single_col.sort_values()
        nums = len(single_col)
        # collect top (top subset) 0.5% and bottom 0.5% of values
        bottom_subset = int(bottom_actual_percent * nums)
        top_subset = int(top_actual_percent * nums)
        # get the bottom/top 0.5%

        bottom = single_col.iloc[bottom_subset-1]
        top = single_col.iloc[len(single_col)-top_subset]
        # append to list of bottom/top values
        bottom_values.append(bottom)
        top_values.append(top)
    return bottom_values, top_values

In [146]:
df = pd.read_table(
    '/projects/ps-yeolab3/bay001/maps/current/se/204_02_RBFOX2.merged.r2.4.svg.HepG2_native_cassette_exons_all.RBFOX2-BGHLV26-HepG2.set26-excluded-upon-knockdown.randsample.tsv',
    index_col=0,
    
)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1390,1391,1392,1393,1394,1395,1396,1397,1398,1399
0,-0.000210021219,-0.000197580574,-0.000197749473,-0.00021658278,-0.000213546886,-0.00020411458,-0.000189650647,-0.000173401362,-0.000193566749,-0.000189297243,...,-0.000133529179,-0.000136455805,-0.000151136394,-0.000155321583,-0.000161276411,-0.000167879232,-0.000170630101,-0.000176285963,-0.000169225116,-0.000165713204
1,-6.2421888e-05,-6.9836949e-05,-7.8507049e-05,-7.9916568e-05,-8.2928043e-05,-7.0179228e-05,-7.4009443e-05,-5.9729124e-05,-5.5511151e-05,-5.7981977e-05,...,-0.000139558986,-0.000128763212,-0.000147852213,-0.000148030381,-0.000146443039,-0.000153706052,-0.000147209515,-0.000141453701,-0.000146171962,-0.00014612456
2,-0.000297883066,-0.000288707554,-0.000292559135,-0.000278691544,-0.000290259996,-0.000277611073,-0.000271715259,-0.000264338383,-0.000263080314,-0.000257676741,...,-0.000108355567,-0.000118077833,-0.000131665155,-0.000134246412,-0.000129168305,-0.000142628607,-0.000156388628,-0.000166886871,-0.000155908899,-0.000163215186
3,-0.000139808668,-0.000159049036,-0.000160517079,-0.000167431531,-0.000162197579,-0.000136909349,-0.000130166534,-0.000131373144,-0.000131832916,-0.000130534804,...,-0.000309978002,-0.000308542542,-0.000325605998,-0.000315044653,-0.000323661248,-0.000320364569,-0.000321744635,-0.00031402274,-0.000309628566,-0.00030958731
4,-0.000143416541,-0.000133673918,-0.000129838541,-0.000114999433,-0.00011893102,-0.000102051503,-0.000101275117,-9.4165798e-05,-0.000104348531,-0.000103985672,...,-6.1438711e-05,-6.0525065e-05,-7.0294382e-05,-7.807966e-05,-6.8739334e-05,-7.853912e-05,-7.1068417e-05,-6.4767594e-05,-6.344466e-05,-6.5492782e-05


In [147]:
df[["0"]].sort_values("0", ascending=False)

Unnamed: 0,0
173,-0.000005873441
481,-0.000013256499
890,-0.000014839813
775,-0.000023198402
580,-0.000034736815
543,-0.000036552002
171,-0.000038992428
438,-0.000039081432
639,-0.000039183978
665,-0.000039776278


In [139]:
bottom, top = bottom_top_values_from_dataframe(df)

995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995
995


In [148]:
bottom

[-0.00032539457502100001,
 -0.00032548518688800001,
 -0.00032586756968299997,
 -0.00031840106258300004,
 -0.00031965764830799994,
 -0.00031079625200499999,
 -0.00032129441243,
 -0.00030660972043500002,
 -0.00030439279460199998,
 -0.00031630933457200007,
 -0.00032535672239700003,
 -0.00031781548202300001,
 -0.00031813670222000004,
 -0.00032808573554100006,
 -0.00031852364961400004,
 -0.00032496862369800001,
 -0.00031788720461199997,
 -0.000321206459176,
 -0.00032719666083499998,
 -0.000320710853141,
 -0.000317742853743,
 -0.00031192745138700003,
 -0.00032104929745899999,
 -0.00031225444116599998,
 -0.00031430030940700002,
 -0.00031749154836299996,
 -0.00032120991230799996,
 -0.00033145327914399996,
 -0.00032339027755000002,
 -0.00032218505928500004,
 -0.00032917688407300006,
 -0.00033233368290499994,
 -0.00033340090829799999,
 -0.00031773012897699999,
 -0.00032334846570599996,
 -0.00032744032561099999,
 -0.00033695324722799998,
 -0.00032385566534900003,
 -0.00032878739472500006,
 -0.000

In [60]:
df = pd.read_table(
    '/projects/ps-yeolab3/bay001/maps/current_annotations/a3ss/IGF2BP1-BGHcLV03-HepG2.BGHcLV03.A3SS.MATS.JunctionCountOnly.txt'
)
df.head()

Unnamed: 0,ID,GeneID,geneSymbol,chr,strand,longExonStart_0base,longExonEnd,shortES,shortEE,flankingES,...,SJC_SAMPLE_1,IJC_SAMPLE_2,SJC_SAMPLE_2,IncFormLen,SkipFormLen,PValue,FDR,IncLevel1,IncLevel2,IncLevelDifference
0,12788,ENSG00000113924.7,HGD,chr3,-,120363218,120363466,120363218,120363290,120365113,...,221267,3578,193421,197,99,1.2e-05,0.208408,"0.162,0.162","0.084,0.085",0.077
1,14860,ENSG00000165792.13,METTL17,chr14,+,21460250,21460364,21460282,21460364,21458622,...,108170,44140,144345,130,99,2.3e-05,0.210563,"0.358,0.361","0.189,0.236",0.147
2,11231,ENSG00000173531.11,MST1,chr3,-,49722444,49722558,49722444,49722522,49722694,...,231347,2562,79172,134,99,6.6e-05,0.23712,"0.123,0.089","0.189,0.21",-0.094
3,21978,ENSG00000100911.9,PSME2,chr14,-,24614918,24615105,24614918,24614981,24615416,...,616712,162367,6381482,197,99,5.7e-05,0.23712,"0.167,0.176","0.113,0.111",0.059
4,2268,ENSG00000101150.13,TPD52L2,chr20,+,62514071,62514173,62514102,62514173,62505020,...,93,134291,0,129,99,5.2e-05,0.23712,"0.9,0.972","1.0,1.0",-0.064


In [62]:
df = df[
    (df['IncLevelDifference']<0.05) & (df['FDR'] <= 0.1) & (df['PValue'] <=0.05)
]
df

Unnamed: 0,ID,GeneID,geneSymbol,chr,strand,longExonStart_0base,longExonEnd,shortES,shortEE,flankingES,...,SJC_SAMPLE_1,IJC_SAMPLE_2,SJC_SAMPLE_2,IncFormLen,SkipFormLen,PValue,FDR,IncLevel1,IncLevel2,IncLevelDifference
