# hm ok - can you think of some ways to put a quantitative number on this (besides ks test p value which can have issues)

- maybe median delta psi? or fold-enrichment in total number of incl / (spl + incl) reads (summing all events)? then scatter plot each dataset in CI vs atac and we see if there's a difference?

- it might be interesting to look at which splicing factors are on diagonal vs which are off


In [34]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
import glob
from scipy import stats
from tqdm import tnrange, tqdm_notebook

import matplotlib
matplotlib.rcParams.update({'font.size': 18})
sns.set_style("whitegrid")

In [36]:
# Get all knockdown expts and controls
manifest_directory = '/projects/ps-yeolab3/encode/'
k562 = pd.read_table(os.path.join(
    manifest_directory,
    'k562_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt'
), index_col=0)
hepg2 = pd.read_table(os.path.join(
    manifest_directory,
    'hepg2_brenton-graveley_ambiguous_bams_for_integrated_analysis.txt'
), index_col=0)

manifest = pd.concat([k562, hepg2])
manifest.head()

Unnamed: 0,control_rep1,control_rep1_md5sum,control_rep2,control_rep2_md5sum,expt_rep1,expt_rep1_md5sum,expt_rep2,expt_rep2_md5sum,name
ENCSR000KYM,ENCFF309OOI.bam,006834b1e534abfc0a766464ce4f60d9,ENCFF209PLP.bam,b50208a4553de637758e93ceaa099569,ENCFF354JPP.bam,aa41d43a1f51b87c0691d0526f4f3ac5,ENCFF243AMZ.bam,dea667416a4d4c3c582637c2f9ad0f19,DDX3X
ENCSR000YYN,ENCFF695KEA.bam,14e5ef8ccef0d53859e5f67375f4c13f,ENCFF848JJM.bam,c04240081b7bb137e69d298837baa648,ENCFF065KHB.bam,84a492145ce5bf8f806aa2b7f9b30765,ENCFF640HPA.bam,7f4e51a76b95e8699789d5bb36c5df85,AKAP8
ENCSR004RGI,ENCFF098BEA.bam,d3be652a5058aacf2e9d95311bcba657,ENCFF178NNK.bam,7b1be9be800298f9f8958b2adc14090c,ENCFF459YMO.bam,22e6c3066888702577d180c635d0280b,ENCFF922IOC.bam,a2588d2a5a56edeb4f96ea6f4c17d4d9,RPS10
ENCSR007XKL,ENCFF985IXD.bam,2e7f51c61c5c545dfba6906772ac5716,ENCFF461TSD.bam,9e3ce9878e3352b44855f4a2d1c66d37,ENCFF878JKR.bam,4ee27925a7f0f3ce95964686da0a8927,ENCFF466OZP.bam,c69b58950871a86055b9f24744add8c6,NFX1
ENCSR023HWI,ENCFF804JHE.bam,f14e222eebd2030a3da0d086acd48bf7,ENCFF819ORB.bam,14d4a76c9237f92eff0e40f19ef4ffcb,ENCFF656QEN.bam,f409a646dd82e6b47c88869cd1921a55,ENCFF738SSD.bam,83fe79368dde05a9405107b333b0eca3,KHDRBS1


In [41]:
def get_avg(row):
    n1 = row['dpsi_x']
    n2 = row['dpsi_y']
    if np.isnan(n1):
        return n2
    elif np.isnan(n2):
        return n1
    else:
        return (n1 + n2) / 2.0

def get_psi(fn):
    """
    returns the percent spliced in (inc/(inc+exc)) for a dataset.
    """
    names = ['exc_count', 'inc_count', 'dpsi']
    df = pd.read_table(fn, sep='\t', index_col=0, names=names).drop_duplicates()
    sum_exc = df['exc_count'].sum()
    sum_inc = df['inc_count'].sum()
    psi = sum_inc / float(sum_exc + sum_inc)
    return psi

def get_fold_enrichment(kd, ctrl):
    """
    Given two datasets, calculate the fold enrichment 
    """
    kd_psi = get_psi(kd)
    ctrl_psi = get_psi(ctrl)
    return kd_psi/ctrl_psi

def get_all_required_files(expt_id, jxc_dir, df=manifest):
    """
    For each experiment ID, return: distributions of 
    the average dpsi vals for both reps for each condition
    """
    datapoints = []
    
    sub = df.ix[expt_id]
    ciri_prefix = '.primary.namesort.bam.CIandRI.psi'
    atac_prefix = '.primary.namesort.bam.ATAC.psi'
    
    ciri_expt_files = [
        os.path.join(jxc_dir, (sub['expt_rep1'] + ciri_prefix)), 
        os.path.join(jxc_dir, (sub['expt_rep2'] + ciri_prefix))
    ]
    ciri_ctrl_files = [
        os.path.join(jxc_dir, (sub['control_rep1'] + ciri_prefix)), 
        os.path.join(jxc_dir, (sub['control_rep2'] + ciri_prefix))
    ]
    
    atac_expt_files = [
        os.path.join(jxc_dir, (sub['expt_rep1'] + atac_prefix)), 
        os.path.join(jxc_dir, (sub['expt_rep2'] + atac_prefix))
    ]
    atac_ctrl_files = [
        os.path.join(jxc_dir, (sub['control_rep1'] + atac_prefix)), 
        os.path.join(jxc_dir, (sub['control_rep2'] + atac_prefix))
    ]
    try:
        ciri_r1_foldenr = get_fold_enrichment(ciri_expt_files[0], ciri_ctrl_files[0])
        ciri_r2_foldenr = get_fold_enrichment(ciri_expt_files[1], ciri_ctrl_files[1])
        atac_r1_foldenr = get_fold_enrichment(atac_expt_files[0], atac_ctrl_files[0])
        atac_r2_foldenr = get_fold_enrichment(atac_expt_files[1], atac_ctrl_files[1])
    
        datapoints.append([ciri_r1_foldenr,atac_r1_foldenr])
        datapoints.append([ciri_r2_foldenr,atac_r2_foldenr])
    except KeyError:
        return []
    return datapoints

In [42]:
jxc_dir = '/home/bay001/projects/encode/analysis/atac_intron_analysis/jxc_from_eric'

progress = tnrange(len(k562.index))
for index in k562.index:
    rbp = k562.ix[index]['name']
    datapoints = get_all_required_files(index,jxc_dir)
    print(rbp, datapoints)
    progress.update(1)

('DDX3X', [])
('AKAP8', [])
('RPS10', [])
('NFX1', [])
('KHDRBS1', [])
('DDX51', [])
('HNRNPUL1', [])
('EIF4G2', [])
('SUB1', [])
('PABPC4', [])
('HNRNPU', [])
('SF3B1', [])
('HNRNPA1', [])
('CIRBP', [])
('TROVE2', [])
('SRSF1', [])
('DDX24', [])
('EIF2S2', [])
('ESF1', [])
('TBRG4', [])
('DNAJC21', [])
('SF3B4', [])
('STIP1', [])
('EIF3H', [])
('RPS19', [])
('SUCLG1', [])
('SLBP', [])
('SRSF9', [])
('CNOT7', [])
('EFTUD2', [])
('PUM2', [])
('DDX6', [])
('ILF2', [])
('SMN1', [])
('TARDBP', [])
('PRPF8', [])
('SRSF7', [])
('EIF3G', [])
('RBM25', [])
('TRIP6', [])
('DDX47', [])
('ADAR', [])
('WRN', [])
('UTP18', [])
('MSI2', [])
('EEF2', [])
('GTF2F1', [])
('PNPT1', [])
('SUGP2', [])
('PABPC1', [])
('NELFE', [])
('DDX28', [])
('DDX1', [])
('RRP9', [])
('SBDS', [])
('HSPD1', [])
('MBNL1', [])
('CPSF7', [])
('NOL12', [])
('ATP5C1', [])
('XRCC6', [])
('SND1', [])
('SLTM', [])
('KRR1', [])
('G3BP2', [])
('UPF1', [])
('ASCC1', [])
('QKI', [])
('EIF3A', [])
('BUD13', [])
('AGO1', [])
('ILF3', 

In [27]:
get_psi('/home/bay001/projects/encode/analysis/atac_intron_analysis/jxc_from_eric/ENCFF074LFN.bam.primary.namesort.bam.CIandRI.psi')

0.045514557231856753

In [26]:
pd.read_table("/home/bay001/projects/encode/analysis/atac_intron_analysis/jxc_from_eric/ENCFF074LFN.bam.primary.namesort.bam.CIandRI.psi",
             names=['exc_count', 'inc_count', 'dpsi'], index_col=0).drop_duplicates()['exc_count'].sum()

694035

In [28]:
pd.read_table("/home/bay001/projects/encode/analysis/atac_intron_analysis/jxc_from_eric/ENCFF074LFN.bam.primary.namesort.bam.CIandRI.psi",
             names=['exc_count', 'inc_count', 'dpsi'], index_col=0).drop_duplicates()['inc_count'].sum()

33095