In [1]:
from __future__ import print_function

%load_ext autoreload
%autoreload 2


import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

sns.set(style='ticks', context='talk', rc={'font.sans-serif':'Arial', 'pdf.fonttype': 42})

%matplotlib inline


import flotilla
flotilla_dir = '/projects/ps-yeolab/obotvinnik/flotilla_projects'

study = flotilla.embark('singlecell_pnm_figure2_modalities_bayesian', flotilla_dir=flotilla_dir)
not_outliers = study.splicing.singles.index.difference(study.splicing.outliers.index)

psi = study.splicing.singles.ix[not_outliers]
grouped = psi.groupby(study.sample_id_to_phenotype)
psi_filtered = grouped.apply(lambda x: x.dropna(axis=1, thresh=20))

2017-01-13 10:54:34	Reading datapackage from /projects/ps-yeolab/obotvinnik/flotilla_projects/singlecell_pnm_figure2_modalities_bayesian/datapackage.json
2017-01-13 10:54:34	Parsing datapackage to create a Study object
https://s3-us-west-2.amazonaws.com/flotilla-projects/ercc/ERCC_Controls.txt has not been downloaded before.
	Downloading now to /home/obotvinnik/flotilla_projects/hg19/ERCC_Controls.txt


Error loading species hg19 data: HTTP Error 404: Not Found

2017-01-13 10:55:24 	Initializing Study
2017-01-13 10:55:24 	Initializing Predictor configuration manager for Study
2017-01-13 10:55:24	Predictor ExtraTreesClassifier is of type <class 'sklearn.ensemble.forest.ExtraTreesClassifier'>
2017-01-13 10:55:24	Added ExtraTreesClassifier to default predictors
2017-01-13 10:55:24	Predictor ExtraTreesRegressor is of type <class 'sklearn.ensemble.forest.ExtraTreesRegressor'>
2017-01-13 10:55:24	Added ExtraTreesRegressor to default predictors
2017-01-13 10:55:24	Predictor GradientBoostingClassifier is of type <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>
2017-01-13 10:55:24	Added GradientBoostingClassifier to default predictors
2017-01-13 10:55:24	Predictor GradientBoostingRegressor is of type <class 'sklearn.ensemble.gradient_boosting.GradientBoostingRegressor'>
2017-01-13 10:55:24	Added GradientBoostingRegressor to default predictors
2017-01-13 10:55:24 	Loading metadata
2017-01-13 10:55:24	Loading expression data
2017-0

In [2]:
# study.splicing.may

In [3]:
# folder = '/home/obotvinnik/Dropbox/figures2/singlecell_pnm/figure2_modalities/bayesian'
# !mkdir $folder

figure_folder = 'figures/030_modality_kmer_z_scores'
! mkdir -p $figure_folder

In [4]:
study.supplemental.modalities_tidy.groupby(['phenotype', 'modality']).size()

phenotype  modality     
MN         bimodal          1235
           excluded         2216
           included         3137
           uncategorized       9
NPC        bimodal          1910
           excluded         2970
           included         4514
           uncategorized      20
iPSC       bimodal          2679
           excluded         3725
           included         6182
           middle              1
           uncategorized      74
dtype: int64


## Background = phenotype events

Use the other modalities, within that celltype, as background.

  - What properties are unique to events that are bimodal in iPSC, rather than ~0 or ~1?
    - Foreground: Bimodal events in iPSC
    - Background: All other events in iPSC

In [5]:
splicing_feature_folder = '/projects/ps-yeolab/obotvinnik/singlecell_pnms/csvs_for_paper/splicing_feature_data'
alternative_folder = '{}/alternative'.format(splicing_feature_folder)
constitutive_folder = '{}/constitutive'.format(splicing_feature_folder)

In [6]:
ls /projects/ps-yeolab/obotvinnik/singlecell_pnms/csvs_for_paper/splicing_feature_data/alternative/exons_upstream200nt*

/projects/ps-yeolab/obotvinnik/singlecell_pnms/csvs_for_paper/splicing_feature_data/alternative/exons_upstream200nt.bed
/projects/ps-yeolab/obotvinnik/singlecell_pnms/csvs_for_paper/splicing_feature_data/alternative/exons_upstream200nt_kmers.csv


In [7]:
suffixes = '', '_upstream200nt', '_downstream200nt'

exon_types = {'alternative': alternative_folder, 'constitutive': constitutive_folder}

In [8]:
kmers = {}

for exon_type, folder in exon_types.items():
    print(exon_type)
    for suffix in suffixes:
        print('\tsuffix:', suffix)
        df = pd.read_csv('{}/exons{}_kmers.csv'.format(folder, suffix), index_col=0)
        print('\t\t', df.shape)
        kmers[(exon_type, suffix)] = df


alternative
	suffix: 
		 (34979, 5376)
	suffix: _upstream200nt
		 (34982, 5376)
	suffix: _downstream200nt
		 (34982, 5376)
constitutive
	suffix: 
		 (17540, 5376)
	suffix: _upstream200nt
		 (17541, 5376)
	suffix: _downstream200nt
		 (17541, 5376)


In [9]:
def kmer_location(suffix):
    if suffix == '':
        return '_exonbody'
    else:
        return suffix

In [10]:
kmer_zscores = []


for phenotype, phenotype_df in study.supplemental.modalities_tidy.groupby('phenotype'):
    background_events = set(phenotype_df.event_id)
    for modality, modality_df in phenotype_df.groupby('modality'):
        print('---\n', phenotype, modality)
        event_ids = set(modality_df.event_id)

        for suffix in suffixes:
            print('\tsuffix:', suffix)
            background = kmers[('alternative', suffix)].loc[background_events]
            foreground = kmers[('alternative', suffix)].loc[event_ids]
            kmer_zscore = (foreground.mean() - background.mean())/background.std()
            kmer_zscore.name = '{phenotype}_{modality}_{location}'.format(
                phenotype=phenotype, modality=modality, location=kmer_location(suffix))
            kmer_zscores.append(kmer_zscore)
            
kmer_zscores_background_phenotype = pd.concat(kmer_zscores, axis=1)
kmer_zscores_background_phenotype.head()

---
 MN bimodal
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 MN excluded
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 MN included
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 MN uncategorized
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 NPC bimodal
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 NPC excluded
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 NPC included
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 NPC uncategorized
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 iPSC bimodal
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 iPSC excluded
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 iPSC included
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 iPSC middle
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 iPSC uncategorized
	suffix: 
	suffix: _upstream200nt
	

Unnamed: 0,MN_bimodal__exonbody,MN_bimodal__upstream200nt,MN_bimodal__downstream200nt,MN_excluded__exonbody,MN_excluded__upstream200nt,MN_excluded__downstream200nt,MN_included__exonbody,MN_included__upstream200nt,MN_included__downstream200nt,MN_uncategorized__exonbody,...,iPSC_excluded__downstream200nt,iPSC_included__exonbody,iPSC_included__upstream200nt,iPSC_included__downstream200nt,iPSC_middle__exonbody,iPSC_middle__upstream200nt,iPSC_middle__downstream200nt,iPSC_uncategorized__exonbody,iPSC_uncategorized__upstream200nt,iPSC_uncategorized__downstream200nt
AAAA,0.056405,0.175361,0.152522,-0.336768,0.002428,0.008287,0.212556,-0.072078,-0.06894,0.885871,...,0.038945,0.167091,-0.051938,-0.043259,-0.354903,1.303387,-0.220022,0.079013,0.405857,0.384235
AAAC,0.050008,0.181014,0.139388,-0.362839,0.00211,-0.001317,0.233482,-0.074146,-0.05347,0.879162,...,0.012722,0.186482,-0.042672,-0.048479,-0.399564,-0.235414,-0.223115,0.042905,0.252028,0.285885
AAAG,0.007504,0.130289,0.11182,-0.381989,0.030548,0.032929,0.263989,-0.073617,-0.069507,0.805528,...,0.025699,0.194283,-0.042919,-0.045002,-0.399528,2.121747,-0.257663,0.175131,0.214095,0.19314
AAAT,0.071756,0.213067,0.141952,-0.364139,-0.014273,0.016375,0.224952,-0.076973,-0.069441,1.174909,...,0.022285,0.213385,-0.063287,-0.040413,-0.447026,1.444946,-0.283506,0.063098,0.806763,0.30728
AACA,0.056258,0.151192,0.153538,-0.381541,-0.010809,0.014833,0.24387,-0.052223,-0.071286,0.992083,...,0.029234,0.199863,-0.032605,-0.033755,-0.435454,-0.253286,-0.215715,0.048602,0.53487,0.261458


## Background = modality

Use modality exons as background

  - What properties are unique to events that are bimodal in iPSC, rather than bimodal in NPC or MN??
    - Foreground: Bimodal events in iPSC
    - Background: All bimodal events

In [11]:
kmer_zscores = []


for modality, modality_df in study.supplemental.modalities_tidy.groupby('modality'):
    background_events = set(modality_df.event_id)
    for phenotype, phenotype_df in modality_df.groupby('phenotype'):
        print('---\n', modality, phenotype)
        event_ids = set(phenotype_df.event_id)

        for suffix in suffixes:
            print('\tsuffix:', suffix)
            background = kmers[('alternative', suffix)].loc[background_events]
            foreground = kmers[('alternative', suffix)].loc[event_ids]
            kmer_zscore = (foreground.mean() - background.mean())/background.std()
            kmer_zscore.name = '{phenotype}_{modality}_{location}'.format(
                phenotype=phenotype, modality=modality, location=kmer_location(suffix))
            kmer_zscores.append(kmer_zscore)
            
kmer_zscores_background_modality = pd.concat(kmer_zscores, axis=1)
kmer_zscores_background_modality.head()

---
 bimodal MN
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 bimodal NPC
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 bimodal iPSC
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 excluded MN
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 excluded NPC
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 excluded iPSC
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 included MN
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 included NPC
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 included iPSC
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 middle iPSC
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 uncategorized MN
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 uncategorized NPC
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 uncategorized iPSC
	suffix: 
	suffix: _upstream200nt
	

Unnamed: 0,MN_bimodal__exonbody,MN_bimodal__upstream200nt,MN_bimodal__downstream200nt,NPC_bimodal__exonbody,NPC_bimodal__upstream200nt,NPC_bimodal__downstream200nt,iPSC_bimodal__exonbody,iPSC_bimodal__upstream200nt,iPSC_bimodal__downstream200nt,MN_excluded__exonbody,...,iPSC_middle__downstream200nt,MN_uncategorized__exonbody,MN_uncategorized__upstream200nt,MN_uncategorized__downstream200nt,NPC_uncategorized__exonbody,NPC_uncategorized__upstream200nt,NPC_uncategorized__downstream200nt,iPSC_uncategorized__exonbody,iPSC_uncategorized__upstream200nt,iPSC_uncategorized__downstream200nt
AAAA,0.037405,0.11304,0.096247,0.049721,0.058547,0.073591,-0.021084,-0.013773,-0.030383,-0.016963,...,,0.61134,0.036702,0.230451,-0.141569,-0.155843,0.093067,-0.016922,0.016596,-0.05917
AAAC,0.017962,0.119641,0.083922,0.056093,0.041125,0.036119,-0.01682,-0.008099,-0.012743,-0.014208,...,,0.701052,0.172474,-0.341014,0.024471,0.239308,0.187558,-0.030387,0.027969,0.001843
AAAG,-0.013528,0.082009,0.063479,0.061544,0.045198,0.030874,-0.026754,-0.023283,-0.015227,-0.022642,...,,0.433663,0.037405,0.283346,-0.322135,0.095383,0.443817,0.041639,-0.003336,-0.104708
AAAT,0.034062,0.11048,0.088746,0.057628,0.053855,0.077369,-0.037204,-0.011433,-0.018389,-0.023534,...,,0.629176,0.135574,0.2096,-0.083487,0.030504,0.148302,-0.056116,0.018281,-0.030622
AACA,0.024849,0.082493,0.113188,0.052891,0.053867,0.026185,-0.009851,-0.001861,-0.045592,-0.01622,...,,0.659754,-0.241592,-0.121214,-0.208975,-0.042407,0.055616,-0.021813,0.006043,-0.019656



## Background = constitutive

Use constitutive exons as background

  - What properties are unique to events that are bimodal in iPSC, rather than constitutive exons?
    - Foreground: Bimodal events in iPSC
    - Background: Constitutive exons across all samples

In [12]:
kmer_zscores = []


for phenotype, phenotype_df in study.supplemental.modalities_tidy.groupby('phenotype'):
    background_events = set(phenotype_df.event_id)
    for modality, modality_df in phenotype_df.groupby('modality'):
        print('---\n', phenotype, modality)
        event_ids = set(modality_df.event_id)

        for suffix in suffixes:
            print('\tsuffix:', suffix)
            background = kmers[('constitutive', suffix)]
            foreground = kmers[('alternative', suffix)].loc[event_ids]
            kmer_zscore = (foreground.mean() - background.mean())/background.std()
            kmer_zscore.name = '{phenotype}_{modality}_{location}'.format(
                phenotype=phenotype, modality=modality, location=kmer_location(suffix))
            kmer_zscores.append(kmer_zscore)
            
kmer_zscores_background_constitutive = pd.concat(kmer_zscores, axis=1)
kmer_zscores_background_constitutive.head()

---
 MN bimodal
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 MN excluded
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 MN included
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 MN uncategorized
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 NPC bimodal
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 NPC excluded
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 NPC included
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 NPC uncategorized
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 iPSC bimodal
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 iPSC excluded
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 iPSC included
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 iPSC middle
	suffix: 
	suffix: _upstream200nt
	suffix: _downstream200nt
---
 iPSC uncategorized
	suffix: 
	suffix: _upstream200nt
	

Unnamed: 0,MN_bimodal__exonbody,MN_bimodal__upstream200nt,MN_bimodal__downstream200nt,MN_excluded__exonbody,MN_excluded__upstream200nt,MN_excluded__downstream200nt,MN_included__exonbody,MN_included__upstream200nt,MN_included__downstream200nt,MN_uncategorized__exonbody,...,iPSC_excluded__downstream200nt,iPSC_included__exonbody,iPSC_included__upstream200nt,iPSC_included__downstream200nt,iPSC_middle__exonbody,iPSC_middle__upstream200nt,iPSC_middle__downstream200nt,iPSC_uncategorized__exonbody,iPSC_uncategorized__upstream200nt,iPSC_uncategorized__downstream200nt
AAAA,-0.169238,0.194292,0.155678,-0.379129,0.012273,0.01034,-0.085879,-0.066146,-0.067477,0.273562,...,0.008401,-0.073921,-0.078576,-0.070874,-0.433035,1.232219,-0.24134,-0.134515,0.364178,0.34139
AAAC,-0.175832,0.177417,0.135566,-0.423041,-0.005321,-0.008769,-0.06597,-0.083211,-0.062267,0.320657,...,-0.03675,-0.054983,-0.071007,-0.091623,-0.490582,-0.259703,-0.248203,-0.161702,0.217508,0.20817
AAAG,-0.220533,0.067076,0.030502,-0.446407,-0.021893,-0.034056,-0.071792,-0.114808,-0.11788,0.242258,...,-0.068454,-0.067792,-0.112853,-0.123686,-0.502143,1.668081,-0.289821,-0.081801,0.0986,0.062354
AAAT,-0.152331,0.32243,0.2158,-0.418774,0.052622,0.067782,-0.058689,-0.021791,-0.033371,0.521976,...,0.047766,-0.03411,-0.035617,-0.021884,-0.48884,1.682824,-0.29193,-0.137591,0.955695,0.36436
AACA,-0.18859,0.112296,0.113357,-0.477171,-0.042433,-0.021776,-0.064922,-0.081989,-0.105678,0.428274,...,-0.051716,-0.066135,-0.078912,-0.105833,-0.547646,-0.28213,-0.262163,-0.180777,0.443658,0.147798


In [None]:
study.supplemental.kmer_zscores_background_phenotype = kmer_zscores_background_phenotype
study.supplemental.kmer_zscores_background_modality = kmer_zscores_background_modality
study.supplemental.kmer_zscores_background_constitutive = kmer_zscores_background_constitutive

In [None]:
study.save('singlecell_pnm_figure2_modalities_bayesian_kmers', flotilla_dir=flotilla_dir)