In [None]:
import modish
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

sns.set(style='ticks', context='talk', rc={'font.sans-serif':'Arial', 'pdf.fonttype': 42})

%matplotlib inline

%load_ext autoreload
%autoreload 2

folder = '/home/obotvinnik/Dropbox/figures2/singlecell_pnm/figure4_voyages'

import flotilla
study = flotilla.embark('singlecell_pnm_figure2_modalities')
# study = flotilla.embark('singlecell_pnm_figure1_supplementary_post_splicing_filtering')

## Predict Nonsense-mediated decay splicing events

In [None]:
import itertools
from scipy.stats import spearmanr

splicing_variant = study.splicing.data.ix[:, study.splicing.data.var() > 0.1]

s = study.splicing.feature_data.ensembl_id[splicing_variant.columns]
event_name_to_ensembl_ids = list(itertools.chain(
            *[zip([k] * len(v.split(',')), v.split(',')) for k, v in
              s.iteritems()]))
psi_expression_correls = pd.DataFrame(event_name_to_ensembl_ids, columns=['event_name', 'ensembl_id'])
psi_expression_correls['spearmanr'] = np.nan
psi_expression_correls['spearmanp'] = np.nan
print psi_expression_correls.shape

for i, row in psi_expression_correls.iterrows():
    if i % 1000 == 0:
        print '\t', i
    try:
        x = study.splicing.data[row.event_name].dropna()
        y = study.expression.data[row.ensembl_id]
        x, y = x.align(y, 'inner')
        r, p = spearmanr(x, y)
        psi_expression_correls.ix[i, ['spearmanr', 'spearmanp']] = r, p
    except KeyError:
        continue

In [None]:
psi_expression_correls = psi_expression_correls.join(study.expression.feature_data, on='ensembl_id')

In [None]:
sns.distplot(psi_expression_correls.spearmanr[psi_expression_correls.gerstberger2014_tf].dropna())

In [None]:
rbp_not_ribosome = ~psi_expression_correls.gerstberger2014_rbp_target_ribosome & psi_expression_correls.gerstberger2014_rbp
print rbp_not_ribosome.sum()
sns.distplot(psi_expression_correls.spearmanr[rbp_not_ribosome].dropna())

In [None]:
pd.options.display.max_colwidth = 200

Check a positive control

In [None]:
psi_expression_correls.loc[psi_expression_correls.gene_name == 'EIF4A2']

In [None]:
potential_nmd = psi_expression_correls.ix[psi_expression_correls.spearmanr.abs() > 0.3]
print potential_nmd.shape
potential_nmd.head()

In [None]:
potential_nmd.index = np.arange(potential_nmd.shape[0])

In [None]:
potential_nmd.to_csv('/home/obotvinnik/Dropbox/figures2/singlecell_pnm/potential_nmd_targets.csv', index=False)

### There is a True NMD event in EIF4A3

In [None]:
# eif4a2_event = 'chr3:186505592:186505671:+@chr3:186506099:186506205:+@chr3:186506914:186507685:+'

eif4a2_junction = 'chr3:186505672-186506098:+|5p'
eif4a2_ensembl_id = study.expression.maybe_renamed_to_feature_id('EIF4A2')[0]

x = study.expression.data[eif4a2_ensembl_id]
y = study.splicing.data[eif4a2_junction].dropna()

x, y = x.align(y, 'inner')
y.name = '{} {} $\Psi$'.format("EIF4A2", eif4a2_junction)
x.name = "EIF4A2" + ' $\log_2(\mathrm{TPM}+1)$'
color = study.sample_id_to_color[x.index]

sns.jointplot(x, y, joint_kws=dict(color=color), color='#262626', ylim=(-0.05, 1.05), #annot_kws=dict(loc='lower left'), 
              stat_func=spearmanr)

### Plot random events


In [None]:
events = list(np.random.choice(potential_nmd.index, size=10, replace=False)) 
for i in events:
    row = potential_nmd.iloc[i]
    x = study.expression.data[row.ensembl_id]
    y = study.splicing.data[row.event_name].dropna()
    
    print row.gene_name, row.event_name, '\t', row.spearmanr, '\t', y.var()

    x, y = x.align(y, 'inner')
    y.name = '{} {} $\Psi$'.format(row.gene_name, row.event_name)
    x.name = row.gene_name + ' $\log_2(\mathrm{TPM}+1)$'
    color = study.sample_id_to_color[x.index]

    sns.jointplot(x, y, joint_kws=dict(color=color), color='#262626', ylim=(-0.05, 1.05), #annot_kws=dict(loc='lower left'), 
                  stat_func=spearmanr)