In [None]:
import modish
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

from modish.visualize import MODALITY_TO_COLOR, MODALITY_ORDER, MODALITY_PALETTE
modality_order = MODALITY_ORDER

sns.set(style='ticks', context='talk', rc={'font.sans-serif':'Arial', 'pdf.fonttype': 42})

%matplotlib inline

%load_ext autoreload
%autoreload 2

folder = '/home/obotvinnik/Dropbox/figures2/singlecell_pnm/figure4_voyages'

import flotilla
study = flotilla.embark('singlecell_pnm_figure4_voyages')

In [4]:
ls /home/obotvinnik/flotilla_projects/singlecell_pnm*

/home/obotvinnik/flotilla_projects/singlecell_pnm:
[0m[01;32mdatapackage.json[0m*           [01;32mmapping_stats.csv.gz[0m*  [01;32msplicing_feature.csv.gz[0m*
[01;32mexpression.csv.gz[0m*          [01;32mmetadata.csv.gz[0m*
[01;32mexpression_feature.csv.gz[0m*  [01;32msplicing.csv.gz[0m*

/home/obotvinnik/flotilla_projects/singlecell_pnm_0.3:
[01;32mdatapackage.json[0m*           [01;32mmetadata.csv.gz[0m*          splicing.hdf
[01;32mexpression.csv.gz[0m*          [01;32mpsi_constitutive.csv.gz[0m*  [01;32msplicing_feature.csv.gz[0m*
[01;32mexpression_feature.csv.gz[0m*  psi_constitutive.hdf      splicing_feature.hdf
[01;32mmapping_stats.csv.gz[0m*       [01;32msplicing.csv.gz[0m*

/home/obotvinnik/flotilla_projects/singlecell_pnm_figure1_post_kmeans_outliers:
datapackage.json           gene_ontology.hdf     splicing.csv.gz
expression.csv.gz          mapping_stats.csv.gz  splicing_feature.csv.gz
expression_feature.csv.gz  metadata.csv.gz

/home/obotvinn

In [None]:
transitions = study.phenotype_transitions + [('iPSC', 'MN')]
transitions

In [None]:
transition_strs = ['{}-{}'.format(*transition) for transition in transitions]
transition_strs

In [None]:
import matplotlib as mpl

fig, ax = plt.subplots(figsize=(4, 3))
x = study.supplemental.voyages['$\Delta x$']
y = study.supplemental.voyages['$\Delta y$']

# ind = (x != 0) & (y != 0)
# x = x[ind]
# y = y[ind]

xmin = x.min()
xmax = x.max()

ymin = y.min()
ymax = y.max()

vmin = min(xmin, ymin)
vmax = max(xmax, ymax)

image = ax.hexbin(x, y, mincnt=1, bins='log', 
                  gridsize=25,
                  cmap='YlGnBu', edgecolor='darkgrey')
ax.hlines(0, vmin, vmax, zorder=-1)
ax.vlines(0, vmin, vmax, zorder=-1)
ax.set(xlabel='$\Delta x$', ylabel='$\Delta y$')
sns.despine(left=True, bottom=True)
plt.colorbar(image, label='$\log_{10} \mathrm{count}$', ticks=mpl.ticker.MaxNLocator(4))
fig.tight_layout()
fig.savefig('{}/voyage_distances_deltas.pdf'.format(folder))

In [None]:
g = sns.FacetGrid(study.supplemental.voyages, col='transition', col_order=transition_strs)
g.map(plt.hexbin, '$\Delta x$', '$\Delta y$', mincnt=1, #bins='log', 
      gridsize=25,
      cmap='YlGnBu', edgecolor='darkgrey')

In [None]:
def mean_cutoff(x, *args, **kwargs):
    ax = kwargs['ax'] if 'ax' in kwargs else plt.gca()
    ymin, ymax = ax.get_ylim()
    cutoff = x.mean() + x.std()
    ax.vlines(cutoff, ymin, ymax, **kwargs)

g = sns.FacetGrid(study.supplemental.voyages, col='direction', row='transition', margin_titles=True, row_order=transition_strs)
g.map(sns.distplot, 'voyage_distance', color='teal', kde=True, bins=np.linspace(0, 1.5, 20))
g.map(mean_cutoff, 'voyage_distance', linestyle='--', color='k')
g.set(xlim=(0, 1.5))
g.savefig('{}/voyages_histogram_facetgrid.pdf'.format(folder))

## Features of changing events

In [None]:
delta_cols = ['$\Delta x$', '$\Delta y$']

voyages_filtered = study.supplemental.voyages.groupby(['transition', 'direction'], as_index=False, group_keys=False).apply(
    lambda x: x.loc[x.voyage_distance > x.voyage_distance.quantile(0.8)])

# voyages_filtered = study.supplemental.voyages.groupby(['transition', 'direction']).apply(
#     lambda x: x.loc[(x[delta_cols].abs() > (x[delta_cols].abs().mean() + 1*x[delta_cols].abs().std())).any(axis=1)])
print voyages_filtered.groupby('transition').size()

In [None]:
vmax = 35

g = sns.FacetGrid(voyages_filtered, col='transition', col_order=transition_strs)
g.map(plt.hexbin, '$\Delta x$', '$\Delta y$',# bins='log', 
      cmap='YlGnBu',
      mincnt=1, edgecolor='darkgrey', gridsize=25, #vmax=vmax, 
      vmin=0)
for ax in g.axes.flat:
    ax.hlines(0, -1, 1, color='k', zorder=-1)
    ax.vlines(0, -1, 1, color='k', zorder=-1)
    print np.max(ax.collections[0].get_array())
    
g.set(xlim=(-1, 1), ylim=(-1, 1))
g.savefig('{}/per_transition_voyages_facetgrid.pdf'.format(folder))

cfig, cax = plt.subplots(figsize=(1.25, 2))
plt.colorbar(ax.collections[0], cax=cax, orientation='vertical',  label='Count')
cfig.tight_layout()
cfig.savefig('{}/per_transition_voyages_facetgrid_colorbar.pdf'.format(folder))
# sns.despine(bottom=True, left=True)

In [None]:
voyages_filtered.head()

In [None]:
import pybedtools

bed_folder = '/projects/ps-yeolab/obotvinnik/singlecell_pnms/figure4_voayges'
! mkdir $bed_folder

DIR = '/projects/ps-yeolab/obotvinnik/singlecell_pnms'

exon_bedfile = '{}/exon2.bed'.format(DIR)
exon_bed = pybedtools.BedTool(exon_bedfile)



## Features of changing events

In [38]:
study.splicing.feature_data.shape

(33693, 103)

In [40]:
voyages_annotated = study.supplemental.voyages.join(study.splicing.feature_data, on='event_id')
voyages_annotated.head()

Unnamed: 0,event_id,$\Delta x$,$\Delta y$,voyage_distance,group1,group2,direction,transition,criteria,criteria_additional,...,isoform1_pfam_go_id,isoform1_pfam_go_name,isoform1_pfam_name,isoform2_pfam_accession,isoform2_pfam_go_id,isoform2_pfam_go_name,isoform2_pfam_name,isoform_domain_effects,custom_1,custom_2
0,exon:chr10:102114184-102114389:+@exon:chr10:10...,0.053571,-0.054198,0.076206,iPSC,NPC,$\searrow$,iPSC-NPC,only one,,...,,,,PF00487.19[1],GO:0006629[1],(lipid metabolic process)[1],FA_desaturase[1],No annotated reading frame --> Protein +/- domain,False,False
1,exon:chr10:102286732-102286831:-@exon:chr10:10...,0.0,0.0,0.0,iPSC,NPC,$\swarrow$,iPSC-NPC,best,appris_principal,...,,,,PF05821.6[1],"GO:0003954[1],GO:0008137[1],GO:0005739[1]","(NADH dehydrogenase activity)[1],(NADH dehydro...",NDUF_B8[1],No annotated reading frame --> Protein +/- domain,True,True
2,exon:chr10:102747070-102747190:-@exon:chr10:10...,0.00448,-0.00448,0.006336,iPSC,NPC,$\searrow$,iPSC-NPC,one event with gencode transcripts,,...,,,,PF05047.11[1],nan[1],(nan)[1],L51_S25_CI-B8[1],No annotated reading frame --> Protein +/- domain,True,True
3,exon:chr10:103348089-103348157:+@exon:chr10:10...,0.0,-0.027778,0.027778,iPSC,NPC,$\swarrow$,iPSC-NPC,only one,,...,,,,PF14913.1[1],nan[1],(nan)[1],DPCD[1],No annotated reading frame --> Protein +/- domain,True,True
4,exon:chr10:103360960-103361093:+@exon:chr10:10...,-0.056306,0.055556,0.0791,iPSC,NPC,$\nwarrow$,iPSC-NPC,only one,,...,nan[1],(nan)[1],DPCD[1],,,,,Protein +/- domain --> No annotated reading frame,True,True


## Submit HOMER motifs

### Get upstream/downstream sequences

In [18]:
import os
import glob
import pybedtools

nt = 200

directions = 'upstream', 'downstream'

for bedfile in glob.iglob('{}/exon2*.bed'.format(bed_folder)):
    if 'stream' in bedfile:
        continue
    
    basename = os.path.basename(bedfile)
    print basename
    prefix = basename.split('.bed')[0]
    bed = pybedtools.BedTool(bedfile)
    
    for direction in directions:
        # Get downstream intron
        if direction == 'downstream':
            intron = bed.flank(l=0, r=nt, s=True, g=pybedtools.chromsizes('hg19'))
        elif direction == 'upstream':
            intron = bed.flank(l=nt, r=0, s=True, g=pybedtools.chromsizes('hg19'))
            
    # Saved every exon that was exactly upstream or downstream of a junction,
    # So when taking the flanking sequence, there's a lot of repetition
    unique = pybedtools.BedTool(list(set(x for x in intron)))
    
    
    print '\t', len(intron), len(unique)
    saveas = '{}/{}_{}{}.bed'.format(bed_folder, prefix, direction, nt)
    print '\t', saveas
    unique.saveas(saveas)
    ! wc -l $saveas

exon2_iPSC-NPC_swarrow.bed
	371 371
	/projects/ps-yeolab/obotvinnik/singlecell_pnms/figure4_voayges/exon2_iPSC-NPC_swarrow_downstream200.bed
371 /projects/ps-yeolab/obotvinnik/singlecell_pnms/figure4_voayges/exon2_iPSC-NPC_swarrow_downstream200.bed
exon2_iPSC-MN_nearrow_background.bed
	2331 2331
	/projects/ps-yeolab/obotvinnik/singlecell_pnms/figure4_voayges/exon2_iPSC-MN_nearrow_background_downstream200.bed
2331 /projects/ps-yeolab/obotvinnik/singlecell_pnms/figure4_voayges/exon2_iPSC-MN_nearrow_background_downstream200.bed
exon2_iPSC-NPC_nearrow.bed
	93 93
	/projects/ps-yeolab/obotvinnik/singlecell_pnms/figure4_voayges/exon2_iPSC-NPC_nearrow_downstream200.bed
93 /projects/ps-yeolab/obotvinnik/singlecell_pnms/figure4_voayges/exon2_iPSC-NPC_nearrow_downstream200.bed
exon2_iPSC-MN_searrow_background.bed
	2265 2265
	/projects/ps-yeolab/obotvinnik/singlecell_pnms/figure4_voayges/exon2_iPSC-MN_searrow_background_downstream200.bed
2265 /projects/ps-yeolab/obotvinnik/singlecell_pnms/figure4_

In [3]:
ls /projects/ps-yeolab/obotvinnik/singlecell_pnms/figure4_voyages/homer

[0m[01;34mexon2_NPC-MN_nearrow[0m/                  homer.err-10  homer.out-11
[01;34mexon2_NPC-MN_nearrow_downstream200[0m/    homer.err-11  homer.out-12
[01;34mexon2_NPC-MN_nearrow_upstream200[0m/      homer.err-12  homer.out-13
[01;34mexon2_NPC-MN_nwarrow[0m/                  homer.err-13  homer.out-14
[01;34mexon2_NPC-MN_nwarrow_downstream200[0m/    homer.err-14  homer.out-15
[01;34mexon2_NPC-MN_nwarrow_upstream200[0m/      homer.err-15  homer.out-16
[01;34mexon2_NPC-MN_searrow[0m/                  homer.err-16  homer.out-17
[01;34mexon2_NPC-MN_searrow_downstream200[0m/    homer.err-17  homer.out-18
[01;34mexon2_NPC-MN_searrow_upstream200[0m/      homer.err-18  homer.out-19
[01;34mexon2_NPC-MN_swarrow[0m/                  homer.err-19  homer.out-2
[01;34mexon2_NPC-MN_swarrow_downstream200[0m/    homer.err-2   homer.out-20
[01;34mexon2_NPC-MN_swarrow_upstream200[0m/      homer.err-20  homer.out-21
[01;34mexon2_iPSC-MN_nearrow[0m/               

In [22]:
cd $homer_dir

/projects/ps-yeolab/obotvinnik/singlecell_pnms/figure4_voayges/homer


In [23]:

import os
import fileinput

from gscripts.qtools import Submitter
import glob
import itertools

findMotifsGenome = '/home/yeo-lab/software/homer/bin/findMotifsGenome.pl'
n_processors = 4
homer_flags = '-rna -len 4,5,6,7 -mset vertebrates -mis 1 -p {}'.format(n_processors)

commands = []


for bedfile in glob.iglob('{}/exon2*.bed'.format(bed_folder)):
    
    basename = os.path.basename(bedfile)
    if 'background' in basename:
        continue
    
    prefix = basename.split('.')[0]
    transition = prefix.split('_')[1]
    direction = prefix.split('_')[2]
#     print modality
    
    background = bedfile.replace(direction, direction + '_background')
    
    out_dir = '{}/{}'.format(homer_dir, basename.replace('.bed', ''))
    if os.path.exists('{}/homerResults.html'):
        print 'Already finished {}, moving on'.format(basename)
        continue
    command = '{} {} hg19 {} -bg {} {}'.format(
        findMotifsGenome, bedfile, out_dir, background, homer_flags)
    print '\n', command
    commands.append(command)
#     ! $command
jobname = 'homer'
sub = Submitter(commands, jobname, walltime='1:00:00', queue='home-scrm',
               ppn=n_processors, write_and_submit=True, array=True,
               out_filename='{}/{}.out'.format(homer_dir, jobname),
               err_filename='{}/{}.err'.format(homer_dir, jobname),)

running 24 tasks as an array-job.
job ID: 3670513



/home/yeo-lab/software/homer/bin/findMotifsGenome.pl /projects/ps-yeolab/obotvinnik/singlecell_pnms/figure4_voayges/exon2_iPSC-NPC_swarrow.bed hg19 /projects/ps-yeolab/obotvinnik/singlecell_pnms/figure4_voayges/homer/exon2_iPSC-NPC_swarrow -bg /projects/ps-yeolab/obotvinnik/singlecell_pnms/figure4_voayges/exon2_iPSC-NPC_swarrow_background.bed -rna -len 4,5,6,7 -mset vertebrates -mis 1 -p 4

/home/yeo-lab/software/homer/bin/findMotifsGenome.pl /projects/ps-yeolab/obotvinnik/singlecell_pnms/figure4_voayges/exon2_iPSC-NPC_swarrow_downstream200.bed hg19 /projects/ps-yeolab/obotvinnik/singlecell_pnms/figure4_voayges/homer/exon2_iPSC-NPC_swarrow_downstream200 -bg /projects/ps-yeolab/obotvinnik/singlecell_pnms/figure4_voayges/exon2_iPSC-NPC_swarrow_background_downstream200.bed -rna -len 4,5,6,7 -mset vertebrates -mis 1 -p 4

/home/yeo-lab/software/homer/bin/findMotifsGenome.pl /projects/ps-yeolab/obotvinnik/singlecell_pnms/figure4_voayges/exon2_iPSC-NPC_nearrow.bed hg19 /projects/ps-yeolab/o

In [20]:
prefix

'exon2_iPSC-NPC_nwarrow_downstream200'