In [1]:
import pandas as pd
import gffutils
import pybedtools
import re
import numpy as np

v19db_filename = '/projects/ps-yeolab/genomes/hg19/gencode/v19/gencode.v19.annotation.gtf.db'
v19db = gffutils.FeatureDB(v19db_filename)

folder = '/projects/ps-yeolab/obotvinnik/singlecell_pnms'

exons_to_junctions = pd.read_csv('{}/exons_to_junctions_se.csv'.format(folder), index_col=[0, 1, 2], squeeze=True, header=None)
exons_to_junctions = exons_to_junctions.reset_index()
exons_to_junctions = exons_to_junctions.rename(columns={0: 'exon1', 1: 'exon2', 2: 'exon3', 3:'junctions'})
exons_to_junctions['junctions'] = exons_to_junctions.junctions.map(eval)
exons_to_junctions.head()

Unnamed: 0,exon1,exon2,exon3,junctions
0,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100188908-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-..."
1,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100188913-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-..."
2,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100189330-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-..."
3,exon:chr10:100193697-100193848:-,exon:chr10:100190888-100191048:-,exon:chr10:100190328-100190427:-,"(chr10:100191049-100193696:-, chr10:100190428-..."
4,exon:chr10:100195392-100195529:-,exon:chr10:100195029-100195171:-,exon:chr10:100193697-100193848:-,"(chr10:100195172-100195391:-, chr10:100193849-..."


In [2]:
exons_to_junctions['event_id'] = exons_to_junctions['exon1'] + '@' \
    + exons_to_junctions['exon2'] + '@' + \
    exons_to_junctions['exon3']
exons_to_junctions.head()

Unnamed: 0,exon1,exon2,exon3,junctions,event_id
0,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100188908-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-...",exon:chr10:100190328-100190427:-@exon:chr10:10...
1,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100188913-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-...",exon:chr10:100190328-100190427:-@exon:chr10:10...
2,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100189330-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-...",exon:chr10:100190328-100190427:-@exon:chr10:10...
3,exon:chr10:100193697-100193848:-,exon:chr10:100190888-100191048:-,exon:chr10:100190328-100190427:-,"(chr10:100191049-100193696:-, chr10:100190428-...",exon:chr10:100193697-100193848:-@exon:chr10:10...
4,exon:chr10:100195392-100195529:-,exon:chr10:100195029-100195171:-,exon:chr10:100193697-100193848:-,"(chr10:100195172-100195391:-, chr10:100193849-...",exon:chr10:100195392-100195529:-@exon:chr10:10...


In [3]:
exon_junction_direction = pd.read_csv('{}/junction_exons.csv'.format(folder), index_col=0)
print exon_junction_direction.shape
exon_junction_direction.head()

(706732, 3)


Unnamed: 0,junction_location,exon,direction
0,chr10:100008749-100010821:-,exon:chr10:100010822-100010933:-,upstream
1,chr10:100010934-100011322:-,exon:chr10:100011323-100011459:-,upstream
2,chr10:100143626-100144703:-,exon:chr10:100144704-100144824:-,upstream
3,chr10:100144825-100146957:-,exon:chr10:100146958-100147064:-,upstream
4,chr10:100147065-100155147:-,exon:chr10:100155148-100155209:-,upstream


## Write alternative exons to bed file

In [57]:
from poshsplice.region import Region

exons_to_junctions['exon2_region'] = exons_to_junctions.exon2.map(Region)

exon2_bed = pd.DataFrame.from_records(exons_to_junctions.exon2_region.map(
    lambda x: pd.Series(dict(chrom=x.chrom, start=x._start, stop=x._stop, strand=x.strand, score=1000))))

exon2_bed['name'] = exons_to_junctions.event_id
exon2_bed = exon2_bed.reindex(columns=['chrom', 'start', 'stop', 'name', 'score', 'strand'])

print exon2_bed.shape
exon2_bed.head()

(58842, 6)


Unnamed: 0,chrom,start,stop,name,score,strand
0,chr10,100189548,100189646,exon:chr10:100190328-100190427:-@exon:chr10:10...,1000,-
1,chr10,100189548,100189646,exon:chr10:100190328-100190427:-@exon:chr10:10...,1000,-
2,chr10,100189548,100189646,exon:chr10:100190328-100190427:-@exon:chr10:10...,1000,-
3,chr10,100190888,100191048,exon:chr10:100193697-100193848:-@exon:chr10:10...,1000,-
4,chr10,100195029,100195171,exon:chr10:100195392-100195529:-@exon:chr10:10...,1000,-


In [56]:
exon2_bed.to_csv('{}/skipped_exon_exon2.bed'.format(folder), index=False, header=False, sep='\t')

## Get internal, constitutive exons

### Internal: have both downstream and upstream junctions

In [4]:
exon_direction = exon_junction_direction.groupby(['exon', 'direction']).size()
exon_direction.head()
# constitutive = constitutive[constitutive == 2]

exon                              direction 
exon:chr10:100003848-100004106:+  downstream    1
exon:chr10:100003848-100004321:+  downstream    1
exon:chr10:100003848-100004651:+  downstream    1
exon:chr10:100003848-100004654:+  downstream    1
exon:chr10:100007447-100008748:-  downstream    1
dtype: int64

Group on the number of "direction" items they have. Must have two to be internal

In [5]:
internal_exons = exon_direction.groupby(level=0).size()
internal_exons = internal_exons[internal_exons == 2]
internal_exons.head()

exon
exon:chr10:100010822-100010933:-    2
exon:chr10:100144704-100144824:-    2
exon:chr10:100146958-100147064:-    2
exon:chr10:100155148-100155209:-    2
exon:chr10:100157102-100157255:-    2
dtype: int64

In [6]:
exon_juction_direction_internal = exon_junction_direction.loc[exon_junction_direction.exon.isin(internal_exons.index)]
exon_juction_direction_internal.head()

Unnamed: 0,junction_location,exon,direction
0,chr10:100008749-100010821:-,exon:chr10:100010822-100010933:-,upstream
2,chr10:100143626-100144703:-,exon:chr10:100144704-100144824:-,upstream
3,chr10:100144825-100146957:-,exon:chr10:100146958-100147064:-,upstream
4,chr10:100147065-100155147:-,exon:chr10:100155148-100155209:-,upstream
6,chr10:100155210-100157101:-,exon:chr10:100157102-100157255:-,upstream


In [15]:
single_upstream_downstream = exon_juction_direction_internal.groupby(['exon', 'direction']).size()
single_upstream_downstream = single_upstream_downstream[single_upstream_downstream == 1]
single_upstream_downstream = single_upstream_downstream.groupby(level=0).filter(lambda x: len(x) == 2)
print single_upstream_downstream.shape
single_upstream_downstream

(168130,)


exon                              direction 
exon:chr10:100010822-100010933:-  downstream    1
                                  upstream      1
exon:chr10:100144704-100144824:-  downstream    1
                                  upstream      1
exon:chr10:100146958-100147064:-  downstream    1
                                  upstream      1
exon:chr10:100155148-100155209:-  downstream    1
                                  upstream      1
exon:chr10:100157102-100157255:-  downstream    1
                                  upstream      1
exon:chr10:100159859-100160014:-  downstream    1
                                  upstream      1
exon:chr10:100167339-100167412:-  downstream    1
                                  upstream      1
exon:chr10:100167661-100167754:-  downstream    1
                                  upstream      1
exon:chr10:100170696-100170715:-  downstream    1
                                  upstream      1
exon:chr10:100177932-100178014:-  downstream    1
     

In [16]:
single_upstream_downstream = single_upstream_downstream.reset_index()
single_upstream_downstream.tail()

Unnamed: 0,exon,direction,0
168125,exon:chrY:6955308-6955473:+,upstream,1
168126,exon:chrY:7235397-7235474:+,downstream,1
168127,exon:chrY:7235397-7235474:+,upstream,1
168128,exon:chrY:7239784-7239930:+,downstream,1
168129,exon:chrY:7239784-7239930:+,upstream,1


In [20]:
single_upstream_downstream.exon.unique().shape

(84065,)

### Remove all exons that appear as exon2 in alternative annotations

In [19]:
constitutive = single_upstream_downstream.loc[~single_upstream_downstream.exon.isin(exons_to_junctions.exon2)]
print constitutive.exon.unique().shape
constitutive.head()

(74549,)


Unnamed: 0,exon,direction,0
0,exon:chr10:100010822-100010933:-,downstream,1
1,exon:chr10:100010822-100010933:-,upstream,1
2,exon:chr10:100144704-100144824:-,downstream,1
3,exon:chr10:100144704-100144824:-,upstream,1
4,exon:chr10:100146958-100147064:-,downstream,1


In [23]:
from poshsplice.region import Region
constitutive_regions = pd.Series(constitutive.exon.unique()).map(Region)
constitutive_regions.head()

0    exon:chr10:100010822-100010933:-
1    exon:chr10:100144704-100144824:-
2    exon:chr10:100146958-100147064:-
3    exon:chr10:100155148-100155209:-
4    exon:chr10:100157102-100157255:-
dtype: object

Check that the junctions for these exons are seen in 20+ single cells per celltype

In [26]:
csv_folder = '/home/obotvinnik/projects/singlecell_pnms/analysis/csvs_for_paper'
psi3_psi5 = pd.read_hdf('{}/psi5_psi3_combined.hdf'.format(csv_folder), 'psi')
# psi3_psi5 = pd.read_csv('{}/psi5_psi3_combined.csv'.format(csv_folder), index_col=0)
print psi3_psi5.shape
psi3_psi5.head()

(214, 573064)


Unnamed: 0_level_0,chr10:100008749-100010821:-|5p,chr10:100010934-100011322:-|5p,chr10:100143626-100144703:-|5p,chr10:100144825-100146957:-|5p,chr10:100147065-100155147:-|5p,chr10:100150512-100150766:-|5p,chr10:100155210-100157101:-|5p,chr10:100157256-100159858:-|5p,chr10:100160015-100167338:-|5p,chr10:100167413-100167660:-|5p,...,chrY:7194211-7209155:+|3p,chrY:7194211-7224175:+|3p,chrY:7198575-7201095:+|3p,chrY:7208064-7209155:+|3p,chrY:7209276-7224175:+|3p,chrY:7223339-7223437:+|3p,chrY:7224272-7235396:+|3p,chrY:7225609-7226951:+|3p,chrY:7235475-7239783:+|3p,chrY:7239931-7243752:+|3p
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CVN_01,,,,,,,,,,,...,,,,,,,,,,
CVN_02,,,,,,,,,,,...,0.0,0.0,,1.0,1.0,,1.0,,1.0,1.0
CVN_03,,,,,,,,,,,...,,,,,,,,,,
CVN_04,,,,,,,,,,,...,,,,,,,,,,
CVN_05,,,,,,,,,,,...,,,,,,,,,,


In [29]:
ls $csv_folder

expression.csv              psi5_psi3_combined.hdf
expression_stressed.csv     psi5_psi3_combined_metadata.csv
mapping_stats.csv           psi5_psi3_combined_metadata.hdf
mapping_stats_stressed.csv  psi5_stressed.csv
metadata.csv                sj_raw.csv
metadata_stressed.csv       skipped_exon_psi_from_junctions.csv
psi3.csv                    splice_junction_metadata.csv
psi3_stressed.csv           splice_junctions_psi.csv
psi5.csv                    splicing.csv
psi5_psi3_combined.csv      splicing_stressed.csv


In [30]:
sj_metadata = pd.read_hdf('{}/psi5_psi3_combined_metadata.hdf'.format(csv_folder), 'metadata')
sj_metadata.head()

Unnamed: 0_level_0,chrom,intron_start,intron_stop,strand,intron_motif,annotated,gencode_id,ensembl_id,gene_id,gene_name,...,gerstberger2014_rbp_target_ncrna,gerstberger2014_rbp_target_ribosome,gerstberger2014_rbp_target_trna,gerstberger2014_tf,rbp,ribosomal,ribosomal_subunit,synapse,transcription_factor,intron_length
intron_location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chr1:120933-129054:-|5p,chr1,120933,129054,-,GT/AG,True,ENSG00000238009.2,ENSG00000238009,ENSG00000238009.2,RP11-34P13.7,...,False,False,False,False,False,False,False,False,False,8121
chr1:135767-137718:-|5p,chr1,135767,137718,-,GC/AG,False,ENSG00000237683.5,ENSG00000237683,ENSG00000237683.5,AL627309.1,...,False,False,False,False,False,False,False,False,False,1951
chr1:326417-327587:+|5p,chr1,326417,327587,+,GC/AG,False,"ENSG00000237094.7,ENSG00000250575.1","ENSG00000237094,ENSG00000250575","ENSG00000237094.7,ENSG00000250575.1","RP4-669L17.10,RP4-669L17.8",...,False,False,False,False,False,False,False,False,False,1170
chr1:334298-342391:+|5p,chr1,334298,342391,+,GT/AG,True,ENSG00000237094.7,ENSG00000237094,ENSG00000237094.7,RP4-669L17.10,...,False,False,False,False,False,False,False,False,False,8093
chr1:566538-566579:-|5p,chr1,566538,566579,-,GC/AG,False,ENSG00000230021.3,ENSG00000230021,ENSG00000230021.3,RP5-857K21.4,...,False,False,False,False,False,False,False,False,False,41


In [31]:
sj_metadata['exon_stop'] = sj_metadata['intron_start'] - 1
sj_metadata['exon_start'] = sj_metadata['intron_stop'] + 1

In [27]:
metadata = pd.read_csv('{}/metadata.csv'.format(csv_folder), index_col=0)
metadata.head()

Unnamed: 0,single,pooled,phenotype,neuron,progenitor,stressed,craig_venter,differentiation_batch,split
CVN_01,True,False,NPC,False,True,False,True,,
CVN_02,True,False,NPC,False,True,False,True,,
CVN_03,True,False,NPC,False,True,False,True,,
CVN_04,True,False,NPC,False,True,False,True,,
CVN_05,True,False,NPC,False,True,False,True,,


In [10]:
phenotype = metadata.phenotype[metadata.single]

In [37]:
psi3_psi5.shape

(214, 573064)

In [36]:
psi20_all = psi3_psi5.loc[:, (psi3_psi5.groupby(phenotype).count() >= 20).all()]
psi20_all.head()

Unnamed: 0_level_0,chr10:101157444-101162336:-|5p,chr10:101162480-101163225:-|5p,chr10:101163392-101163480:-|5p,chr10:101163392-101165512:-|5p,chr10:101163632-101165512:-|5p,chr10:101166007-101190204:-|5p,chr10:101437763-101476104:-|5p,chr10:101456299-101476104:-|5p,chr10:101473237-101476104:-|5p,chr10:101474476-101476104:-|5p,...,chrY:2713785-2734833:+|3p,chrY:2714508-2722640:+|3p,chrY:2721733-2722640:+|3p,chrY:2722746-2734833:+|3p,chrY:2722813-2733128:+|3p,chrY:2722813-2734833:+|3p,chrY:2722813-2789783:+|3p,chrY:2722813-2796904:+|3p,chrY:2733287-2734833:+|3p,chrY:2733362-2734833:+|3p
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CVN_01,1.0,1.0,1.0,0.0,1.0,0.0,,,,,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
CVN_02,1.0,1.0,1.0,0.0,1.0,0.0,,,,,...,,,,,,,,,,
CVN_03,1.0,1.0,1.0,0.0,1.0,0.0,,,,,...,,,,,,,,,,
CVN_04,,,,,,,,,,,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
CVN_05,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


Get Junctions whose psi score is equal to 1 in all celltypes

In [39]:
psi_is_1 = psi20_all.groupby(phenotype).apply(lambda x: (x == 1)[x.notnull()].all(axis=0))
psi_is_1.head()

Unnamed: 0_level_0,chr10:101157444-101162336:-|5p,chr10:101162480-101163225:-|5p,chr10:101163392-101163480:-|5p,chr10:101163392-101165512:-|5p,chr10:101163632-101165512:-|5p,chr10:101166007-101190204:-|5p,chr10:101437763-101476104:-|5p,chr10:101456299-101476104:-|5p,chr10:101473237-101476104:-|5p,chr10:101474476-101476104:-|5p,...,chrY:2713785-2734833:+|3p,chrY:2714508-2722640:+|3p,chrY:2721733-2722640:+|3p,chrY:2722746-2734833:+|3p,chrY:2722813-2733128:+|3p,chrY:2722813-2734833:+|3p,chrY:2722813-2789783:+|3p,chrY:2722813-2796904:+|3p,chrY:2733287-2734833:+|3p,chrY:2733362-2734833:+|3p
phenotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MN,True,True,False,False,True,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
NPC,True,True,True,False,True,False,False,False,False,False,...,False,False,False,False,True,False,False,False,True,False
iPSC,True,True,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [40]:
psi20_is_1_all = psi20_all.loc[:, psi_is_1.all()]
psi20_is_1_all.head()

Unnamed: 0_level_0,chr10:101157444-101162336:-|5p,chr10:101162480-101163225:-|5p,chr10:101163632-101165512:-|5p,chr10:101480826-101483712:-|5p,chr10:101483881-101486724:-|5p,chr10:101487321-101489309:-|5p,chr10:101489492-101491716:-|5p,chr10:102035251-102039880:-|5p,chr10:102107289-102107820:+|5p,chr10:102286312-102286731:-|5p,...,chrX:77378872-77380370:+|3p,chrX:77380923-77381286:+|3p,chrX:80532669-80533829:+|3p,chrX:80533911-80552693:+|3p,chrX:99887566-99888401:-|3p,chrX:99890250-99890554:-|3p,chrY:1455653-1456171:-|3p,chrY:1456313-1458133:-|3p,chrY:14774638-14776570:+|3p,chrY:2710284-2712117:+|3p
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CVN_01,1.0,1.0,1.0,,,,,1.0,,1,...,1.0,1.0,1,1,1.0,1.0,1,1,1.0,1.0
CVN_02,1.0,1.0,1.0,,,,,,1.0,1,...,1.0,1.0,1,1,1.0,1.0,1,1,,
CVN_03,1.0,1.0,1.0,,,,,1.0,1.0,1,...,,,1,1,,,1,1,1.0,
CVN_04,,,,,,,,,,1,...,1.0,1.0,1,1,1.0,1.0,1,1,1.0,1.0
CVN_05,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,1,...,1.0,1.0,1,1,1.0,1.0,1,1,1.0,1.0


In [42]:
sj20_is_1_all = sj_metadata.loc[psi20_is_1_all.columns, :]
print sj20_is_1_all.shape
sj20_is_1_all.head()

(9313, 43)


Unnamed: 0,chrom,intron_start,intron_stop,strand,intron_motif,annotated,gencode_id,ensembl_id,gene_id,gene_name,...,gerstberger2014_rbp_target_trna,gerstberger2014_tf,rbp,ribosomal,ribosomal_subunit,synapse,transcription_factor,intron_length,exon_stop,exon_start
chr10:101157444-101162336:-|5p,chr10,101157444,101162336,-,GT/AG,True,ENSG00000120053.9,ENSG00000120053,ENSG00000120053.9,GOT1,...,False,False,False,False,False,False,False,4892,101157443,101162337
chr10:101162480-101163225:-|5p,chr10,101162480,101163225,-,GT/AG,True,ENSG00000120053.9,ENSG00000120053,ENSG00000120053.9,GOT1,...,False,False,False,False,False,False,False,745,101162479,101163226
chr10:101163632-101165512:-|5p,chr10,101163632,101165512,-,GT/AG,True,ENSG00000120053.9,ENSG00000120053,ENSG00000120053.9,GOT1,...,False,False,False,False,False,False,False,1880,101163631,101165513
chr10:101480826-101483712:-|5p,chr10,101480826,101483712,-,GT/AG,True,ENSG00000014919.8,ENSG00000014919,ENSG00000014919.8,COX15,...,False,False,False,False,False,False,False,2886,101480825,101483713
chr10:101483881-101486724:-|5p,chr10,101483881,101486724,-,GT/AG,True,ENSG00000014919.8,ENSG00000014919,ENSG00000014919.8,COX15,...,False,False,False,False,False,False,False,2843,101483880,101486725


In [43]:
records = constitutive_regions.map(
    lambda x: pd.Series(dict(chrom=x.chrom, start=x._start, stop=x._stop, strand=x.strand, score=1000, name=x.name)))
constitutive_bed = pd.DataFrame.from_records(records)
constitutive_bed = constitutive_bed.reindex(columns=['chrom', 'start', 'stop', 'name', 'score', 'strand'])
constitutive_bed.head()

Unnamed: 0,chrom,start,stop,name,score,strand
0,chr10,100010822,100010933,exon:chr10:100010822-100010933:-,1000,-
1,chr10,100144704,100144824,exon:chr10:100144704-100144824:-,1000,-
2,chr10,100146958,100147064,exon:chr10:100146958-100147064:-,1000,-
3,chr10,100155148,100155209,exon:chr10:100155148-100155209:-,1000,-
4,chr10,100157102,100157255,exon:chr10:100157102-100157255:-,1000,-


In [47]:
constitutive_exon_start.merge?

In [46]:
constitutive_exon_stop = constitutive_bed.merge(sj20_is_1_all, 
                                                right_on=['chrom', 'exon_stop', 'strand'], 
                                                left_on=['chrom', 'stop', 'strand'])
print 'constitutive_exon_stop', constitutive_exon_stop.shape
constitutive_exon_start = constitutive_bed.merge(sj20_is_1_all, 
                                                right_on=['chrom', 'exon_start', 'strand'], 
                                                left_on=['chrom', 'start', 'strand'])
print 'constitutive_exon_start', constitutive_exon_start.shape

constitutive_exon_stop (3501, 47)
constitutive_exon_start (3420, 47)


In [48]:
constitutive_exon_start.head()

Unnamed: 0,chrom,start,stop,name,score,strand,intron_start,intron_stop,intron_motif,annotated,...,gerstberger2014_rbp_target_trna,gerstberger2014_tf,rbp,ribosomal,ribosomal_subunit,synapse,transcription_factor,intron_length,exon_stop,exon_start
0,chr10,101162337,101162479,exon:chr10:101162337-101162479:-,1000,-,101157444,101162336,GT/AG,True,...,False,False,False,False,False,False,False,4892,101157443,101162337
1,chr10,101162337,101162479,exon:chr10:101162337-101162479:-,1000,-,101157444,101162336,GT/AG,True,...,False,False,False,False,False,False,False,4892,101157443,101162337
2,chr10,101483713,101483880,exon:chr10:101483713-101483880:-,1000,-,101480826,101483712,GT/AG,True,...,False,False,False,False,False,False,False,2886,101480825,101483713
3,chr10,101483713,101483880,exon:chr10:101483713-101483880:-,1000,-,101480826,101483712,GT/AG,True,...,False,False,False,False,False,False,False,2886,101480825,101483713
4,chr10,101486725,101486911,exon:chr10:101486725-101486911:-,1000,-,101483881,101486724,GT/AG,True,...,False,False,False,False,False,False,False,2843,101483880,101486725


In [50]:
constitutive_exons = constitutive_exon_start.merge(constitutive_exon_stop, left_on='name', right_on='name')
print constitutive_exons.shape
constitutive_exons.head()

(5229, 93)


Unnamed: 0,chrom_x,start_x,stop_x,name,score_x,strand_x,intron_start_x,intron_stop_x,intron_motif_x,annotated_x,...,gerstberger2014_rbp_target_trna_y,gerstberger2014_tf_y,rbp_y,ribosomal_y,ribosomal_subunit_y,synapse_y,transcription_factor_y,intron_length_y,exon_stop_y,exon_start_y
0,chr10,101162337,101162479,exon:chr10:101162337-101162479:-,1000,-,101157444,101162336,GT/AG,True,...,False,False,False,False,False,False,False,745,101162479,101163226
1,chr10,101162337,101162479,exon:chr10:101162337-101162479:-,1000,-,101157444,101162336,GT/AG,True,...,False,False,False,False,False,False,False,745,101162479,101163226
2,chr10,101162337,101162479,exon:chr10:101162337-101162479:-,1000,-,101157444,101162336,GT/AG,True,...,False,False,False,False,False,False,False,745,101162479,101163226
3,chr10,101162337,101162479,exon:chr10:101162337-101162479:-,1000,-,101157444,101162336,GT/AG,True,...,False,False,False,False,False,False,False,745,101162479,101163226
4,chr10,101483713,101483880,exon:chr10:101483713-101483880:-,1000,-,101480826,101483712,GT/AG,True,...,False,False,False,False,False,False,False,2843,101483880,101486725


In [52]:
bed = constitutive_exons[['chrom_x', 'start_x', 'stop_x', 'name', 'score_x', 'strand_x']]
bed = bed.drop_duplicates()
print bed.shape
bed.columns = bed.columns.map(lambda x: x.split('_')[0])
bed.head()

(1449, 6)


Unnamed: 0,chrom,start,stop,name,score,strand
0,chr10,101162337,101162479,exon:chr10:101162337-101162479:-,1000,-
4,chr10,101483713,101483880,exon:chr10:101483713-101483880:-,1000,-
8,chr10,101489310,101489491,exon:chr10:101489310-101489491:-,1000,-
12,chr10,104183414,104183474,exon:chr10:104183414-104183474:-,1000,-
16,chr10,104245366,104245490,exon:chr10:104245366-104245490:-,1000,-


In [58]:
bed.to_csv('{}/constitutive_exons.bed'.format(folder), index=False, header=False, sep='\t')

## Submit compute job to calculate exon conservation

In [None]:
folder = '/projects/ps-yeolab/obotvinnik/singlecell_pnms'

alt_exons_bedfile = '{}/skipped_exon_exon2.bed'.format(folder)
constitutive_bedfile = '{}/constitutive_exons.bed'.format(folder)

bedfiles = alt_exons_bedfile, constitutive_bedfile

In [60]:
import os
import glob

from gscripts.qtools import Submitter

commands = []

bw = '/projects/ps-yeolab/genomes/hg19/hg19_phastcons_placental_mammal.bw'

for bedfile in bedfiles:
    basename = os.path.basename(bedfile)
    
    prefix = basename.split('.bed')[0]
    
    prefix += '_phastcons_placental_mammal'
    bedout = '{}/{}'.format(folder, prefix + '.bed')
    outtab = '{}/{}'.format(folder, prefix + '.txt')
    command = 'bigWigAverageOverBed {} {} {} -bedOut={}'.format(bw, bedfile, outtab, bedout)
    print command
    commands.append(command)

jobname = 'exonbody_conservation'
Submitter(commands, jobname, array=True, walltime='2:00:00', write_and_submit=True,
          err_filename='{}/{}.err'.format(folder, jobname),
          out_filename='{}/{}.out'.format(folder, jobname))

1449