In [1]:
import itertools

import pandas as pd
import gffutils
import pybedtools
import re
import numpy as np

v19db_filename = '/projects/ps-yeolab/genomes/hg19/gencode/v19/gencode.v19.annotation.gtf.db'
v19db = gffutils.FeatureDB(v19db_filename)

folder = '/projects/ps-yeolab/obotvinnik/singlecell_pnms'

exons_to_junctions = pd.read_csv('{}/exons_to_junctions_se.csv'.format(folder), index_col=[0, 1, 2], squeeze=True, header=None)
exons_to_junctions = exons_to_junctions.reset_index()
exons_to_junctions = exons_to_junctions.rename(columns={0: 'exon1', 1: 'exon2', 2: 'exon3', 3:'junctions'})
exons_to_junctions['junctions'] = exons_to_junctions.junctions.map(eval)
exons_to_junctions.head()

Unnamed: 0,exon1,exon2,exon3,junctions
0,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100188908-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-..."
1,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100188913-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-..."
2,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100189330-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-..."
3,exon:chr10:100193697-100193848:-,exon:chr10:100190888-100191048:-,exon:chr10:100190328-100190427:-,"(chr10:100191049-100193696:-, chr10:100190428-..."
4,exon:chr10:100195392-100195529:-,exon:chr10:100195029-100195171:-,exon:chr10:100193697-100193848:-,"(chr10:100195172-100195391:-, chr10:100193849-..."


In [2]:
exons_to_junctions['event_id'] = exons_to_junctions['exon1'] + '@' \
    + exons_to_junctions['exon2'] + '@' + \
    exons_to_junctions['exon3']
exons_to_junctions.head()

Unnamed: 0,exon1,exon2,exon3,junctions,event_id
0,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100188908-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-...",exon:chr10:100190328-100190427:-@exon:chr10:10...
1,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100188913-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-...",exon:chr10:100190328-100190427:-@exon:chr10:10...
2,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100189330-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-...",exon:chr10:100190328-100190427:-@exon:chr10:10...
3,exon:chr10:100193697-100193848:-,exon:chr10:100190888-100191048:-,exon:chr10:100190328-100190427:-,"(chr10:100191049-100193696:-, chr10:100190428-...",exon:chr10:100193697-100193848:-@exon:chr10:10...
4,exon:chr10:100195392-100195529:-,exon:chr10:100195029-100195171:-,exon:chr10:100193697-100193848:-,"(chr10:100195172-100195391:-, chr10:100193849-...",exon:chr10:100195392-100195529:-@exon:chr10:10...


In [3]:
exon = v19db['exon:chr10:100189548-100189646:-']
exon.attributes['gene_name']

['HPS1']

In [4]:
exon = v19db['exon:chr10:100189548-100189646:-']
exon.attributes['gene_id']

['ENSG00000107521.14']

In [5]:
exon_cols = ['exon1', 'exon2', 'exon3']

In [7]:

exons_to_junctions['gencode_id'] = exons_to_junctions[exon_cols].apply(lambda x: ','.join(set(itertools.chain(*[v19db[i].attributes['gene_id'] for i in x]))), axis=1)

In [8]:
exons_to_junctions['gene_name'] = exons_to_junctions[exon_cols].apply(
    lambda x: ','.join(set(itertools.chain(*[v19db[i].attributes['gene_name'] for i in x]))), axis=1)

In [9]:
exons_to_junctions['ensembl_id'] = exons_to_junctions[exon_cols].apply(
    lambda x: ','.join(set(itertools.chain(*[map(lambda y: y.split('.')[0], v19db[i].attributes['gene_id']) for i in x]))), 
    axis=1)

In [10]:
from poshsplice.region import Region

In [11]:
exons_to_junctions['exon2_length'] = exons_to_junctions.exon2.map(lambda x: len(Region(x)))

In [12]:
ns = [1, 2, 3]
for n in ns:
    exons_to_junctions['exon{}_region'.format(n)] = exons_to_junctions['exon{}'.format(n)].map(Region)
    exons_to_junctions['exon{}_length'.format(n)] = exons_to_junctions['exon{}_region'.format(n)].map(len)

In [None]:
exons_to_junctions['strand'] = exons_to_junctions.exon1.str[-1]
exons_to_junctions.head()

In [19]:
positive_strand = exons_to_junctions.strand == '+'
exons_to_junctions['intron1_length'] = np.nan


exons_to_junctions.loc[positive_strand, 'intron1_length'] = \
    exons_to_junctions.loc[positive_strand].apply(lambda x: x.exon2_region._start - x.exon1_region._stop - 1, axis=1)
exons_to_junctions.loc[~positive_strand, 'intron1_length'] = \
    exons_to_junctions.loc[~positive_strand].apply(lambda x: x.exon1_region._start - x.exon2_region._stop - 1, axis=1)
exons_to_junctions.head()

Unnamed: 0,exon1,exon2,exon3,junctions,event_id,gencode_id,gene_name,ensembl_id,exon2_length,exon1_region,exon1_length,exon2_region,exon3_region,exon3_length,strand,intron1_length
0,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100188908-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-...",exon:chr10:100190328-100190427:-@exon:chr10:10...,ENSG00000107521.14,HPS1,ENSG00000107521,99,exon:chr10:100190328-100190427:-,100,exon:chr10:100189548-100189646:-,exon:chr10:100188908-100189399:-,492,-,681
1,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100188913-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-...",exon:chr10:100190328-100190427:-@exon:chr10:10...,ENSG00000107521.14,HPS1,ENSG00000107521,99,exon:chr10:100190328-100190427:-,100,exon:chr10:100189548-100189646:-,exon:chr10:100188913-100189399:-,487,-,681
2,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100189330-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-...",exon:chr10:100190328-100190427:-@exon:chr10:10...,ENSG00000107521.14,HPS1,ENSG00000107521,99,exon:chr10:100190328-100190427:-,100,exon:chr10:100189548-100189646:-,exon:chr10:100189330-100189399:-,70,-,681
3,exon:chr10:100193697-100193848:-,exon:chr10:100190888-100191048:-,exon:chr10:100190328-100190427:-,"(chr10:100191049-100193696:-, chr10:100190428-...",exon:chr10:100193697-100193848:-@exon:chr10:10...,ENSG00000107521.14,HPS1,ENSG00000107521,161,exon:chr10:100193697-100193848:-,152,exon:chr10:100190888-100191048:-,exon:chr10:100190328-100190427:-,100,-,2648
4,exon:chr10:100195392-100195529:-,exon:chr10:100195029-100195171:-,exon:chr10:100193697-100193848:-,"(chr10:100195172-100195391:-, chr10:100193849-...",exon:chr10:100195392-100195529:-@exon:chr10:10...,ENSG00000107521.14,HPS1,ENSG00000107521,143,exon:chr10:100195392-100195529:-,138,exon:chr10:100195029-100195171:-,exon:chr10:100193697-100193848:-,152,-,220


In [20]:
positive_strand = exons_to_junctions.strand == '+'
exons_to_junctions['intron2_length'] = np.nan


exons_to_junctions.loc[positive_strand, 'intron2_length'] = \
    exons_to_junctions.loc[positive_strand].apply(lambda x: x.exon3_region._start - x.exon2_region._stop - 1, axis=1)
exons_to_junctions.loc[~positive_strand, 'intron2_length'] = \
    exons_to_junctions.loc[~positive_strand].apply(lambda x: x.exon2_region._start - x.exon3_region._stop - 1, axis=1)
exons_to_junctions.head()

Unnamed: 0,exon1,exon2,exon3,junctions,event_id,gencode_id,gene_name,ensembl_id,exon2_length,exon1_region,exon1_length,exon2_region,exon3_region,exon3_length,strand,intron1_length,intron2_length
0,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100188908-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-...",exon:chr10:100190328-100190427:-@exon:chr10:10...,ENSG00000107521.14,HPS1,ENSG00000107521,99,exon:chr10:100190328-100190427:-,100,exon:chr10:100189548-100189646:-,exon:chr10:100188908-100189399:-,492,-,681,148
1,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100188913-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-...",exon:chr10:100190328-100190427:-@exon:chr10:10...,ENSG00000107521.14,HPS1,ENSG00000107521,99,exon:chr10:100190328-100190427:-,100,exon:chr10:100189548-100189646:-,exon:chr10:100188913-100189399:-,487,-,681,148
2,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100189330-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-...",exon:chr10:100190328-100190427:-@exon:chr10:10...,ENSG00000107521.14,HPS1,ENSG00000107521,99,exon:chr10:100190328-100190427:-,100,exon:chr10:100189548-100189646:-,exon:chr10:100189330-100189399:-,70,-,681,148
3,exon:chr10:100193697-100193848:-,exon:chr10:100190888-100191048:-,exon:chr10:100190328-100190427:-,"(chr10:100191049-100193696:-, chr10:100190428-...",exon:chr10:100193697-100193848:-@exon:chr10:10...,ENSG00000107521.14,HPS1,ENSG00000107521,161,exon:chr10:100193697-100193848:-,152,exon:chr10:100190888-100191048:-,exon:chr10:100190328-100190427:-,100,-,2648,460
4,exon:chr10:100195392-100195529:-,exon:chr10:100195029-100195171:-,exon:chr10:100193697-100193848:-,"(chr10:100195172-100195391:-, chr10:100193849-...",exon:chr10:100195392-100195529:-@exon:chr10:10...,ENSG00000107521.14,HPS1,ENSG00000107521,143,exon:chr10:100195392-100195529:-,138,exon:chr10:100195029-100195171:-,exon:chr10:100193697-100193848:-,152,-,220,1180


In [21]:
exons_to_junctions['intron12_length_summed'] = exons_to_junctions.intron1_length + exons_to_junctions.intron2_length
exons_to_junctions.head()

Unnamed: 0,exon1,exon2,exon3,junctions,event_id,gencode_id,gene_name,ensembl_id,exon2_length,exon1_region,exon1_length,exon2_region,exon3_region,exon3_length,strand,intron1_length,intron2_length,intron12_length_summed
0,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100188908-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-...",exon:chr10:100190328-100190427:-@exon:chr10:10...,ENSG00000107521.14,HPS1,ENSG00000107521,99,exon:chr10:100190328-100190427:-,100,exon:chr10:100189548-100189646:-,exon:chr10:100188908-100189399:-,492,-,681,148,829
1,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100188913-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-...",exon:chr10:100190328-100190427:-@exon:chr10:10...,ENSG00000107521.14,HPS1,ENSG00000107521,99,exon:chr10:100190328-100190427:-,100,exon:chr10:100189548-100189646:-,exon:chr10:100188913-100189399:-,487,-,681,148,829
2,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100189330-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-...",exon:chr10:100190328-100190427:-@exon:chr10:10...,ENSG00000107521.14,HPS1,ENSG00000107521,99,exon:chr10:100190328-100190427:-,100,exon:chr10:100189548-100189646:-,exon:chr10:100189330-100189399:-,70,-,681,148,829
3,exon:chr10:100193697-100193848:-,exon:chr10:100190888-100191048:-,exon:chr10:100190328-100190427:-,"(chr10:100191049-100193696:-, chr10:100190428-...",exon:chr10:100193697-100193848:-@exon:chr10:10...,ENSG00000107521.14,HPS1,ENSG00000107521,161,exon:chr10:100193697-100193848:-,152,exon:chr10:100190888-100191048:-,exon:chr10:100190328-100190427:-,100,-,2648,460,3108
4,exon:chr10:100195392-100195529:-,exon:chr10:100195029-100195171:-,exon:chr10:100193697-100193848:-,"(chr10:100195172-100195391:-, chr10:100193849-...",exon:chr10:100195392-100195529:-@exon:chr10:10...,ENSG00000107521.14,HPS1,ENSG00000107521,143,exon:chr10:100195392-100195529:-,138,exon:chr10:100195029-100195171:-,exon:chr10:100193697-100193848:-,152,-,220,1180,1400


In [31]:
exons_to_junctions.dtypes

exon1                      object
exon2                      object
exon3                      object
junctions                  object
event_id                   object
gencode_id                 object
gene_name                  object
ensembl_id                 object
exon2_length                int64
exon1_region               object
exon1_length                int64
exon2_region               object
exon3_region               object
exon3_length                int64
strand                     object
intron1_length            float64
intron2_length            float64
intron12_length_summed    float64
exon2_divisible_by_3         bool
dtype: object

In [32]:
exons_to_junctions['exon2_divisible_by_3'] = exons_to_junctions['exon2_length'].astype(int).mod(3) == 0
exons_to_junctions.head()

Unnamed: 0,exon1,exon2,exon3,junctions,event_id,gencode_id,gene_name,ensembl_id,exon2_length,exon1_region,exon1_length,exon2_region,exon3_region,exon3_length,strand,intron1_length,intron2_length,intron12_length_summed,exon2_divisible_by_3
0,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100188908-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-...",exon:chr10:100190328-100190427:-@exon:chr10:10...,ENSG00000107521.14,HPS1,ENSG00000107521,99,exon:chr10:100190328-100190427:-,100,exon:chr10:100189548-100189646:-,exon:chr10:100188908-100189399:-,492,-,681,148,829,True
1,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100188913-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-...",exon:chr10:100190328-100190427:-@exon:chr10:10...,ENSG00000107521.14,HPS1,ENSG00000107521,99,exon:chr10:100190328-100190427:-,100,exon:chr10:100189548-100189646:-,exon:chr10:100188913-100189399:-,487,-,681,148,829,True
2,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100189330-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-...",exon:chr10:100190328-100190427:-@exon:chr10:10...,ENSG00000107521.14,HPS1,ENSG00000107521,99,exon:chr10:100190328-100190427:-,100,exon:chr10:100189548-100189646:-,exon:chr10:100189330-100189399:-,70,-,681,148,829,True
3,exon:chr10:100193697-100193848:-,exon:chr10:100190888-100191048:-,exon:chr10:100190328-100190427:-,"(chr10:100191049-100193696:-, chr10:100190428-...",exon:chr10:100193697-100193848:-@exon:chr10:10...,ENSG00000107521.14,HPS1,ENSG00000107521,161,exon:chr10:100193697-100193848:-,152,exon:chr10:100190888-100191048:-,exon:chr10:100190328-100190427:-,100,-,2648,460,3108,False
4,exon:chr10:100195392-100195529:-,exon:chr10:100195029-100195171:-,exon:chr10:100193697-100193848:-,"(chr10:100195172-100195391:-, chr10:100193849-...",exon:chr10:100195392-100195529:-@exon:chr10:10...,ENSG00000107521.14,HPS1,ENSG00000107521,143,exon:chr10:100195392-100195529:-,138,exon:chr10:100195029-100195171:-,exon:chr10:100193697-100193848:-,152,-,220,1180,1400,False


In [33]:
exons_to_junctions['exon2_divisible_by_3'].sum()/float(exons_to_junctions.shape[0])

0.38737636382175999

In [34]:
cols = ['event_id', 'gencode_id', 'ensembl_id', 'gene_name', 'exon1_length', 
        'exon2_length', 'exon3_length', 'intron1_length', 
        'intron2_length', 'exon2_divisible_by_3', 'intron12_length_summed']
exon_metadata = exons_to_junctions[cols]
exon_metadata = exon_metadata.set_index('event_id')
exon_metadata.head()

Unnamed: 0_level_0,gencode_id,ensembl_id,gene_name,exon1_length,exon2_length,exon3_length,intron1_length,intron2_length,exon2_divisible_by_3,intron12_length_summed
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
exon:chr10:100190328-100190427:-@exon:chr10:100189548-100189646:-@exon:chr10:100188908-100189399:-,ENSG00000107521.14,ENSG00000107521,HPS1,100,99,492,681,148,True,829
exon:chr10:100190328-100190427:-@exon:chr10:100189548-100189646:-@exon:chr10:100188913-100189399:-,ENSG00000107521.14,ENSG00000107521,HPS1,100,99,487,681,148,True,829
exon:chr10:100190328-100190427:-@exon:chr10:100189548-100189646:-@exon:chr10:100189330-100189399:-,ENSG00000107521.14,ENSG00000107521,HPS1,100,99,70,681,148,True,829
exon:chr10:100193697-100193848:-@exon:chr10:100190888-100191048:-@exon:chr10:100190328-100190427:-,ENSG00000107521.14,ENSG00000107521,HPS1,152,161,100,2648,460,False,3108
exon:chr10:100195392-100195529:-@exon:chr10:100195029-100195171:-@exon:chr10:100193697-100193848:-,ENSG00000107521.14,ENSG00000107521,HPS1,138,143,152,220,1180,False,1400


In [35]:
print exon_metadata.ensembl_id.str.contains(',').sum()
print exon_metadata.shape

3928
(58842, 10)


3,928/58,842 splicing events have multiple gene ids associated with them. Since we want to intersect this information with existing gene annotations, we need a *single* id. To do this coarsely and simply, we will take the first gene ID for intersecting purposes.

In [36]:
exon_metadata['one_ensembl_id'] = exon_metadata.ensembl_id.str.split(',').str[0]
exon_metadata.head()

Unnamed: 0_level_0,gencode_id,ensembl_id,gene_name,exon1_length,exon2_length,exon3_length,intron1_length,intron2_length,exon2_divisible_by_3,intron12_length_summed,one_ensembl_id
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
exon:chr10:100190328-100190427:-@exon:chr10:100189548-100189646:-@exon:chr10:100188908-100189399:-,ENSG00000107521.14,ENSG00000107521,HPS1,100,99,492,681,148,True,829,ENSG00000107521
exon:chr10:100190328-100190427:-@exon:chr10:100189548-100189646:-@exon:chr10:100188913-100189399:-,ENSG00000107521.14,ENSG00000107521,HPS1,100,99,487,681,148,True,829,ENSG00000107521
exon:chr10:100190328-100190427:-@exon:chr10:100189548-100189646:-@exon:chr10:100189330-100189399:-,ENSG00000107521.14,ENSG00000107521,HPS1,100,99,70,681,148,True,829,ENSG00000107521
exon:chr10:100193697-100193848:-@exon:chr10:100190888-100191048:-@exon:chr10:100190328-100190427:-,ENSG00000107521.14,ENSG00000107521,HPS1,152,161,100,2648,460,False,3108,ENSG00000107521
exon:chr10:100195392-100195529:-@exon:chr10:100195029-100195171:-@exon:chr10:100193697-100193848:-,ENSG00000107521.14,ENSG00000107521,HPS1,138,143,152,220,1180,False,1400,ENSG00000107521


### Write to a safe place

In [37]:
exon_metadata.to_csv('{}/splicing_feature_data.csv'.format(folder))