# Get translated protein product of isoforms

In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import gffutils
import pybedtools
import re
import numpy as np

v19db_filename = '/projects/ps-yeolab/genomes/hg19/gencode/v19/gencode.v19.annotation.gtf.db'
v19db = gffutils.FeatureDB(v19db_filename)

folder = '/projects/ps-yeolab/obotvinnik/singlecell_pnms'

In [94]:
figure_folder = '/home/obotvinnik/Dropbox/figures2/singlecell_pnms/isoform_protein_properties'
! mkdir $figure_folder

In [3]:
splicing_feature_data = pd.read_csv('{}/splicing_feature_data.csv'.format(folder), index_col=0)

In [4]:
exons_to_junctions = pd.read_csv('{}/exons_to_junctions_se.csv'.format(folder), index_col=[0, 1, 2], squeeze=True, header=None)
exons_to_junctions = exons_to_junctions.reset_index()
exons_to_junctions = exons_to_junctions.rename(columns={0: 'exon1', 1: 'exon2', 2: 'exon3', 3:'junctions'})
exons_to_junctions.head()

Unnamed: 0,exon1,exon2,exon3,junctions
0,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100188908-100189399:-,"('chr10:100189647-100190327:-', 'chr10:1001894..."
1,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100188913-100189399:-,"('chr10:100189647-100190327:-', 'chr10:1001894..."
2,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100189330-100189399:-,"('chr10:100189647-100190327:-', 'chr10:1001894..."
3,exon:chr10:100193697-100193848:-,exon:chr10:100190888-100191048:-,exon:chr10:100190328-100190427:-,"('chr10:100191049-100193696:-', 'chr10:1001904..."
4,exon:chr10:100195392-100195529:-,exon:chr10:100195029-100195171:-,exon:chr10:100193697-100193848:-,"('chr10:100195172-100195391:-', 'chr10:1001938..."


In [5]:
exons_to_junctions['junctions'] = exons_to_junctions['junctions'].map(eval)
exons_to_junctions.head()

Unnamed: 0,exon1,exon2,exon3,junctions
0,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100188908-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-..."
1,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100188913-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-..."
2,exon:chr10:100190328-100190427:-,exon:chr10:100189548-100189646:-,exon:chr10:100189330-100189399:-,"(chr10:100189647-100190327:-, chr10:100189400-..."
3,exon:chr10:100193697-100193848:-,exon:chr10:100190888-100191048:-,exon:chr10:100190328-100190427:-,"(chr10:100191049-100193696:-, chr10:100190428-..."
4,exon:chr10:100195392-100195529:-,exon:chr10:100195029-100195171:-,exon:chr10:100193697-100193848:-,"(chr10:100195172-100195391:-, chr10:100193849-..."


## Translate isoform

In [6]:
prefix = 'skipped_exon_isoform_translations'
translated_fasta = '{}/{}.fa'.format(folder, prefix)

In [1]:
import itertools
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
from Bio.Alphabet import generic_dna, generic_protein

hg19_fasta = '/projects/ps-yeolab/genomes/hg19/gencode/v19/GRCh37.p13.genome.fa'

def overlap(x, y):
    return not ((x.start > y.stop) or (x.stop < y.start))

seqrecords = []

for i, row in exons_to_junctions.iterrows():
    if (i+1) % 1000 == 0:
        print i+1
    exon1 = v19db[row.exon1]
    exon2 = v19db[row.exon2]
    exon3 = v19db[row.exon3]
#     print row.junctions
    
    exon_trio = exon1, exon2, exon3
    event_id = '@'.join(map(lambda x: x.id, exon_trio))
    
    isoform_to_exons = {'isoform1': (exon1, exon3), 'isoform2': exon_trio}
    
    transcripts = map(lambda x: set(v19db.parents(x, featuretype='transcript')), exon_trio)
    
    # Isoform 1: exclusion of exon2, so all transcripts that have exon1 and exon3 but not exon2
    isoform1_transcripts = (transcripts[0] & transcripts[2]) - transcripts[1]
    
    # Isoform 2: inclusion of exon2, so all transcripts that have exon1, exon2, and exon3
    isoform2_transcripts = set(itertools.chain(*transcripts))
    
    isoforms = {'isoform1': isoform1_transcripts, 
                'isoform2': isoform2_transcripts}
    
    isoform_to_cds = {'isoform1': [], 'isoform2': []}
    for isoform, transcripts in isoforms.items():
        exons = isoform_to_exons[isoform]
        for transcript in transcripts:
            reverse = transcript.strand == '-'
            cdss = v19db.children(transcript, featuretype='CDS', order_by='start', reverse=reverse)
            
            cdss = filter(lambda cds: any(map(lambda exon: overlap(cds, exon), exons)), cdss)
            if len(cdss) == len(exons):
                cds_str = '@'.join(map(lambda x: x.id, cdss))
                if cds_str in isoform_to_cds[isoform]:
                    continue
                
                isoform_to_cds[isoform].append(cds_str)
                    
#                 print '\t', ' '.join(map(lambda x: x.id, es))
#                 print '\t', ' '.join(map(lambda x: x.id, cdss))

                if reverse:
                    coding_sequence = Seq(''.join(cds.sequence(hg19_fasta)[::-1] for cds in cdss), generic_dna).complement()
                else:
                    coding_sequence = Seq(''.join(cds.sequence(hg19_fasta) for cds in cdss), generic_dna)
#                 print len(coding_sequence)
                coding_sequence = coding_sequence[int(cdss[0].frame):]
                translated = coding_sequence.translate()
                seqrecord = SeqRecord(translated, id='{0}|{1}|{2}'.format(event_id, cds_str, isoform))
                seqrecords.append(seqrecord)
#     pprint(isoform_to_cds)


with open(translated_fasta, 'w') as f:
    SeqIO.write(seqrecords, f, 'fasta')

NameError: name 'exons_to_junctions' is not defined