In [1]:
#%run data_phase1_ar3_1.ipynb
import petl as etl
import petlx.bio
import numpy as np

gff_fn = '../../phase2.AR1/geneset/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.2.gff3.gz'

In [2]:
tbl_features = (
    etl
    .fromgff3(gff_fn)
    .convert('start', lambda v: v-1)
    .rename('end', 'stop')
    .unpackdict('attributes', ['Parent', 'ID'])
    .rename('Parent', 'parent')
    .addfield('length', lambda rec: rec.stop-rec.start, index=5)
    .cutout('source', 'score')
    .cache()
)

In [3]:
tbl_features.display(10)

seqid,type,start,stop,length,strand,phase,parent,ID
2L,contig,0,49364325,49364325,.,.,,2L
2L,gene,157347,186936,29589,-,.,,AGAP004677
2L,mRNA,157347,186936,29589,-,.,AGAP004677,AGAP004677-RB
2L,exon,186859,186936,77,-,.,AGAP004677-RB,AGAP004677-RB-E1B
2L,exon,159191,159366,175,-,.,AGAP004677-RB,AGAP004677-RB-E2B
2L,exon,157678,158297,619,-,.,AGAP004677-RB,AGAP004677-RB-E3B
2L,exon,157347,157623,276,-,.,AGAP004677-RB,AGAP004677-RB-E4B
2L,five_prime_UTR,186859,186936,77,-,.,AGAP004677-RB,
2L,five_prime_UTR,159356,159366,10,-,.,AGAP004677-RB,
2L,CDS,159191,159356,165,-,0,AGAP004677-RB,


In [4]:
def exons2introns(parent, exons):
    exons = list(exons)
    seqid = exons[0].seqid
    strand = exons[0].strand
    type = 'intron'
    for i in range(1, len(exons)):
        start = exons[i-1].stop
        stop = exons[i].start
        if strand == '+':
            ID = '%s:%s' % (exons[i-1].ID, exons[i].ID)
        else:
            ID = '%s:%s' % (exons[i].ID, exons[i-1].ID)
        yield (seqid, type, start, stop, stop-start, strand, '.', parent, ID)
            

tbl_introns = (
    tbl_features
    .eq('type', 'exon')
    .rowgroupmap(key='parent',
                 mapper=exons2introns,
                 header=['seqid', 'type', 'start', 'stop', 'length', 'strand','phase', 'parent', 'ID'])
    .sort(key=('seqid', 'start', 'parent'))
)
tbl_introns.display(20)

seqid,type,start,stop,length,strand,phase,parent,ID
2L,intron,158297,157347,-950,-,.,AGAP004677-RA,AGAP004677-RB-E4A:AGAP004677-RB-E3A
2L,intron,158297,157347,-950,-,.,AGAP004677-RB,AGAP004677-RB-E4B:AGAP004677-RB-E3B
2L,intron,159366,157678,-1688,-,.,AGAP004677-RA,AGAP004677-RB-E3A:AGAP004677-RA-E2A
2L,intron,159366,157678,-1688,-,.,AGAP004677-RB,AGAP004677-RB-E3B:AGAP004677-RB-E2B
2L,intron,181305,159191,-22114,-,.,AGAP004677-RA,AGAP004677-RA-E2A:AGAP004677-RA-E1A
2L,intron,186936,159191,-27745,-,.,AGAP004677-RB,AGAP004677-RB-E2B:AGAP004677-RB-E1B
2L,intron,203924,203980,56,+,.,AGAP004678-RA,AGAP004678-RA-E1A:AGAP004678-RA-E2A
2L,intron,207953,208392,439,+,.,AGAP004679-RB,AGAP004679-RB-E1B:AGAP004679-RB-E2B
2L,intron,208581,208638,57,+,.,AGAP004679-RA,AGAP004679-RA-E1A:AGAP004679-RB-E3A
2L,intron,208581,208638,57,+,.,AGAP004679-RB,AGAP004679-RB-E2B:AGAP004679-RB-E3B


In [5]:
lkp_feature_children = tbl_features.recordlookup('parent')

In [6]:
lkp_feature_children['AGAP004677']

[('2L',
  'mRNA',
  157347,
  186936,
  29589,
  '-',
  '.',
  'AGAP004677',
  'AGAP004677-RB'),
 ('2L',
  'mRNA',
  157347,
  181305,
  23958,
  '-',
  '.',
  'AGAP004677',
  'AGAP004677-RA')]

In [7]:
def transcript_length(row):
    if row['type'] == 'mRNA':
        exons = [f for f in lkp_feature_children[row.ID] if f['type'] == 'exon']
        l = sum(e.length for e in exons)
        return l
    else:
        return None


lkp_transcript_length = tbl_features.eq('type', 'mRNA').addfield('transcript_length', transcript_length).lookupone('ID', 'transcript_length')

In [8]:
lkp_transcript_length['AGAP004677-RA']

1467

In [9]:
# EL: Looking at the code, I think "canonical" just means that it is the longest mRNA for this gene
def is_canonical_transcript(row):
    if row['type'] == 'mRNA':
        length = lkp_transcript_length[row.ID]
        lengths = [lkp_transcript_length[f.ID] for f in lkp_feature_children[row.parent]]
        return length == max(lengths)
    else:
        return None

In [10]:
lkp_transcript_is_canonical = tbl_features.eq('type', 'mRNA').addfield('is_canonical', is_canonical_transcript).lookupone('ID', 'is_canonical')

In [11]:
# EL: If the exon's parent is canonical, then the exon in caninical
def is_canonical(row):
    if row['type'] == 'mRNA':
        return lkp_transcript_is_canonical[row.ID]
    elif row.parent in lkp_transcript_is_canonical:
        return lkp_transcript_is_canonical[row.parent]
    else:
        return None

    
tbl_features_aug = (
    tbl_features
    .cat(tbl_introns)
    .sort(key=('seqid', 'start', 'parent', 'type'), cache=False)
    .addfield('n_children', lambda row: len(lkp_feature_children[row.ID]) 
                                        if row.ID is not None and row.ID in lkp_feature_children
                                        else 0)
    .addfield('transcript_length', transcript_length)
    .addfield('is_canonical', is_canonical)
    .cache()
)

In [12]:
lkp_feature_children_aug = tbl_features_aug.recordlookup('parent')

In [13]:
# lkp_feature_children_aug['AGAP004677-RA']

In [14]:
def first_last(row):
    if row['type'] in {'exon', 'intron', 'CDS'}:
        first, last = False, False
        sibs = [f for f in lkp_feature_children_aug[row.parent]
                if f['type'] == row['type']]
        if row.strand == '+':
            starts = sorted([f.start for f in sibs])
            index = starts.index(row.start)
        else:
            starts = sorted([f.stop for f in sibs], reverse=True)
            index = starts.index(row.stop)
        if index == 0:
            first = True
        if index == len(sibs) - 1:
            last = True
        return first, last
    return (None, None)

In [15]:
tbl_features_aug2 = (
    tbl_features_aug
    .addfield('first_last', first_last)
    .unpack('first_last', newfields=['is_first', 'is_last'])
)
tbl_features_aug2.display(40)

seqid,type,start,stop,length,strand,phase,parent,ID,n_children,transcript_length,is_canonical,is_first,is_last
2L,contig,0,49364325,49364325,.,.,,2L,0,,,,
2L,gene,157347,186936,29589,-,.,,AGAP004677,2,,,,
2L,mRNA,157347,186936,29589,-,.,AGAP004677,AGAP004677-RB,10,1147.0,False,,
2L,mRNA,157347,181305,23958,-,.,AGAP004677,AGAP004677-RA,10,1467.0,True,,
2L,exon,157347,157623,276,-,.,AGAP004677-RA,AGAP004677-RB-E4A,0,,True,False,True
2L,three_prime_UTR,157347,157495,148,-,.,AGAP004677-RA,,0,,True,,
2L,exon,157347,157623,276,-,.,AGAP004677-RB,AGAP004677-RB-E4B,0,,False,False,True
2L,three_prime_UTR,157347,157495,148,-,.,AGAP004677-RB,,0,,False,,
2L,CDS,157495,157623,128,-,2,AGAP004677-RA,,0,,True,False,True
2L,CDS,157495,157623,128,-,2,AGAP004677-RB,,0,,False,False,True


In [16]:
tbl_features_aug2.eq('type', 'mRNA').display(20)

seqid,type,start,stop,length,strand,phase,parent,ID,n_children,transcript_length,is_canonical,is_first,is_last
2L,mRNA,157347,186936,29589,-,.,AGAP004677,AGAP004677-RB,10,1147,False,,
2L,mRNA,157347,181305,23958,-,.,AGAP004677,AGAP004677-RA,10,1467,True,,
2L,mRNA,203778,205293,1515,+,.,AGAP004678,AGAP004678-RA,6,1459,True,,
2L,mRNA,207893,210460,2567,+,.,AGAP004679,AGAP004679-RB,14,1861,False,,
2L,mRNA,208182,210460,2278,+,.,AGAP004679,AGAP004679-RA,12,2011,True,,
2L,mRNA,271284,271815,531,+,.,AGAP004680,AGAP004680-RA,2,531,True,,
2L,mRNA,358328,359280,952,-,.,AGAP004681,AGAP004681-RA,6,857,True,,
2L,mRNA,433502,461627,28125,-,.,AGAP004682,AGAP004682-RA,7,2073,True,,
2L,mRNA,485697,488369,2672,-,.,AGAP004683,AGAP004683-RA,5,2247,True,,
2L,mRNA,493038,493543,505,+,.,AGAP004684,AGAP004684-RA,4,441,True,,


In [17]:
# Is 'RA' transcript always canonical?
tbl_features_aug2.eq('type', 'mRNA').addfield('check', lambda row: row.ID[-2:] == 'RA' and row.is_canonical).valuecounts('check')

check,count,frequency
True,12273,0.8350683813023065
False,2424,0.1649316186976934


In [18]:
# How many transcripts do genes have?
tbl_features_aug2.select(lambda row: row.type == 'gene').valuecounts('n_children').displayall()

n_children,count,frequency
1,12376,0.9083969465648856
2,910,0.0667938931297709
3,211,0.0154873752201996
4,74,0.0054315913094539
5,27,0.001981796829125
6,9,0.0006605989430416
7,5,0.0003669994128009
8,4,0.0002935995302407
11,3,0.0002201996476805
12,1,7.33998825601879e-05


In [19]:
(tbl_features_aug2
 .teetsv('tbl_features.txt')
 .topickle('tbl_features.pickle'))