In [1]:
#%run data_phase1_ar3_1.ipynb
import petl as etl
import pyfasta
import numpy as np

genome_fn = '../../phase2.AR1/genome/agamP4/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa'
genome = pyfasta.Fasta(genome_fn)

In [2]:
tbl_features = etl.frompickle('tbl_features.pickle')
tbl_features.display(10)

seqid,type,start,stop,length,strand,phase,parent,ID,n_children,transcript_length,is_canonical,is_first,is_last
2L,contig,0,49364325,49364325,.,.,,2L,0,,,,
2L,gene,157347,186936,29589,-,.,,AGAP004677,2,,,,
2L,mRNA,157347,186936,29589,-,.,AGAP004677,AGAP004677-RB,10,1147.0,False,,
2L,mRNA,157347,181305,23958,-,.,AGAP004677,AGAP004677-RA,10,1467.0,True,,
2L,exon,157347,157623,276,-,.,AGAP004677-RA,AGAP004677-RB-E4A,0,,True,False,True
2L,three_prime_UTR,157347,157495,148,-,.,AGAP004677-RA,,0,,True,,
2L,exon,157347,157623,276,-,.,AGAP004677-RB,AGAP004677-RB-E4B,0,,False,False,True
2L,three_prime_UTR,157347,157495,148,-,.,AGAP004677-RB,,0,,False,,
2L,CDS,157495,157623,128,-,2,AGAP004677-RA,,0,,True,False,True
2L,CDS,157495,157623,128,-,2,AGAP004677-RB,,0,,False,False,True


In [3]:
CLS_UPSTREAM = 1
CLS_DOWNSTREAM = 2
CLS_5UTR = 3
CLS_3UTR = 4
CLS_CDS_FIRST = 5
CLS_CDS_MID = 6
CLS_CDS_LAST = 7
CLS_INTRON_FIRST = 8
CLS_INTRON_MID = 9
CLS_INTRON_LAST = 10

feature_cls_names = [
    'Unknown',
    'Upstream',
    'Downstream',
    "5' UTR",
    "3' UTR",
    "CDS (first)",
    "CDS (mid)",
    "CDS (last)",
    "Intron (first)",
    "Intron (mid)",
    "Intron (last)",
]

In [4]:
seq_cls = {k: np.zeros(len(genome[k]), dtype='u1') for k in genome.keys()}
seq_relpos_start = {k: np.zeros(len(genome[k]), dtype='u4') for k in genome.keys()}
seq_relpos_stop = {k: np.zeros(len(genome[k]), dtype='u4') for k in genome.keys()}
seq_flen = {k: np.zeros(len(genome[k]), dtype='u4') for k in genome.keys()}

In [5]:
seq_cls

{'2R': array([0, 0, 0, ..., 0, 0, 0], dtype=uint8),
 '3R': array([0, 0, 0, ..., 0, 0, 0], dtype=uint8),
 '2L': array([0, 0, 0, ..., 0, 0, 0], dtype=uint8),
 'UNKN': array([0, 0, 0, ..., 0, 0, 0], dtype=uint8),
 '3L': array([0, 0, 0, ..., 0, 0, 0], dtype=uint8),
 'X': array([0, 0, 0, ..., 0, 0, 0], dtype=uint8),
 'Y_unplaced': array([0, 0, 0, ..., 0, 0, 0], dtype=uint8),
 'Mt': array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)}

In [6]:
# build the upstream and downstream classes
############################################

def build_upstream_downstream():
    prv_gene = None
    for gene in tbl_features.eq('type', 'gene').records():
        seqid = gene.seqid
        if seqid in genome and prv_gene is not None and (gene.start > prv_gene.stop) and gene.seqid == prv_gene.seqid:

            # midpoint between previous and current genes
            m = (prv_gene.stop + gene.start) // 2

            # deal with previous gene
            if prv_gene.strand == '+':
                prv_cls = CLS_DOWNSTREAM
                seq_relpos_start[seqid][prv_gene.stop:m] = np.arange(m - prv_gene.stop)
                seq_relpos_stop[seqid][prv_gene.stop:m] = np.arange(m - prv_gene.stop)[::-1]
            else:
                prv_cls = CLS_UPSTREAM
                seq_relpos_start[seqid][prv_gene.stop:m] = np.arange(m - prv_gene.stop)[::-1]
                seq_relpos_stop[seqid][prv_gene.stop:m] = np.arange(m - prv_gene.stop)
            seq_cls[seqid][prv_gene.stop:m] = prv_cls
            seq_flen[seqid][prv_gene.stop:m] = m - prv_gene.stop

            # deal with current gene
            if gene.strand == '+':
                cls = CLS_UPSTREAM
                seq_relpos_start[seqid][m:gene.start] = np.arange(gene.start - m)
                seq_relpos_stop[seqid][m:gene.start] = np.arange(gene.start - m)[::-1]
            else:
                cls = CLS_DOWNSTREAM
                seq_relpos_start[seqid][m:gene.start] = np.arange(gene.start - m)[::-1]
                seq_relpos_stop[seqid][m:gene.start] = np.arange(gene.start - m)  
            seq_cls[seqid][m:gene.start] = cls
            seq_flen[seqid][m:gene.start] = gene.start - m

        prv_gene = gene

build_upstream_downstream()

In [7]:
x = np.bincount(seq_cls['2L'])
for i, n in enumerate(feature_cls_names):
    if i < x.shape[0]:
        print(n, x[i])

Unknown 18225524
Upstream 17128777
Downstream 14010024


In [8]:
# build the UTR classes
########################

def build_utr():
    for f in tbl_features.records():
        seqid = f.seqid
        if seqid in genome:
            cls = None
            if f['type'] == 'five_prime_UTR':
                cls = CLS_5UTR
            elif f['type'] == 'three_prime_UTR':
                cls = CLS_3UTR

            if cls:
                seq_cls[seqid][f.start:f.stop] = cls
                if f.strand == '+':
                    seq_relpos_start[seqid][f.start:f.stop] = np.arange(f.stop - f.start)
                    seq_relpos_stop[seqid][f.start:f.stop] = np.arange(f.stop - f.start)[::-1]
                else:
                    seq_relpos_start[seqid][f.start:f.stop] = np.arange(f.stop - f.start)[::-1]
                    seq_relpos_stop[seqid][f.start:f.stop] = np.arange(f.stop - f.start)
                seq_flen[seqid][f.start:f.stop] = f.stop - f.start
            
build_utr()

In [9]:
x = np.bincount(seq_cls['2L'])
for i, n in enumerate(feature_cls_names):
    if i < x.shape[0]:
        print(n, x[i])

Unknown 17491210
Upstream 17114953
Downstream 13996841
5' UTR 339898
3' UTR 421423


In [10]:
# build the CDS classes
#########################

def build_cds():
    for f in tbl_features.eq('type', 'CDS').records():
        seqid = f.seqid
        if seqid in genome:
            cls = None
            if f.is_first and not f.is_last:
                cls = CLS_CDS_FIRST
            elif not f.is_first and f.is_last:
                cls = CLS_CDS_LAST
            elif not f.is_first and not f.is_last:
                cls = CLS_CDS_MID

            if cls:
                seq_cls[seqid][f.start:f.stop] = cls
                if f.strand == '+':
                    seq_relpos_start[seqid][f.start:f.stop] = np.arange(f.stop - f.start)
                    seq_relpos_stop[seqid][f.start:f.stop] = np.arange(f.stop - f.start)[::-1]
                else:
                    seq_relpos_start[seqid][f.start:f.stop] = np.arange(f.stop - f.start)[::-1]
                    seq_relpos_stop[seqid][f.start:f.stop] = np.arange(f.stop - f.start)
                seq_flen[seqid][f.start:f.stop] = f.stop - f.start
                
build_cds()

In [11]:
x = np.bincount(seq_cls['2L'])
for i, n in enumerate(feature_cls_names):
    if i < x.shape[0]:
        print(n, x[i])

Unknown 13350513
Upstream 17034596
Downstream 13897774
5' UTR 328933
3' UTR 411711
CDS (first) 738146
CDS (mid) 2398508
CDS (last) 1204144


In [12]:
# build the intron classes
###########################

def build_intron():
    for f in tbl_features.eq('type', 'intron').records():
        seqid = f.seqid
        if seqid in genome:
            cls = None
            if f.is_first and not f.is_last:
                cls = CLS_INTRON_FIRST
            elif not f.is_first and f.is_last:
                cls = CLS_INTRON_LAST
            elif not f.is_first and not f.is_last:
                cls = CLS_INTRON_MID

            if cls:
                seq_cls[seqid][f.start:f.stop] = cls
                if f.strand == '+':
                    seq_relpos_start[seqid][f.start:f.stop] = np.arange(f.stop - f.start)
                    seq_relpos_stop[seqid][f.start:f.stop] = np.arange(f.stop - f.start)[::-1]
                else:
                    seq_relpos_start[seqid][f.start:f.stop] = np.arange(f.stop - f.start)[::-1]
                    seq_relpos_stop[seqid][f.start:f.stop] = np.arange(f.stop - f.start)
                seq_flen[seqid][f.start:f.stop] = f.stop - f.start

build_intron()

In [13]:
x = np.bincount(seq_cls['2L'])
for i, n in enumerate(feature_cls_names):
    if i < x.shape[0]:
        print(n, x[i])

Unknown 7598307
Upstream 16624416
Downstream 13564793
5' UTR 304615
3' UTR 398948
CDS (first) 707474
CDS (mid) 2342073
CDS (last) 1155577
Intron (first) 2215183
Intron (mid) 3851366
Intron (last) 601573


In [14]:
np.savez_compressed('seq_cls.npz', **seq_cls)

In [15]:
np.savez_compressed('seq_relpos_start.npz', **seq_relpos_start)

In [16]:
np.savez_compressed('seq_relpos_stop.npz', **seq_relpos_stop)

In [17]:
np.savez_compressed('seq_flen.npz', **seq_flen)