In [1]:
#pip install gffutils
from collections import defaultdict

import gffutils
import sqlite3

In [9]:
!rm -f ag.db

In [15]:
try:
    db = gffutils.create_db('https://vectorbase.org/common/downloads/Pre-VEuPathDB%20VectorBase%20files/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.2.gff3.gz', 'ag.db')
except sqlite3.OperationalError:
    db = gffutils.FeatureDB('ag.db')

In [16]:
print(list(db.featuretypes()))
for feat_type in db.featuretypes():
    print(feat_type, db.count_features_of_type(feat_type))

['CDS', 'RNase_P_RNA', 'SRP_RNA', 'contig', 'exon', 'five_prime_UTR', 'gene', 'mRNA', 'miRNA', 'misc_RNA', 'pseudogene', 'rRNA', 'snRNA', 'snoRNA', 'tRNA', 'tRNA_pseudogene', 'three_prime_UTR']
CDS 62408
RNase_P_RNA 1
SRP_RNA 3
contig 8
exon 66485
five_prime_UTR 10520
gene 13624
mRNA 14697
miRNA 187
misc_RNA 10
pseudogene 5
rRNA 53
snRNA 38
snoRNA 12
tRNA 463
tRNA_pseudogene 9
three_prime_UTR 7281


In [17]:
for contig in db.features_of_type('contig'):
    print(contig)

2L	VectorBase	contig	1	49364325	.	.	.	ID=2L;molecule_type=dsDNA;translation_table=1;topology=linear;localization=chromosomal
3R	VectorBase	contig	1	53200684	.	.	.	ID=3R;molecule_type=dsDNA;translation_table=1;topology=linear;localization=chromosomal
UNKN	VectorBase	contig	1	42389979	.	.	.	ID=UNKN;molecule_type=dsDNA;translation_table=1;topology=linear;localization=chromosomal
X	VectorBase	contig	1	24393108	.	.	.	ID=X;molecule_type=dsDNA;translation_table=1;topology=linear;localization=chromosomal
Y_unplaced	VectorBase	contig	1	237045	.	.	.	ID=Y_unplaced;molecule_type=dsDNA;translation_table=1;topology=linear;localization=chromosomal
Mt	VectorBase	contig	1	15363	.	.	.	ID=Mt;molecule_type=dsDNA;translation_table=1;topology=linear;localization=chromosomal
2R	VectorBase	contig	1	61545105	.	.	.	ID=2R;molecule_type=dsDNA;translation_table=1;topology=linear;localization=chromosomal
3L	VectorBase	contig	1	41963435	.	.	.	ID=3L;molecule_type=dsDNA;translation_table=1;topology=linear;localization

In [11]:
num_mRNAs = defaultdict(int)
num_exons = defaultdict(int)
max_exons = 0
max_span = 0
for contig in db.features_of_type('contig'):
    cnt = 0
    for gene in db.region((contig.seqid, contig.start, contig.end), featuretype='gene'):
        cnt += 1
        span = abs(gene.start - gene.end) # strand
        if span > max_span:
            max_span = span
            max_span_gene = gene
        my_mRNAs = list(db.children(gene, featuretype='mRNA'))
        num_mRNAs[len(my_mRNAs)] += 1
        if len(my_mRNAs) == 0:
            exon_check = [gene]
        else:
            exon_check = my_mRNAs
        for check in exon_check:
            my_exons = list(db.children(check, featuretype='exon'))
            num_exons[len(my_exons)] += 1
            if len(my_exons) > max_exons:
                max_exons = len(my_exons)
                max_exons_gene = gene
    print('contig %s, number of genes %d' % (contig.seqid, cnt))
print('Max number of exons: %s (%d)' % (max_exons_gene.id, max_exons))
print('Max span: %s (%d)' % (max_span_gene.id, max_span))
print(num_mRNAs)
print(num_exons)

contig 2L, number of genes 3105
contig 3R, number of genes 2763
contig UNKN, number of genes 567
contig X, number of genes 1097
contig Y_unplaced, number of genes 0
contig Mt, number of genes 37
contig 2R, number of genes 3834
contig 3L, number of genes 2221
Max number of exons: AGAP001660 (67)
Max span: AGAP006656 (365621)
defaultdict(<class 'int'>, {2: 910, 1: 11595, 3: 211, 4: 74, 0: 781, 11: 3, 5: 27, 8: 4, 12: 1, 7: 5, 6: 9, 13: 1, 10: 1, 20: 1, 9: 1})
defaultdict(<class 'int'>, {4: 2091, 2: 3359, 5: 1411, 6: 1039, 1: 2019, 3: 2838, 9: 419, 10: 298, 11: 202, 8: 454, 12: 159, 31: 5, 7: 718, 13: 106, 15: 65, 19: 28, 16: 45, 17: 53, 14: 65, 26: 3, 18: 22, 21: 9, 22: 7, 24: 6, 30: 5, 20: 19, 32: 1, 33: 1, 27: 2, 28: 5, 23: 6, 34: 1, 29: 4, 25: 9, 67: 1, 50: 1, 49: 1, 42: 1})
