In [None]:
import gffutils
import gzip
from Bio import Alphabet, Seq, SeqIO

## Retrieving data

In [None]:
!rm -rf ag.db gambiae.fa.gz 2>/dev/null
!wget ftp://ftp.vectorbase.org/public_data/organism_data/agambiae/Genome/agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz -O gambiae.fa.gz

In [None]:
!rm -f ag.db

db = gffutils.create_db('https://vectorbase.org/common/downloads/Pre-VEuPathDB%20VectorBase%20files/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.2.gff3.gz', 'ag.db')

# Getting a gene

In [None]:
gene_id = 'AGAP004707'

In [None]:
gene = db[gene_id]

In [None]:
print(gene)
print(gene.seqid, gene.strand)

In [None]:
recs = SeqIO.parse(gzip.open('gambiae.fa.gz', 'rt', encoding='utf-8'), 'fasta')
for rec in recs:
    print(rec.description)
    if rec.description.split(':')[2] == gene.seqid:
        my_seq = rec.seq
        break
print(my_seq.alphabet)

In [None]:
def get_sequence(chrom_seq, CDSs, strand):
    seq = Seq.Seq('', alphabet=Alphabet.IUPAC.unambiguous_dna)
    for CDS in CDSs:
        #FRAME???
        my_cds = Seq.Seq(str(chrom_seq[CDS.start - 1: CDS.end]), alphabet=Alphabet.IUPAC.unambiguous_dna)
        seq += my_cds
    return seq if strand == '+' else seq.reverse_complement()

In [None]:
mRNAs = db.children(gene, featuretype='mRNA')
for mRNA in mRNAs:
    print(mRNA.id)
    if mRNA.id.endswith('RA'):
        break

CDSs = db.children(mRNA, featuretype='CDS', order_by='start')
gene_seq = get_sequence(my_seq, CDSs, gene.strand)

print(len(gene_seq), gene_seq)
prot = gene_seq.translate()
print(len(prot), prot)

# Reverse strand

In [None]:
reverse_transcript_id = 'AGAP004708-RA'

In [None]:
reverse_CDSs = db.children(reverse_transcript_id, featuretype='CDS', order_by='start')
reverse_seq = get_sequence(my_seq, reverse_CDSs, '-')

print(len(reverse_seq), reverse_seq)
reverse_prot = reverse_seq.translate()
print(len(reverse_prot), reverse_prot)