In [17]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio.SeqFeature import SeqFeature, FeatureLocation

In [58]:
def write_gene_reference(gene_seq, gene_name, gene_description, cov_type, outfile):
    gene_record = SeqRecord(gene_seq, id= seq_record.id, 
                             name= gene_name, 
                             description= gene_description)
    source_feature = SeqFeature(FeatureLocation(0, len(gene_seq)), type='source', 
                                qualifiers={'organsism':cov_type, "mol_type":"genomic RNA"}) 
    gene_record.features.append(source_feature)
    cds_feature = SeqFeature(FeatureLocation(0, len(gene_seq)), type='CDS', qualifiers={'translation':gene_seq.translate()})
    gene_record.features.append(cds_feature)

    SeqIO.write(gene_record, outfile, 'genbank')

In [73]:
#NL63 gene seqs

nl63_genes = {"replicase polyprotein 1ab":'replicase1ab', "spike protein":'spike', "protein 3":'protein3', 
              "envelope protein":'envelope', "membrane protein":'membrane', "nucleocapsid protein":'nucleocapsid'}

for seq_record in SeqIO.parse("../../nl63/config/nl63_full_reference.gb", "genbank"):
    cov_type = "Human coronavirus NL63"
    for feature in seq_record.features:
        if feature.type == 'CDS':
            for k,v in nl63_genes.items():
                if feature.qualifiers['product'] == [k]:
                    gene_seq = feature.location.extract(seq_record.seq)
                    gene_name = 'NL63_'+v
                    gene_description = v + 'sequence extracted from whole genome Genbank file'
                    outfile = '../../nl63/config/nl63_'+v+'_reference.gb'
                    write_gene_reference(gene_seq, gene_name, gene_description, cov_type, outfile)

In [71]:
#229E gene seqs

two29e_genes = {"replicase polyprotein 1ab":'replicase1ab', "replicase polyprotein 1a":'replicase1a', "surface glycoprotein":'spike', 
                "4a protein":'protein4a', "4b protein":'protein4b',
                "envelope protein":'envelope', "membrane protein":'membrane', "nucleocapsid protein":'nucleocapsid'}

for seq_record in SeqIO.parse("../../229e/config/229e_full_reference.gb", "genbank"):
    cov_type = "Human coronavirus 229E"
    for feature in seq_record.features:
        if feature.type == 'CDS':
            for k,v in two29e_genes.items():
                if feature.qualifiers['product'] == [k]:
                    gene_seq = feature.location.extract(seq_record.seq)
                    gene_name = '229E_'+v
                    gene_description = v + 'sequence extracted from whole genome Genbank file'
                    outfile = '../../229e/config/229e_'+v+'_reference.gb'
                    write_gene_reference(gene_seq, gene_name, gene_description, cov_type, outfile)

In [76]:
#HKU1 gene seqs
hku1_genes = {"orf1ab polyprotein":'replicase1ab', "hemagglutinin-esterase glycoprotein":'he', 
              "spike glycoprotein":'spike', "non-structural protein":'nonstructural4',
              "small membrane protein":'envelope', "membrane glycoprotein":'membrane', 
              "nucleocapsid phosphoprotein":'nucleocapsid', "nucleocapsid phosphoprotein 2":'nucleocapsid2'}

for seq_record in SeqIO.parse("../../hku1/config/hku1_full_reference.gb", "genbank"):
    cov_type = "Human coronavirus HKU1"
    for feature in seq_record.features:
        if feature.type == 'CDS':
            for k,v in hku1_genes.items():
                if feature.qualifiers['product'] == [k]:
                    gene_seq = feature.location.extract(seq_record.seq)
                    gene_name = 'HKU1_'+v
                    gene_description = v + 'sequence extracted from whole genome Genbank file'
                    outfile = '../../hku1/config/hku1_'+v+'_reference.gb'
                    write_gene_reference(gene_seq, gene_name, gene_description, cov_type, outfile)

In [78]:
#OC43 gene seqs
oc43_genes = {"replicase polyprotein":'replicase1ab', "NS2a protein":'nonstructural2a',
              "HE protein":'he', "S protein":'spike', "NS2 protein":'nonstructural2',
              "NS3 protein":'envelope', "M protein":'membrane', 
              "N protein":'nucleocapsid', "N2 protein":'n2protein'}

for seq_record in SeqIO.parse("../../oc43/config/oc43_full_reference.gb", "genbank"):
    cov_type = "Human coronavirus OC43"
    for feature in seq_record.features:
        if feature.type == 'CDS':
            for k,v in oc43_genes.items():
                if feature.qualifiers['product'] == [k]:
                    gene_seq = feature.location.extract(seq_record.seq)
                    gene_name = 'OC43_'+v
                    gene_description = v + 'sequence extracted from whole genome Genbank file'
                    outfile = '../../oc43/config/oc43_'+v+'_reference.gb'
                    write_gene_reference(gene_seq, gene_name, gene_description, cov_type, outfile)