In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio.SeqFeature import SeqFeature, FeatureLocation

In [33]:
def write_gene_reference(gene_seq, gene_name, gene_description, cov_type, outfile):
    gene_record = SeqRecord(gene_seq, id= seq_record.id, 
                             name= gene_name, 
                             description= gene_description)
    source_feature = SeqFeature(FeatureLocation(0, len(gene_seq)), type='source', 
                                qualifiers={'organsism':cov_type, "mol_type":"genomic RNA"}) 
    gene_record.features.append(source_feature)
    cds_feature = SeqFeature(FeatureLocation(0, len(gene_seq)), type='CDS', qualifiers={'translation':gene_seq.translate()})
    gene_record.features.append(cds_feature)

    SeqIO.write(gene_record, outfile, 'genbank')

In [None]:
#229e
#From UniProt P0C6X1 (R1AB_CVH22)
#https://www.uniprot.org/uniprot/P0C6X1

#In reference strain I have been using, find region homologous to:

>sp|P0C6X1|4069-4995
SFDNSYLNRVRGSSAARLEPCNGTDIDYCVRAFDVYNKDASFIGKNLKSNCVRFKNVDKD
DAFYIVKRCIKSVMDHEQSMYNLLKGCNAVAKHDFFTWHEGRTIYGNVSRQDLTKYTMMD
LCFALRNFDEKDCEVFKEILVLTGCCSTDYFEMKNWFDPIENEDIHRVYAALGKVVANAM
LKCVAFCDEMVLKGVVGVLTLDNQDLNGNFYDFGDFVLCPPGMGIPYCTSYYSYMMPVMG
MTNCLASECFMKSDIFGQDFKTFDLLKYDFTEHKEVLFNKYFKYWGQDYHPDCVDCHDEM
CILHCSNFNTLFATTIPNTAFGPLCRKVFIDGVPVVATAGYHFKQLGLVWNKDVNTHSTR
LTITELLQFVTDPTLIVASSPALVDKRTVCFSVAALSTGLTSQTVKPGHFNKEFYDFLRS
QGFFDEGSELTLKHFFFTQKGDAAIKDFDYYRYNRPTMLDIGQARVAYQVAARYFDCYEG
GCITSREVVVTNLNKSAGWPLNKFGKAGLYYESISYEEQDAIFSLTKRNILPTMTQLNLK
YAISGKERARTVGGVSLLATMTTRQFHQKCLKSIVATRNATVVIGTTKFYGGWDNMLKNL
MADVDDPKLMGWDYPKCDRAMPSMIRMLSAMILGSKHVTCCTASDKFYRLSNELAQVLTE
VVYSNGGFYFKPGGTTSGDATTAYANSVFNIFQAVSSNINCVLSVNSSNCNNFNVKKLQR
QLYDNCYRNSNVDESFVDDFYGYLQKHFSMMILSDDSVVCYNKTYAGLGYIADISAFKAT
LYYQNGVFMSTAKCWTEEDLSIGPHEFCSQHTMQIVDENGKYYLPYPDPSRIISAGVFVD
DITKTDAVILLERYVSLAIDAYPLSKHPKPEYRKVFYALLDWVKHLNKTLNEGVLESFSV
TLLDEHESKFWDESFYASMYEKSTVLQ

In [34]:
#229e

for seq_record in SeqIO.parse("../229e/config/229e_full_reference.gb", "genbank"):
    for feature in seq_record.features:
        if feature.type == 'CDS':
            if feature.qualifiers['product'] == ['replicase polyprotein 1ab']:
                rdrp_229e_nt = feature.location.extract(seq_record.seq)[12204:14985]
                rdrp_229e_aa = feature.qualifiers['translation'][0][4068:4995]


outfile = '../229e/config/229e_rdrp_reference.gb'
write_gene_reference(rdrp_229e_nt, '229e_rdrp', 'rdrp sequence extracted from whole genome Genbank file', '229e', outfile)

In [None]:
#nl63
#can't find rdrp/nsp12 annotated in a nl63 sequence or protein 
#use region homologous to 229e's nsp12 since both are alpha-CoVs

In [41]:
#nl63

for seq_record in SeqIO.parse("../nl63/config/nl63_full_reference.gb", "genbank"):
    for feature in seq_record.features:
        if feature.type == 'CDS':
            if feature.qualifiers['product'] == ['replicase polyprotein 1ab']:
                rdrp_nl63_nt = feature.location.extract(seq_record.seq)[12129:14910]
                rdrp_nl63_aa = feature.qualifiers['translation'][0][4043:4970]

              
                
outfile = '../nl63/config/nl63_rdrp_reference.gb'
write_gene_reference(rdrp_nl63_nt, 'nl63_rdrp', 
                     'rdrp sequence extracted from whole genome Genbank file based on homology to 229e rdrp', 'nl63', outfile)



SVDISYLNRARGSSAARLEPCNGTDIDKCVRAFDIYNKNVSFLGKCLKMNCVRFKNADLKDGYFVIKRCTKSVMEHEQSMYNLLNFSGALAEHDFFTWKDGRVIYGNVSRHNLTKYTMMDLVYAMRNFDEQNCDVLKEVLVLTGCCDNSYFDSKGWYDPVENEDIHRVYASLGKIVARAMLKCVALCDAMVAKGVVGVLTLDNQDLNGNFYDFGDFVVSLPNMGVPCCTSYYSYMMPIMGLTNCLASECFVKSDIFGSDFKTFDLLKYDFTEHKENLFNKYFKHWSFDYHPNCSDCYDDMCVIHCANFNTLFATTIPGTAFGPLCRKVFIDGVPLVTTAGYHFKQLGLVWNKDVNTHSVRLTITELLQFVTDPSLIIASSPALVDQRTICFSVAALSTGLTNQVVKPGHFNEEFYNFLRLRGFFDEGSELTLKHFFFAQNGDAAVKDFDFYRYNKPTILDICQARVTYKIVSRYFDIYEGGCIKACEVVVTNLNKSAGWPLNKFGKASLYYESISYEEQDALFALTKRNVLPTMTQLNLKYAISGKERARTVGGVSLLSTMTTRQYHQKHLKSIVNTRNATVVIGTTKFYGGWNNMLRTLIDGVENPMLMGWDYPKCDRALPNMIRMISAMVLGSKHVNCCTATDRFYRLGNELAQVLTEVVYSNGGFYFKPGGTTSGDASTAYANSIFNIFQAVSSNINRLLSVPSDSCNNVNVRDLQRRLYDNCYRLTSVEESFIDDYYGYLRKHFSMMILSDDGVVCYNKDYAELGYIADISAFKATLYYQNNVFMSTSKCWVEEDLTKGPHEFCSQHTMQIVDKDGTYYLPYPDPSRILSAGVFVDDVVKTDAVVLLERYVSLAIDAYPLSKHPNSEYRKVFYVLLDWVKHLNKNLNEGVLESFSVTLLDNQEDKFWCEDFYASMYENSTILQ


In [46]:
#hku1
#full reference file has nsp12 annotated

for seq_record in SeqIO.parse("../hku1/config/hku1_full_reference.gb", "genbank"):
    for feature in seq_record.features:
        if feature.type == 'mat_peptide':
            if feature.qualifiers['product'] == ['nsp12']:
                rdrp_hku1_nt = feature.location.extract(seq_record.seq)
                rdrp_hku1_aa = feature.location.extract(seq_record.seq).translate()
              
                
outfile = '../hku1/config/hku1_rdrp_reference.gb'
write_gene_reference(rdrp_hku1_nt, 'hku1_rdrp', 
                     'rdrp sequence extracted from whole genome Genbank file based on homology to oc43 rdrp', 'hku1', outfile)

