In [161]:
import re
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio.SeqFeature import SeqFeature, FeatureLocation

In [168]:
#first 6 aas of each domain
#from uniprot: NL63 (Q6Q1S2), 229e(P15423), oc43 (P36334), hku1 (Q0ZME7)
s1_domains = {'nl63': 'CNSNAN', '229e': 'CQTTNG', 'oc43': 'AVIGDL', 'hku1': 'AVIGDF'}
s2_domains = {'nl63': 'SNGGNN', '229e': 'SNGTYN', 'oc43': 'AITTGY', 'hku1': 'SISASY'}

In [169]:
def write_gene_reference(gene_seq, gene_id, gene_name, gene_description, cov_type, outfile):
    gene_record = SeqRecord(gene_seq, id= gene_id, 
                             name= gene_name, 
                             description= gene_description)
    source_feature = SeqFeature(FeatureLocation(0, len(gene_seq)), type='source', 
                                qualifiers={'organsism':cov_type, "mol_type":"genomic RNA"}) 
    gene_record.features.append(source_feature)
    cds_feature = SeqFeature(FeatureLocation(0, len(gene_seq)), type='CDS', qualifiers={'translation':gene_seq.translate()})
    gene_record.features.append(cds_feature)

    SeqIO.write(gene_record, outfile, 'genbank')

In [170]:
def make_s1_s2_reference(cov):
    spike_reference = '../'+str(cov)+'/config/'+str(cov)+'_spike_reference.gb'

    with open(spike_reference, "r") as handle:
        for record in SeqIO.parse(handle, "genbank"):
            nt_seq = record.seq
            aa_seq = record.seq.translate()
            
            s1_regex = re.compile(f'{s1_domains[cov]}.*(?={s2_domains[cov]})')
            s1_aa = s1_regex.search(str(aa_seq)).group()
            s1_aa_coords = [(aa.start(0), aa.end(0)) for aa in re.finditer(s1_regex, str(aa_seq))][0]
            s1_nt_coords = [s1_aa_coords[0]*3, s1_aa_coords[1]*3]
            s1_nt_seq = nt_seq[s1_nt_coords[0]: s1_nt_coords[1]]
            
            s2_regex = re.compile(f'{s2_domains[cov]}.*')
            s2_aa = s2_regex.search(str(aa_seq)).group()
            s2_aa_coords = [(aa.start(0), aa.end(0)) for aa in re.finditer(s2_regex, str(aa_seq))][0]
            s2_nt_coords = [s2_aa_coords[0]*3, s2_aa_coords[1]*3]
            s2_nt_seq = nt_seq[s2_nt_coords[0]: s2_nt_coords[1]]

            write_gene_reference(s1_nt_seq, record.id, str(cov)+'_S1', 'spike s1 subdomain', 
                                 cov, '../'+str(cov)+'/config/'+str(cov)+'_s1_reference.gb')
            write_gene_reference(s2_nt_seq, record.id, str(cov)+'_S2', 'spike s2 subdomain', 
                     cov, '../'+str(cov)+'/config/'+str(cov)+'_s2_reference.gb')

In [171]:
covs = ['oc43', '229e', 'nl63', 'hku1']
for cov in covs:
    make_s1_s2_reference(cov)