In [2]:
import re
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio.SeqFeature import SeqFeature, FeatureLocation

In [25]:
#first 6 aas of each domain
#from uniprot: NL63 (Q6Q1S2), 229e(P15423), oc43 (P36334), hku1 (Q0ZME7)
#nl63 s1 domain definition: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2693060/
s1_domains = {'nl63': 'FFTCNS', '229e': 'CQTTNG', 'oc43': 'AVIGDL', 'hku1': 'AVIGDF'}
s2_domains = {'nl63': 'SSDNGI', '229e': 'IIAVQP', 'oc43': 'AITTGY', 'hku1': 'SISASY'}

rdrp_domains_start = {'oc43': 'SKDTNF'}
rdrp_domains_end = {'oc43': 'RSAVMQ'}

In [38]:
def write_gene_reference(gene_seq, gene_id, gene_name, gene_description, cov_type, outfile):
    gene_record = SeqRecord(gene_seq, id= gene_id, 
                             name= gene_name, 
                             description= gene_description)
    source_feature = SeqFeature(FeatureLocation(0, len(gene_seq)), type='source', 
                                qualifiers={'organsism':cov_type, "mol_type":"genomic RNA"}) 
    gene_record.features.append(source_feature)
    cds_feature = SeqFeature(FeatureLocation(0, len(gene_seq)), type='CDS', qualifiers={'translation':gene_seq.translate()})
    gene_record.features.append(cds_feature)

    SeqIO.write(gene_record, outfile, 'genbank')

In [15]:
def make_s1_s2_reference(cov):
    spike_reference = '../'+str(cov)+'/config/'+str(cov)+'_spike_reference.gb'

    with open(spike_reference, "r") as handle:
        for record in SeqIO.parse(handle, "genbank"):
            nt_seq = record.seq
            aa_seq = record.seq.translate()
            
            s1_regex = re.compile(f'{s1_domains[cov]}.*(?={s2_domains[cov]})')
            s1_aa = s1_regex.search(str(aa_seq)).group()
            s1_aa_coords = [(aa.start(0), aa.end(0)) for aa in re.finditer(s1_regex, str(aa_seq))][0]
            s1_nt_coords = [s1_aa_coords[0]*3, s1_aa_coords[1]*3]
            s1_nt_seq = nt_seq[s1_nt_coords[0]: s1_nt_coords[1]]
            
            s2_regex = re.compile(f'{s2_domains[cov]}.*')
            s2_aa = s2_regex.search(str(aa_seq)).group()
            s2_aa_coords = [(aa.start(0), aa.end(0)) for aa in re.finditer(s2_regex, str(aa_seq))][0]
            s2_nt_coords = [s2_aa_coords[0]*3, s2_aa_coords[1]*3]
            s2_nt_seq = nt_seq[s2_nt_coords[0]: s2_nt_coords[1]]

            write_gene_reference(s1_nt_seq, record.id, str(cov)+'_S1', 'spike s1 subdomain', 
                                 cov, '../'+str(cov)+'/config/'+str(cov)+'_s1_reference.gb')
            write_gene_reference(s2_nt_seq, record.id, str(cov)+'_S2', 'spike s2 subdomain', 
                     cov, '../'+str(cov)+'/config/'+str(cov)+'_s2_reference.gb')

In [26]:
# covs = ['oc43', '229e', 'nl63', 'hku1']
covs = ['229e']
for cov in covs:
    make_s1_s2_reference(cov)

In [20]:
def make_rdrp_reference(cov):
    replicase_reference = '../'+str(cov)+'/config/'+str(cov)+'_replicase1ab_reference.gb'

    with open(replicase_reference, "r") as handle:
        for record in SeqIO.parse(handle, "genbank"):
            nt_seq = record.seq
            aa_seq = record.seq.translate()
            
            rdrp_regex = re.compile(f'{rdrp_domains_start[cov]}.*{rdrp_domains_end[cov]}')
            rdrp_aa = rdrp_regex.search(str(aa_seq)).group()
            rdrp_aa_coords = [(aa.start(0), aa.end(0)) for aa in re.finditer(rdrp_regex, str(aa_seq))][0]
            rdrp_nt_coords = [rdrp_aa_coords[0]*3, rdrp_aa_coords[1]*3]
            rdrp_nt_seq = nt_seq[rdrp_nt_coords[0]: rdrp_nt_coords[1]]

            write_gene_reference(rdrp_nt_seq, record.id, str(cov)+'_rdrp', 'rna-dependent rna polymerase', 
                     cov, '../'+str(cov)+'/config/'+str(cov)+'_rdrp_reference.gb')

In [36]:
def make_left_right_ofspike_reference(cov):
    full_reference = '../'+str(cov)+'/config/'+str(cov)+'_full_reference.gb'

    with open(full_reference, "r") as handle:
        for record in SeqIO.parse(handle, "genbank"):
            nt_seq = record.seq
            aa_seq = record.seq.translate()
            
            for feature in record.features:
                if feature.type == 'CDS':
                    if 'gene' in feature.qualifiers:
                        if feature.qualifiers['gene'][0].lower()=='s' or feature.qualifiers['gene'][0].lower()=='spike':
                            spike_location = feature.location

            left_location = SeqFeature(FeatureLocation(0, spike_location.start))
            right_location = SeqFeature(FeatureLocation(spike_location.end, len(nt_seq)))

            left_seq = left_location.extract(nt_seq)
            right_seq = right_location.extract(nt_seq)

            write_gene_reference(left_seq, record.id, str(cov)+'_left', 'genome left of spike', 
                                 cov, '../'+str(cov)+'/config/'+str(cov)+'_left_reference.gb')
            write_gene_reference(right_seq, record.id, str(cov)+'_right', 'genome right of spike', 
                     cov, '../'+str(cov)+'/config/'+str(cov)+'_right_reference.gb')

In [39]:
# covs = ['oc43', '229e', 'nl63', 'hku1']
covs = ['oc43']
for cov in covs:
    make_left_right_ofspike_reference(cov)

In [None]:
def find_rbd_flu(flu):
    ha_reference = '../../seasonal-flu/config/reference_'+str(cov)+'_ha.gb'

    with open(ha_reference, "r") as handle:
        for record in SeqIO.parse(handle, "genbank"):
            nt_seq = record.seq
            aa_seq = record.seq.translate()
            
            rbd_regex = re.compile(f'{rbd_domain_start[flu]}.*{rbd_domain_end[flu]}')
            rbd_aa = rbd_regex.search(str(aa_seq)).group()
            rbd_aa_coords = [(aa.start(0), aa.end(0)) for aa in re.finditer(rbd_regex, str(aa_seq))][0]
            rbd_nt_coords = [rbd_aa_coords[0]*3, rbd_aa_coords[1]*3]
            rbd_nt_seq = nt_seq[rbd_nt_coords[0]: rbd_nt_coords[1]]
    print((rbd_aa))

In [71]:
def find_rbd(cov):
    spike_reference = '../'+str(cov)+'/config/'+str(cov)+'_s1_reference.gb'

    with open(spike_reference, "r") as handle:
        for record in SeqIO.parse(handle, "genbank"):
            nt_seq = record.seq
            aa_seq = record.seq.translate()
            
            rbd_regex = re.compile(f'{rbd_domain_start[cov]}.*{rbd_domain_end[cov]}')
            rbd_aa = rbd_regex.search(str(aa_seq)).group()
            rbd_aa_coords = [(aa.start(0), aa.end(0)) for aa in re.finditer(rbd_regex, str(aa_seq))][0]
            rbd_nt_coords = [rbd_aa_coords[0]*3, rbd_aa_coords[1]*3]
            rbd_nt_seq = nt_seq[rbd_nt_coords[0]: rbd_nt_coords[1]]
    print((rbd_aa_coords))

In [63]:
#Find RBDs
#OC43: https://virologyj.biomedcentral.com/articles/10.1186/1743-422X-2-73/figures/5
rbd_domain_start = {'oc43': 'NLPNCN', 'h3n2':}
rbd_domain_end = {'oc43': 'TDLQKA', 'h3n2':}

In [72]:
find_rbd('oc43')

(323, 607)
