In [74]:
import re
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import AlignIO
from collections import Counter 

In [75]:
#Gene positions for each virus
def get_virus_genes(virus):
    if virus == 'nl63':
        genes_dict = {'replicase1ab':"replicase polyprotein 1ab", 'spike':"spike protein", 'protein3':"protein 3", 
                      'envelope':"envelope protein", 'membrane':"membrane protein", 'nucleocapsid':"nucleocapsid protein", 
                      's1':'spike_subdomain1', 's2':'spike_subdomain2'}
    elif virus == '229e':
        genes_dict = {'replicase1ab':"replicase polyprotein 1ab", 'replicase1a': "replicase polyprotein 1a", 'spike':"surface glycoprotein", 
                      'protein4a':"4a protein", 'protein4b':"4b protein",
                      'envelope':"envelope protein", 'membrane':"membrane protein", 'nucleocapsid':"nucleocapsid protein", 
                      's1':'spike_subdomain1', 's2':'spike_subdomain2'}
    elif virus == 'hku1':
        genes_dict = {'replicase1ab':"orf1ab polyprotein", 'he':"hemagglutinin-esterase glycoprotein", 
                      'spike':"spike glycoprotein", 'nonstructural4':"non-structural protein",
                      'envelope':"small membrane protein", 'membrane':"membrane glycoprotein", 
                      'nucleocapsid':"nucleocapsid phosphoprotein", 'nucleocapsid2':"nucleocapsid phosphoprotein 2", 
                      's1':'spike_subdomain1', 's2':'spike_subdomain2'}
    elif virus == 'oc43':
        genes_dict = {'replicase1ab':"replicase polyprotein", 'nonstructural2a':"NS2a protein",
                      'he':"HE protein", 'spike':"S protein", 'nonstructural2':"NS2 protein",
                      'envelope':"NS3 protein", 'membrane':"M protein", 
                      'nucleocapsid':"N protein", 'n2protein':"N2 protein", 
                      's1':'spike_subdomain1', 's2':'spike_subdomain2'}
    return genes_dict


In [32]:
#first 6 aas of each domain
#from uniprot: NL63 (Q6Q1S2), 229e(P15423), oc43 (P36334), hku1 (Q0ZME7)
s1_domains = {'nl63': 'CNSNAN', '229e': 'CQTTNG', 'oc43': 'AVIGDL', 'hku1': 'AVIGDF'}
s2_domains = {'nl63': 'SNGGNN', '229e': 'SNGTYN', 'oc43': 'AITTGY', 'hku1': 'SISASY'}

In [87]:
def get_s1_coords(virus):
    spike_reference = '../'+str(virus)+'/config/'+str(virus)+'_spike_reference.gb'

    with open(spike_reference, "r") as handle:
        for record in SeqIO.parse(handle, "genbank"):
            nt_seq = record.seq
            aa_seq = record.seq.translate()
            
            s1_regex = re.compile(f'{s1_domains[virus]}.*(?={s2_domains[virus]})')
            s1_aa = s1_regex.search(str(aa_seq)).group()
            s1_aa_coords = [(aa.start(0), aa.end(0)) for aa in re.finditer(s1_regex, str(aa_seq))][0]
            s1_nt_coords = [s1_aa_coords[0]*3, s1_aa_coords[1]*3]
    return s1_nt_coords

In [88]:
def get_s2_coords(virus):
    spike_reference = '../'+str(virus)+'/config/'+str(virus)+'_spike_reference.gb'

    with open(spike_reference, "r") as handle:
        for record in SeqIO.parse(handle, "genbank"):
            nt_seq = record.seq
            aa_seq = record.seq.translate()
            
            s2_regex = re.compile(f'{s2_domains[virus]}.*')
            s2_aa = s2_regex.search(str(aa_seq)).group()
            s2_aa_coords = [(aa.start(0), aa.end(0)) for aa in re.finditer(s2_regex, str(aa_seq))][0]
            s2_nt_coords = [s2_aa_coords[0]*3, s2_aa_coords[1]*3]
    return s2_nt_coords

In [90]:
#Gene positions for each virus
def get_gene_position_test(virus, gene, sequence):
    genes_dict = get_virus_genes(virus)
    
    for seq_record in SeqIO.parse("../"+str(virus)+"/config/"+str(virus)+"_full_reference.gb", "genbank"):
        for feature in seq_record.features:
            if feature.type == 'CDS':
                if gene != 's1' and gene != 's2':
                    if feature.qualifiers['product'] == [genes_dict[gene]]:
                        gene_nt = feature.location.extract(sequence)
                if gene == 's1':
                    if feature.qualifiers['product'] == [genes_dict['spike']]:
                        s1_nt_coords = get_s1_coords(virus)
                        gene_nt = feature.location.extract(sequence)[s1_nt_coords[0]:s1_nt_coords[1]]
                if gene == 's2':
                    if feature.qualifiers['product'] == [genes_dict['spike']]:
                        s2_nt_coords = get_s2_coords(virus)
                        gene_nt = feature.location.extract(sequence)[s2_nt_coords[0]:s2_nt_coords[1]]
    return gene_nt

In [94]:
#From aligned .fasta, extract just the portion of the genome encoding each gene
def extract_genes(virus):
    
    aligned_fasta = "../"+str(virus)+"/results/aligned_"+str(virus)+"_full.fasta"
    original_fasta = "../"+str(virus)+"/data/"+str(virus)+"_full.fasta"
    
    genes_dict = get_virus_genes(virus)
    genes = [k for k,v in genes_dict.items()]
    
    for gene in genes:
        output_fasta = "../"+str(virus)+"/data/"+str(virus)+"_"+str(gene)+".fasta"
        gene_sequences = {}
        with open(aligned_fasta, "r") as handle:
            alignment = SeqIO.parse(handle, "fasta")
            for aligned_record in alignment:
                gene_nt = get_gene_position_test(virus, gene, aligned_record.seq)
                gene_nt_str = str(gene_nt)
                #Throw out sequences that don't cover gene
                num_unaligned_gene = Counter(gene_nt_str)['N']
                if num_unaligned_gene < (len(gene_nt)/2):
                    gene_sequences[aligned_record.id] = gene_nt
        
        gene_entries = []

        with open(original_fasta, "r") as handle_2:
            metadata = SeqIO.parse(handle_2, "fasta")
            for meta_record in metadata:
                strain_name = meta_record.id.split('|')[1]
                if str(strain_name) in gene_sequences.keys():
                    gene_record = SeqRecord(gene_sequences[strain_name], id=meta_record.id, description=meta_record.id)
                    gene_entries.append(gene_record)

        SeqIO.write(gene_entries, output_fasta, "fasta")


In [98]:
extract_genes('nl63')

In [None]:
###Old below

In [None]:
#Gene positions for each virus
def get_gene_position(virus, gene, sequence):
    genes_dict = get_virus_genes(virus)
    
    for seq_record in SeqIO.parse("../"+str(virus)+"/config/"+str(virus)+"_full_reference.gb", "genbank"):
        for feature in seq_record.features:
            if feature.type == 'CDS':
                if feature.qualifiers['product'] == [genes_dict[gene]]:
                    gene_nt = feature.location.extract(sequence)

    return gene_nt


In [122]:
#From aligned .fasta, extract just the portion of the genome encoding Spike
def extract_spike(aligned_fasta, original_fasta, output_spike_fasta, output_he_fasta):
    
    spike_sequences = {}
    he_sequences = {}
    with open(aligned_fasta, "r") as handle:
        alignment = SeqIO.parse(handle, "fasta")
        for record in alignment:
            spike_nt = record.seq[23643:27729]
            spike_nt_str = str(spike_nt)
            #Throw out sequences that don't cover Spike
            num_unaligned_spike = Counter(spike_nt_str)['N']
            if num_unaligned_spike < (len(spike_nt)/2):
                spike_sequences[record.id] = spike_nt
            
            he_nt = record.seq[22354:23629]
            he_nt_str = str(he_nt)
            #Throw out sequences that don't cover HE
            num_unaligned_he = Counter(he_nt_str)['N']
            if num_unaligned_he < (len(he_nt)/2):
                he_sequences[record.id] = he_nt
                
    print(len(spike_sequences))
    print(len(he_sequences))
    
    spike_entries = []
    he_entries = []
    with open(original_fasta, "r") as handle_2:
        metadata = SeqIO.parse(handle_2, "fasta")
        for record in metadata:
            gb_id = record.id.split('|')[0]
            if gb_id in spike_sequences.keys():
                spike_record = SeqRecord(spike_sequences[gb_id], id=record.id, description=record.id)
                spike_entries.append(spike_record)
            if gb_id in he_sequences.keys():
                he_record = SeqRecord(he_sequences[gb_id], id=record.id, description=record.id)
                he_entries.append(he_record)
                
    SeqIO.write(spike_entries, output_spike_fasta, "fasta")
    SeqIO.write(he_entries, output_he_fasta, "fasta")

In [127]:
extract_spike("../nextstrain/seasonal-corona-genome/results/aligned_hku1.fasta", 
              "../nextstrain/seasonal-corona-genome/data/hku1_datefix.fasta", 
              "../nextstrain/seasonal-corona-beta/data/hku1_spike_genomealign.fasta", 
              "../nextstrain/seasonal-corona-beta/data/hku1_he_genomealign.fasta")


39
29
