In [2]:
import re
import pandas as pd
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import AlignIO
from collections import Counter 
from ete3 import Tree

In [None]:
#1. Download all coronaviruses from Vipr with human host. Select all possible metadata fields and download as a fasta

In [None]:
#2. Name this file 'human_cov_genome.fasta' and put it in the seasonal-cov/data/ directory

In [None]:
#3. Run add_subtype_to_virus_fastas function
add_subtype_to_virus_fastas("../data/human_cov_genome.fasta")

In [None]:
#4. Run fix_fasta_dateformat function
fix_fasta_dateformat("../data/human_cov_genome_annotated.fasta")

In [None]:
#5. In terminal, run: `snakemake` from seasonal-cov/

In [14]:
#6. Run find_unlabeled_subtypes function
find_unlabled_subtypes('../data/human_cov_full.fasta')

In [23]:
#7. Run separate_virus_fastas function for each virus
viruses = ['oc43', '229e', 'hku1', 'nl63']
for virus in viruses:
    separate_virus_fastas(virus)

In [None]:
#8. Toggle each virus's Snakefile to GENE = ["full"]. 
# In terminal, run: `snakemake` from within each virus directory

In [13]:
#9. Run extract_genes function for each virus
# viruses = ['oc43', '229e', 'hku1', 'nl63']
viruses = ['hku1']
for virus in viruses:
    extract_genes(virus)

In [None]:
#10. Toggle each virus's Snakefile to GENE = ["replicase1ab", ...]. 
# In terminal, run: `snakemake` from within each virus directory

In [3]:
#Append virus subtype to fasta fields, to ultimately create column in fauna
def add_subtype_to_virus_fastas(input_fasta):
    output_fasta = str(input_fasta.replace('.fasta',''))+"_annotated.fasta"
    #subtypes to look for
    cov_subtypes = ['OC43', 'HKU1', 'NL63', '229E']
    cov_types = {'OC43':'beta', 'HKU1':'beta', 'NL63':'alpha', '229E':'alpha'}
    
    sequences = []

    with open(input_fasta, "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            new_record_list = record.description.split('|')

            
            #Annotate subtypes
            new_record_list = new_record_list+['None', 'None', 'full']
            for subtype in cov_subtypes:
                if subtype in record.description:
                    new_record_list[-3] = subtype.lower()
                    new_record_list[-2] = cov_types[subtype]
            #New fasta fields format: 'gb-id|strain|date|host|country|virus species|subtype|type|sequence_locus'
            new_record_description = '|'.join(new_record_list)

            
            sequences.append(SeqRecord(record.seq, id=new_record_description, description=new_record_description))

    SeqIO.write(sequences, output_fasta, "fasta")

In [4]:
def fix_fasta_dateformat(input_fasta):
    with open(input_fasta, "r") as handle:
        new_cov_records = []
        for record in SeqIO.parse(handle, "fasta"):
            date = record.description.split('|')[3].replace('_','-')
            if len(date)==4:
                date = date+'-XX-XX'
            if len(date)==7:
                date = date+'-XX'
            fix_date_description = record.description.split('|')
            fix_date_description[3] = date
            fix_date_description.pop(2)
            fix_date_description.pop(5)
            fix_date_description = '|'.join(fix_date_description)
            new_record = SeqRecord(record.seq, id= fix_date_description, 
                                   name= fix_date_description, description= fix_date_description)
            new_cov_records.append(new_record)
        SeqIO.write(new_cov_records, "../data/human_cov_full.fasta", 'fasta')

In [5]:
#Known viruses for each subtype
known = {'oc43':['KF963229', 'KF530087', 'KF530059', 'LC506876', 'KU131570', 'LC506782', 'LC506896'], 
         'hku1':['KR055515', 'KF430197', 'KF686339', 'KR055516', 'MF996629', 'MH940245'], 
         'nl63': ['KM055607', 'KT359837', 'KM055597', 'KM055602', 'JX104161', 'KY862037', 'MF996663'], 
         '229e': ['JX503060', 'LC005741', 'KM055524', 'KM055568', 'KJ866103', 'GU068548', 'KY369908', 'KT359754'],
         'mers': ['KJ156950', 'KT357808', 'MK129253', 'KX034094', 'KJ156890'],
         'sars1':['AY345986', 'GU553363', 'JN247396', 'DQ182595'],
         'sars2':['MT326086', 'MT159717', 'MT304486']
        }

def find_unlabled_subtypes(input_fasta):
    cov_tree = Tree('../results/tree_cov_full.nwk', format=1)
    cov_metadata = pd.read_csv('../results/metadata_cov_full.tsv', delimiter = '\t').set_index('strain')
    
    subtyped_viruses = {key: set() for key in known.keys()}

    for node in cov_tree.iter_descendants("postorder"):
        leafs = node.get_leaves()
        descendents = [leaf.name for leaf in leafs]


        for subtype in known.keys():

            this_subtype = known[subtype]
            remove_subtype = {key:val for key, val in known.items() if key != subtype}
            other_subtypes = [item for sublist in list(remove_subtype.values()) for item in sublist]
            #check if node has all known viruses of a subtype
            if all(elem in descendents for elem in this_subtype):
                #check that node doesn't have any viruses of other known subtypes
                if not any(elem in descendents for elem in other_subtypes):
                    subtyped_viruses[subtype].update(descendents)
    
    output_fasta = str(input_fasta.replace('.fasta',''))+"_subtyped.fasta"
    add_newly_subtyped_to_virus_fastas(input_fasta, subtyped_viruses, output_fasta)
    make_strainname(output_fasta)

    
def make_strainname(output_fasta):
    
    new_strainname_records = []

    with open(output_fasta, "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):

            new_record_list = record.description.split('|')
            #Make strain name from subtype, accession number, strain, and date
            year = str(new_record_list[2])[0:4]
            strain_name = str(new_record_list[-3])+'/'+str(new_record_list[0])+'/'+str(new_record_list[1])+'/'+year
            
            new_record_list = [strain_name] + new_record_list
            #New fasta fields format: 'strain_name|gb-id|strain|date|host|country|virus species|subtype|type|sequence_locus'
            new_record_description = '|'.join(new_record_list)
            
            new_record = SeqRecord(record.seq, id= new_record_description, 
                                   name= new_record_description, description= new_record_description)
            new_strainname_records.append(new_record)
            
    SeqIO.write(new_strainname_records, output_fasta, "fasta")
    
                    
def add_newly_subtyped_to_virus_fastas(input_fasta, subtyped_viruses, output_fasta):
    cov_types = {'oc43':'beta', 'hku1':'beta', 'nl63':'alpha', '229e':'alpha', 'mers':'beta', 'sars1':'beta', 'sars2':'beta'}
    
    new_subtype_records = []

    with open(input_fasta, "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):

            new_record_list = record.description.split('|')

            #Annotate subtypes
            for subtype in known.keys():
                strains = subtyped_viruses[subtype]
                if new_record_list[0] in strains:
                    new_record_list[-3] = subtype
                    new_record_list[-2] = cov_types[subtype]
            new_record_description = '|'.join(new_record_list)
            
            new_record = SeqRecord(record.seq, id= new_record_description, 
                       name= new_record_description, description= new_record_description)
            new_subtype_records.append(new_record)

    SeqIO.write(new_subtype_records, output_fasta, "fasta")
    

In [6]:
def separate_virus_fastas(virus):
    output_fasta = "../"+str(virus)+"/data/"+str(virus)+"_full.fasta"
    sequences = []
    with open("../data/human_cov_full_subtyped.fasta", "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            if virus in record.description:
                #Fix date formatting
                date = record.description.split('|')[3].replace('_','-')
                if len(date)==4:
                    date = date+'-XX-XX'
                if len(date)==7:
                    date = date+'-XX'
                new_record_list = record.description.split('|')
                new_record_list[3] = date
                new_record_description = '|'.join(new_record_list)
                
                #Fasta fields format: 'gb-id|strain|segment|date|host|country|subtype|virus species'
                sequences.append(SeqRecord(record.seq, id=new_record_description, description=new_record_description))

    SeqIO.write(sequences, output_fasta, "fasta")

In [7]:
#Gene positions for each virus
def get_virus_genes(virus):
    if virus == 'nl63':
        genes_dict = {'replicase1ab':"replicase polyprotein 1ab", 'spike':"spike protein", 'protein3':"protein 3", 
                      'envelope':"envelope protein", 'membrane':"membrane protein", 'nucleocapsid':"nucleocapsid protein", 
                      's1':'spike_subdomain1', 's2':'spike_subdomain2'}
    elif virus == '229e':
        genes_dict = {'replicase1ab':"replicase polyprotein 1ab", 'replicase1a': "replicase polyprotein 1a", 'spike':"surface glycoprotein", 
                      'protein4a':"4a protein", 'protein4b':"4b protein",
                      'envelope':"envelope protein", 'membrane':"membrane protein", 'nucleocapsid':"nucleocapsid protein", 
                      's1':'spike_subdomain1', 's2':'spike_subdomain2'}
    elif virus == 'hku1':
        genes_dict = {'replicase1ab':"orf1ab polyprotein", 'he':"hemagglutinin-esterase glycoprotein", 
                      'spike':"spike glycoprotein", 'nonstructural4':"non-structural protein",
                      'envelope':"small membrane protein", 'membrane':"membrane glycoprotein", 
                      'nucleocapsid':"nucleocapsid phosphoprotein", 'nucleocapsid2':"nucleocapsid phosphoprotein 2", 
                      's1':'spike_subdomain1', 's2':'spike_subdomain2'}
    elif virus == 'oc43':
        genes_dict = {'replicase1ab':"replicase polyprotein", 'nonstructural2a':"NS2a protein",
                      'he':"HE protein", 'spike':"S protein", 'nonstructural2':"NS2 protein",
                      'envelope':"NS3 protein", 'membrane':"M protein", 
                      'nucleocapsid':"N protein", 'n2protein':"N2 protein", 
                      's1':'spike_subdomain1', 's2':'spike_subdomain2'}
    return genes_dict


In [8]:
#first 6 aas of each domain
#from uniprot: NL63 (Q6Q1S2), 229e(P15423), oc43 (P36334), hku1 (Q0ZME7)
s1_domains = {'nl63': 'CNSNAN', '229e': 'CQTTNG', 'oc43': 'AVIGDL', 'hku1': 'AVIGDF'}
s2_domains = {'nl63': 'SNGGNN', '229e': 'SNGTYN', 'oc43': 'AITTGY', 'hku1': 'SISASY'}

In [9]:
def get_s1_coords(virus):
    spike_reference = '../'+str(virus)+'/config/'+str(virus)+'_spike_reference.gb'

    with open(spike_reference, "r") as handle:
        for record in SeqIO.parse(handle, "genbank"):
            nt_seq = record.seq
            aa_seq = record.seq.translate()
            
            s1_regex = re.compile(f'{s1_domains[virus]}.*(?={s2_domains[virus]})')
            s1_aa = s1_regex.search(str(aa_seq)).group()
            s1_aa_coords = [(aa.start(0), aa.end(0)) for aa in re.finditer(s1_regex, str(aa_seq))][0]
            s1_nt_coords = [s1_aa_coords[0]*3, s1_aa_coords[1]*3]
    return s1_nt_coords

In [10]:
def get_s2_coords(virus):
    spike_reference = '../'+str(virus)+'/config/'+str(virus)+'_spike_reference.gb'

    with open(spike_reference, "r") as handle:
        for record in SeqIO.parse(handle, "genbank"):
            nt_seq = record.seq
            aa_seq = record.seq.translate()
            
            s2_regex = re.compile(f'{s2_domains[virus]}.*')
            s2_aa = s2_regex.search(str(aa_seq)).group()
            s2_aa_coords = [(aa.start(0), aa.end(0)) for aa in re.finditer(s2_regex, str(aa_seq))][0]
            s2_nt_coords = [s2_aa_coords[0]*3, s2_aa_coords[1]*3]
    return s2_nt_coords

In [11]:
#Gene positions for each virus
def get_gene_position_test(virus, gene, sequence):
    genes_dict = get_virus_genes(virus)
    
    for seq_record in SeqIO.parse("../"+str(virus)+"/config/"+str(virus)+"_full_reference.gb", "genbank"):
        for feature in seq_record.features:
            if feature.type == 'CDS':
                if gene != 's1' and gene != 's2':
                    if feature.qualifiers['product'] == [genes_dict[gene]]:
                        gene_nt = feature.location.extract(sequence)
                if gene == 's1':
                    if feature.qualifiers['product'] == [genes_dict['spike']]:
                        s1_nt_coords = get_s1_coords(virus)
                        gene_nt = feature.location.extract(sequence)[s1_nt_coords[0]:s1_nt_coords[1]]
                if gene == 's2':
                    if feature.qualifiers['product'] == [genes_dict['spike']]:
                        s2_nt_coords = get_s2_coords(virus)
                        gene_nt = feature.location.extract(sequence)[s2_nt_coords[0]:s2_nt_coords[1]]
    return gene_nt

In [12]:
#From aligned .fasta, extract just the portion of the genome encoding each gene
def extract_genes(virus):
    
    aligned_fasta = "../"+str(virus)+"/results/aligned_"+str(virus)+"_full.fasta"
    original_fasta = "../"+str(virus)+"/data/"+str(virus)+"_full.fasta"
    
    genes_dict = get_virus_genes(virus)
    genes = [k for k,v in genes_dict.items()]
    
    for gene in genes:
        output_fasta = "../"+str(virus)+"/data/"+str(virus)+"_"+str(gene)+".fasta"
        gene_sequences = {}
        with open(aligned_fasta, "r") as handle:
            alignment = SeqIO.parse(handle, "fasta")
            for aligned_record in alignment:
                gene_nt = get_gene_position_test(virus, gene, aligned_record.seq)
                gene_nt_str = str(gene_nt)
                #Throw out sequences that don't cover gene
                num_unaligned_gene = Counter(gene_nt_str)['N']
                if num_unaligned_gene < (len(gene_nt)/2):
                    gene_sequences[aligned_record.id] = gene_nt
        
        gene_entries = []

        with open(original_fasta, "r") as handle_2:
            metadata = SeqIO.parse(handle_2, "fasta")
            for meta_record in metadata:
                strain_accession = meta_record.id.split('|')[0]
                if str(strain_accession) in gene_sequences.keys():
                    gene_record = SeqRecord(gene_sequences[strain_accession], id=meta_record.id, description=meta_record.id)
                    gene_entries.append(gene_record)

        SeqIO.write(gene_entries, output_fasta, "fasta")
