In [1]:
import re
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

In [2]:
#Append virus subtype to fasta fields, to ultimately create column in fauna
def add_subtype_to_virus_fastas(input_fasta):
    output_fasta = str(input_fasta.replace('.fasta',''))+"_annotated.fasta"
    #subtypes to look for
    cov_subtypes = ['OC43', 'HKU1', 'NL63', '229E']
    cov_types = {'OC43':'beta', 'HKU1':'beta', 'NL63':'alpha', '229E':'alpha'}
    
    sequences = []

    with open(input_fasta, "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            #Fix date formatting
#             date = record.description.split('|')[3].replace('_','-')
#             if len(date)==4:
#                 date = date+'-XX-XX'
#             if len(date)==7:
#                 date = date+'-XX'
            new_record_list = record.description.split('|')
            #Fasta fields format: 'gb-id|strain|segment|date|host|country|subtype|virus species'
#             new_record_list[3] = date
            
            #Annotate subtypes
            new_record_list = new_record_list+['None', 'None', 'full']
            for subtype in cov_subtypes:
                if subtype in record.description:
                    new_record_list[-3] = subtype.lower()
                    new_record_list[-2] = cov_types[subtype]
            #New fasta fields format: 'gb-id|strain|segment|date|host|country|subtype|virus species|subtype|type|sequence_locus'
            new_record_description = '|'.join(new_record_list)

            
            sequences.append(SeqRecord(record.seq, id=new_record_description, description=new_record_description))

    SeqIO.write(sequences, output_fasta, "fasta")

In [3]:
add_subtype_to_virus_fastas("../data/human_cov_genome.fasta")

In [94]:
#Add virus subtype info to Spike and HE sequence files as well
#Combine all Spike sequences into one file with virus type annotated
def add_subtype_to_gene_fastas(gene):
    cov_subtypes = ['OC43', 'HKU1', 'NL63', '229E']
    cov_types = ['alpha', 'beta']
    
    sequences = []
    
    for virus_type in cov_types:
        for virus_subtype in cov_subtypes:
            input_fasta = '../../seasonal-corona-'+virus_type+'/data/'+virus_subtype+'_'+gene+'_genomealign.fasta'
            try:
                with open(input_fasta, "r") as handle:
                    for record in SeqIO.parse(handle, "fasta"):
                        #Fasta fields format: 'gb-id|strain|segment|date|host|country|subtype|virus species'
                        #Fix date formatting
#                         date = record.description.split('|')[3].replace('_','-')
#                         if len(date)==4:
#                             date = date+'-XX-XX'
#                         if len(date)==7:
#                             date = date+'-XX'
                        new_record_list = record.description.split('|')
#                         new_record_list[3] = date
                        
                        #Change accesion number and strain name to indicate gene, for fauna
                        new_record_list[0] = new_record_list[0]+'_'+str(gene)
                        new_record_list[1] = new_record_list[1]+'_'+str(gene)

                        #Annotate subtypes
                        new_record_list = new_record_list+[virus_subtype.lower(), virus_type, gene]

                        #New fasta fields format: 'gb-id|strain|segment|date|host|country|subtype|virus species|subtype|type|sequence_locus'
                        new_record_description = '|'.join(new_record_list)


                        sequences.append(SeqRecord(record.seq, id=new_record_description, 
                                                   description=new_record_description))
            except:
                pass

    output_fasta = '../data/human_cov_'+str(gene)+"_annotated.fasta"


    SeqIO.write(sequences, output_fasta, "fasta")

In [95]:
add_subtype_to_gene_fastas('spike')

In [96]:
add_subtype_to_gene_fastas('he')

In [97]:
#Combine full_seq, spike, he into one fasta file for upload
def combine_all_loci_fastas():
    loci = ['genome','spike','he']

    sequences = []
    for locus in loci:
        input_fasta = '../data/human_cov_'+locus+'_annotated.fasta'
        with open(input_fasta, "r") as handle:
            for record in SeqIO.parse(handle, "fasta"):
                sequences.append(record)

    output_fasta = '../data/human_cov_annotated.fasta'

    SeqIO.write(sequences, output_fasta, "fasta")

In [98]:
combine_all_loci_fastas()

In [None]:
##The following is old

In [174]:
#Make separate fastas for each virus
def separate_virus_fastas(virus):
    output_fasta = str(virus)+".fasta"
    output_date_fasta = str(virus)+"_datefix.fasta"
    sequences = []
    sequences_datefix = []
    with open("human_cov_genome_with_metadata.fasta", "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            if virus in record.description:
                #Fix date formatting
                date = record.description.split('|')[3].replace('_','-')
                if len(date)==4:
                    date = date+'-XX-XX'
                if len(date)==7:
                    date = date+'-XX'
                new_record_list = record.description.split('|')
                new_record_list[3] = date
                new_record_description = '|'.join(new_record_list)
                
                #Fasta fields format: 'gb-id|strain|segment|date|host|country|subtype|virus species'
                sequences.append(record)
                sequences_datefix.append(SeqRecord(record.seq, id=new_record_description, description=new_record_description))
    
    SeqIO.write(sequences, output_fasta, "fasta")
    SeqIO.write(sequences_datefix, output_date_fasta, "fasta")

In [173]:
separate_virus_fastas('OC43')

In [175]:
separate_virus_fastas('HKU1')

In [176]:
separate_virus_fastas('NL63')

In [177]:
separate_virus_fastas('229E')