In [4]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from dateutil.parser import parse
import datetime

In [20]:
#Put date in 'yyyy-mm-dd' format
#And combine sequences that were stored under different names (human parainfluenzavirus versus human respirovirus)
#don't include reference strain, or augur align will get mad

subtypes = ['1', '2', '3', '4']
reference_strains = {'1':'MH828703', '2': 'LC654458', '3':'ON729325', '4':'ON729322'}

for subtype in subtypes:
    reference_id = reference_strains[subtype]
    
    seq_records = []
    
    used_ids = []

    for record in SeqIO.parse(open(f"vipr_parainfluenza{subtype}.fasta","r"), "fasta"):
        date = record.id.split('|')[3]
        genbank_id = record.id.split('|')[0]
        #don't include the reference strain
        if genbank_id != reference_id:
            # exclude sequences with no date
            if date!= 'NA':
                # if date only has year, add -XX-XX for month and day
                if len(date)==4:
                    formatted_date = date+'-XX-XX'
                else:
                    formatted_date = date.replace('_', '-')
                    # if date only has month, add -XX for day
                    if len(formatted_date)<10:
                        formatted_date = formatted_date+'-XX'

                record.id = record.id.replace(date, formatted_date)
                record.id = record.id+f'|HPIV{subtype}'

                used_ids.append(genbank_id)

                seq_records.append(SeqRecord(record.seq, id=record.id, description=''))
    
    #alternate name of virus depends on subtype
    alternate_names = {"1": "respirovirus", "3": "respirovirus", 
                      "2":"orthorubulavirus", "4":"orthorubulavirus"}
    
    alternate_name = alternate_names[subtype]

    for record in SeqIO.parse(open(f"vipr_{alternate_name}{subtype}.fasta","r"), "fasta"):
        date = record.id.split('|')[3]
        genbank_id = record.id.split('|')[0]
        #don't want duplicates, if this sequences was submitted with both names
        if genbank_id not in used_ids:
            #don't include the reference strain
            if genbank_id != reference_id:
                # exclude sequences with no date
                if date!= 'NA':
                    if date != 'May_2016/Dec_2017':
                        # if date only has year, add -XX-XX for month and day
                        if len(date)==4:
                            formatted_date = date+'-XX-XX'
                        else:
                            formatted_date = date.replace('_', '-')
                            # if date only has month, add -XX for day
                            if len(formatted_date)<10:
                                formatted_date = formatted_date+'-XX'

                        record.id = record.id.replace(date, formatted_date)
                        record.id = record.id+f'|HPIV{subtype}'


                        seq_records.append(SeqRecord(record.seq, id=record.id, description=''))

    

    SeqIO.write(seq_records, f'hpiv_{subtype}.fasta', "fasta")