In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from dateutil.parser import parse
from collections import Counter
from dateutil import parser
import re

In [2]:
#sequence data downloaded as a multi-genbank file
#compile all sequences into a fasta file with header: accession|strain_name|date|country|host
#Put date in 'yyyy-mm-dd' format



#store all edited sequence records
seq_records = []
for record in SeqIO.parse(open(f"genbank_sequences.gb","r"), "genbank"):
    accession = record.id
    collection_date, strain_name, country, host = 'None', 'None', 'None', 'None'
    #exclude patent and synthetic sequences, that are not clinical isolates
    if record.annotations['data_file_division'] not in ['PAT', 'SYN']:
        
        for feature in record.features:
            if 'collection_date' in feature.qualifiers:
                collection_date = feature.qualifiers['collection_date'][0]   
            if 'strain' in feature.qualifiers:
                strain_name = feature.qualifiers['strain'][0]
            if 'country' in feature.qualifiers:
                country = feature.qualifiers['country'][0]
            if 'host' in feature.qualifiers:
                host = feature.qualifiers['host'][0]

        if collection_date == 'None':
            collection_date = record.annotations['date']

        #only keep sequences with date
        if collection_date != 'None':
            formatted_date = parser.parse(collection_date).strftime('%Y-%m-%d')
            #dateutil parser will assign a day (today's date) to unknown days, and same for month, want XX instead
            if len(collection_date)==8:
                formatted_date = formatted_date[:-2] + 'XX'
            elif len(collection_date)==4:
                formatted_date = formatted_date[:5] + 'XX-XX'
                
            list_of_info = [accession, strain_name, formatted_date, country, host]
            new_record_info = '|'.join(list_of_info)
            seq_records.append(SeqRecord(record.seq, id=new_record_info, description=''))  
    
 

        

#write fasta sequence file 
SeqIO.write(seq_records, f'adenovirusB_all.fasta', "fasta")

539