In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from dateutil.parser import parse
from collections import Counter
from dateutil import parser
import re

In [5]:
#sequence data downloaded as a multi-genbank file
#compile all sequences into a fasta file with header: accession|strain_name|date|country|host|subtype
#Put date in 'yyyy-mm-dd' format

not_subtyped_count = 0
subtyped_count = 0


#store all edited sequence records
seq_records = []
for record in SeqIO.parse(open(f"genbank_sequences.gb","r"), "genbank"):
    accession = record.annotations['accessions'][0]
    collection_date, strain_name, country, host, subtype = 'None', 'None', 'None', 'None', 'None'
    #exclude patent and synthetic sequences, that are not clinical isolates
    if record.annotations['data_file_division'] not in ['PAT', 'SYN']:
        
        for feature in record.features:
            if 'collection_date' in feature.qualifiers:
                collection_date = feature.qualifiers['collection_date'][0]   
            if 'strain' in feature.qualifiers:
                strain_name = feature.qualifiers['strain'][0]
            if 'country' in feature.qualifiers:
                country = feature.qualifiers['country'][0]
            if 'host' in feature.qualifiers:
                host = feature.qualifiers['host'][0]
        
        #find subtype from 'organism' genbank entry
        subtype_organism_search = re.search('rhinovirus A([0-9]*)', record.annotations['organism'], re.IGNORECASE)

        if subtype_organism_search:
            subtype = subtype_organism_search.group(1)
            if subtype == '':
                subtype = 'None'
        
        #otherwise, look in the 'description'
        if subtype == 'None':
            subtype_description_search = re.search('rhinovirus A strain C([0-9]*)', record.description, re.IGNORECASE)
            
            if subtype_description_search:
                subtype = subtype_description_search.group(1)
                if subtype == '':
                    subtype = 'None'
        
        #the above assigns subtypes to about 83% isolates- do the rest by clustering on a rhinovirus-all tree
        if subtype == 'None':
            not_subtyped_count+=1
        else:
            subtyped_count+=1

        
                
        #only keep country information, not city/state/region
        if ':' in country:
            country = country.split(':')[0]

        if collection_date == 'None':
            collection_date = record.annotations['date']


        #only keep sequences with date
        if collection_date != 'None' and collection_date!='May-2016/Dec-2017':
            formatted_date = parser.parse(collection_date).strftime('%Y-%m-%d')
            #dateutil parser will assign a day (today's date) to unknown days, and same for month, want XX instead
            if len(collection_date)==8:
                formatted_date = formatted_date[:-2] + 'XX'
            elif len(collection_date)==4:
                formatted_date = formatted_date[:5] + 'XX-XX'
                
            list_of_info = [accession, strain_name, formatted_date, country, host, subtype]
            new_record_info = '|'.join(list_of_info)
            seq_records.append(SeqRecord(record.seq, id=new_record_info, description=''))  
            



#write fasta sequence file 
SeqIO.write(seq_records, f'rhinovirusA_all.fasta', "fasta")

1172