In [76]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from dateutil.parser import parse
import datetime
import re

In [149]:
#curate fasta header to be
#downloaded as species|accession|collection_date|genbank_title|genus|family|nuc_completeness|segment|country|host
#Put date in 'yyyy-mm-dd' format
#not going to worry about retrieving country from the strain name because I don't need that for adaptation analysis

#list of all segments and different names for them that might be present in the genbank names
list_of_segments = ['PB2', 'POLYMERASE 2', 'PB1', 'P3', 'HE', 'SEG 4', 
                    'NP', 'SEG 5', 'M', 'M1', 'MATRIX',
                    'NS', 'NS1', 'NS-2', 'NONSTRUCTURAL PROTEINS']
#map different names for same segment to a common name
segment_mapper = {'1': 'PB2', 'POLYMERASE 2': 'PB2', 'PB2':'PB2',
                  '2': 'PB1', 'PB1':'PB1',
                  '3': 'P3', 'P3':'P3',
                  '4': 'HE', 'SEG 4':'HE', 'HE':'HE',
                  '5': 'NP', 'SEG 5': 'NP', 'NP':'NP',
                  '6': 'M', 'M1':'M', 'MATRIX':'M', 'M': 'M',
                  '7':'NS', 'NS1':'NS', 'NS-2':'NS', 'NONSTRUCTURAL PROTEINS':'NS', 'NS':'NS'}

#keep track of sequences by segment
seq_records = {'PB2':[], 'PB1':[], 'P3':[], 'HE':[], 'NP':[], 'M':[], 'NS':[]}

for record in SeqIO.parse(open(f"ncbi_sequences.fasta","r"), "fasta"):
    if len(record.seq) > 200:
        accession=record.description.split('|')[1]
        date=record.description.split('|')[2]
        genbank_name=record.description.split('|')[3]
        segment=record.description.split('|')[7]
        country=record.description.split('|')[8]
        host=record.description.split('|')[9]
        
        #strain name can consist of 2, 3, or 4 slashes
        #search for them in order to make sure the whole thing is captured
        strain_pattern4 = re.compile(r'C/[a-zA-Z0-9\-\s_]+/[a-zA-Z0-9\-\s_]+/[a-zA-Z0-9\-\s_]+/\d+')
        strain_pattern3 = re.compile(r'C/[a-zA-Z0-9\-\s_]+/[a-zA-Z0-9\-\s_]+/\d+')
        strain_pattern2 = re.compile(r'C/[a-zA-Z0-9\-\s_]+/\d+')
        influenza_strain_name4 = strain_pattern4.search(genbank_name)
        influenza_strain_name3 = strain_pattern3.search(genbank_name)
        influenza_strain_name2 = strain_pattern2.search(genbank_name)
        #look first for the longer one, since shorter one could be false truncation
        if influenza_strain_name4:
            influenza_strain_name = influenza_strain_name4.group()
        elif influenza_strain_name3:
            influenza_strain_name = influenza_strain_name3.group()
        elif influenza_strain_name2:
            influenza_strain_name = influenza_strain_name2.group()
        else:
            influenza_strain_name = ''

    
        
        #assume hosts without annotations are human?
        if host=='Homo sapiens' or host=='':
            
            #FORMAT SEGMENT
            #if segment is not labeled, see if this information is in the genbank title
            segment_formatted=''
            if segment == '':
                for seg in list_of_segments:
                    if f' {seg} ' in genbank_name.upper() or f'({seg})' in genbank_name.upper() or f'{seg},' in genbank_name.upper():
                        segment_formatted = segment_mapper[seg]

            #reassign segment numbers with the name
            else:
                segment_formatted = segment_mapper[segment]
            
            #FORMAT DATE
            #look for date info in the genbank_name
            if date=='NA' or date=='':
                if influenza_strain_name!='':
                    date_from_name = influenza_strain_name.split('/')[-1]
                    if len(date_from_name)==2:
                        date_from_name = '19'+date_from_name
                        
                    formatted_date = date_from_name +'-XX-XX'
                else:
                    formatted_date = ''

            else:
                # if date only has year, add -XX-XX for month and day
                if len(date)==4:
                    formatted_date = date+'-XX-XX'
                else:
                    formatted_date = date.replace('_', '-')

                    # if date only has month, add -XX for day
                    if len(formatted_date)<10:
                        formatted_date = formatted_date+'-XX'
  
                    
            if segment_formatted!='':
                if formatted_date!= '':
                    list_of_info = [accession, genbank_name, formatted_date, segment_formatted, country, host]
                    new_record_info = '|'.join(list_of_info)

                    seq_records[segment_formatted].append(SeqRecord(record.seq, id=new_record_info, description=new_record_info))


for seg, seqs in seq_records.items():
    SeqIO.write(seqs, f'fluC_{seg}.fasta', "fasta")

