In [2]:
#### import libraries ####
from Bio import SeqIO
from Bio import AlignIO
import datetime

date = datetime.datetime.now().strftime ("%Y-%m-%d")


In [3]:

#### infile paths #### 
zika_msa_stripped = "/Users/alliblk/Desktop/gitrepos/augur/zika/processed/zika_aligned_stripped.mfa"
fauna_file = "/Users/alliblk/Desktop/gitrepos/fauna/data/zika.fasta"

#### outfile paths #### 
americas_file = '/Users/alliblk/Desktop/gitrepos/zika-usvi/data/fastas/american-zika-{}.fasta'.format(date)
americas_frenchpol_file = '/Users/alliblk/Desktop/gitrepos/zika-usvi/data/fastas/american-frenchPolyn-zika-{}.fasta'.format(date)

usvi_file = "/Users/alliblk/Desktop/gitrepos/zika-usvi/data/fastas/usvi-{}.fasta".format(date)
usvi_primary_clade_file = '/Users/alliblk/Desktop/gitrepos/zika-usvi/data/fastas/usvi-primary-clade-{}.fasta'.format(date)


## Combining the augur processed multiple sequence alignment with the fauna-output fasta

Here I want to combine attributes of both the `Nextstrain/augur` processed Zika MSA with the fasta output from `Nextstrain/fauna`. The Fauna download has the strain information in the desired fasta format, with all necessary metadata (sampling date, geography) in the header. The processed multiple sequence alignment however has been aligned with mafft and stripped to the WHO ZIKV reference genome, and therefore represents the sequence the alignment that I want.

The header from the MSA contains the strain name of the sample, which is also in the fauna header. Therefore I will use key matching to make a new fasta file that combines the header from the fauna file with the sequences from the augur msa.

Note to self: Might not be a bad idea to have this capability in Augur, should potentially open an issue to prompt discussion.

In [4]:
# Sequences that should be removed from the analysis alignment #
# (reasons for removal are noted in the comments) #

#### geographic exclusion criteria #### 
regions_to_exclude1 = ['southeast_asia', 'oceania', 'japan_korea', 'china','europe'] #french polynesia out
regions_to_exclude2 = ['southeast_asia', 'japan_korea', 'china','europe'] #french polynesia in

#### sequence characteristic exclusion criteria (based on Augur processing) #### 
drop_for_indel = ["CX17"] #sequence has large number of indels.
drop_for_contamination = ['ZF36_36S'] #possible contamination.
drop_duplicates = ["Dominican_Republic/2016/PD2", "GD01", "GDZ16001", "VEN/UF_2/2016"] #true strains, but duplicates of other strains
#excessive terminal branch length, likely indicative of large amount of sequencing error.
drop_for_excessive_terminal_branch_length = ["Bahia04", "JAM/2016/WI_JM6", "Bahia11", "Bahia12", "DOM/2016/MA_WGS16_009", "VE_Ganxian", "BRA/2016/FC_DQ60D1", "CX5"]
drop_unknown_export = ["VR10599/Pavia/2016", "34997/Pavia/2016"] # travel cases where country of infection acquisition is unknown
drop_molecClock_off = ["THA/PLCal_ZV/2013", "SK403/13AS", "SV0010/15", "SK364/13AS"] #outliers on root to tip analyses. Aren't adhering to molecular clock.


In [5]:
# read in headers from fauna output file
with open(fauna_file,'rU') as file:
    strain_header_dict={line.split('|')[0].replace('>',''):line.strip() for line in file if line.startswith('>')}

In [6]:
print strain_header_dict['DOM/2016/MA_WGS16_020']

>DOM/2016/MA_WGS16_020|zika|KY785460|2016-06-30|north_america|dominican_republic|dominican_republic|dominican_republic|genbank|genome|Metsky et al|https://www.ncbi.nlm.nih.gov/nuccore/KY785460


In [7]:
# Get stats on how many of the Americas fauna sequences are dropped by augur because they are problematic.

seqs_in_americas_count = 0
indel_drop_count = 0
contam_drop_count = 0
duplicate_drop_count = 0
term_branch_leng_drop_count = 0
export_drop_count = 0
clock_drop_count = 0

for key in strain_header_dict.keys():
    if strain_header_dict[key].split('|')[4] in regions_to_exclude1: #only look at Americas sequences
        continue
    else:
        seqs_in_americas_count += 1
        if key in drop_for_indel:
            indel_drop_count += 1
        elif key in drop_for_contamination:
            contam_drop_count += 1
        elif key in drop_duplicates:
            duplicate_drop_count += 1
        elif key in drop_for_excessive_terminal_branch_length:
            term_branch_leng_drop_count += 1
        elif key in drop_unknown_export:
            export_drop_count += 1
        elif key in drop_molecClock_off:
            clock_drop_count += 1

print "There are {} Zika sequences available from fauna".format(len(strain_header_dict.keys()))
print "There are {} American Zika sequences available from fauna".format(seqs_in_americas_count)
print "Augur removed {} sequence(s) sampled from the Americas because they contain excessive indels".format(indel_drop_count)
print "Augur removed {} sequence(s) sampled from the Americas because they show evidence of contamination".format(contam_drop_count)
print "Augur removed {} sequence(s) sampled from the Americas because they are duplicate strains".format(duplicate_drop_count)
print "Augur removed {} sequence(s) sampled from the Americas because they demonstrate excessive terminal branch length".format(term_branch_leng_drop_count)
print "Augur removed {} sequence(s) sampled from the Americas because they are exported cases where country of infection acquistion could not be determined".format(export_drop_count)
print "Augur removed {} sequence(s) sampled from the Americas because they did not follow the molecular clock".format(clock_drop_count)

There are 501 Zika sequences available from fauna
There are 335 American Zika sequences available from fauna
Augur removed 1 sequence(s) sampled from the Americas because they contain excessive indels
Augur removed 0 sequence(s) sampled from the Americas because they show evidence of contamination
Augur removed 3 sequence(s) sampled from the Americas because they are duplicate strains
Augur removed 8 sequence(s) sampled from the Americas because they demonstrate excessive terminal branch length
Augur removed 0 sequence(s) sampled from the Americas because they are exported cases where country of infection acquistion could not be determined
Augur removed 0 sequence(s) sampled from the Americas because they did not follow the molecular clock


In [8]:
# read in sequences from multiple sequence alignment
# currently header in msa is only strain name. Need to grab all metadata as well with key matching against fauna fasta headers.

zika_msa = AlignIO.read(open(zika_msa_stripped),'fasta')
zika_msa_dict = {record.id:record.seq for record in zika_msa}

#check that you in fact have the trimmed down alignment loaded in.
for key in zika_msa_dict.keys():
    assert len(zika_msa_dict[key]) == 10769

print len(zika_msa_dict.keys())

482


In [15]:
for key in output_dict.keys():
    split_name = key.split('|')
    header = split_name[0] +'|'+ split_name[3] + '|'+ split_name[4] + '|'+ split_name[5]
    print header
    break

>SG_114|2016-09-08|southeast_asia|singapore


In [17]:
#print out Americas only multiple sequence alignment
with open(americas_file,'w') as out_file:
    for key in output_dict.keys():
        if key.split('|')[4] in regions_to_exclude1:
            continue
        else:
            split_name = key.split('|')
            header = split_name[0] +'|'+ split_name[3] + '|'+ split_name[4] + '|'+ split_name[5]
            out_file.write(str(header + '\n' + output_dict[key] + '\n'))

#print out Americas and french polynesia multiple sequence alignment
with open(americas_frenchpol_file,'w') as out_file:
    for key in output_dict.keys():
        if key.split('|')[4] in regions_to_exclude2:
            continue
        else:
            split_name = key.split('|')
            header = split_name[0] +'|'+ split_name[3] + '|'+ split_name[4] + '|'+ split_name[5] 
            out_file.write(str(header + '\n' + output_dict[key] + '\n'))

#print out USVI only multiple sequence alignment
with open(usvi_file,'w') as out_file:
    for key in output_dict.keys():
        if key.split('|')[5] == 'usvi':
            split_name = key.split('|')
            header = split_name[0] +'|'+ split_name[3] + '|'+ split_name[4] + '|'+ split_name[5]
            out_file.write(str(header + '\n' + output_dict[key] + '\n'))
        else:
            continue
        