  ### Dataset curation: making the zika-colombia input fasta dataset
  
This notebook contains code needed to go from a raw download of all Zika genomes in `nextstrain/fauna` to the input fasta file for the zika-colombia specific analysis (which is done as a custom `nextstrain/augur` build. 

Here I am removing sequences from geographic areas that I don't want included in this analysis (e.g. Singapore), as well as ensuring that I only keep genomes that I have permission to include in a published analysis.

In [1]:
#import libraries
from Bio import SeqIO
import pandas as pd
import numpy as np

In [36]:
# Paths to files, keeping relational show that paths should work if someone downloads the repo as is.
fauna_seqs_dict = SeqIO.to_dict(SeqIO.parse('../data/zika-fauna-2018-09-06.fasta', 'fasta'))
print 'There are {} sequences downloaded from Fauna.'.format(len(fauna_seqs_dict))

#Geographic pruning
regions_to_exclude = ['southeast_asia', 'japan_korea', 'china', 'europe', 'africa']
print 'Genomes from the following regions will be excluded: {0}, {1}, {2}, and {3}.'.format(regions_to_exclude[0],regions_to_exclude[1],regions_to_exclude[2],regions_to_exclude[3])

geoPruned_seqs_dict = {fauna_seqs_dict[key].description:fauna_seqs_dict[key].seq for key in fauna_seqs_dict.keys() if key.split('|')[4] not in regions_to_exclude}
print 'There are {} sequences meet the geographic criteria.'.format(len(geoPruned_seqs_dict))



There are 694 sequences downloaded from Fauna.
Genomes from the following regions will be excluded: southeast_asia, japan_korea, china, and europe.
There are 504 sequences meet the geographic criteria.


In [None]:
#at this point, print out all the genomes are that are left, and ensure that permissions are available.
accessions_to_check = [key for key in geoPruned_seqs_dict.keys()]

strain_names = [accession.split('|')[0] for accession in accessions_to_check]
ncbi_id = [accession.split('|')[2] for accession in accessions_to_check]
lead_author =  [accession.split('|')[10].replace('et al','') for accession in accessions_to_check]


accessions_df = pd.DataFrame(np.column_stack([strain_names, ncbi_id, lead_author]), columns= ['strain_name', 'accession_number','lead_author'])
accessions_df.to_csv('../data/all_included_accessions.csv', index=False)

In [33]:
# read in the dataframe that has the permissions information,
# then parse that to select out all strains that can be included in a publishable analysis
# these are the strains that should be used, and form the fauna subset we want.

genome_permissions = pd.read_csv('../data/genome-permissions-2018-09-06.txt', delimiter ='\t')

publishable_strains = []
for i in range(len(genome_permissions)):
    record = genome_permissions.iloc[i]
    if record['permission_to_use'] != 'permission_not_received' and record['preliminarily_include'] == 'yes':
        publishable_strains.append(record['strain_name'])

print "There are {} genomes that we can include in published analyses.".format(len(publishable_strains))

There are 431 genomes that we can include in published analyses.


In [43]:
# using the strains in the publishable_strains list, pull out the full fauna headers (and sequences)
# for each strain that can be published on.
# then make a new fauna-formatted fasta file that can be read in to Augur for analysis.

publishable_seqs_dict = {}
for strain in publishable_strains:
    for key in geoPruned_seqs_dict.keys():
        if key.startswith(strain):
            publishable_seqs_dict[key] = geoPruned_seqs_dict[key]

with open('../data/publishable-zika-fauna-2018-09-06.fasta','w') as file:
    for key in publishable_seqs_dict.keys():
        file.write(str('>' + key + '\n' + publishable_seqs_dict[key] + '\n'))

In [32]:
## replace n's with gaps, and count n's in sequences (alignment checks)

def count_n(sequence):
    counter = 0
    for base in sequence:
        if base == 'n':
            counter +=1
    return counter

In [36]:
sequences = SeqIO.to_dict(SeqIO.parse('../data/publishable-zika-fauna-2018-09-06.fasta', 'fasta'))

In [55]:
n_counts_dict = {}
for key in sequences.keys():
    n_count = count_n(sequences[key].seq)
    n_counts_dict[key] = n_count

gaps_not_n_seqs = {}
for key in sequences.keys():
    n_seq = str(sequences[key].seq)
    gap_seq = n_seq.replace('n','-')
    gaps_not_n_seqs[key] = gap_seq
    
high_qual_seqs = {}
for key in n_counts_dict.keys():
    if key.split('|')[5] == 'colombia':
        high_qual_seqs[key] = sequences[key].seq
    else:
        if float(n_counts_dict[key])/10769 < 0.2:
            high_qual_seqs[key] = sequences[key].seq
            
medium_qual_seqs = {}
for key in n_counts_dict.keys():
    if key.split('|')[5] == 'colombia':
        medium_qual_seqs[key] = sequences[key].seq
    else:
        if float(n_counts_dict[key])/10769 < 0.5:
            medium_qual_seqs[key] = sequences[key].seq

In [56]:
print len(high_qual_seqs)
with open('../data/publishable-zika-fauna-2018-09-06-high-quality.fasta','w') as file:
    for key in high_qual_seqs.keys():
        file.write(str('>' + key + '\n' + high_qual_seqs[key] + '\n'))
        
print len(medium_qual_seqs)
with open('../data/publishable-zika-fauna-2018-09-06-medium-quality.fasta','w') as file:
    for key in medium_qual_seqs.keys():
        file.write(str('>' + key + '\n' + medium_qual_seqs[key] + '\n'))

347
415


In [46]:
with open('../data/publishable-zika-fauna-2018-09-06-gapped.fasta','w') as file:
    for key in gaps_not_n_seqs.keys():
        file.write(str('>' + key + '\n' + gaps_not_n_seqs[key] + '\n'))