Skip to content

Commit

Permalink
Merge branch 'refactor'
Browse files Browse the repository at this point in the history
* refactor: (48 commits)
  Adjust color scale.
  Fix auspice genotype frequency display.
  deleted extra ]
  Change default fasta_fields to match default GISAID output.
  fixed some default arguments, moved path addition (only necessary for interactive sessions) and the higher recursion limit to process.py
  defined alphabets early, checked outgroup removal, seems ok
  increased recursion limit
  caught unfinished tree in loading -- only refined trees have strain attributes used to construct the node look-up
  added node_lookup and sequence_lookup to the loading
  changed file name for dumps to accect a common prefix for all files. added a steps argument to run which is fed in from main
  fixed issue with yvalues
  added aa alignment file dump
  * added start and stop flags to run parts of the pipeline.   * fixed an issue with the clock-cleaning. some times the std deviations is very large due to extreme outliers. using interquartile distance now instead.   * put layout in a separate function from attribute adding
  set forced_strains = [] if not desired
  export of virus statistics to meta.json
  implemented the preferential inclusion of HI strains.
  * corrected various little mistakes in the adoption of the frequency estimation   * removed a few unneeded imports   * added virus_stats to meta.json
  deleted nextflu_config.py, added frequencies -- preliminarily
  removed streamline, now part of generic process
  removed unused imports
  ...
  • Loading branch information
trvrb committed Mar 4, 2015
2 parents 80144dc + bad1994 commit 59720dd
Show file tree
Hide file tree
Showing 16 changed files with 2,038 additions and 1,435 deletions.
489 changes: 489 additions & 0 deletions augur/source-data/HI_strains.txt

Large diffs are not rendered by default.

307 changes: 307 additions & 0 deletions augur/src/H3N2_process.py

Large diffs are not rendered by default.

706 changes: 318 additions & 388 deletions augur/src/bernoulli_frequency.py

Large diffs are not rendered by default.

302 changes: 250 additions & 52 deletions augur/src/process.py
@@ -1,52 +1,250 @@
import time, os, argparse
import virus_download, virus_filter, virus_align, virus_clean
import tree_infer, tree_ancestral, tree_refine
import bernoulli_frequency
import streamline
from io_util import *

def main(years_back=3, viruses_per_month=50):
"""Run full pipeline"""

print "--- Start processing at " + time.strftime("%H:%M:%S") + " ---"

# Run pipeline
# virus_download.main() # Download from GISAID
virus_fname = 'data/gisaid_epiflu_sequence.fasta'

# Filter sequences
virus_fname = virus_filter.main(virus_fname, years_back=years_back, viruses_per_month=viruses_per_month)

# Align sequences
virus_fname = virus_align.main(virus_fname)

# Clean sequences
virus_fname = virus_clean.main(virus_fname)

# Make tree, creates raxml files
tree_fname = tree_infer.main(virus_fname)

# infer ancestral states using the cleaned viruses and the raxml tree
tree_fname = tree_ancestral.main(tree_fname=tree_fname, virus_fname=virus_fname)

# Clean tree, reads viruses in fname + raxml files
tree_fname = tree_refine.main(tree_fname=tree_fname, virus_fname = virus_fname)

# Estimate frequencies
tree_fname = bernoulli_frequency.main(tree_fname=tree_fname)

# Streamline tree for auspice
tree_fname = streamline.main(tree_fname)

# Write out metadata
print "Writing out metadata"
meta = {"updated": time.strftime("X%d %b %Y").replace('X0','X').replace('X','')}
meta_fname = "../auspice/data/meta.json"
write_json(meta, meta_fname)

if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Process virus sequences, build tree, and prepare of web visualization')
parser.add_argument('-y', '--years_back', type = int, default=3, help='number of past years to sample sequences from')
parser.add_argument('-v', '--viruses_per_month', type = int, default = 50, help='number of viruses sampled per month')
params = parser.parse_args()
main(**params.__dict__)
import sys, time, os, argparse,shutil,subprocess, glob
sys.path.append('src')
sys.setrecursionlimit(10000) # needed since we are dealing with large trees
from Bio import SeqIO, AlignIO,Phylo
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import dendropy
from bernoulli_frequency import virus_frequencies
from tree_util import delimit_newick
import numpy as np

class process(virus_frequencies):
"""generic template class for processing virus sequences into trees"""
def __init__(self, prefix = 'data/', auspice_frequency_fname ='../auspice/data/frequencies.json',
auspice_sequences_fname='../auspice/data/sequences.json', auspice_tree_fname='../auspice/data/tree.json', min_freq = 0.01, **kwargs):
virus_frequencies.__init__(self, **kwargs)
self.tree_fname = prefix+'tree.pkl'
self.virus_fname = prefix+'virus.pkl'
self.frequency_fname = prefix+'frequencies.pkl'
self.aa_seq_fname = prefix+'aa_seq.pkl'
self.min_freq = min_freq
self.auspice_tree_fname = auspice_tree_fname
self.auspice_sequences_fname = auspice_sequences_fname
self.auspice_frequency_fname = auspice_frequency_fname
self.nuc_alphabet = 'ACGT-N'
self.aa_alphabet = 'ACDEFGHIKLMNPQRSTVWY*X'

def dump(self):
import cPickle
if hasattr(self, 'tree'):
with open(self.tree_fname, 'w') as outfile:
cPickle.dump(self.tree, outfile)
if hasattr(self, 'viruses'):
with open(self.virus_fname, 'w') as outfile:
cPickle.dump(self.viruses, outfile)
if hasattr(self, 'frequencies'):
with open(self.frequency_fname, 'w') as outfile:
cPickle.dump(self.frequencies, outfile)
if hasattr(self, 'aa_aln'):
with open(self.aa_seq_fname, 'w') as outfile:
cPickle.dump(self.aa_aln, outfile)

def load(self):
import cPickle
if os.path.isfile(self.tree_fname):
with open(self.tree_fname, 'r') as infile:
self.tree = cPickle.load(infile)
try:
self.node_lookup = {l.strain:l for l in self.tree.leaf_iter()}
self.node_lookup.update({node.strain.lower():node for node in self.tree.leaf_iter()})
except:
pass
if os.path.isfile(self.virus_fname):
with open(self.virus_fname, 'r') as infile:
self.viruses = cPickle.load(infile)
try:
self.sequence_lookup = {v.strain:v for v in self.viruses}
except:
pass
if os.path.isfile(self.frequency_fname):
with open(self.frequency_fname, 'r') as infile:
self.frequencies = cPickle.load(infile)
if os.path.isfile(self.aa_seq_fname):
with open(self.aa_seq_fname, 'r') as infile:
self.aa_aln = cPickle.load(infile)

def export_to_auspice(self, tree_fields = [], tree_pop_list = []):
from tree_util import dendropy_to_json, all_descendants
from io_util import write_json, read_json
print "--- Streamline at " + time.strftime("%H:%M:%S") + " ---"
# Move sequence data to separate file
print "Writing sequences"
elems = {}
for node in self.tree:
if hasattr(node,"clade"):
elems[node.clade] = node.aa_seq
write_json(elems, self.auspice_sequences_fname, indent=None)

print "writing tree"
self.tree_json = dendropy_to_json(self.tree.seed_node, tree_fields)
for node in all_descendants(self.tree_json):
for attr in tree_pop_list:
if attr in node:
node.pop(attr, None)
if "freq" in node:
for reg in node["freq"]:
try:
node["freq"][reg] = [round(x,3) for x in node["freq"][reg]]
except:
node["freq"][reg] = "undefined"

write_json(self.tree_json, self.auspice_tree_fname, indent=None)
try:
read_json(self.auspice_tree_fname)
except:
print "Read failed, rewriting with indents"
write_json(self.tree_json, self.auspice_tree_fname, indent=1)

# Include genotype frequencies
if hasattr(self, 'frequencies'):
write_json(self.frequencies, self.auspice_frequency_fname)

# Write out metadata
print "Writing out metadata"
meta = {"updated": time.strftime("X%d %b %Y").replace('X0','X').replace('X','')}
if hasattr(self,"date_region_count"):
meta["regions"] = self.regions
meta["virus_stats"] = [ [str(y)+'-'+str(m)] + [self.date_region_count[(y,m)][reg] for reg in self.regions]
for y,m in sorted(self.date_region_count.keys()) ]
meta_fname = "../auspice/data/meta.json"
write_json(meta, meta_fname, indent=0)

def align(self):
'''
aligns viruses using mafft. produces temporary files and deletes those at the end
after this step, self.viruses is a BioPhython multiple alignment object
'''
SeqIO.write([SeqRecord(Seq(v['seq']), id=v['strain']) for v in self.viruses], "temp_in.fasta", "fasta")
os.system("mafft --nofft temp_in.fasta > temp_out.fasta")
aln = AlignIO.read('temp_out.fasta', 'fasta')
for tmp_file in ['temp_in.fasta', 'temp_out.fasta']:
try:
os.remove(tmp_file)
except OSError:
pass

self.sequence_lookup = {seq.id:seq for seq in aln}
# add attributes to alignment
for v in self.viruses:
self.sequence_lookup[v['strain']].__dict__.update({k:val for k,val in v.iteritems() if k!='seq'})
self.viruses = aln

def infer_tree(self, raxml_time_limit):
'''
builds a tree from the alignment using fasttree and RAxML. raxml runs for
raxml_time_limit and is terminated thereafter. raxml_time_limit can be 0.
'''
def cleanup():
for file in glob.glob("RAxML_*") + glob.glob("temp*") + ["raxml_tree.newick", "initial_tree.newick"]:
try:
os.remove(file)
except OSError:
pass

cleanup()
AlignIO.write(self.viruses, 'temp.fasta', 'fasta')

print "Building initial tree with FastTree"
os.system("fasttree -gtr -nt -gamma -nosupport -mlacc 2 -slownni temp.fasta > initial_tree.newick")
self.tree = dendropy.Tree.get_from_string(delimit_newick('initial_tree.newick'),'newick', as_rooted=True)
self.tree.resolve_polytomies()
self.tree.write_to_path("initial_tree.newick", "newick")

AlignIO.write(self.viruses,"temp.phyx", "phylip-relaxed")
if raxml_time_limit>0:
print "RAxML tree optimization with time limit " + str(raxml_time_limit) + " hours"
# using exec to be able to kill process
end_time = time.time() + int(raxml_time_limit*3600)
process = subprocess.Popen("exec raxml -f d -T 6 -j -s temp.phyx -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick", shell=True)
while (time.time() < end_time):
if os.path.isfile('RAxML_result.topology'):
break
time.sleep(10)
process.terminate()

checkpoint_files = [file for file in glob.glob("RAxML_checkpoint*")]
if os.path.isfile('RAxML_result.topology'):
checkpoint_files.append('RAxML_result.topology')
if len(checkpoint_files) > 0:
last_tree_file = checkpoint_files[-1]
shutil.copy(last_tree_file, 'raxml_tree.newick')
else:
shutil.copy("initial_tree.newick", 'raxml_tree.newick')
else:
shutil.copy("initial_tree.newick", 'raxml_tree.newick')

print "RAxML branch length optimization and rooting"
os.system("raxml -f e -T 6 -s temp.phyx -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick -o " + self.outgroup['strain'])

out_fname = "data/tree_infer.newick"
os.rename('RAxML_result.branches', out_fname)
Phylo.write(Phylo.read(out_fname, 'newick'),'temp.newick','newick')
self.tree = self.tree = dendropy.Tree.get_from_string(delimit_newick(out_fname), 'newick', as_rooted=True)
cleanup()

def infer_ancestral(self):
from tree_util import to_Biopython
from tree_ancestral import ancestral_sequences
anc_seq = ancestral_sequences(self.tree, self.viruses,seqtype='str')
anc_seq.calc_ancestral_sequences()

def temporal_regional_statistics(self):
'''
produces a dictionary with (year, month) keys, each entry of which is a
a dictionary that contains the isolate count in each region observed
stored as:
self.date_region_count
self.regions
self.region_totals
'''
from collections import defaultdict, Counter
self.date_region_count = defaultdict(lambda:defaultdict(int))
regions = set()
# count viruses in every month and every region
for v in self.viruses:
if v.strain != self.outgroup['strain']:
year, month, day = map(int, v.date.split('-'))
self.date_region_count[(year, month)][v.region]+=1
regions.add(v.region)
# add a sorted list of all regions to self and calculate region totals
self.regions = sorted(regions)
self.region_totals = {reg:sum(val[reg] for val in self.date_region_count.values()) for reg in self.regions}

def determine_variable_positions(self):
'''
calculates nucleoties_frequencies and aa_frequencies at each position of the alignment
also computes consensus sequences and position at which the major allele is at less than 1-min_freq
results are stored as
self.nucleoties_frequencies
self.aa_frequencies
self.variable_nucleotides
self.variable_aa
'''
aln_array = np.array(self.viruses)
self.nucleoties_frequencies = np.zeros((len(self.nuc_alphabet),aln_array.shape[1]))
for ni,nuc in enumerate(self.nuc_alphabet):
self.nucleoties_frequencies[ni,:]=(aln_array==nuc).mean(axis=0)

self.variable_nucleotides = np.where(np.max(self.nucleoties_frequencies,axis=0)<1.0-self.min_freq)[0]
self.consensus_nucleotides = "".join(np.fromstring(self.nuc_alphabet, 'S1')[np.argmax(self.nucleoties_frequencies,axis=0)])

if hasattr(self, 'aa_aln'):
aln_array = np.array(self.aa_aln)
self.aa_frequencies = np.zeros((len(self.aa_alphabet),aln_array.shape[1]))
for ai,aa in enumerate(self.aa_alphabet):
self.aa_frequencies[ai,:]=(aln_array==aa).mean(axis=0)

self.variable_aa = np.where(np.max(self.aa_frequencies,axis=0)<1.0-self.min_freq)[0]
self.consensus_aa = "".join(np.fromstring(self.aa_alphabet, 'S1')[np.argmax(self.aa_frequencies,axis=0)])

def estimate_frequencies(self, tasks = ['mutations','genotypes', 'clades', 'tree']):
if 'mutations' in tasks:
self.all_mutation_frequencies()
if 'genotypes' in tasks:
self.all_genotypes_frequencies()
if 'clades' in tasks:
self.all_clade_frequencies()
if 'tree' in tasks:
self.all_tree_frequencies()
51 changes: 5 additions & 46 deletions augur/src/seq_util.py
@@ -1,57 +1,16 @@
from itertools import izip
import numpy as np
epitope_mask = np.fromstring("00000000000000000000000000000000000000000000000000000000000011111011011001010011000100000001001011110011100110101000001100000100000001000110101011111101011010111110001010011111000101011011111111010010001111101110111001010001110011111111000000111110000000101010101110000000000011100100000001011011100000000000001001011000110111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000", dtype='S1')

def partition_string(string, length):
return list(string[0+i:length+i] for i in range(0, len(string), length))
def hamming_distance(seq1, seq2):
aseq1, aseq2 = np.array(seq1), np.array(seq2)
non_gap = (aseq1!='-')*(aseq2!='-')
return np.mean(aseq1[non_gap]!=aseq2[non_gap])

def translate(nuc):
"""Translate nucleotide sequence to amino acid"""
from Bio import Seq
return Seq.translate(nuc) #returns string when argument is a string, Bio.Seq otherwise

def epitope_sites(aa):
aaa = np.fromstring(aa, 'S1')
return ''.join(aaa[epitope_mask[:len(aa)]=='1'])

def nonepitope_sites(aa):
aaa = np.fromstring(aa, 'S1')
return ''.join(aaa[epitope_mask[:len(aa)]=='0'])

def receptor_binding_sites(aa):
"""Receptor binding site mutations from Koel et al. 2014"""
"""These are (145, 155, 156, 158, 159, 189, 193) in canonical HA numbering"""
"""When counting from ATG/M, need to offset by 16, giving (161, 171, 172, 174, 175, 205, 209)"""
"""When indexing from 0, these are (160, 170, 171, 173, 174, 204, 208)"""
sites = [160, 170, 171, 173, 174, 204, 208]
return ''.join([aa[pos] for pos in sites])

def get_HA1(aa):
'''
return the part of the peptide corresponding to HA1, starts at pos 16, is 329 aa long
'''
return aa[16:(16+329)]

def epitope_distance(aaA, aaB):
"""Return distance of sequences aaA and aaB by comparing epitope sites"""
epA = epitope_sites(aaA)
epB = epitope_sites(aaB)
distance = sum(a != b for a, b in izip(epA, epB))
return distance

def nonepitope_distance(aaA, aaB):
"""Return distance of sequences aaA and aaB by comparing non-epitope sites"""
neA = nonepitope_sites(aaA)
neB = nonepitope_sites(aaB)
distance = sum(a != b for a, b in izip(neA, neB))
return distance
def receptor_binding_distance(aaA, aaB):
"""Return distance of sequences aaA and aaB by comparing receptor binding sites"""
neA = receptor_binding_sites(aaA)
neB = receptor_binding_sites(aaB)
distance = sum(a != b for a, b in izip(neA, neB))
return distance

def json_to_Bio_alignment(seq_json):
from Bio.Align import MultipleSeqAlignment
from Bio.SeqRecord import SeqRecord
Expand All @@ -64,7 +23,7 @@ def json_to_Bio_alignment(seq_json):
def main():
"""Testing with Hong Kong/68"""
nuc = "ATGAAGACCATCATTGCTTTGAGCTACATTTTCTGTCTGGCTCTCGGCCAAGACCTTCCAGGAAATGACAACAGCACAGCAACGCTGTGCCTGGGACATCATGCGGTGCCAAACGGAACACTAGTGAAAACAATCACAGATGATCAGATTGAAGTGACTAATGCTACTGAGCTAGTTCAGAGCTCCTCAACGGGGAAAATATGCAACAATCCTCATCGAATCCTTGATGGAATAGACTGCACACTGATAGATGCTCTATTGGGGGACCCTCATTGTGATGTTTTTCAAAATGAGACATGGGACCTTTTCGTTGAACGCAGCAAAGCTTTCAGCAACTGTTACCCTTATGATGTGCCAGATTATGCCTCCCTTAGGTCACTAGTTGCCTCGTCAGGCACTCTGGAGTTTATCACTGAGGGTTTCACTTGGACTGGGGTCACTCAGAATGGGGGAAGCAATGCTTGCAAAAGGGGACCTGGTAGCGGTTTTTTCAGTAGACTGAACTGGTTGACCAAATCAGGAAGCACATATCCAGTGCTGAACGTGACTATGCCAAACAATGACAATTTTGACAAACTATACATTTGGGGGGTTCACCACCCGAGCACGAACCAAGAACAAACCAGCCTGTATGTTCAAGCATCAGGGAGAGTCACAGTCTCTACCAGAAGAAGCCAGCAAACTATAATCCCGAATATCTGGTCCAGACCCTGGGTAAGGGGTCTGTCTAGTAGAATAAGCATCTATTGGACAATAGTTAAGCCGGGAGACGTACTGGTAATTAATAGTAATGGGAACCTAATCGCTCCTCGGGGTTATTTCAAAATGCGCACTGGGAAAAGCTCAATAATGAGGTCAGATGCACCTATTGATACCTGTATTTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAGCCCTTTCAAAACGTAAACAAGATCACATATGGAGCATGCCCCAAGTATGTTAAGCAAAACACC"
aa = translate(nuc)
aa = translate(nuc[48:])
ep = epitope_sites(aa)
ne = nonepitope_sites(aa)
rb = receptor_binding_sites(aa)
Expand Down

0 comments on commit 59720dd

Please sign in to comment.