Merge branch 'refactor'

* refactor: (48 commits) Adjust color scale. Fix auspice genotype frequency display. deleted extra ] Change default fasta_fields to match default GISAID output. fixed some default arguments, moved path addition (only necessary for interactive sessions) and the higher recursion limit to process.py defined alphabets early, checked outgroup removal, seems ok increased recursion limit caught unfinished tree in loading -- only refined trees have strain attributes used to construct the node look-up added node_lookup and sequence_lookup to the loading changed file name for dumps to accect a common prefix for all files. added a steps argument to run which is fed in from main fixed issue with yvalues added aa alignment file dump * added start and stop flags to run parts of the pipeline. * fixed an issue with the clock-cleaning. some times the std deviations is very large due to extreme outliers. using interquartile distance now instead. * put layout in a separate function from attribute adding set forced_strains = [] if not desired export of virus statistics to meta.json implemented the preferential inclusion of HI strains. * corrected various little mistakes in the adoption of the frequency estimation * removed a few unneeded imports * added virus_stats to meta.json deleted nextflu_config.py, added frequencies -- preliminarily removed streamline, now part of generic process removed unused imports ...
blab · Mar 4, 2015 · 59720dd · 59720dd
2 parents 80144dc + bad1994
commit 59720dd
Show file tree

Hide file tree

Showing 16 changed files with 2,038 additions and 1,435 deletions.
diff --git a/augur/source-data/HI_strains.txt b/augur/source-data/HI_strains.txt
diff --git a/augur/src/H3N2_process.py b/augur/src/H3N2_process.py
diff --git a/augur/src/bernoulli_frequency.py b/augur/src/bernoulli_frequency.py
diff --git a/augur/src/process.py b/augur/src/process.py
@@ -1,52 +1,250 @@
-import time, os, argparse
-import virus_download, virus_filter, virus_align, virus_clean
-import tree_infer, tree_ancestral, tree_refine
-import bernoulli_frequency
-import streamline
-from io_util import *
-
-def main(years_back=3, viruses_per_month=50):
-	"""Run full pipeline"""
-
-	print "--- Start processing at " + time.strftime("%H:%M:%S") + " ---"
-
-	# Run pipeline
-#	virus_download.main()		# Download from GISAID
-	virus_fname = 'data/gisaid_epiflu_sequence.fasta'
-
-	# Filter sequences
-	virus_fname = virus_filter.main(virus_fname, years_back=years_back, viruses_per_month=viruses_per_month)
-
-	# Align sequences
-	virus_fname = virus_align.main(virus_fname)
-
-	# Clean sequences
-	virus_fname = virus_clean.main(virus_fname)
-
-	# Make tree, creates raxml files
-	tree_fname = tree_infer.main(virus_fname)
-
-	# infer ancestral states using the cleaned viruses and the raxml tree
-	tree_fname = tree_ancestral.main(tree_fname=tree_fname, virus_fname=virus_fname)
-
-	# Clean tree, reads viruses in fname + raxml files
-	tree_fname = tree_refine.main(tree_fname=tree_fname, virus_fname = virus_fname)
-
-	# Estimate frequencies
-	tree_fname = bernoulli_frequency.main(tree_fname=tree_fname)
-
-	# Streamline tree for auspice
-	tree_fname = streamline.main(tree_fname)
-
-	# Write out metadata
-	print "Writing out metadata"
-	meta = {"updated": time.strftime("X%d %b %Y").replace('X0','X').replace('X','')}
-	meta_fname = "../auspice/data/meta.json"
-	write_json(meta, meta_fname)
-
-if __name__ == "__main__":
-	parser = argparse.ArgumentParser(description='Process virus sequences, build tree, and prepare of web visualization')
-	parser.add_argument('-y', '--years_back', type = int, default=3, help='number of past years to sample sequences from')
-	parser.add_argument('-v', '--viruses_per_month', type = int, default = 50, help='number of viruses sampled per month')
-	params = parser.parse_args()
-	main(**params.__dict__)
+import sys, time, os, argparse,shutil,subprocess, glob
+sys.path.append('src')
+sys.setrecursionlimit(10000)  # needed since we are dealing with large trees
+from Bio import SeqIO, AlignIO,Phylo
+from Bio.SeqRecord import SeqRecord
+from Bio.Seq import Seq
+import dendropy
+from bernoulli_frequency import virus_frequencies
+from tree_util import delimit_newick
+import numpy as np
+
+class process(virus_frequencies):
+	"""generic template class for processing virus sequences into trees"""
+	def __init__(self, prefix = 'data/', auspice_frequency_fname ='../auspice/data/frequencies.json',
+				auspice_sequences_fname='../auspice/data/sequences.json', auspice_tree_fname='../auspice/data/tree.json', min_freq = 0.01, **kwargs):
+		virus_frequencies.__init__(self, **kwargs)
+		self.tree_fname = prefix+'tree.pkl'
+		self.virus_fname = prefix+'virus.pkl'
+		self.frequency_fname = prefix+'frequencies.pkl'
+		self.aa_seq_fname = prefix+'aa_seq.pkl'
+		self.min_freq = min_freq
+		self.auspice_tree_fname = auspice_tree_fname
+		self.auspice_sequences_fname = auspice_sequences_fname
+		self.auspice_frequency_fname = auspice_frequency_fname
+		self.nuc_alphabet = 'ACGT-N'
+		self.aa_alphabet = 'ACDEFGHIKLMNPQRSTVWY*X'
+
+	def dump(self):
+		import cPickle
+		if hasattr(self, 'tree'):
+			with open(self.tree_fname, 'w') as outfile:
+				cPickle.dump(self.tree, outfile)
+		if hasattr(self, 'viruses'):
+			with open(self.virus_fname, 'w') as outfile:
+				cPickle.dump(self.viruses, outfile)
+		if hasattr(self, 'frequencies'):
+			with open(self.frequency_fname, 'w') as outfile:
+				cPickle.dump(self.frequencies, outfile)
+		if hasattr(self, 'aa_aln'):
+			with open(self.aa_seq_fname, 'w') as outfile:
+				cPickle.dump(self.aa_aln, outfile)
+
+	def load(self):
+		import cPickle
+		if os.path.isfile(self.tree_fname):
+			with open(self.tree_fname, 'r') as infile:
+				self.tree = cPickle.load(infile)
+				try:
+					self.node_lookup = {l.strain:l for l in self.tree.leaf_iter()}
+					self.node_lookup.update({node.strain.lower():node for node in self.tree.leaf_iter()})
+				except:
+					pass
+		if os.path.isfile(self.virus_fname):
+			with open(self.virus_fname, 'r') as infile:
+				self.viruses = cPickle.load(infile)
+				try:
+					self.sequence_lookup = {v.strain:v for v in self.viruses}
+				except:
+					pass
+		if os.path.isfile(self.frequency_fname):
+			with open(self.frequency_fname, 'r') as infile:
+				self.frequencies = cPickle.load(infile)
+		if os.path.isfile(self.aa_seq_fname):
+			with open(self.aa_seq_fname, 'r') as infile:
+				self.aa_aln = cPickle.load(infile)
+
+	def export_to_auspice(self, tree_fields = [], tree_pop_list = []):
+		from tree_util import dendropy_to_json, all_descendants
+		from io_util import write_json, read_json
+		print "--- Streamline at " + time.strftime("%H:%M:%S") + " ---"
+		# Move sequence data to separate file
+		print "Writing sequences"
+		elems = {}
+		for node in self.tree:
+			if hasattr(node,"clade"):
+				elems[node.clade] = node.aa_seq
+		write_json(elems, self.auspice_sequences_fname, indent=None)
+
+		print "writing tree"
+		self.tree_json = dendropy_to_json(self.tree.seed_node, tree_fields)
+		for node in all_descendants(self.tree_json):
+			for attr in tree_pop_list:
+				if attr in node:
+					node.pop(attr, None)
+			if "freq" in node:
+				for reg in node["freq"]:
+					try:
+						node["freq"][reg] = [round(x,3) for x in node["freq"][reg]]
+					except:
+						node["freq"][reg] = "undefined"				
+
+		write_json(self.tree_json, self.auspice_tree_fname, indent=None)
+		try:
+			read_json(self.auspice_tree_fname)
+		except:
+			print "Read failed, rewriting with indents"	
+			write_json(self.tree_json, self.auspice_tree_fname, indent=1)
+
+		# Include genotype frequencies
+		if hasattr(self, 'frequencies'):
+			write_json(self.frequencies, self.auspice_frequency_fname)
+
+		# Write out metadata
+		print "Writing out metadata"
+		meta = {"updated": time.strftime("X%d %b %Y").replace('X0','X').replace('X','')}
+		if hasattr(self,"date_region_count"):
+			meta["regions"] = self.regions
+			meta["virus_stats"] = [ [str(y)+'-'+str(m)] + [self.date_region_count[(y,m)][reg] for reg in self.regions]
+									for y,m in sorted(self.date_region_count.keys()) ]
+		meta_fname = "../auspice/data/meta.json"
+		write_json(meta, meta_fname, indent=0)
+
+	def align(self):
+		'''
+		aligns viruses using mafft. produces temporary files and deletes those at the end
+		after this step, self.viruses is a BioPhython multiple alignment object
+		'''
+		SeqIO.write([SeqRecord(Seq(v['seq']), id=v['strain']) for v in self.viruses], "temp_in.fasta", "fasta")
+		os.system("mafft --nofft temp_in.fasta > temp_out.fasta")
+		aln = AlignIO.read('temp_out.fasta', 'fasta')
+		for tmp_file in ['temp_in.fasta', 'temp_out.fasta']:
+			try:
+				os.remove(tmp_file)
+			except OSError:
+				pass
+
+		self.sequence_lookup = {seq.id:seq for seq in aln}
+		# add attributes to alignment
+		for v in self.viruses:
+			self.sequence_lookup[v['strain']].__dict__.update({k:val for k,val in v.iteritems() if k!='seq'})
+		self.viruses = aln
+
+	def infer_tree(self, raxml_time_limit):
+		'''
+		builds a tree from the alignment using fasttree and RAxML. raxml runs for 
+		raxml_time_limit and is terminated thereafter. raxml_time_limit can be 0.
+		'''
+		def cleanup():
+			for file in glob.glob("RAxML_*") + glob.glob("temp*") + ["raxml_tree.newick", "initial_tree.newick"]:
+				try:
+					os.remove(file)
+				except OSError:
+					pass
+
+		cleanup()
+		AlignIO.write(self.viruses, 'temp.fasta', 'fasta')
+
+		print "Building initial tree with FastTree"
+		os.system("fasttree -gtr -nt -gamma -nosupport -mlacc 2 -slownni temp.fasta > initial_tree.newick")
+		self.tree = dendropy.Tree.get_from_string(delimit_newick('initial_tree.newick'),'newick', as_rooted=True)
+		self.tree.resolve_polytomies()
+		self.tree.write_to_path("initial_tree.newick", "newick")
+
+		AlignIO.write(self.viruses,"temp.phyx", "phylip-relaxed")
+		if raxml_time_limit>0:
+			print "RAxML tree optimization with time limit " + str(raxml_time_limit) + " hours"
+			# using exec to be able to kill process
+			end_time = time.time() + int(raxml_time_limit*3600)
+			process = subprocess.Popen("exec raxml -f d -T 6 -j -s temp.phyx -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick", shell=True)
+			while (time.time() < end_time):
+				if os.path.isfile('RAxML_result.topology'):
+					break
+				time.sleep(10)
+			process.terminate()
+
+			checkpoint_files = [file for file in glob.glob("RAxML_checkpoint*")]
+			if os.path.isfile('RAxML_result.topology'):
+				checkpoint_files.append('RAxML_result.topology')
+			if len(checkpoint_files) > 0:
+				last_tree_file = checkpoint_files[-1]
+				shutil.copy(last_tree_file, 'raxml_tree.newick')
+			else:
+				shutil.copy("initial_tree.newick", 'raxml_tree.newick')
+		else:
+			shutil.copy("initial_tree.newick", 'raxml_tree.newick')
+
+		print "RAxML branch length optimization and rooting"
+		os.system("raxml -f e -T 6 -s temp.phyx -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick -o " + self.outgroup['strain'])
+
+		out_fname = "data/tree_infer.newick"
+		os.rename('RAxML_result.branches', out_fname)
+		Phylo.write(Phylo.read(out_fname, 'newick'),'temp.newick','newick')
+		self.tree = self.tree = dendropy.Tree.get_from_string(delimit_newick(out_fname), 'newick', as_rooted=True)
+		cleanup()
+
+	def infer_ancestral(self):
+		from tree_util import to_Biopython
+		from tree_ancestral import ancestral_sequences
+		anc_seq = ancestral_sequences(self.tree, self.viruses,seqtype='str')
+		anc_seq.calc_ancestral_sequences()
+
+	def temporal_regional_statistics(self):
+		'''
+		produces a dictionary with (year, month) keys, each entry of which is a
+		a dictionary that contains the isolate count in each region observed
+		stored as:
+
+		self.date_region_count
+		self.regions
+		self.region_totals
+		'''
+		from collections import defaultdict, Counter
+		self.date_region_count = defaultdict(lambda:defaultdict(int))
+		regions = set()
+		# count viruses in every month and every region
+		for v in self.viruses:
+			if v.strain != self.outgroup['strain']:
+				year, month, day = map(int, v.date.split('-'))
+				self.date_region_count[(year, month)][v.region]+=1
+				regions.add(v.region)
+		# add a sorted list of all regions to self and calculate region totals
+		self.regions = sorted(regions)
+		self.region_totals = {reg:sum(val[reg] for val in self.date_region_count.values()) for reg in self.regions}
+
+	def determine_variable_positions(self):
+		'''
+		calculates nucleoties_frequencies and aa_frequencies at each position of the alignment
+		also computes consensus sequences and position at which the major allele is at less than 1-min_freq
+		results are stored as
+		self.nucleoties_frequencies
+		self.aa_frequencies
+		self.variable_nucleotides
+		self.variable_aa
+		'''
+		aln_array = np.array(self.viruses)
+		self.nucleoties_frequencies = np.zeros((len(self.nuc_alphabet),aln_array.shape[1]))
+		for ni,nuc in enumerate(self.nuc_alphabet):
+			self.nucleoties_frequencies[ni,:]=(aln_array==nuc).mean(axis=0)
+
+		self.variable_nucleotides = np.where(np.max(self.nucleoties_frequencies,axis=0)<1.0-self.min_freq)[0]
+		self.consensus_nucleotides = "".join(np.fromstring(self.nuc_alphabet, 'S1')[np.argmax(self.nucleoties_frequencies,axis=0)])
+
+		if hasattr(self, 'aa_aln'):
+			aln_array = np.array(self.aa_aln)
+			self.aa_frequencies = np.zeros((len(self.aa_alphabet),aln_array.shape[1]))
+			for ai,aa in enumerate(self.aa_alphabet):
+				self.aa_frequencies[ai,:]=(aln_array==aa).mean(axis=0)
+
+			self.variable_aa = np.where(np.max(self.aa_frequencies,axis=0)<1.0-self.min_freq)[0]
+			self.consensus_aa = "".join(np.fromstring(self.aa_alphabet, 'S1')[np.argmax(self.aa_frequencies,axis=0)])
+
+	def estimate_frequencies(self, tasks = ['mutations','genotypes', 'clades', 'tree']):
+		if 'mutations' in tasks:
+			self.all_mutation_frequencies() 
+		if 'genotypes' in tasks:
+			self.all_genotypes_frequencies() 
+		if 'clades' in tasks:
+			self.all_clade_frequencies() 
+		if 'tree' in tasks:
+			self.all_tree_frequencies() 
diff --git a/augur/src/seq_util.py b/augur/src/seq_util.py
@@ -1,57 +1,16 @@
 from itertools import izip
 import numpy as np
-epitope_mask = np.fromstring("00000000000000000000000000000000000000000000000000000000000011111011011001010011000100000001001011110011100110101000001100000100000001000110101011111101011010111110001010011111000101011011111111010010001111101110111001010001110011111111000000111110000000101010101110000000000011100100000001011011100000000000001001011000110111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000", dtype='S1')
 
-def partition_string(string, length):
-	return list(string[0+i:length+i] for i in range(0, len(string), length))
+def hamming_distance(seq1, seq2):
+	aseq1, aseq2 = np.array(seq1), np.array(seq2)
+	non_gap = (aseq1!='-')*(aseq2!='-')
+	return np.mean(aseq1[non_gap]!=aseq2[non_gap])
 
 def translate(nuc):
 	"""Translate nucleotide sequence to amino acid"""
 	from Bio import Seq
 	return Seq.translate(nuc) #returns string when argument is a string, Bio.Seq otherwise
 
-def epitope_sites(aa):
-	aaa = np.fromstring(aa, 'S1')
-	return ''.join(aaa[epitope_mask[:len(aa)]=='1'])
-
-def nonepitope_sites(aa):
-	aaa = np.fromstring(aa, 'S1')
-	return ''.join(aaa[epitope_mask[:len(aa)]=='0'])
-
-def receptor_binding_sites(aa):
-	"""Receptor binding site mutations from Koel et al. 2014"""
-	"""These are (145, 155, 156, 158, 159, 189, 193) in canonical HA numbering"""
-	"""When counting from ATG/M, need to offset by 16, giving (161, 171, 172, 174, 175, 205, 209)"""
-	"""When indexing from 0, these are (160, 170, 171, 173, 174, 204, 208)"""
-	sites = [160, 170, 171, 173, 174, 204, 208]
-	return ''.join([aa[pos] for pos in sites])
-
-def get_HA1(aa):
-	'''
-	return the part of the peptide corresponding to HA1, starts at pos 16, is 329 aa long
-	'''
-	return aa[16:(16+329)]
-
-def epitope_distance(aaA, aaB):
-	"""Return distance of sequences aaA and aaB by comparing epitope sites"""
-	epA = epitope_sites(aaA)
-	epB = epitope_sites(aaB)
-	distance = sum(a != b for a, b in izip(epA, epB))
-	return distance
-
-def nonepitope_distance(aaA, aaB):
-	"""Return distance of sequences aaA and aaB by comparing non-epitope sites"""
-	neA = nonepitope_sites(aaA)
-	neB = nonepitope_sites(aaB)
-	distance = sum(a != b for a, b in izip(neA, neB))
-	return distance
-def receptor_binding_distance(aaA, aaB):
-	"""Return distance of sequences aaA and aaB by comparing receptor binding sites"""
-	neA = receptor_binding_sites(aaA)
-	neB = receptor_binding_sites(aaB)
-	distance = sum(a != b for a, b in izip(neA, neB))
-	return distance
-
 def json_to_Bio_alignment(seq_json):
 	from Bio.Align import MultipleSeqAlignment
 	from Bio.SeqRecord import SeqRecord
@@ -64,7 +23,7 @@ def json_to_Bio_alignment(seq_json):
 def main():
 	"""Testing with Hong Kong/68"""
 	nuc = "ATGAAGACCATCATTGCTTTGAGCTACATTTTCTGTCTGGCTCTCGGCCAAGACCTTCCAGGAAATGACAACAGCACAGCAACGCTGTGCCTGGGACATCATGCGGTGCCAAACGGAACACTAGTGAAAACAATCACAGATGATCAGATTGAAGTGACTAATGCTACTGAGCTAGTTCAGAGCTCCTCAACGGGGAAAATATGCAACAATCCTCATCGAATCCTTGATGGAATAGACTGCACACTGATAGATGCTCTATTGGGGGACCCTCATTGTGATGTTTTTCAAAATGAGACATGGGACCTTTTCGTTGAACGCAGCAAAGCTTTCAGCAACTGTTACCCTTATGATGTGCCAGATTATGCCTCCCTTAGGTCACTAGTTGCCTCGTCAGGCACTCTGGAGTTTATCACTGAGGGTTTCACTTGGACTGGGGTCACTCAGAATGGGGGAAGCAATGCTTGCAAAAGGGGACCTGGTAGCGGTTTTTTCAGTAGACTGAACTGGTTGACCAAATCAGGAAGCACATATCCAGTGCTGAACGTGACTATGCCAAACAATGACAATTTTGACAAACTATACATTTGGGGGGTTCACCACCCGAGCACGAACCAAGAACAAACCAGCCTGTATGTTCAAGCATCAGGGAGAGTCACAGTCTCTACCAGAAGAAGCCAGCAAACTATAATCCCGAATATCTGGTCCAGACCCTGGGTAAGGGGTCTGTCTAGTAGAATAAGCATCTATTGGACAATAGTTAAGCCGGGAGACGTACTGGTAATTAATAGTAATGGGAACCTAATCGCTCCTCGGGGTTATTTCAAAATGCGCACTGGGAAAAGCTCAATAATGAGGTCAGATGCACCTATTGATACCTGTATTTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAGCCCTTTCAAAACGTAAACAAGATCACATATGGAGCATGCCCCAAGTATGTTAAGCAAAACACC"
-	aa = translate(nuc)
+	aa = translate(nuc[48:])
 	ep = epitope_sites(aa)
 	ne = nonepitope_sites(aa)
 	rb = receptor_binding_sites(aa)