From ffcc2e0339c43ad306982779f083d681a4f8e17e Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Tue, 24 Feb 2015 05:58:13 +0100 Subject: [PATCH 01/48] moved vaccine strains into separate file, cleaned up a few functions --- .../source-data/H3N2_outgroup_and_vaccine.py | 56 +++++++++ augur/src/virus_filter.py | 117 ++++-------------- 2 files changed, 80 insertions(+), 93 deletions(-) create mode 100644 augur/source-data/H3N2_outgroup_and_vaccine.py diff --git a/augur/source-data/H3N2_outgroup_and_vaccine.py b/augur/source-data/H3N2_outgroup_and_vaccine.py new file mode 100644 index 00000000..0f408853 --- /dev/null +++ b/augur/source-data/H3N2_outgroup_and_vaccine.py @@ -0,0 +1,56 @@ +outgroup = { + 'strain': 'A/Beijing/32/1992', + 'db': 'IRD', + 'accession': 'U26830', + 'date': '1992-01-01', + 'country': 'China', + 'region': 'China', + 'seq': 'ATGAAGACTATCATTGCTTTGAGCTACATTTTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACAACAGCACAGCAACGCTGTGCCTGGGACATCATGCAGTGCCAAACGGAACGCTAGTGAAAACAATCACGAATGATCAAATTGAAGTGACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTAGAATATGCGACAGTCCTCACCGAATCCTTGATGGAAAAAACTGCACACTGATAGATGCTCTATTGGGAGACCCTCATTGTGATGGCTTCCAAAATAAGGAATGGGACCTTTTTGTTGAACGCAGCAAAGCTTACAGCAACTGTTACCCTTATGATGTACCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCAGGCACCCTGGAGTTTATCAATGAAGACTTCAATTGGACTGGAGTCGCTCAGGATGGGGGAAGCTATGCTTGCAAAAGGGGATCTGTTAACAGTTTCTTTAGTAGATTGAATTGGTTGCACAAATCAGAATACAAATATCCAGCGCTGAACGTGACTATGCCAAACAATGGCAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGAGCACGGACAGAGACCAAACCAGCCTATATGTTCGAGCATCAGGGAGAGTCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAACCCCGAATATCGGGTCTAGACCCTGGGTAAGGGGTCAGTCCAGTAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAATAGCACAGGGAATCTAATTGCTCCTCGGGGTTACTTCAAAATACGAAATGGGAAAAGCTCAATAATGAGGTCAGATGCACCCATTGGCACCTGCAGTTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCTTTTCAAAATGTAAACAGGATCACATATGGGGCCTGCCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGGAATGTACCAGAGAAACAAACTAGAGGCATATTCGGCGCAATCGCAGGTTTCATAGAAAATGGTTGGGAGGGAATGGTAGACGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGCACAGGACAAGCAGCAGATCTTAAAAGCACTCAAGCAGCAATCGACCAAATCAACGGGAAACTGAATAGGTTAATCGAGAAAACGAACGAGAAATTCCATCAAATCGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTCGAGAAATATGTTGAAGACACTAAAATAGATCTCTGGTCTTACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTTACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAGGAAGCAACTGAGGGAAAATGCTGAGGACATGGGCAATGGTTGCTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGGTCAATCAGAAATGGAACTTATGACCATGATGTATACAGAGACGAAGCATTAAACAACCGGTTCCAGATCAAAGGTGTTGAGCTGAAGTCAGGATACAAAGATTGGATCCTGTGGATTTCCTTTGCCATATCATGCTTTTTGCTTTGTGTTGTTTTGCTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGTAACATTTGCATTTGA' +# 'seq': 'ATGAAGACTATCATTGCTTTGAGCTACATTTTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACAACAGCACAGCAACGCTGTGCCTGGGACATCATGCAGTGCCAAACGGAACGCTAGTGAAAACAATCACGAATGATCAAATTGAAGTGACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTAGAATATGCGACAGTCCTCACCGAATCCTTGATGGAAAAAACTGCACACTGATAGATGCTCTATTGGGAGACCCTCATTGTGATGGCTTCCAAAATAAGGAATGGGACCTTTTTGTTGAACGCAGCAAAGCTTACAGCAACTGTTACCCTTATGATGTACCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCAGGCACCCTGGAGTTTATCAATGAAGACTTCAATTGGACTGGAGTCGCTCAGGATGGGGGAAGCTATGCTTGCAAAAGGGGATCTGTTAACAGTTTCTTTAGTAGATTGAATTGGTTGCACAAATCAGAATACAAATATCCAGCGCTGAACGTGACTATGCCAAACAATGGCAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGAGCACGGACAGAGACCAAACCAGCCTATATGTTCGAGCATCAGGGAGAGTCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAACCCCGAATATCGGGTCTAGACCCTGGGTAAGGGGTCAGTCCAGTAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAATAGCACAGGGAATCTAATTGCTCCTCGGGGTTACTTCAAAATACGAAATGGGAAAAGCTCAATAATGAGGTCAGATGCACCCATTGGCACCTGCAGTTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCTTTTCAAAATGTAAACAGGATCACATATGGGGCCTGCCCCAGATATGTTAAGCAAAACACT' + } + +vaccine_strains = [ + { + "strain": "A/Wisconsin/67/2005", + "db": "IRD", + "accession": "CY163984", + "date": "2005-08-31", + "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGGAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACGATGAAAGCTTCAATTGGACTGGAGTCACTCAAAATGGAACAAGCTCTTCTTGCAAAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACAATGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTTCAAAATGTAAACAGGATCACATATGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAATAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCAATCAAATCAATGGGAAGCTGAATAGGTTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTCGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAGAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCATGATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAAGGCGTTGAGCTGAAGTCAGGATACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGA" +# "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGGAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACGATGAAAGCTTCAATTGGACTGGAGTCACTCAAAATGGAACAAGCTCTTCTTGCAAAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACAATGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTTCAAAATGTAAACAGGATCACATATGGGGCCTGTCCCAGATATGTTAAGCAAAACACT" + }, { + "strain": "A/Brisbane/10/2007", + "db": "IRD", + "accession": "CY113005", + "date": "2007-02-06", + "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCACTCAAAAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAATGACCAAATCTTCCCGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAACGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAATAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATAGGTTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTCGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACCAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACAATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGCGTTGAGCTGAAGTCAGGATACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGA" +# "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCACTCAAAAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAATGACCAAATCTTCCCGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAACGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACT" + }, { + "strain": "A/Perth/16/2009", + "db": "IRD", + "accession": "GQ293081", + "date": "2009-04-07", + "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAAAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAAAAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAAGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACCGTAAGCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATAGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTCGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGAGATGAAGCATTAAACAACCGGTTTCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGA" +# "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAAAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAAAAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAAGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACCGTAAGCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACT" + }, { + "strain": "A/Victoria/361/2011", + "db": "IRD", + "accession": "GQ293081", + "date": "2011-10-24", + "seq": "ATGAAGACTATCATTGCTTTGAGCCACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAATAATAGTTTCTTTAGTAGATTAAATTGGTTGACCCGCTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACAAGGAACAAATCTTCCTGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATATAGACCCAGAATAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACTCTGAAATTGGCAACAGGAATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTCGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTAAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA" +# "seq": "ATGAAGACTATCATTGCTTTGAGCCACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAATAATAGTTTCTTTAGTAGATTAAATTGGTTGACCCGCTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACAAGGAACAAATCTTCCTGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATATAGACCCAGAATAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACT" + }, { + "strain": "A/Texas/50/2012", + "db": "GISAID", + "accession": "EPI_ISL_129858", + "date": "2012-04-15", + "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAACTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGAATGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAATAATAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCTCAACCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACTCTGAAATTGGCAACAGGAATGCGGAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA", +# "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAACTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGAATGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAATAATAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCTCAACCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACT", + }, { + "strain": "A/Switzerland/9715293/2013", + "db": "GISAID", + "accession": "EPI_ISL_162149", + "date": "2013-12-06", + "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGGCTGGAGTCACTCAAAACGGAACAAGTTCTTCTTGCATAAGGGGATCTAATAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTCCAAATACCCAGCATTAAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCACAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACTCTGAAATTGGCAACAGGAATGCGAAATGTACCAGAGAGACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGCTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACAAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA", +# "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGGCTGGAGTCACTCAAAACGGAACAAGTTCTTCTTGCATAAGGGGATCTAATAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTCCAAATACCCAGCATTAAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCACAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACT", + } +] diff --git a/augur/src/virus_filter.py b/augur/src/virus_filter.py index 9bd47e0e..2846523b 100644 --- a/augur/src/virus_filter.py +++ b/augur/src/virus_filter.py @@ -5,8 +5,10 @@ # - a single sequence per virus strain, taken as first sequence in list # outputs to virus_filter.json -import os, re, time, datetime, csv +import os, re, time, datetime, csv, sys from io_util import * +sys.path.append('../source-data') +from H3N2_outgroup_and_vaccine import outgroup, vaccine_strains def parse_gisaid(fasta): """Parse FASTA file from GISAID with default header formating""" @@ -61,78 +63,14 @@ def filter_passage(viruses): round_one = filter(lambda v: re.match(r'^E\d+', v.get('passage',''), re.I) == None, viruses) return filter(lambda v: re.match(r'^Egg', v.get('passage',''), re.I) == None, round_one) -def add_outgroup(viruses): - viruses.insert(0, { - 'strain': 'A/Beijing/32/1992', - 'db': 'IRD', - 'accession': 'U26830', - 'date': '1992-01-01', - 'country': 'China', - 'region': 'China', - 'seq': 'ATGAAGACTATCATTGCTTTGAGCTACATTTTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACAACAGCACAGCAACGCTGTGCCTGGGACATCATGCAGTGCCAAACGGAACGCTAGTGAAAACAATCACGAATGATCAAATTGAAGTGACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTAGAATATGCGACAGTCCTCACCGAATCCTTGATGGAAAAAACTGCACACTGATAGATGCTCTATTGGGAGACCCTCATTGTGATGGCTTCCAAAATAAGGAATGGGACCTTTTTGTTGAACGCAGCAAAGCTTACAGCAACTGTTACCCTTATGATGTACCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCAGGCACCCTGGAGTTTATCAATGAAGACTTCAATTGGACTGGAGTCGCTCAGGATGGGGGAAGCTATGCTTGCAAAAGGGGATCTGTTAACAGTTTCTTTAGTAGATTGAATTGGTTGCACAAATCAGAATACAAATATCCAGCGCTGAACGTGACTATGCCAAACAATGGCAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGAGCACGGACAGAGACCAAACCAGCCTATATGTTCGAGCATCAGGGAGAGTCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAACCCCGAATATCGGGTCTAGACCCTGGGTAAGGGGTCAGTCCAGTAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAATAGCACAGGGAATCTAATTGCTCCTCGGGGTTACTTCAAAATACGAAATGGGAAAAGCTCAATAATGAGGTCAGATGCACCCATTGGCACCTGCAGTTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCTTTTCAAAATGTAAACAGGATCACATATGGGGCCTGCCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGGAATGTACCAGAGAAACAAACTAGAGGCATATTCGGCGCAATCGCAGGTTTCATAGAAAATGGTTGGGAGGGAATGGTAGACGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGCACAGGACAAGCAGCAGATCTTAAAAGCACTCAAGCAGCAATCGACCAAATCAACGGGAAACTGAATAGGTTAATCGAGAAAACGAACGAGAAATTCCATCAAATCGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTCGAGAAATATGTTGAAGACACTAAAATAGATCTCTGGTCTTACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTTACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAGGAAGCAACTGAGGGAAAATGCTGAGGACATGGGCAATGGTTGCTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGGTCAATCAGAAATGGAACTTATGACCATGATGTATACAGAGACGAAGCATTAAACAACCGGTTCCAGATCAAAGGTGTTGAGCTGAAGTCAGGATACAAAGATTGGATCCTGTGGATTTCCTTTGCCATATCATGCTTTTTGCTTTGTGTTGTTTTGCTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGTAACATTTGCATTTGA' -# 'seq': 'ATGAAGACTATCATTGCTTTGAGCTACATTTTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACAACAGCACAGCAACGCTGTGCCTGGGACATCATGCAGTGCCAAACGGAACGCTAGTGAAAACAATCACGAATGATCAAATTGAAGTGACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTAGAATATGCGACAGTCCTCACCGAATCCTTGATGGAAAAAACTGCACACTGATAGATGCTCTATTGGGAGACCCTCATTGTGATGGCTTCCAAAATAAGGAATGGGACCTTTTTGTTGAACGCAGCAAAGCTTACAGCAACTGTTACCCTTATGATGTACCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCAGGCACCCTGGAGTTTATCAATGAAGACTTCAATTGGACTGGAGTCGCTCAGGATGGGGGAAGCTATGCTTGCAAAAGGGGATCTGTTAACAGTTTCTTTAGTAGATTGAATTGGTTGCACAAATCAGAATACAAATATCCAGCGCTGAACGTGACTATGCCAAACAATGGCAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGAGCACGGACAGAGACCAAACCAGCCTATATGTTCGAGCATCAGGGAGAGTCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAACCCCGAATATCGGGTCTAGACCCTGGGTAAGGGGTCAGTCCAGTAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAATAGCACAGGGAATCTAATTGCTCCTCGGGGTTACTTCAAAATACGAAATGGGAAAAGCTCAATAATGAGGTCAGATGCACCCATTGGCACCTGCAGTTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCTTTTCAAAATGTAAACAGGATCACATATGGGGCCTGCCCCAGATATGTTAAGCAAAACACT' - - }) - -def add_vaccine_strains(viruses): - viruses.insert(0, { # prepending to catch with unique filter - "strain": "A/Wisconsin/67/2005", - "db": "IRD", - "accession": "CY163984", - "date": "2005-08-31", - "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGGAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACGATGAAAGCTTCAATTGGACTGGAGTCACTCAAAATGGAACAAGCTCTTCTTGCAAAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACAATGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTTCAAAATGTAAACAGGATCACATATGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAATAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCAATCAAATCAATGGGAAGCTGAATAGGTTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTCGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAGAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCATGATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAAGGCGTTGAGCTGAAGTCAGGATACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGA" -# "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGGAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACGATGAAAGCTTCAATTGGACTGGAGTCACTCAAAATGGAACAAGCTCTTCTTGCAAAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACAATGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTTCAAAATGTAAACAGGATCACATATGGGGCCTGTCCCAGATATGTTAAGCAAAACACT" - }) - viruses.insert(0, { - "strain": "A/Brisbane/10/2007", - "db": "IRD", - "accession": "CY113005", - "date": "2007-02-06", - "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCACTCAAAAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAATGACCAAATCTTCCCGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAACGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAATAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATAGGTTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTCGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACCAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACAATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGCGTTGAGCTGAAGTCAGGATACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGA" -# "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCACTCAAAAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAATGACCAAATCTTCCCGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAACGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACT" - }) - viruses.insert(0, { - "strain": "A/Perth/16/2009", - "db": "IRD", - "accession": "GQ293081", - "date": "2009-04-07", - "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAAAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAAAAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAAGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACCGTAAGCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATAGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTCGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGAGATGAAGCATTAAACAACCGGTTTCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGA" -# "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAAAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAAAAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAAGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACCGTAAGCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACT" - }) - viruses.insert(0, { - "strain": "A/Victoria/361/2011", - "db": "IRD", - "accession": "GQ293081", - "date": "2011-10-24", - "seq": "ATGAAGACTATCATTGCTTTGAGCCACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAATAATAGTTTCTTTAGTAGATTAAATTGGTTGACCCGCTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACAAGGAACAAATCTTCCTGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATATAGACCCAGAATAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACTCTGAAATTGGCAACAGGAATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTCGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTAAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA" -# "seq": "ATGAAGACTATCATTGCTTTGAGCCACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAATAATAGTTTCTTTAGTAGATTAAATTGGTTGACCCGCTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACAAGGAACAAATCTTCCTGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATATAGACCCAGAATAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACT" - }) - viruses.insert(0, { - "strain": "A/Texas/50/2012", - "db": "GISAID", - "accession": "EPI_ISL_129858", - "date": "2012-04-15", - "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAACTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGAATGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAATAATAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCTCAACCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACTCTGAAATTGGCAACAGGAATGCGGAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA", -# "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAACTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGAATGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAATAATAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCTCAACCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACT", - }) - viruses.insert(0, { - "strain": "A/Switzerland/9715293/2013", - "db": "GISAID", - "accession": "EPI_ISL_162149", - "date": "2013-12-06", - "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGGCTGGAGTCACTCAAAACGGAACAAGTTCTTCTTGCATAAGGGGATCTAATAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTCCAAATACCCAGCATTAAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCACAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACTCTGAAATTGGCAACAGGAATGCGAAATGTACCAGAGAGACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGCTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACAAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA", -# "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGGCTGGAGTCACTCAAAACGGAACAAGTTCTTCTTGCATAAGGGGATCTAATAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTCCAAATACCCAGCATTAAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCACAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACT", - }) - def filter_unique(viruses): """Keep only the first isolate of a strain""" - filtered_viruses = [] - strains = set() + filtered_viruses = {} for v in viruses: - if not v['strain'].lower() in strains: - strains.add(v['strain'].lower()) - filtered_viruses.append(v) - return filtered_viruses + label = v['strain'].lower() + if not label in filtered_viruses: + filtered_viruses[label] = v + return filtered_viruses.values() def append_country_and_region(viruses): """Label viruses with geographic location based on strain name""" @@ -163,13 +101,15 @@ def append_country_and_region(viruses): return filter(lambda v: v['region'] != 'Unknown', viruses) def get_virus_tuples(viruses): - virus_tuples = {} + ''' + make dictionary of lists of viruses belonging to a certain date and region + ''' + from collections import defaultdict + virus_tuples = defaultdict(list) for v in viruses: vdate = datetime.datetime.strptime(v['date'], '%Y-%m-%d').date() - tuple = (vdate.year, vdate.month, v['region']) - if tuple not in virus_tuples: - virus_tuples[tuple] = [] - virus_tuples[tuple].append(v) + virus_tuples[(vdate.year, vdate.month, v['region'])].append(v) + return virus_tuples def streamline(viruses, years_back, viruses_per_month): @@ -197,24 +137,15 @@ def streamline(viruses, years_back, viruses_per_month): return filtered_viruses def select_viruses(virus_tuples, y, m, viruses_per_month, regions): - + ''' + select viruses_per_month strains as evenly as possible from all regions + ''' + from itertools import izip_longest select_set = [] - counts = [0] - for r in regions: - if (y, m, r) in virus_tuples: - counts.append(len(virus_tuples[(y, m, r)])) - max_count = max(counts) - for index in range(0, max_count): - for r in regions: - if (y, m, r) in virus_tuples: - viruses = virus_tuples[(y, m, r)] - if len(viruses) > index: - select_set.append(viruses[index]) - - if len(select_set) > viruses_per_month: - select_set = select_set[0:viruses_per_month] - - return select_set + for representative in izip_longest(*[virus_tuples[(y,m,r)] for r in regions], fillvalue = None): + select_set.extend([v in representative if v is not None]) + return select_set[:viruses_per_month] + def main(in_fname=None, years_back=3, viruses_per_month=50): @@ -231,7 +162,7 @@ def main(in_fname=None, years_back=3, viruses_per_month=50): fix_strain_names(viruses) # add vaccine strains - add_vaccine_strains(viruses) + viruses = vaccine_strains + viruses print str(len(viruses)) + " with vaccine strains" # filter strain names From cbc8d7a9750258c79c1ef7efd5255908dbc02c78 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Tue, 24 Feb 2015 18:40:12 +0100 Subject: [PATCH 02/48] first shot at modular virus processing --- .../source-data/H3N2_outgroup_and_vaccine.py | 56 --- augur/src/H3N2_filter.py | 76 ++++ augur/src/virus_filter.py | 372 +++++++++--------- 3 files changed, 258 insertions(+), 246 deletions(-) delete mode 100644 augur/source-data/H3N2_outgroup_and_vaccine.py create mode 100644 augur/src/H3N2_filter.py diff --git a/augur/source-data/H3N2_outgroup_and_vaccine.py b/augur/source-data/H3N2_outgroup_and_vaccine.py deleted file mode 100644 index 0f408853..00000000 --- a/augur/source-data/H3N2_outgroup_and_vaccine.py +++ /dev/null @@ -1,56 +0,0 @@ -outgroup = { - 'strain': 'A/Beijing/32/1992', - 'db': 'IRD', - 'accession': 'U26830', - 'date': '1992-01-01', - 'country': 'China', - 'region': 'China', - 'seq': 'ATGAAGACTATCATTGCTTTGAGCTACATTTTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACAACAGCACAGCAACGCTGTGCCTGGGACATCATGCAGTGCCAAACGGAACGCTAGTGAAAACAATCACGAATGATCAAATTGAAGTGACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTAGAATATGCGACAGTCCTCACCGAATCCTTGATGGAAAAAACTGCACACTGATAGATGCTCTATTGGGAGACCCTCATTGTGATGGCTTCCAAAATAAGGAATGGGACCTTTTTGTTGAACGCAGCAAAGCTTACAGCAACTGTTACCCTTATGATGTACCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCAGGCACCCTGGAGTTTATCAATGAAGACTTCAATTGGACTGGAGTCGCTCAGGATGGGGGAAGCTATGCTTGCAAAAGGGGATCTGTTAACAGTTTCTTTAGTAGATTGAATTGGTTGCACAAATCAGAATACAAATATCCAGCGCTGAACGTGACTATGCCAAACAATGGCAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGAGCACGGACAGAGACCAAACCAGCCTATATGTTCGAGCATCAGGGAGAGTCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAACCCCGAATATCGGGTCTAGACCCTGGGTAAGGGGTCAGTCCAGTAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAATAGCACAGGGAATCTAATTGCTCCTCGGGGTTACTTCAAAATACGAAATGGGAAAAGCTCAATAATGAGGTCAGATGCACCCATTGGCACCTGCAGTTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCTTTTCAAAATGTAAACAGGATCACATATGGGGCCTGCCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGGAATGTACCAGAGAAACAAACTAGAGGCATATTCGGCGCAATCGCAGGTTTCATAGAAAATGGTTGGGAGGGAATGGTAGACGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGCACAGGACAAGCAGCAGATCTTAAAAGCACTCAAGCAGCAATCGACCAAATCAACGGGAAACTGAATAGGTTAATCGAGAAAACGAACGAGAAATTCCATCAAATCGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTCGAGAAATATGTTGAAGACACTAAAATAGATCTCTGGTCTTACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTTACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAGGAAGCAACTGAGGGAAAATGCTGAGGACATGGGCAATGGTTGCTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGGTCAATCAGAAATGGAACTTATGACCATGATGTATACAGAGACGAAGCATTAAACAACCGGTTCCAGATCAAAGGTGTTGAGCTGAAGTCAGGATACAAAGATTGGATCCTGTGGATTTCCTTTGCCATATCATGCTTTTTGCTTTGTGTTGTTTTGCTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGTAACATTTGCATTTGA' -# 'seq': 'ATGAAGACTATCATTGCTTTGAGCTACATTTTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACAACAGCACAGCAACGCTGTGCCTGGGACATCATGCAGTGCCAAACGGAACGCTAGTGAAAACAATCACGAATGATCAAATTGAAGTGACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTAGAATATGCGACAGTCCTCACCGAATCCTTGATGGAAAAAACTGCACACTGATAGATGCTCTATTGGGAGACCCTCATTGTGATGGCTTCCAAAATAAGGAATGGGACCTTTTTGTTGAACGCAGCAAAGCTTACAGCAACTGTTACCCTTATGATGTACCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCAGGCACCCTGGAGTTTATCAATGAAGACTTCAATTGGACTGGAGTCGCTCAGGATGGGGGAAGCTATGCTTGCAAAAGGGGATCTGTTAACAGTTTCTTTAGTAGATTGAATTGGTTGCACAAATCAGAATACAAATATCCAGCGCTGAACGTGACTATGCCAAACAATGGCAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGAGCACGGACAGAGACCAAACCAGCCTATATGTTCGAGCATCAGGGAGAGTCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAACCCCGAATATCGGGTCTAGACCCTGGGTAAGGGGTCAGTCCAGTAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAATAGCACAGGGAATCTAATTGCTCCTCGGGGTTACTTCAAAATACGAAATGGGAAAAGCTCAATAATGAGGTCAGATGCACCCATTGGCACCTGCAGTTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCTTTTCAAAATGTAAACAGGATCACATATGGGGCCTGCCCCAGATATGTTAAGCAAAACACT' - } - -vaccine_strains = [ - { - "strain": "A/Wisconsin/67/2005", - "db": "IRD", - "accession": "CY163984", - "date": "2005-08-31", - "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGGAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACGATGAAAGCTTCAATTGGACTGGAGTCACTCAAAATGGAACAAGCTCTTCTTGCAAAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACAATGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTTCAAAATGTAAACAGGATCACATATGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAATAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCAATCAAATCAATGGGAAGCTGAATAGGTTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTCGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAGAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCATGATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAAGGCGTTGAGCTGAAGTCAGGATACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGA" -# "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGGAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACGATGAAAGCTTCAATTGGACTGGAGTCACTCAAAATGGAACAAGCTCTTCTTGCAAAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACAATGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTTCAAAATGTAAACAGGATCACATATGGGGCCTGTCCCAGATATGTTAAGCAAAACACT" - }, { - "strain": "A/Brisbane/10/2007", - "db": "IRD", - "accession": "CY113005", - "date": "2007-02-06", - "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCACTCAAAAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAATGACCAAATCTTCCCGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAACGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAATAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATAGGTTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTCGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACCAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACAATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGCGTTGAGCTGAAGTCAGGATACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGA" -# "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCACTCAAAAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAATGACCAAATCTTCCCGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAACGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACT" - }, { - "strain": "A/Perth/16/2009", - "db": "IRD", - "accession": "GQ293081", - "date": "2009-04-07", - "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAAAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAAAAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAAGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACCGTAAGCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATAGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTCGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGAGATGAAGCATTAAACAACCGGTTTCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGA" -# "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAAAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAAAAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAAGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACCGTAAGCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACT" - }, { - "strain": "A/Victoria/361/2011", - "db": "IRD", - "accession": "GQ293081", - "date": "2011-10-24", - "seq": "ATGAAGACTATCATTGCTTTGAGCCACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAATAATAGTTTCTTTAGTAGATTAAATTGGTTGACCCGCTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACAAGGAACAAATCTTCCTGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATATAGACCCAGAATAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACTCTGAAATTGGCAACAGGAATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTCGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTAAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA" -# "seq": "ATGAAGACTATCATTGCTTTGAGCCACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAATAATAGTTTCTTTAGTAGATTAAATTGGTTGACCCGCTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACAAGGAACAAATCTTCCTGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATATAGACCCAGAATAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACT" - }, { - "strain": "A/Texas/50/2012", - "db": "GISAID", - "accession": "EPI_ISL_129858", - "date": "2012-04-15", - "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAACTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGAATGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAATAATAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCTCAACCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACTCTGAAATTGGCAACAGGAATGCGGAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA", -# "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAACTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGAATGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAATAATAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCTCAACCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACT", - }, { - "strain": "A/Switzerland/9715293/2013", - "db": "GISAID", - "accession": "EPI_ISL_162149", - "date": "2013-12-06", - "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGGCTGGAGTCACTCAAAACGGAACAAGTTCTTCTTGCATAAGGGGATCTAATAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTCCAAATACCCAGCATTAAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCACAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACTCTGAAATTGGCAACAGGAATGCGAAATGTACCAGAGAGACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGCTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACAAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA", -# "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGGCTGGAGTCACTCAAAACGGAACAAGTTCTTCTTGCATAAGGGGATCTAATAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTCCAAATACCCAGCATTAAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCACAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACT", - } -] diff --git a/augur/src/H3N2_filter.py b/augur/src/H3N2_filter.py new file mode 100644 index 00000000..336fc247 --- /dev/null +++ b/augur/src/H3N2_filter.py @@ -0,0 +1,76 @@ +from virus_filter import flu_filter + +class H3N2_filter(flu_filter): + def __init__(self, fasta_fname): + flu_filter.__init__(fasta_fname) + self.vaccine_strains =[ + { + "strain": "A/Wisconsin/67/2005", + "db": "IRD", + "accession": "CY163984", + "date": "2005-08-31", + "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGGAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACGATGAAAGCTTCAATTGGACTGGAGTCACTCAAAATGGAACAAGCTCTTCTTGCAAAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACAATGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTTCAAAATGTAAACAGGATCACATATGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAATAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCAATCAAATCAATGGGAAGCTGAATAGGTTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTCGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAGAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCATGATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAAGGCGTTGAGCTGAAGTCAGGATACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGA" + # "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGGAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACGATGAAAGCTTCAATTGGACTGGAGTCACTCAAAATGGAACAAGCTCTTCTTGCAAAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACAATGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTTCAAAATGTAAACAGGATCACATATGGGGCCTGTCCCAGATATGTTAAGCAAAACACT" + }, { + "strain": "A/Brisbane/10/2007", + "db": "IRD", + "accession": "CY113005", + "date": "2007-02-06", + "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCACTCAAAAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAATGACCAAATCTTCCCGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAACGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAATAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATAGGTTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTCGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACCAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACAATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGCGTTGAGCTGAAGTCAGGATACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGA" + # "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCACTCAAAAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAATGACCAAATCTTCCCGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAACGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACT" + }, { + "strain": "A/Perth/16/2009", + "db": "IRD", + "accession": "GQ293081", + "date": "2009-04-07", + "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAAAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAAAAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAAGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACCGTAAGCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATAGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTCGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGAGATGAAGCATTAAACAACCGGTTTCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGA" + # "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAAAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAAAAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAAGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACCGTAAGCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACT" + }, { + "strain": "A/Victoria/361/2011", + "db": "IRD", + "accession": "GQ293081", + "date": "2011-10-24", + "seq": "ATGAAGACTATCATTGCTTTGAGCCACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAATAATAGTTTCTTTAGTAGATTAAATTGGTTGACCCGCTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACAAGGAACAAATCTTCCTGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATATAGACCCAGAATAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACTCTGAAATTGGCAACAGGAATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTCGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTAAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA" + # "seq": "ATGAAGACTATCATTGCTTTGAGCCACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAATAATAGTTTCTTTAGTAGATTAAATTGGTTGACCCGCTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACAAGGAACAAATCTTCCTGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATATAGACCCAGAATAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACT" + }, { + "strain": "A/Texas/50/2012", + "db": "GISAID", + "accession": "EPI_ISL_129858", + "date": "2012-04-15", + "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAACTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGAATGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAATAATAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCTCAACCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACTCTGAAATTGGCAACAGGAATGCGGAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA", + # "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAACTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGAATGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAATAATAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCTCAACCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACT", + }, { + "strain": "A/Switzerland/9715293/2013", + "db": "GISAID", + "accession": "EPI_ISL_162149", + "date": "2013-12-06", + "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGGCTGGAGTCACTCAAAACGGAACAAGTTCTTCTTGCATAAGGGGATCTAATAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTCCAAATACCCAGCATTAAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCACAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACTCTGAAATTGGCAACAGGAATGCGAAATGTACCAGAGAGACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGCTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACAAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA", + # "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGGCTGGAGTCACTCAAAACGGAACAAGTTCTTCTTGCATAAGGGGATCTAATAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTCCAAATACCCAGCATTAAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCACAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACT", + } + ] + self.outgroup = { + 'strain': 'A/Beijing/32/1992', + 'db': 'IRD', + 'accession': 'U26830', + 'date': '1992-01-01', + 'country': 'China', + 'region': 'China', + 'seq': 'ATGAAGACTATCATTGCTTTGAGCTACATTTTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACAACAGCACAGCAACGCTGTGCCTGGGACATCATGCAGTGCCAAACGGAACGCTAGTGAAAACAATCACGAATGATCAAATTGAAGTGACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTAGAATATGCGACAGTCCTCACCGAATCCTTGATGGAAAAAACTGCACACTGATAGATGCTCTATTGGGAGACCCTCATTGTGATGGCTTCCAAAATAAGGAATGGGACCTTTTTGTTGAACGCAGCAAAGCTTACAGCAACTGTTACCCTTATGATGTACCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCAGGCACCCTGGAGTTTATCAATGAAGACTTCAATTGGACTGGAGTCGCTCAGGATGGGGGAAGCTATGCTTGCAAAAGGGGATCTGTTAACAGTTTCTTTAGTAGATTGAATTGGTTGCACAAATCAGAATACAAATATCCAGCGCTGAACGTGACTATGCCAAACAATGGCAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGAGCACGGACAGAGACCAAACCAGCCTATATGTTCGAGCATCAGGGAGAGTCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAACCCCGAATATCGGGTCTAGACCCTGGGTAAGGGGTCAGTCCAGTAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAATAGCACAGGGAATCTAATTGCTCCTCGGGGTTACTTCAAAATACGAAATGGGAAAAGCTCAATAATGAGGTCAGATGCACCCATTGGCACCTGCAGTTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCTTTTCAAAATGTAAACAGGATCACATATGGGGCCTGCCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGGAATGTACCAGAGAAACAAACTAGAGGCATATTCGGCGCAATCGCAGGTTTCATAGAAAATGGTTGGGAGGGAATGGTAGACGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGCACAGGACAAGCAGCAGATCTTAAAAGCACTCAAGCAGCAATCGACCAAATCAACGGGAAACTGAATAGGTTAATCGAGAAAACGAACGAGAAATTCCATCAAATCGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTCGAGAAATATGTTGAAGACACTAAAATAGATCTCTGGTCTTACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTTACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAGGAAGCAACTGAGGGAAAATGCTGAGGACATGGGCAATGGTTGCTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGGTCAATCAGAAATGGAACTTATGACCATGATGTATACAGAGACGAAGCATTAAACAACCGGTTCCAGATCAAAGGTGTTGAGCTGAAGTCAGGATACAAAGATTGGATCCTGTGGATTTCCTTTGCCATATCATGCTTTTTGCTTTGTGTTGTTTTGCTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGTAACATTTGCATTTGA' + # 'seq': 'ATGAAGACTATCATTGCTTTGAGCTACATTTTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACAACAGCACAGCAACGCTGTGCCTGGGACATCATGCAGTGCCAAACGGAACGCTAGTGAAAACAATCACGAATGATCAAATTGAAGTGACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTAGAATATGCGACAGTCCTCACCGAATCCTTGATGGAAAAAACTGCACACTGATAGATGCTCTATTGGGAGACCCTCATTGTGATGGCTTCCAAAATAAGGAATGGGACCTTTTTGTTGAACGCAGCAAAGCTTACAGCAACTGTTACCCTTATGATGTACCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCAGGCACCCTGGAGTTTATCAATGAAGACTTCAATTGGACTGGAGTCGCTCAGGATGGGGGAAGCTATGCTTGCAAAAGGGGATCTGTTAACAGTTTCTTTAGTAGATTGAATTGGTTGCACAAATCAGAATACAAATATCCAGCGCTGAACGTGACTATGCCAAACAATGGCAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGAGCACGGACAGAGACCAAACCAGCCTATATGTTCGAGCATCAGGGAGAGTCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAACCCCGAATATCGGGTCTAGACCCTGGGTAAGGGGTCAGTCCAGTAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAATAGCACAGGGAATCTAATTGCTCCTCGGGGTTACTTCAAAATACGAAATGGGAAAAGCTCAATAATGAGGTCAGATGCACCCATTGGCACCTGCAGTTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCTTTTCAAAATGTAAACAGGATCACATATGGGGCCTGCCCCAGATATGTTAAGCAAAACACT' + } + + + +def main(in_fname='data/gisaid_epiflu_sequence.fasta', years_back=3, viruses_per_month=50): + + print "--- Filter at " + time.strftime("%H:%M:%S") + " ---" + myH3N2_filter = H3N2_filter(in_fname) + myH3N2_filter.fiter() + myH3N2_filter.subsample(years_back, viruses_per_month) + + out_fname = 'data/virus_filter.json' + write_json(myH3N3_filter.virus_subsample, out_fname) + return out_fname + +if __name__ == "__main__": + main() diff --git a/augur/src/virus_filter.py b/augur/src/virus_filter.py index 2846523b..9b16d4c1 100644 --- a/augur/src/virus_filter.py +++ b/augur/src/virus_filter.py @@ -7,198 +7,190 @@ import os, re, time, datetime, csv, sys from io_util import * +from collections import defaultdict sys.path.append('../source-data') -from H3N2_outgroup_and_vaccine import outgroup, vaccine_strains - -def parse_gisaid(fasta): - """Parse FASTA file from GISAID with default header formating""" - viruses = [] - try: - handle = open(fasta, 'r') - except IOError: - print fasta + " not found" - else: - for record in SeqIO.parse(handle, "fasta"): - words = record.description.replace(">","").replace(" ","").split('|') - strain = words[0] - accession = words[1] - passage = words[3] - date = words[5] - seq = str(record.seq).upper() - v = { - "strain": strain, - "date": date, - "accession": accession, - "db": "GISAID", - "seq": seq - } - if passage != "": - v['passage'] = passage - viruses.append(v) - handle.close() - - return viruses - -def sort_length(viruses): - """Sort by length, but randomize viruses of a given length""" - from random import shuffle - shuffle(viruses) - return sorted(viruses, key = lambda v: len(v['seq']), reverse = True) - -def fix_strain_names(viruses): - for v in viruses: - v['strain'] = v['strain'].replace('\'','').replace('(','').replace(')','').replace('H3N2','').replace('Human','').replace('human','').replace('//','/') - -def filter_strain_names(viruses): - filtered_viruses = filter(lambda v: re.match(r'^A/', v['strain']) != None, viruses) - return filtered_viruses - -def filter_length(viruses): - return filter(lambda v: len(v['seq']) >= 987, viruses) - -def filter_date(viruses): - return filter(lambda v: re.match(r'\d\d\d\d-\d\d-\d\d', v['date']) != None, viruses) - -def filter_passage(viruses): - round_one = filter(lambda v: re.match(r'^E\d+', v.get('passage',''), re.I) == None, viruses) - return filter(lambda v: re.match(r'^Egg', v.get('passage',''), re.I) == None, round_one) - -def filter_unique(viruses): - """Keep only the first isolate of a strain""" - filtered_viruses = {} - for v in viruses: - label = v['strain'].lower() - if not label in filtered_viruses: - filtered_viruses[label] = v - return filtered_viruses.values() - -def append_country_and_region(viruses): - """Label viruses with geographic location based on strain name""" - """Location is to the level of country of administrative division when available""" - reader = csv.DictReader(open("source-data/geo_synonyms.tsv"), delimiter='\t') # list of dicts - label_to_country = {} - for line in reader: - label_to_country[line['label'].lower()] = line['country'] - for v in viruses: - label = re.match(r'^A/([^/]+)/', v['strain']).group(1).lower() # check first for whole geo match - v['country'] = 'Unknown' - if label in label_to_country: - v['country'] = label_to_country[label] +from H3N2_outgroup_and_vaccine import outgroup, vaccine_strains, min_length + + +class virus_filter(object): + + def __init__(self,viruses=None): + if viruses is None: viruses=[] + self.viruses = viruses + self.strain_lookup = {} + self.outgroup = None + + def filter_generic(self, min_length=None, date_spec = 'full', prepend_strains = None): + ''' + filter viruses by length and accurate date, sort, add additioanl strains such + as vaccine strains that are preferentially retained and prune to unique strains + ''' + print len(self.viruses) + " initial viruses" + if min_length is not None: + self.filter_length(min_length) + print len(self.viruses) + " after filtering by length >=", min_length + + self.filter_date(min_length, date_spec) + print len(self.viruses) + " after filtering for precise dates" + self.sort_length() + if prepend_strains is not None: + self.viruses = prepend_strains + self.viruses + self.filter_unique() + print len(self.viruses) + " after filtering for unique strains" + + def sort_length(self): + ''' + Sort by length, but randomize viruses of a given length + ''' + from random import shuffle + shuffle(self.viruses) + self.viruses.sort(key = lambda v: len(v['seq']), reverse = True) + + def filter_unique(self): + ''' + Keep only the first isolate of a strain + ''' + filtered_viruses = [] + for v in viruses: + label = v['strain'].lower() + if not label in self.strain_lookup: + filtered_viruses.append(v) + self.strain_lookup[label]=v + self.viruses=filtered_viruses + + def filter_length(self, min_length): + self.viruses = filter(lambda v: len(v['seq']) >= min_length, self.viruses) + + def filter_date(self, date_spec): + if date_spec=='full': + self.viruses = filter(lambda v: re.match(r'\d\d\d\d-\d\d-\d\d', v['date']) != None, self.viruses) + elif date_spec=='year': + self.viruses = filter(lambda v: re.match(r'\d\d\d\d', v['date']) != None, self.viruses + + def subsample(self, years_back, viruses_per_month, prioritize = None): + ''' + Subsample x viruses per month + Take from beginning of list - this will prefer longer sequences + Take viruses 1 per region in a cycle to get geographic diversity + But pad with additional viruses from populous regions if necessary + ''' + if prioritize is None: + prioritize=[] else: - label = re.match(r'^A/([^\-^\/]+)[\-\/]', v['strain']).group(1).lower() # check for partial geo match + prioritize = [v.lower() for v in prioritize] + + priority_viruses = viruses_by_date_region([v for label,v in self.strain_lookup.iteritems() if v in prioritize]) + other_viruses = viruses_by_date_region([v for label,v in self.strain_lookup.iteritems() if v not in prioritize]) + + filtered_viruses = [] + first_year = datetime.datetime.today().year - years_back + first_month = datetime.datetime.today().month + regions = list(set([v['region'] for v in self.viruses])) + + print "Filtering between " + str(first_month) + "/" + str(first_year) + " and today" + print "Selecting " + str(viruses_per_month) + " viruses per month" + y = first_year + for m in range(first_month,13): + filtered_viruses.extend(select_viruses(priority_viruses,other_viruses, + y, m, viruses_per_month, regions)) + for y in range(first_year+1,datetime.datetime.today().year+1): + filtered_viruses.extend(select_viruses(priority_viruses,other_viruses, + y, m, viruses_per_month, regions)) + if self.outgroup is not None: + self.filtered_viruses.append(self.outgroup) + print len(filtered_viruses), "with outgroup" + self.virus_subsample = filtered_viruses + + def viruses_by_date_region(self, viruses): + ''' + make dictionary of lists of viruses belonging to a certain date and region + ''' + from collections import defaultdict + virus_tuples = defaultdict(list) + for v in viruses: + vdate = datetime.datetime.strptime(v['date'], '%Y-%m-%d').date() + virus_tuples[(vdate.year, vdate.month, v['region'])].append(v) + + return virus_tuples + + def select_viruses(priority_viruses,other_viruses, y, m, viruses_per_month, regions): + ''' + select viruses_per_month strains as evenly as possible from all regions + ''' + from itertools import izip_longest + select_set = [] + for vset in [priority_viruses, other_viruses]: + for representative in izip_longest(*[vset[(y,m,r)] for r in regions], fillvalue = None): + select_set.extend([v in representative if v is not None]) + return select_set[:viruses_per_month] + + +class flu_filter(virus_filter): + + def __init__(self,fasta_fname): + self.fasta_header = {0:'strain', 1:'accession', 3:'passage', 5:'date' } + self.viruses = self.parse_gisaid(fasta_fname) + self.fix_strain_names() + self.vaccine_strains=[] + + def parse_gisaid(self, fasta): + """Parse FASTA file from GISAID with default header formating""" + viruses = [] + try: + handle = open(fasta, 'r') + except IOError: + print fasta + " not found" + else: + for record in SeqIO.parse(handle, "fasta"): + words = record.description.replace(">","").replace(" ","").split('|') + v = {key:words[ii] for ii, key in self.fasta_header.iteritems()} + v['db']="GISAID", + v['seq']=str(record.seq).upper() + if passage != "": + v['passage'] = passage + viruses.append(v) + handle.close() + return viruses + + def fix_strain_names(self): + for v in self.viruses: + v['strain'] = v['strain'].replace('\'','').replace('(','').replace(')','').replace('H3N2','').replace('Human','').replace('human','').replace('//','/') + + def filter_passage(self): + self.viruses = filter(lambda v: re.match(r'^E\d+', v.get('passage',''), re.I) == None, self.viruses) + self.viruses = filter(lambda v: re.match(r'^Egg', v.get('passage',''), re.I) == None, self.viruses) + + def filter(self): + self.filter_generic(prepend_strains = self.vaccine_strains) + self.filter_passage() + print len(self.viruses) + " without egg passage" + self.filter_geo() + + def self.filter_geo(self): + """Label viruses with geographic location based on strain name""" + """Location is to the level of country of administrative division when available""" + reader = csv.DictReader(open("source-data/geo_synonyms.tsv"), delimiter='\t') # list of dicts + label_to_country = {} + for line in reader: + label_to_country[line['label'].lower()] = line['country'] + for v in self.viruses: + label = re.match(r'^A/([^/]+)/', v['strain']).group(1).lower() # check first for whole geo match + v['country'] = 'Unknown' if label in label_to_country: v['country'] = label_to_country[label] + else: + label = re.match(r'^A/([^\-^\/]+)[\-\/]', v['strain']).group(1).lower() # check for partial geo match + if label in label_to_country: + v['country'] = label_to_country[label] + + reader = csv.DictReader(open("source-data/geo_regions.tsv"), delimiter='\t') # list of dicts + country_to_region = {} + for line in reader: + country_to_region[line['country']] = line['region'] + for v in self.viruses: + v['region'] = 'Unknown' + if v['country'] in country_to_region: + v['region'] = country_to_region[v['country']] + + self.viruses = filter(lambda v: v['region'] != 'Unknown', self.viruses) - reader = csv.DictReader(open("source-data/geo_regions.tsv"), delimiter='\t') # list of dicts - country_to_region = {} - for line in reader: - country_to_region[line['country']] = line['region'] - for v in viruses: - v['region'] = 'Unknown' - if v['country'] in country_to_region: - v['region'] = country_to_region[v['country']] - - return filter(lambda v: v['region'] != 'Unknown', viruses) - -def get_virus_tuples(viruses): - ''' - make dictionary of lists of viruses belonging to a certain date and region - ''' - from collections import defaultdict - virus_tuples = defaultdict(list) - for v in viruses: - vdate = datetime.datetime.strptime(v['date'], '%Y-%m-%d').date() - virus_tuples[(vdate.year, vdate.month, v['region'])].append(v) - - return virus_tuples - -def streamline(viruses, years_back, viruses_per_month): - """Subsample x viruses per month""" - """Take from beginning of list - this will prefer longer sequences""" - """Take viruses 1 per region in a cycle to get geographic diversity""" - """But pad with additional viruses from populous regions if necessary""" - - virus_tuples = get_virus_tuples(viruses) - - filtered_viruses = [] - first_year = datetime.datetime.today().year - years_back - first_month = datetime.datetime.today().month - regions = [v['region'] for v in viruses] - regions = list(set(regions)) - - print "Filtering between " + str(first_month) + "/" + str(first_year) + " and today" - print "Selecting " + str(viruses_per_month) + " viruses per month" - y = first_year - for m in range(first_month,13): - filtered_viruses.extend(select_viruses(virus_tuples, y, m, viruses_per_month, regions)) - for y in range(first_year+1,datetime.datetime.today().year+1): - for m in range(1,13): - filtered_viruses.extend(select_viruses(virus_tuples, y, m, viruses_per_month, regions)) - return filtered_viruses - -def select_viruses(virus_tuples, y, m, viruses_per_month, regions): - ''' - select viruses_per_month strains as evenly as possible from all regions - ''' - from itertools import izip_longest - select_set = [] - for representative in izip_longest(*[virus_tuples[(y,m,r)] for r in regions], fillvalue = None): - select_set.extend([v in representative if v is not None]) - return select_set[:viruses_per_month] - - -def main(in_fname=None, years_back=3, viruses_per_month=50): - - print "--- Filter at " + time.strftime("%H:%M:%S") + " ---" - - if in_fname is None: in_fname = 'data/gisaid_epiflu_sequence.fasta' - viruses = parse_gisaid(in_fname) - print str(len(viruses)) + " initial viruses" - - # sort by sequence length - viruses = sort_length(viruses) - - # fix strain names - fix_strain_names(viruses) - - # add vaccine strains - viruses = vaccine_strains + viruses - print str(len(viruses)) + " with vaccine strains" - - # filter strain names - viruses = filter_strain_names(viruses) - print str(len(viruses)) + " with proper strain names" - - # filter short sequences - viruses = filter_length(viruses) - print str(len(viruses)) + " without truncation" - - # filter imprecise dates - viruses = filter_date(viruses) - print str(len(viruses)) + " with precise dates" - - # filter passage history - viruses = filter_passage(viruses) - print str(len(viruses)) + " without egg passage" - - # filter to unique strains - viruses = filter_unique(viruses) - print str(len(viruses)) + " with unique strain names" - - # append geo information - viruses = append_country_and_region(viruses) - print str(len(viruses)) + " with geographic information" - - # reduce to manageable volume - viruses = streamline(viruses, years_back, viruses_per_month) - print str(len(viruses)) + " after streamlining" - - # add outgroup - add_outgroup(viruses) - print str(len(viruses)) + " with outgroup" - out_fname = 'data/virus_filter.json' - write_json(viruses, out_fname) - return out_fname - -if __name__ == "__main__": - main() \ No newline at end of file From 13502550c44663481a95626ff316afa0c7c12f5c Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Tue, 24 Feb 2015 19:14:35 +0100 Subject: [PATCH 03/48] fixed a number of mistakes --- augur/src/H3N2_filter.py | 14 ++++--- augur/src/virus_filter.py | 81 +++++++++++++++++++++------------------ 2 files changed, 52 insertions(+), 43 deletions(-) diff --git a/augur/src/H3N2_filter.py b/augur/src/H3N2_filter.py index 336fc247..9e2b740a 100644 --- a/augur/src/H3N2_filter.py +++ b/augur/src/H3N2_filter.py @@ -1,8 +1,10 @@ +import time +from io_util import write_json from virus_filter import flu_filter class H3N2_filter(flu_filter): - def __init__(self, fasta_fname): - flu_filter.__init__(fasta_fname) + def __init__(self, fasta_fname, fasta_header): + flu_filter.__init__(self, fasta_fname, fasta_header) self.vaccine_strains =[ { "strain": "A/Wisconsin/67/2005", @@ -64,13 +66,13 @@ def __init__(self, fasta_fname): def main(in_fname='data/gisaid_epiflu_sequence.fasta', years_back=3, viruses_per_month=50): print "--- Filter at " + time.strftime("%H:%M:%S") + " ---" - myH3N2_filter = H3N2_filter(in_fname) - myH3N2_filter.fiter() + myH3N2_filter = H3N2_filter(in_fname, {0:'strain', 1:"date", 4:"passage", -1:'accession'}) + myH3N2_filter.filter() myH3N2_filter.subsample(years_back, viruses_per_month) out_fname = 'data/virus_filter.json' - write_json(myH3N3_filter.virus_subsample, out_fname) - return out_fname + write_json(myH3N2_filter.virus_subsample, out_fname) + return out_fname, myH3N2_filter if __name__ == "__main__": main() diff --git a/augur/src/virus_filter.py b/augur/src/virus_filter.py index 9b16d4c1..a0b7a325 100644 --- a/augur/src/virus_filter.py +++ b/augur/src/virus_filter.py @@ -9,8 +9,6 @@ from io_util import * from collections import defaultdict sys.path.append('../source-data') -from H3N2_outgroup_and_vaccine import outgroup, vaccine_strains, min_length - class virus_filter(object): @@ -25,18 +23,19 @@ def filter_generic(self, min_length=None, date_spec = 'full', prepend_strains = filter viruses by length and accurate date, sort, add additioanl strains such as vaccine strains that are preferentially retained and prune to unique strains ''' - print len(self.viruses) + " initial viruses" + print len(self.viruses), "initial viruses" if min_length is not None: self.filter_length(min_length) - print len(self.viruses) + " after filtering by length >=", min_length + print len(self.viruses), "after filtering by length >=", min_length - self.filter_date(min_length, date_spec) - print len(self.viruses) + " after filtering for precise dates" + self.filter_date(date_spec) + print len(self.viruses), "after filtering for precise dates" self.sort_length() if prepend_strains is not None: self.viruses = prepend_strains + self.viruses + print len(self.viruses), "after adding custom strains" self.filter_unique() - print len(self.viruses) + " after filtering for unique strains" + print len(self.viruses), "after filtering for unique strains" def sort_length(self): ''' @@ -51,7 +50,7 @@ def filter_unique(self): Keep only the first isolate of a strain ''' filtered_viruses = [] - for v in viruses: + for v in self.viruses: label = v['strain'].lower() if not label in self.strain_lookup: filtered_viruses.append(v) @@ -65,7 +64,7 @@ def filter_date(self, date_spec): if date_spec=='full': self.viruses = filter(lambda v: re.match(r'\d\d\d\d-\d\d-\d\d', v['date']) != None, self.viruses) elif date_spec=='year': - self.viruses = filter(lambda v: re.match(r'\d\d\d\d', v['date']) != None, self.viruses + self.viruses = filter(lambda v: re.match(r'\d\d\d\d', v['date']) != None, self.viruses) def subsample(self, years_back, viruses_per_month, prioritize = None): ''' @@ -79,41 +78,42 @@ def subsample(self, years_back, viruses_per_month, prioritize = None): else: prioritize = [v.lower() for v in prioritize] - priority_viruses = viruses_by_date_region([v for label,v in self.strain_lookup.iteritems() if v in prioritize]) - other_viruses = viruses_by_date_region([v for label,v in self.strain_lookup.iteritems() if v not in prioritize]) + priority_viruses = self.viruses_by_date_region([v for v in self.viruses if v['strain'].lower() in prioritize]) + other_viruses = self.viruses_by_date_region([v for v in self.viruses if v['strain'].lower() not in prioritize]) filtered_viruses = [] first_year = datetime.datetime.today().year - years_back first_month = datetime.datetime.today().month regions = list(set([v['region'] for v in self.viruses])) - print "Filtering between " + str(first_month) + "/" + str(first_year) + " and today" - print "Selecting " + str(viruses_per_month) + " viruses per month" + print "Filtering between " + str(first_month) + "/" + str(first_year), "and today" + print "Selecting " + str(viruses_per_month), "viruses per month" y = first_year for m in range(first_month,13): - filtered_viruses.extend(select_viruses(priority_viruses,other_viruses, + filtered_viruses.extend(self.select_viruses(priority_viruses,other_viruses, y, m, viruses_per_month, regions)) for y in range(first_year+1,datetime.datetime.today().year+1): - filtered_viruses.extend(select_viruses(priority_viruses,other_viruses, + for m in range(1,13): + filtered_viruses.extend(self.select_viruses(priority_viruses,other_viruses, y, m, viruses_per_month, regions)) if self.outgroup is not None: - self.filtered_viruses.append(self.outgroup) + filtered_viruses.append(self.outgroup) print len(filtered_viruses), "with outgroup" self.virus_subsample = filtered_viruses - def viruses_by_date_region(self, viruses): + def viruses_by_date_region(self, tmp_viruses): ''' make dictionary of lists of viruses belonging to a certain date and region ''' from collections import defaultdict virus_tuples = defaultdict(list) - for v in viruses: + for v in tmp_viruses: vdate = datetime.datetime.strptime(v['date'], '%Y-%m-%d').date() virus_tuples[(vdate.year, vdate.month, v['region'])].append(v) return virus_tuples - def select_viruses(priority_viruses,other_viruses, y, m, viruses_per_month, regions): + def select_viruses(self, priority_viruses,other_viruses, y, m, viruses_per_month, regions): ''' select viruses_per_month strains as evenly as possible from all regions ''' @@ -121,15 +121,20 @@ def select_viruses(priority_viruses,other_viruses, y, m, viruses_per_month, regi select_set = [] for vset in [priority_viruses, other_viruses]: for representative in izip_longest(*[vset[(y,m,r)] for r in regions], fillvalue = None): - select_set.extend([v in representative if v is not None]) + select_set.extend([v for v in representative if v is not None]) + print "found",len(select_set), 'in year',y,'month',m, 'subsampling to', viruses_per_month return select_set[:viruses_per_month] class flu_filter(virus_filter): - def __init__(self,fasta_fname): - self.fasta_header = {0:'strain', 1:'accession', 3:'passage', 5:'date' } - self.viruses = self.parse_gisaid(fasta_fname) + def __init__(self,fasta_fname, fasta_header=None): + if fasta_header is None: + self.fasta_header = {0:'strain', 1:'accession', 3:'passage', 5:'date' } + else: + self.fasta_header = fasta_header + viruses = self.parse_gisaid(fasta_fname) + virus_filter.__init__(self, viruses) self.fix_strain_names() self.vaccine_strains=[] @@ -139,15 +144,14 @@ def parse_gisaid(self, fasta): try: handle = open(fasta, 'r') except IOError: - print fasta + " not found" + print fasta, "not found" else: for record in SeqIO.parse(handle, "fasta"): words = record.description.replace(">","").replace(" ","").split('|') v = {key:words[ii] for ii, key in self.fasta_header.iteritems()} - v['db']="GISAID", + v['db']="GISAID" v['seq']=str(record.seq).upper() - if passage != "": - v['passage'] = passage + if 'passage' not in v: v['passage']='' viruses.append(v) handle.close() return viruses @@ -163,27 +167,30 @@ def filter_passage(self): def filter(self): self.filter_generic(prepend_strains = self.vaccine_strains) self.filter_passage() - print len(self.viruses) + " without egg passage" + print len(self.viruses), "without egg passage" self.filter_geo() - def self.filter_geo(self): + def filter_geo(self): """Label viruses with geographic location based on strain name""" """Location is to the level of country of administrative division when available""" - reader = csv.DictReader(open("source-data/geo_synonyms.tsv"), delimiter='\t') # list of dicts + reader = csv.DictReader(open("../source-data/geo_synonyms.tsv"), delimiter='\t') # list of dicts label_to_country = {} for line in reader: label_to_country[line['label'].lower()] = line['country'] for v in self.viruses: - label = re.match(r'^A/([^/]+)/', v['strain']).group(1).lower() # check first for whole geo match v['country'] = 'Unknown' - if label in label_to_country: - v['country'] = label_to_country[label] - else: - label = re.match(r'^A/([^\-^\/]+)[\-\/]', v['strain']).group(1).lower() # check for partial geo match + try: + label = re.match(r'^A/([^/]+)/', v['strain']).group(1).lower() # check first for whole geo match if label in label_to_country: v['country'] = label_to_country[label] - - reader = csv.DictReader(open("source-data/geo_regions.tsv"), delimiter='\t') # list of dicts + else: + label = re.match(r'^A/([^\-^\/]+)[\-\/]', v['strain']).group(1).lower() # check for partial geo match + if label in label_to_country: + v['country'] = label_to_country[label] + except: + print "couldn't parse", v['strain'] + + reader = csv.DictReader(open("../source-data/geo_regions.tsv"), delimiter='\t') # list of dicts country_to_region = {} for line in reader: country_to_region[line['country']] = line['region'] From 63a36a02d71b07b468f83495fd70a1d456f34a53 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Tue, 24 Feb 2015 19:21:25 +0100 Subject: [PATCH 04/48] added few output lines and strain_name_filtering --- augur/src/H3N2_filter.py | 2 +- augur/src/virus_filter.py | 18 ++++++++++++------ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/augur/src/H3N2_filter.py b/augur/src/H3N2_filter.py index 9e2b740a..80181fef 100644 --- a/augur/src/H3N2_filter.py +++ b/augur/src/H3N2_filter.py @@ -72,7 +72,7 @@ def main(in_fname='data/gisaid_epiflu_sequence.fasta', years_back=3, viruses_per out_fname = 'data/virus_filter.json' write_json(myH3N2_filter.virus_subsample, out_fname) - return out_fname, myH3N2_filter + return out_fname if __name__ == "__main__": main() diff --git a/augur/src/virus_filter.py b/augur/src/virus_filter.py index a0b7a325..ac59bcd7 100644 --- a/augur/src/virus_filter.py +++ b/augur/src/virus_filter.py @@ -156,6 +156,18 @@ def parse_gisaid(self, fasta): handle.close() return viruses + def filter(self): + self.filter_generic(prepend_strains = self.vaccine_strains) + self.filter_strain_names() + print len(self.viruses), "with proper strain names" + self.filter_passage() + print len(self.viruses), "without egg passage" + self.filter_geo() + print len(self.viruses), "with geographic information" + + def filter_strain_names(self): + self.viruses = filter(lambda v: re.match(r'^A/', v['strain']) != None, self.viruses) + def fix_strain_names(self): for v in self.viruses: v['strain'] = v['strain'].replace('\'','').replace('(','').replace(')','').replace('H3N2','').replace('Human','').replace('human','').replace('//','/') @@ -164,12 +176,6 @@ def filter_passage(self): self.viruses = filter(lambda v: re.match(r'^E\d+', v.get('passage',''), re.I) == None, self.viruses) self.viruses = filter(lambda v: re.match(r'^Egg', v.get('passage',''), re.I) == None, self.viruses) - def filter(self): - self.filter_generic(prepend_strains = self.vaccine_strains) - self.filter_passage() - print len(self.viruses), "without egg passage" - self.filter_geo() - def filter_geo(self): """Label viruses with geographic location based on strain name""" """Location is to the level of country of administrative division when available""" From 8a4115f9a92d54652d12b1e33b3caadd989f08ec Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Wed, 25 Feb 2015 18:32:57 +0100 Subject: [PATCH 05/48] * simplified/made more general dendropy to json * added proper taxon to tree in dendropy import * added strain field to tree upon import --- augur/src/tree_refine.py | 6 +- augur/src/tree_util.py | 119 ++++++++------------------------------- 2 files changed, 28 insertions(+), 97 deletions(-) diff --git a/augur/src/tree_refine.py b/augur/src/tree_refine.py index 9a8023cd..169f280c 100644 --- a/augur/src/tree_refine.py +++ b/augur/src/tree_refine.py @@ -110,9 +110,9 @@ def add_virus_attributes(viruses, tree): strain_to_country = {} strain_to_region = {} for v in viruses: - strain_to_date[v['strain']] = v['date'] - strain_to_country[v['strain']] = v['country'] - strain_to_region[v['strain']] = v['region'] + strain_to_date[v['strain'].lower()] = v['date'] + strain_to_country[v['strain'].lower()] = v['country'] + strain_to_region[v['strain'].lower()] = v['region'] for node in tree.postorder_node_iter(): strain = str(node.taxon).replace("'", '') if strain_to_date.has_key(strain): diff --git a/augur/src/tree_util.py b/augur/src/tree_util.py index 72bfaab5..41f532f3 100644 --- a/augur/src/tree_util.py +++ b/augur/src/tree_util.py @@ -92,49 +92,26 @@ def all_descendants(node): def get_dates(node): """Return ordered list of dates of descendants of a node""" - return sorted([n['date'] for n in tip_descendants(node)]) + return sorted([n['date'] for n in node.leaf_iter()]) -def dendropy_to_json(node): +def dendropy_to_json(node, extra_attr = ['ep', 'ne', 'rb','tol', 'fitness', 'serum', 'dHI', 'cHI', 'HI_info']): json = {} - if hasattr(node, 'clade'): - json['clade'] = node.clade - if hasattr(node, 'taxon'): - if node.taxon != None: - json['strain'] = str(node.taxon).replace("'", '') - if hasattr(node, 'xvalue'): - json['xvalue'] = round(node.xvalue, 5) - if hasattr(node, 'yvalue'): - json['yvalue'] = round(node.yvalue, 5) - if hasattr(node, 'ep'): - json['ep'] = node.ep - if hasattr(node, 'ne'): - json['ne'] = node.ne - if hasattr(node, 'rb'): - json['rb'] = node.rb - if hasattr(node, 'date'): - json['date'] = node.date - if hasattr(node, 'num_date'): - json['num_date'] = node.num_date - if hasattr(node, 'country'): - json['country'] = node.country - if hasattr(node, 'region'): - json['region'] = node.region - if hasattr(node, 'seq'): - json['seq'] = node.seq - if hasattr(node, 'aa_seq'): - json['aa_seq'] = node.aa_seq - if hasattr(node, 'gt'): - json['gt'] = node.gt - if hasattr(node, 'gt_pos'): - json['gt_pos'] = list(node.gt_pos) - if hasattr(node, 'tip_index'): - json['tip_index'] = node.tip_index - if hasattr(node, 'LBI'): - json['LBI'] = round(node.LBI, 5) - if hasattr(node, 'tol'): - json['tol'] = round(node.tol, 5) - if hasattr(node, 'fitness'): - json['fitness'] = round(node.fitness, 5) + str_attr = ['country','region','seq','aa_seq','clade','strain', 'date'] + num_attr = ['xvalue', 'yvalue', 'num_date', 'tip_index'] + for prop in str_attr: + if hasattr(node, prop): + json[prop] = node.__getattribute__(prop) + for prop in num_attr: + if hasattr(node, prop): + json[prop] = round(node.__getattribute__(prop),5) + for prop in extra_attr: + if len(prop)==2 and callable(prop[1]): + if hasattr(node, prop[0]): + json[prop] = prop[1](node.__getattribute__(prop[0])) + else: + if hasattr(node, prop): + json[prop] = node.__getattribute__(prop) + try: if hasattr(node, 'freq') and node.freq is not None: json['freq'] = {reg: [round(x, 3) for x in freq] if freq is not None else "undefined" for reg, freq in node.freq.iteritems()} @@ -152,37 +129,6 @@ def dendropy_to_json(node): json["children"].append(dendropy_to_json(ch)) return json -def BioPhylo_to_json(node): - json = {} - if hasattr(node, 'clade'): - json['clade'] = node.clade - if node.name: - json['strain'] = str(node.name).replace("'", '') - if hasattr(node, 'branch_length'): - json['branch_length'] = round(node.branch_length, 5) - if hasattr(node, 'xvalue'): - json['xvalue'] = round(node.xvalue, 5) - if hasattr(node, 'yvalue'): - json['yvalue'] = round(node.yvalue, 5) - if hasattr(node, 'ep'): - json['ep'] = node.ep - if hasattr(node, 'ne'): - json['ne'] = node.ne - if hasattr(node, 'rb'): - json['rb'] = node.rb - if hasattr(node, 'date'): - json['date'] = node.date - if hasattr(node, 'seq'): - json['seq'] = str(node.seq) - if hasattr(node, 'LBI'): - json['LBI'] = round(node.LBI,5) - if len(node.clades): - json["children"] = [] - for ch in node.clades: - json["children"].append(BioPhylo_to_json(ch)) - return json - - def json_to_dendropy(json): ''' read a json dictionary and make a dendropy tree from it. @@ -190,11 +136,12 @@ def json_to_dendropy(json): tree = dendropy.Tree() tree.get_from_string(';', 'newick') root = tree.seed_node - json_to_dendropy_sub(json, root) + json_to_dendropy_sub(json, root, tree.taxon_set) + root.edge_length=0.0 return tree -def json_to_dendropy_sub(json, node): +def json_to_dendropy_sub(json, node, taxon_set): ''' recursively calls itself for all children of node and builds up the tree. entries in json are added as node attributes @@ -205,7 +152,7 @@ def json_to_dendropy_sub(json, node): if attr=='children': for sub_json in val: child_node = dendropy.Node() - json_to_dendropy_sub(sub_json, child_node) + json_to_dendropy_sub(sub_json, child_node, taxon_set) if hasattr(child_node, 'xvalue'): node.add_child(child_node, edge_length = child_node.xvalue - node.xvalue) elif hasattr(child_node, 'branch_length'): @@ -218,24 +165,8 @@ def json_to_dendropy_sub(json, node): except: node.__setattr__(attr, val) if len(node.child_nodes())==0: - node.taxon = json['strain'] - -def main(): - - tree = read_json('tree.json') - -# print "Whole tree" -# for tip in descendants(tree): -# print tip['date'] - -# node = tree['children'][0] - -# dates = get_dates(tree) -# print dates + node.taxon = dendropy.Taxon(label=json['strain'].lower()) + node.strain = json['strain'] + taxon_set.add_taxon(node.taxon) - for node in all_descendants(tree): - dates = get_dates(node) - print str(node['clade']) + ": " + str(len(dates)) -if __name__ == "__main__": - main() From 7902d5c6743d6b3346d0e56d4360113aa6d85807 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Wed, 25 Feb 2015 18:54:17 +0100 Subject: [PATCH 06/48] removed obsolete BioPhylo_to_json --- augur/src/tree_ancestral.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/augur/src/tree_ancestral.py b/augur/src/tree_ancestral.py index ed87a927..3ae683a5 100644 --- a/augur/src/tree_ancestral.py +++ b/augur/src/tree_ancestral.py @@ -20,7 +20,6 @@ import copy, time from seq_util import json_to_Bio_alignment from io_util import write_json, read_json -from tree_util import BioPhylo_to_json class ancestral_sequences: ''' @@ -266,7 +265,6 @@ def test(): aln = AlignIO.read('../scratch/test_aln.phyx', 'phylip-relaxed') anc_seq = ancestral_sequences(tree=tree, aln=aln, seqtype='str') anc_seq.calc_ancestral_sequences() - write_json(BioPhylo_to_json(anc_seq.T.root), 'test.json') return anc_seq.T if __name__=="__main__": From 697c5619cb17d6be637c7c58a429c55c164bade0 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Wed, 25 Feb 2015 19:07:41 +0100 Subject: [PATCH 07/48] added possibility to force include priority strains --- augur/src/H3N2_filter.py | 2 +- augur/src/process.py | 4 ++-- augur/src/virus_filter.py | 20 +++++++++++++------- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/augur/src/H3N2_filter.py b/augur/src/H3N2_filter.py index 80181fef..f099fb60 100644 --- a/augur/src/H3N2_filter.py +++ b/augur/src/H3N2_filter.py @@ -68,7 +68,7 @@ def main(in_fname='data/gisaid_epiflu_sequence.fasta', years_back=3, viruses_per print "--- Filter at " + time.strftime("%H:%M:%S") + " ---" myH3N2_filter = H3N2_filter(in_fname, {0:'strain', 1:"date", 4:"passage", -1:'accession'}) myH3N2_filter.filter() - myH3N2_filter.subsample(years_back, viruses_per_month) + myH3N2_filter.subsample(years_back, viruses_per_month, prioritize = []) out_fname = 'data/virus_filter.json' write_json(myH3N2_filter.virus_subsample, out_fname) diff --git a/augur/src/process.py b/augur/src/process.py index 6a58d384..0e50ee0c 100644 --- a/augur/src/process.py +++ b/augur/src/process.py @@ -1,5 +1,5 @@ import time, os, argparse -import virus_download, virus_filter, virus_align, virus_clean +import virus_download, H3N2_filter, virus_align, virus_clean import tree_infer, tree_ancestral, tree_refine import bernoulli_frequency import streamline @@ -21,7 +21,7 @@ def main(years_back=3, viruses_per_month=50): virus_fname = virus_align.main(virus_fname) # Clean sequences - virus_fname = virus_clean.main(virus_fname) + virus_fname = H3N2_filter.main(virus_fname) # Make tree, creates raxml files tree_fname = tree_infer.main(virus_fname) diff --git a/augur/src/virus_filter.py b/augur/src/virus_filter.py index ac59bcd7..a246847d 100644 --- a/augur/src/virus_filter.py +++ b/augur/src/virus_filter.py @@ -66,7 +66,7 @@ def filter_date(self, date_spec): elif date_spec=='year': self.viruses = filter(lambda v: re.match(r'\d\d\d\d', v['date']) != None, self.viruses) - def subsample(self, years_back, viruses_per_month, prioritize = None): + def subsample(self, years_back, viruses_per_month, prioritize = None, all_priority=False): ''' Subsample x viruses per month Take from beginning of list - this will prefer longer sequences @@ -91,11 +91,11 @@ def subsample(self, years_back, viruses_per_month, prioritize = None): y = first_year for m in range(first_month,13): filtered_viruses.extend(self.select_viruses(priority_viruses,other_viruses, - y, m, viruses_per_month, regions)) + y, m, viruses_per_month, regions, all_priority=all_priority)) for y in range(first_year+1,datetime.datetime.today().year+1): for m in range(1,13): filtered_viruses.extend(self.select_viruses(priority_viruses,other_viruses, - y, m, viruses_per_month, regions)) + y, m, viruses_per_month, regions, all_priority=all_priority)) if self.outgroup is not None: filtered_viruses.append(self.outgroup) print len(filtered_viruses), "with outgroup" @@ -113,17 +113,23 @@ def viruses_by_date_region(self, tmp_viruses): return virus_tuples - def select_viruses(self, priority_viruses,other_viruses, y, m, viruses_per_month, regions): + def select_viruses(self, priority_viruses,other_viruses, y, m, viruses_per_month, regions, all_priority = False): ''' select viruses_per_month strains as evenly as possible from all regions ''' from itertools import izip_longest select_set = [] for vset in [priority_viruses, other_viruses]: + select_set.append([]) for representative in izip_longest(*[vset[(y,m,r)] for r in regions], fillvalue = None): - select_set.extend([v for v in representative if v is not None]) - print "found",len(select_set), 'in year',y,'month',m, 'subsampling to', viruses_per_month - return select_set[:viruses_per_month] + select_set[-1].extend([v for v in representative if v is not None]) + print "found",len(select_set[-1]), 'in year',y,'month',m + if all_priority: + n_other = max(0,viruses_per_month-len(select_set[0])) + return select_set[0] + select_set[1][:n_other] + else: + tmp = select_set[0] + select_set[1] + return tmp[:viruses_per_month] class flu_filter(virus_filter): From 6f0684b1d6c31230485059417edd19bc605f1dcb Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Wed, 25 Feb 2015 20:55:44 +0100 Subject: [PATCH 08/48] reinstantiated BioPhylo_to_Json --- augur/src/tree_ancestral.py | 1 + augur/src/tree_util.py | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/augur/src/tree_ancestral.py b/augur/src/tree_ancestral.py index 3ae683a5..e4c5ea09 100644 --- a/augur/src/tree_ancestral.py +++ b/augur/src/tree_ancestral.py @@ -20,6 +20,7 @@ import copy, time from seq_util import json_to_Bio_alignment from io_util import write_json, read_json +from tree_util import BioPhylo_to_json class ancestral_sequences: ''' diff --git a/augur/src/tree_util.py b/augur/src/tree_util.py index 41f532f3..0e601446 100644 --- a/augur/src/tree_util.py +++ b/augur/src/tree_util.py @@ -170,3 +170,24 @@ def json_to_dendropy_sub(json, node, taxon_set): taxon_set.add_taxon(node.taxon) +def BioPhylo_to_json(node): + json = {} + if hasattr(node, 'clade'): + json['clade'] = node.clade + if node.name: + json['strain'] = str(node.name).replace("'", '') + if hasattr(node, 'branch_length'): + json['branch_length'] = round(node.branch_length, 5) + if hasattr(node, 'xvalue'): + json['xvalue'] = round(node.xvalue, 5) + if hasattr(node, 'yvalue'): + json['yvalue'] = round(node.yvalue, 5) + if hasattr(node, 'date'): + json['date'] = node.date + if hasattr(node, 'seq'): + json['seq'] = str(node.seq) + if len(node.clades): + json["children"] = [] + for ch in node.clades: + json["children"].append(BioPhylo_to_json(ch)) + return json From c0da468400aa1f1830059eb6b81cb3e817ba934d Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Thu, 26 Feb 2015 23:51:16 +0100 Subject: [PATCH 09/48] added possibilty to select viruses from the global set rather than region specific. the constraint to force even representation of regions seems to mess up the fitness modelling --- augur/src/H3N2_filter.py | 6 ++++-- augur/src/virus_filter.py | 30 ++++++++++++++++++++++++------ 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/augur/src/H3N2_filter.py b/augur/src/H3N2_filter.py index f099fb60..5a56424e 100644 --- a/augur/src/H3N2_filter.py +++ b/augur/src/H3N2_filter.py @@ -1,6 +1,7 @@ import time from io_util import write_json from virus_filter import flu_filter +from Bio import SeqIO class H3N2_filter(flu_filter): def __init__(self, fasta_fname, fasta_header): @@ -64,11 +65,12 @@ def __init__(self, fasta_fname, fasta_header): def main(in_fname='data/gisaid_epiflu_sequence.fasta', years_back=3, viruses_per_month=50): - print "--- Filter at " + time.strftime("%H:%M:%S") + " ---" myH3N2_filter = H3N2_filter(in_fname, {0:'strain', 1:"date", 4:"passage", -1:'accession'}) myH3N2_filter.filter() - myH3N2_filter.subsample(years_back, viruses_per_month, prioritize = []) + HI_data_strains = [seq.name for seq in SeqIO.parse('data/strains_with_HI.fasta', 'fasta')] + myH3N2_filter.subsample(years_back, viruses_per_month, prioritize = HI_data_strains, + all_priority = True, region_specific=False) out_fname = 'data/virus_filter.json' write_json(myH3N2_filter.virus_subsample, out_fname) diff --git a/augur/src/virus_filter.py b/augur/src/virus_filter.py index a246847d..adb2fdfd 100644 --- a/augur/src/virus_filter.py +++ b/augur/src/virus_filter.py @@ -8,7 +8,6 @@ import os, re, time, datetime, csv, sys from io_util import * from collections import defaultdict -sys.path.append('../source-data') class virus_filter(object): @@ -66,7 +65,7 @@ def filter_date(self, date_spec): elif date_spec=='year': self.viruses = filter(lambda v: re.match(r'\d\d\d\d', v['date']) != None, self.viruses) - def subsample(self, years_back, viruses_per_month, prioritize = None, all_priority=False): + def subsample(self, years_back, viruses_per_month, prioritize = None, all_priority=False, region_specific = True): ''' Subsample x viruses per month Take from beginning of list - this will prefer longer sequences @@ -77,6 +76,10 @@ def subsample(self, years_back, viruses_per_month, prioritize = None, all_priori prioritize=[] else: prioritize = [v.lower() for v in prioritize] + if region_specific: + select_func = self.select_viruses + else: + select_func = self.select_viruses_global priority_viruses = self.viruses_by_date_region([v for v in self.viruses if v['strain'].lower() in prioritize]) other_viruses = self.viruses_by_date_region([v for v in self.viruses if v['strain'].lower() not in prioritize]) @@ -90,11 +93,11 @@ def subsample(self, years_back, viruses_per_month, prioritize = None, all_priori print "Selecting " + str(viruses_per_month), "viruses per month" y = first_year for m in range(first_month,13): - filtered_viruses.extend(self.select_viruses(priority_viruses,other_viruses, + filtered_viruses.extend(select_func(priority_viruses,other_viruses, y, m, viruses_per_month, regions, all_priority=all_priority)) for y in range(first_year+1,datetime.datetime.today().year+1): for m in range(1,13): - filtered_viruses.extend(self.select_viruses(priority_viruses,other_viruses, + filtered_viruses.extend(select_func(priority_viruses,other_viruses, y, m, viruses_per_month, regions, all_priority=all_priority)) if self.outgroup is not None: filtered_viruses.append(self.outgroup) @@ -131,6 +134,21 @@ def select_viruses(self, priority_viruses,other_viruses, y, m, viruses_per_month tmp = select_set[0] + select_set[1] return tmp[:viruses_per_month] + def select_viruses_global(self, priority_viruses,other_viruses, y, m, viruses_per_month, regions, all_priority = False): + ''' + select viruses_per_month strains as evenly as possible from all regions + ''' + from random import sample + priority_viruses_flat = [] + for r in regions: priority_viruses_flat.extend(priority_viruses[(y,m,r)]) + other_viruses_flat = [] + for r in regions: other_viruses_flat.extend(other_viruses[(y,m,r)]) + + print "found",len(priority_viruses_flat)+len(other_viruses_flat), 'in year',y,'month',m + n_other = max(0,viruses_per_month-len(priority_viruses_flat)) + return sample(priority_viruses_flat, min(len(priority_viruses_flat), viruses_per_month))\ + + sample(other_viruses_flat, min(n_other, len(other_viruses_flat))) + class flu_filter(virus_filter): @@ -185,7 +203,7 @@ def filter_passage(self): def filter_geo(self): """Label viruses with geographic location based on strain name""" """Location is to the level of country of administrative division when available""" - reader = csv.DictReader(open("../source-data/geo_synonyms.tsv"), delimiter='\t') # list of dicts + reader = csv.DictReader(open("source-data/geo_synonyms.tsv"), delimiter='\t') # list of dicts label_to_country = {} for line in reader: label_to_country[line['label'].lower()] = line['country'] @@ -202,7 +220,7 @@ def filter_geo(self): except: print "couldn't parse", v['strain'] - reader = csv.DictReader(open("../source-data/geo_regions.tsv"), delimiter='\t') # list of dicts + reader = csv.DictReader(open("source-data/geo_regions.tsv"), delimiter='\t') # list of dicts country_to_region = {} for line in reader: country_to_region[line['country']] = line['region'] From 764eae82d77c9d6b9222915d57a2f3c1ddfb0c56 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Thu, 26 Feb 2015 23:51:53 +0100 Subject: [PATCH 10/48] added new processing pipeline --- augur/nextflu_config.py | 6 + augur/src/nextflu_process.py | 220 +++++++++++++++++++++++++++++++++++ 2 files changed, 226 insertions(+) create mode 100644 augur/nextflu_config.py create mode 100644 augur/src/nextflu_process.py diff --git a/augur/nextflu_config.py b/augur/nextflu_config.py new file mode 100644 index 00000000..0939ad9f --- /dev/null +++ b/augur/nextflu_config.py @@ -0,0 +1,6 @@ +config = { + 'virus':'H3N2', + 'alignment_file':'data/20150222_all_H3N2_HA1.fasta', + 'fasta_fields':{0:'strain', 1:"date", 4:"passage", -1:'accession'}, + 'outgroup':'A/Beijing/32/1992' +} diff --git a/augur/src/nextflu_process.py b/augur/src/nextflu_process.py new file mode 100644 index 00000000..f932e8b5 --- /dev/null +++ b/augur/src/nextflu_process.py @@ -0,0 +1,220 @@ +import time, argparse,os,subprocess, shutil, glob +from nextflu_config import config +from Bio import SeqIO +from io_util import write_json, read_json, write_fasta, read_fasta +from tree_util import dendropy_to_json, json_to_dendropy +import dendropy + +class nextflu(object): + def __init__(self): + self.viruses = None + self.tree = None + self.initial_virus_fname = 'data/virus_ingest.json' + self.clean_virus_fname = 'data/virus_clean.json' + + def load_viruses(self, aln_fname = None, years_back=3, viruses_per_month=50): + if config['virus']: + from H3N2_filter import H3N2_filter as virus_filter + fasta_fields = config['fasta_fields'] + force_include_strains = [seq.name for seq in SeqIO.parse('data/strains_with_HI.fasta', 'fasta')] + else: + from virus_filter import virus_filter as virus_filter + fasta_fields = {0:'strain'} + if aln_fname is None: aln_fname = config['alignment_file'] + + my_filter = virus_filter(aln_fname, fasta_fields) + my_filter.filter() + my_filter.subsample(years_back, viruses_per_month, prioritize = force_include_strains, + all_priority = True, region_specific=False) + + self.viruses = my_filter.virus_subsample + write_json(self.viruses, self.initial_virus_fname) + + def clean_viruses(self, virus_fname=None): + import virus_clean + print "--- Clean at " + time.strftime("%H:%M:%S") + " ---" + if virus_fname is None: + if self.viruses is None: + self.viruses = read_json(self.initial_virus_fname) + else: + self.viruses = read_json(virus_fname) + + print str(len(self.viruses)) + " initial self.viruses" + # mask extraneous columns and ambiguous bases + virus_clean.mask_from_outgroup(self.viruses) + virus_clean.clean_ambiguous(self.viruses) + + # clean gapped sequences + # self.viruses = clean_gaps(self.viruses) + # print str(len(self.viruses)) + " with complete HA" + + # clean sequences by outbreak + self.viruses = virus_clean.clean_outbreaks(self.viruses) + print str(len(self.viruses)) + " with outbreak sequences removed" + + # clean reassortant sequences + self.viruses = virus_clean.clean_reassortants(self.viruses) + print str(len(self.viruses)) + " with triple reassortants removed" + + # clean sequences by distance + self.viruses = virus_clean.clean_distances(self.viruses) + print str(len(self.viruses)) + " with clock" + + write_json(self.viruses, self.clean_virus_fname) + + def align(self): + import virus_align + write_fasta(self.viruses, 'temp_in.fasta') + os.system("mafft --nofft temp_in.fasta > temp_out.fasta") + alignment = read_fasta('temp_out.fasta') + virus_align.update_viruses(alignment, self.viruses) + out_fname = 'data/virus_align.json' + write_json(self.viruses, out_fname) + virus_align.cleanup() + + def infer_tree(self, virus_fname = None, raxml_time_limit = 1.0): + print "--- Tree infer at " + time.strftime("%H:%M:%S") + " ---" + import tree_infer + if virus_fname is not None: + self.viruses = read_json(virus_fname) + else: + if self.viruses is None: + self.viruses = read_json(self.clean_virus_fname) + + tree_infer.cleanup() + write_fasta(self.viruses, 'temp.fasta') + print "Building initial tree with FastTree" + os.system("fasttree -gtr -nt -gamma -nosupport -mlacc 2 -slownni temp.fasta > initial_tree.newick") + tree_infer.delimit_newick("initial_tree.newick", "temp.newick") + self.tree = dendropy.Tree.get_from_path("temp.newick", "newick") + self.tree.resolve_polytomies() + self.tree.write_to_path("initial_tree.newick", "newick") + + if raxml_time_limit>0: + print "RAxML tree optimization with time limit " + str(raxml_time_limit) + " hours" + os.system("seqmagick convert temp.fasta temp.phyx") + # using exec to be able to kill process + end_time = time.time() + int(raxml_time_limit*3600) + process = subprocess.Popen("exec raxml -f d -T 6 -j -s temp.phyx -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick", shell=True) + while (time.time() < end_time): + if os.path.isfile('raxml_result.topology'): + break + time.sleep(10) + process.terminate() + + checkpoint_files = [file for file in glob.glob("RAxML_checkpoint*")] + if os.path.isfile('raxml_result.topology'): + checkpoint_files.append('raxml_result.topology') + if len(checkpoint_files) > 0: + last_tree_file = checkpoint_files[-1] + shutil.copy(last_tree_file, 'raxml_tree.newick') + else: + shutil.copy("initial_tree.newick", 'raxml_tree.newick') + + print "RAxML branch length optimization and rooting" + os.system("raxml -f e -T 6 -s temp.phyx -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick -o " + config["outgroup"]) + + tree_infer.delimit_newick("RAxML_result.branches", "temp.newick") + self.tree = dendropy.Tree.get_from_path("temp.newick", "newick") + self.tree.resolve_polytomies() + tree_infer.cleanup() + self.tree.write_to_path("data/tree_infer.newick", "newick") + + def infer_ancestral(self, tree_fname="data/tree_infer.newick", virus_fname = None): + from tree_ancestral import ancestral_sequences + from seq_util import json_to_Bio_alignment + from tree_util import BioPhylo_to_json + print "--- Ancestral inference at " + time.strftime("%H:%M:%S") + " ---" + if virus_fname is not None: + self.viruses = read_json(virus_fname) + else: + if self.viruses is None: + self.viruses = read_json(self.clean_virus_fname) + from Bio import Phylo + aln = json_to_Bio_alignment(self.viruses) + biotree = Phylo.read(tree_fname, 'newick') + print "--- Set-up ancestral inference at " + time.strftime("%H:%M:%S") + " ---" + anc_seq = ancestral_sequences(biotree, aln, seqtype='str') + anc_seq.calc_ancestral_sequences() + anc_seq.cleanup_tree() + out_fname = "data/tree_ancestral.json" + write_json(BioPhylo_to_json(anc_seq.T.root), out_fname) + self.tree = json_to_dendropy(read_json(out_fname)) + + def refine_tree(self): + import tree_refine + print "--- Tree refine at " + time.strftime("%H:%M:%S") + " ---" + print "Remove outgroup" + tree_refine.remove_outgroup(self.tree) + print "Remove outlier branches" + tree_refine.reduce(self.tree) + print "Collapse internal nodes" + tree_refine.collapse(self.tree) + print "Ladderize tree" + tree_refine.ladderize(self.tree) + print "Append node attributes" + tree_refine.add_virus_attributes(self.viruses, self.tree) + tree_refine.add_node_attributes(self.tree) + print "Translate nucleotide sequences" + tree_refine.translate_all(self.tree) + print "Enumerate leaves of ladderized tree and calculate unique numerical date" + tree_refine.unique_date(self.tree) + print "Define trunk" + tree_refine.define_trunk(self.tree) + out_fname = "data/self.tree_refine.json" + write_json(dendropy_to_json(self.tree.seed_node), out_fname) + return out_fname + + def streamline(self): + from tree_util import all_descendants + print "--- Streamline at " + time.strftime("%H:%M:%S") + " ---" + # Move sequence data to separate file + print "Writing sequences" + tree_json = dendropy_to_json(self.tree.seed_node) + elems = [] + for node in all_descendants(tree_json): + elem = {} + if 'clade' in node: + elem['clade'] = node['clade'] + if 'aa_seq' in node: + elem['aa_seq'] = node['aa_seq'] + elems.append(elem) + write_json(elems, "../auspice/data/sequences.json", indent=None) + + # Streamline tree for auspice + print "Writing streamlined tree" + for node in all_descendants(tree_json): + node.pop("seq", None) + node.pop("aa_seq", None) + node.pop("logit_freq", None) + + out_fname_tree = "../auspice/data/tree.json" + write_json(tree_json, out_fname_tree, indent=None) + try: + read_json(out_fname_tree) + except: + print "Read failed, rewriting with indents" + write_json(self.tree, out_fname_tree, indent=1) + + # Include genotype frequencies + shutil.copy2("data/genotype_frequencies.json", "../auspice/data/frequencies.json") + + + def run(self,years_back=3, viruses_per_month=50, raxml_time_limit = 1.0): + self.load_viruses(years_back=years_back, viruses_per_month=viruses_per_month) + self.clean_viruses() + self.align() + self.infer_tree(raxml_time_limit = raxml_time_limit) + self.infer_ancestral() + self.refine_tree() + self.streamline() + +if __name__=="__main__": + parser = argparse.ArgumentParser(description='Process virus sequences, build tree, and prepare of web visualization') + parser.add_argument('-y', '--years_back', type = int, default=3, help='number of past years to sample sequences from') + parser.add_argument('-v', '--viruses_per_month', type = int, default = 50, help='number of viruses sampled per month') + parser.add_argument('-r', '--raxml_time_limit', type = float, default = 1.0, help='number of hours raxml is run') + params = parser.parse_args() + + my_nextflu = nextflu() + my_nextflu.run(**params.__dict__) From 0e294b5576f8099a10fb2111ec58f37f3ff6f1db Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Fri, 27 Feb 2015 02:00:52 +0100 Subject: [PATCH 11/48] changed main method of most pipeline steps to accept the jsons rather than file names, jsons are returned and passed along in the processing class --- augur/src/nextflu_process.py | 183 ++++++----------------------------- augur/src/streamline.py | 14 ++- augur/src/tree_ancestral.py | 10 +- augur/src/tree_infer.py | 55 +++++------ augur/src/tree_refine.py | 9 +- augur/src/tree_util.py | 9 ++ augur/src/virus_align.py | 8 +- augur/src/virus_clean.py | 10 +- 8 files changed, 74 insertions(+), 224 deletions(-) diff --git a/augur/src/nextflu_process.py b/augur/src/nextflu_process.py index f932e8b5..f75348c8 100644 --- a/augur/src/nextflu_process.py +++ b/augur/src/nextflu_process.py @@ -1,8 +1,9 @@ -import time, argparse,os,subprocess, shutil, glob +import time, argparse,os,subprocess, shutil, glob, sys +sys.path.append('./src') from nextflu_config import config from Bio import SeqIO from io_util import write_json, read_json, write_fasta, read_fasta -from tree_util import dendropy_to_json, json_to_dendropy +from tree_util import dendropy_to_json, json_to_dendropy, delimit_newick import dendropy class nextflu(object): @@ -12,6 +13,14 @@ def __init__(self): self.initial_virus_fname = 'data/virus_ingest.json' self.clean_virus_fname = 'data/virus_clean.json' + def load_from_file(self, tree_fname=None, virus_fname = None): + if tree_fname is None: tree_fname = 'data/tree_ancestral.json' + if os.path.isfile(tree_fname): + self.tree = dendropy_to_json(read_json(tree_fname)) + if virus_fname is None: virus_fname = 'data/virus_clean.json' + if os.path.isfile(virus_fname): + self.viruses = dendropy_to_json(read_json(virus_fname)) + def load_viruses(self, aln_fname = None, years_back=3, viruses_per_month=50): if config['virus']: from H3N2_filter import H3N2_filter as virus_filter @@ -30,175 +39,37 @@ def load_viruses(self, aln_fname = None, years_back=3, viruses_per_month=50): self.viruses = my_filter.virus_subsample write_json(self.viruses, self.initial_virus_fname) - def clean_viruses(self, virus_fname=None): + def clean_viruses(self): import virus_clean - print "--- Clean at " + time.strftime("%H:%M:%S") + " ---" - if virus_fname is None: - if self.viruses is None: - self.viruses = read_json(self.initial_virus_fname) - else: - self.viruses = read_json(virus_fname) - - print str(len(self.viruses)) + " initial self.viruses" - # mask extraneous columns and ambiguous bases - virus_clean.mask_from_outgroup(self.viruses) - virus_clean.clean_ambiguous(self.viruses) - - # clean gapped sequences - # self.viruses = clean_gaps(self.viruses) - # print str(len(self.viruses)) + " with complete HA" - - # clean sequences by outbreak - self.viruses = virus_clean.clean_outbreaks(self.viruses) - print str(len(self.viruses)) + " with outbreak sequences removed" - - # clean reassortant sequences - self.viruses = virus_clean.clean_reassortants(self.viruses) - print str(len(self.viruses)) + " with triple reassortants removed" - - # clean sequences by distance - self.viruses = virus_clean.clean_distances(self.viruses) - print str(len(self.viruses)) + " with clock" - + self.viruses = virus_clean.main(self.viruses) write_json(self.viruses, self.clean_virus_fname) def align(self): import virus_align - write_fasta(self.viruses, 'temp_in.fasta') - os.system("mafft --nofft temp_in.fasta > temp_out.fasta") - alignment = read_fasta('temp_out.fasta') - virus_align.update_viruses(alignment, self.viruses) + self.viruses = virus_align.main(self.viruses) out_fname = 'data/virus_align.json' write_json(self.viruses, out_fname) - virus_align.cleanup() - def infer_tree(self, virus_fname = None, raxml_time_limit = 1.0): - print "--- Tree infer at " + time.strftime("%H:%M:%S") + " ---" + def infer_tree(self, raxml_time_limit = 1.0): import tree_infer - if virus_fname is not None: - self.viruses = read_json(virus_fname) - else: - if self.viruses is None: - self.viruses = read_json(self.clean_virus_fname) - - tree_infer.cleanup() - write_fasta(self.viruses, 'temp.fasta') - print "Building initial tree with FastTree" - os.system("fasttree -gtr -nt -gamma -nosupport -mlacc 2 -slownni temp.fasta > initial_tree.newick") - tree_infer.delimit_newick("initial_tree.newick", "temp.newick") + tree_fname = tree_infer.main(self.viruses, raxml_time_limit, config['outgroup']) + delimit_newick(tree_fname, "temp.newick") self.tree = dendropy.Tree.get_from_path("temp.newick", "newick") - self.tree.resolve_polytomies() - self.tree.write_to_path("initial_tree.newick", "newick") - - if raxml_time_limit>0: - print "RAxML tree optimization with time limit " + str(raxml_time_limit) + " hours" - os.system("seqmagick convert temp.fasta temp.phyx") - # using exec to be able to kill process - end_time = time.time() + int(raxml_time_limit*3600) - process = subprocess.Popen("exec raxml -f d -T 6 -j -s temp.phyx -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick", shell=True) - while (time.time() < end_time): - if os.path.isfile('raxml_result.topology'): - break - time.sleep(10) - process.terminate() - - checkpoint_files = [file for file in glob.glob("RAxML_checkpoint*")] - if os.path.isfile('raxml_result.topology'): - checkpoint_files.append('raxml_result.topology') - if len(checkpoint_files) > 0: - last_tree_file = checkpoint_files[-1] - shutil.copy(last_tree_file, 'raxml_tree.newick') - else: - shutil.copy("initial_tree.newick", 'raxml_tree.newick') + os.remove('temp.newick') - print "RAxML branch length optimization and rooting" - os.system("raxml -f e -T 6 -s temp.phyx -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick -o " + config["outgroup"]) - - tree_infer.delimit_newick("RAxML_result.branches", "temp.newick") - self.tree = dendropy.Tree.get_from_path("temp.newick", "newick") - self.tree.resolve_polytomies() - tree_infer.cleanup() - self.tree.write_to_path("data/tree_infer.newick", "newick") - - def infer_ancestral(self, tree_fname="data/tree_infer.newick", virus_fname = None): - from tree_ancestral import ancestral_sequences - from seq_util import json_to_Bio_alignment - from tree_util import BioPhylo_to_json - print "--- Ancestral inference at " + time.strftime("%H:%M:%S") + " ---" - if virus_fname is not None: - self.viruses = read_json(virus_fname) - else: - if self.viruses is None: - self.viruses = read_json(self.clean_virus_fname) - from Bio import Phylo - aln = json_to_Bio_alignment(self.viruses) - biotree = Phylo.read(tree_fname, 'newick') - print "--- Set-up ancestral inference at " + time.strftime("%H:%M:%S") + " ---" - anc_seq = ancestral_sequences(biotree, aln, seqtype='str') - anc_seq.calc_ancestral_sequences() - anc_seq.cleanup_tree() - out_fname = "data/tree_ancestral.json" - write_json(BioPhylo_to_json(anc_seq.T.root), out_fname) - self.tree = json_to_dendropy(read_json(out_fname)) + def infer_ancestral(self, virus_fname = None): + import tree_ancestral + self.tree = tree_ancestral.main(self.tree, self.viruses) def refine_tree(self): import tree_refine - print "--- Tree refine at " + time.strftime("%H:%M:%S") + " ---" - print "Remove outgroup" - tree_refine.remove_outgroup(self.tree) - print "Remove outlier branches" - tree_refine.reduce(self.tree) - print "Collapse internal nodes" - tree_refine.collapse(self.tree) - print "Ladderize tree" - tree_refine.ladderize(self.tree) - print "Append node attributes" - tree_refine.add_virus_attributes(self.viruses, self.tree) - tree_refine.add_node_attributes(self.tree) - print "Translate nucleotide sequences" - tree_refine.translate_all(self.tree) - print "Enumerate leaves of ladderized tree and calculate unique numerical date" - tree_refine.unique_date(self.tree) - print "Define trunk" - tree_refine.define_trunk(self.tree) - out_fname = "data/self.tree_refine.json" - write_json(dendropy_to_json(self.tree.seed_node), out_fname) - return out_fname + tree_refine.main(self.tree, self.viruses) + write_json(dendropy_to_json(self.tree.seed_node), 'data/tree_refine.json') - def streamline(self): - from tree_util import all_descendants - print "--- Streamline at " + time.strftime("%H:%M:%S") + " ---" - # Move sequence data to separate file - print "Writing sequences" + def export_to_auspice(self): + import streamline tree_json = dendropy_to_json(self.tree.seed_node) - elems = [] - for node in all_descendants(tree_json): - elem = {} - if 'clade' in node: - elem['clade'] = node['clade'] - if 'aa_seq' in node: - elem['aa_seq'] = node['aa_seq'] - elems.append(elem) - write_json(elems, "../auspice/data/sequences.json", indent=None) - - # Streamline tree for auspice - print "Writing streamlined tree" - for node in all_descendants(tree_json): - node.pop("seq", None) - node.pop("aa_seq", None) - node.pop("logit_freq", None) - - out_fname_tree = "../auspice/data/tree.json" - write_json(tree_json, out_fname_tree, indent=None) - try: - read_json(out_fname_tree) - except: - print "Read failed, rewriting with indents" - write_json(self.tree, out_fname_tree, indent=1) - - # Include genotype frequencies - shutil.copy2("data/genotype_frequencies.json", "../auspice/data/frequencies.json") - + streamline.main(tree_json) def run(self,years_back=3, viruses_per_month=50, raxml_time_limit = 1.0): self.load_viruses(years_back=years_back, viruses_per_month=viruses_per_month) @@ -207,7 +78,7 @@ def run(self,years_back=3, viruses_per_month=50, raxml_time_limit = 1.0): self.infer_tree(raxml_time_limit = raxml_time_limit) self.infer_ancestral() self.refine_tree() - self.streamline() + self.export_to_auspice() if __name__=="__main__": parser = argparse.ArgumentParser(description='Process virus sequences, build tree, and prepare of web visualization') diff --git a/augur/src/streamline.py b/augur/src/streamline.py index 52474389..be76aa26 100644 --- a/augur/src/streamline.py +++ b/augur/src/streamline.py @@ -2,16 +2,15 @@ from io_util import * from tree_util import * -def main(in_fname='data/tree_frequencies.json'): +def main(tree_json): """Prep tree for auspice, stripping sequence data""" print "--- Streamline at " + time.strftime("%H:%M:%S") + " ---" # Move sequence data to separate file - print "Writing sequences" - tree = read_json(in_fname) + print "Writing sequences" elems = [] - for node in all_descendants(tree): + for node in all_descendants(tree_json): elem = {} if 'clade' in node: elem['clade'] = node['clade'] @@ -22,19 +21,18 @@ def main(in_fname='data/tree_frequencies.json'): # Streamline tree for auspice print "Writing streamlined tree" - tree = read_json(in_fname) - for node in all_descendants(tree): + for node in all_descendants(tree_json): node.pop("seq", None) node.pop("aa_seq", None) node.pop("logit_freq", None) out_fname_tree = "../auspice/data/tree.json" - write_json(tree, out_fname_tree, indent=None) + write_json(tree_json, out_fname_tree, indent=None) try: read_json(out_fname_tree) except: print "Read failed, rewriting with indents" - write_json(tree, out_fname_tree, indent=1) + write_json(tree_json, out_fname_tree, indent=1) # Include genotype frequencies shutil.copy2("data/genotype_frequencies.json", "../auspice/data/frequencies.json") diff --git a/augur/src/tree_ancestral.py b/augur/src/tree_ancestral.py index e4c5ea09..28b3bd1f 100644 --- a/augur/src/tree_ancestral.py +++ b/augur/src/tree_ancestral.py @@ -20,7 +20,7 @@ import copy, time from seq_util import json_to_Bio_alignment from io_util import write_json, read_json -from tree_util import BioPhylo_to_json +from tree_util import BioPhylo_to_json, to_Biopython, json_to_dendropy class ancestral_sequences: ''' @@ -245,19 +245,17 @@ def cleanup_tree(self, attrnames=['prob', 'down_message', 'up_message']): if hasattr(leaf, attrname): delattr(leaf, attrname) -def main(tree_fname='data/tree_infer.newick', virus_fname='data/virus_clean.json'): +def main(tree, viruses): print "--- Ancestral inference at " + time.strftime("%H:%M:%S") + " ---" from Bio import Phylo - viruses = read_json(virus_fname) aln = json_to_Bio_alignment(viruses) - tree = Phylo.read(tree_fname, 'newick') + tree = to_Biopython(tree) print "--- Set-up ancestral inference at " + time.strftime("%H:%M:%S") + " ---" anc_seq = ancestral_sequences(tree, aln, seqtype='str') anc_seq.calc_ancestral_sequences() anc_seq.cleanup_tree() out_fname = "data/tree_ancestral.json" - write_json(BioPhylo_to_json(anc_seq.T.root), out_fname) - return out_fname + return json_to_dendropy(BioPhylo_to_json(anc_seq.T.root)) def test(): from Bio import Phylo, AlignIO diff --git a/augur/src/tree_infer.py b/augur/src/tree_infer.py index ea86b01d..efacdad7 100644 --- a/augur/src/tree_infer.py +++ b/augur/src/tree_infer.py @@ -6,9 +6,7 @@ import subprocess import dendropy from io_util import * - -OUTGROUP = 'A/Beijing/32/1992' -RAXML_LIMIT = 1.0 # in hours +from tree_util import delimit_newick def cleanup(): for file in glob.glob("RAxML_*"): @@ -41,21 +39,12 @@ def cleanup(): except OSError: pass -def delimit_newick(infile_name, outfile_name): - with open(infile_name, 'r') as file: - newick = file.read().replace('\n', '') - newick = re.sub(r'(A/[^\:^,]+)', r"'\1'", newick) - with open(outfile_name, 'w') as file: - file.write(newick) - -def main(in_fname = 'data/virus_clean.json'): +def main(viruses, raxml_time_limit, outgroup): print "--- Tree infer at " + time.strftime("%H:%M:%S") + " ---" cleanup() - viruses = read_json(in_fname) write_fasta(viruses, 'temp.fasta') - print "Building initial tree with FastTree" os.system("fasttree -gtr -nt -gamma -nosupport -mlacc 2 -slownni temp.fasta > initial_tree.newick") delimit_newick("initial_tree.newick", "temp.newick") @@ -63,33 +52,35 @@ def main(in_fname = 'data/virus_clean.json'): tree.resolve_polytomies() tree.write_to_path("initial_tree.newick", "newick") - print "RAxML tree optimization with time limit " + str(RAXML_LIMIT) + " hours" - os.system("seqmagick convert temp.fasta temp.phyx") - # using exec to be able to kill process - end_time = time.time() + int(RAXML_LIMIT*3600) - process = subprocess.Popen("exec raxml -f d -T 6 -j -s temp.phyx -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick", shell=True) - while (time.time() < end_time): - if os.path.isfile('raxml_result.topology'): - break - time.sleep(10) - process.terminate() + if raxml_time_limit>0: + print "RAxML tree optimization with time limit " + str(raxml_time_limit) + " hours" + os.system("seqmagick convert temp.fasta temp.phyx") + # using exec to be able to kill process + end_time = time.time() + int(raxml_time_limit*3600) + process = subprocess.Popen("exec raxml -f d -T 6 -j -s temp.phyx -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick", shell=True) + while (time.time() < end_time): + if os.path.isfile('RAxML_result.topology'): + break + time.sleep(10) + process.terminate() - checkpoint_files = [file for file in glob.glob("RAxML_checkpoint*")] - if os.path.isfile('raxml_result.topology'): - checkpoint_files.append('raxml_result.topology') - if len(checkpoint_files) > 0: - last_tree_file = checkpoint_files[-1] - shutil.copy(last_tree_file, 'raxml_tree.newick') + checkpoint_files = [file for file in glob.glob("RAxML_checkpoint*")] + if os.path.isfile('RAxML_result.topology'): + checkpoint_files.append('RAxML_result.topology') + if len(checkpoint_files) > 0: + last_tree_file = checkpoint_files[-1] + shutil.copy(last_tree_file, 'raxml_tree.newick') + else: + shutil.copy("initial_tree.newick", 'raxml_tree.newick') else: shutil.copy("initial_tree.newick", 'raxml_tree.newick') print "RAxML branch length optimization and rooting" - os.system("raxml -f e -T 6 -s temp.phyx -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick -o " + OUTGROUP) + os.system("raxml -f e -T 6 -s temp.phyx -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick -o " + outgroup) out_fname = "data/tree_infer.newick" os.rename('RAxML_result.branches', out_fname) - cleanup() - + cleanup() return out_fname; if __name__ == "__main__": diff --git a/augur/src/tree_refine.py b/augur/src/tree_refine.py index 169f280c..4a99ec72 100644 --- a/augur/src/tree_refine.py +++ b/augur/src/tree_refine.py @@ -188,12 +188,8 @@ def define_trunk(tree): node.trunk = True; -def main(tree_fname = 'data/tree_ancestral.json', virus_fname='data/virus_clean.json'): - +def main(tree, viruses): print "--- Tree refine at " + time.strftime("%H:%M:%S") + " ---" - - viruses = read_json(virus_fname) - tree = json_to_dendropy(read_json(tree_fname)) print "Remove outgroup" remove_outgroup(tree) print "Remove outlier branches" @@ -211,9 +207,6 @@ def main(tree_fname = 'data/tree_ancestral.json', virus_fname='data/virus_clean. unique_date(tree) print "Define trunk" define_trunk(tree) - out_fname = "data/tree_refine.json" - write_json(dendropy_to_json(tree.seed_node), out_fname) - return out_fname if __name__ == "__main__": main() diff --git a/augur/src/tree_util.py b/augur/src/tree_util.py index 03a75f91..490634db 100644 --- a/augur/src/tree_util.py +++ b/augur/src/tree_util.py @@ -2,6 +2,15 @@ import numpy as np from io_util import * +def delimit_newick(infile_name, outfile_name): + import re + with open(infile_name, 'r') as file: + newick = file.read().replace('\n', '') + newick = re.sub(r'(A/[^\:^,]+)', r"'\1'", newick) + with open(outfile_name, 'w') as file: + file.write(newick) + + def color_BioTree_by_attribute(T,attribute, vmin=None, vmax = None, missing_val='min', transform = lambda x:x, cmap=None): ''' simple function that assigns a color to each node in a biopython tree diff --git a/augur/src/virus_align.py b/augur/src/virus_align.py index 0606d080..63aa9420 100644 --- a/augur/src/virus_align.py +++ b/augur/src/virus_align.py @@ -17,19 +17,15 @@ def cleanup(): except OSError: pass -def main(in_fname = None): +def main(viruses): print "--- Align at " + time.strftime("%H:%M:%S") + " ---" - if in_fname is None: in_fname='data/virus_filter.json' - viruses = read_json(in_fname) write_fasta(viruses, 'temp_in.fasta') os.system("mafft --nofft temp_in.fasta > temp_out.fasta") alignment = read_fasta('temp_out.fasta') update_viruses(alignment, viruses) - out_fname = 'data/virus_align.json' - write_json(viruses, out_fname) cleanup() - return out_fname + return viruses if __name__ == "__main__": main() \ No newline at end of file diff --git a/augur/src/virus_clean.py b/augur/src/virus_clean.py index 802b3e90..0aea1d2b 100644 --- a/augur/src/virus_clean.py +++ b/augur/src/virus_clean.py @@ -105,12 +105,9 @@ def clean_reassortants(viruses): return new_viruses -def main(in_fname=None): +def main(viruses): print "--- Clean at " + time.strftime("%H:%M:%S") + " ---" - - if in_fname is None: in_fname = 'data/virus_align.json' - viruses = read_json(in_fname) print str(len(viruses)) + " initial viruses" # mask extraneous columns and ambiguous bases @@ -132,10 +129,7 @@ def main(in_fname=None): # clean sequences by distance viruses = clean_distances(viruses) print str(len(viruses)) + " with clock" - - out_fname = 'data/virus_clean.json' - write_json(viruses, out_fname) - return out_fname + return viruses if __name__ == "__main__": main() \ No newline at end of file From dc281e83f260789c44e2396f26c39b62d60adf25 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Fri, 27 Feb 2015 02:11:21 +0100 Subject: [PATCH 12/48] simplified tmp file deletions --- augur/src/tree_infer.py | 26 +------------------------- augur/src/virus_align.py | 16 +++++++--------- 2 files changed, 8 insertions(+), 34 deletions(-) diff --git a/augur/src/tree_infer.py b/augur/src/tree_infer.py index efacdad7..9448f086 100644 --- a/augur/src/tree_infer.py +++ b/augur/src/tree_infer.py @@ -9,35 +9,11 @@ from tree_util import delimit_newick def cleanup(): - for file in glob.glob("RAxML_*"): + for file in glob.glob("RAxML_*") + glob.glob("temp*") + ["raxml_tree.newick", "initial_tree.newick"]: try: os.remove(file) except OSError: pass - try: - os.remove("temp.fasta") - except OSError: - pass - try: - os.remove("temp.newick") - except OSError: - pass - try: - os.remove("temp.phyx") - except OSError: - pass - try: - os.remove("temp.phyx.reduced") - except OSError: - pass - try: - os.remove("raxml_tree.newick") - except OSError: - pass - try: - os.remove("initial_tree.newick") - except OSError: - pass def main(viruses, raxml_time_limit, outgroup): diff --git a/augur/src/virus_align.py b/augur/src/virus_align.py index 63aa9420..7fee97de 100644 --- a/augur/src/virus_align.py +++ b/augur/src/virus_align.py @@ -4,18 +4,16 @@ from io_util import * def update_viruses(alignment, viruses): + strain_to_sequence_map = {x['strain'].lower(): x['seq'] for x in alignment} for v in viruses: - v['seq'] = next(x for x in alignment if x['strain'] == v['strain'])['seq'] + v['seq'] = strain_to_sequence_map[v['strain'].lower()] def cleanup(): - try: - os.remove('temp_in.fasta') - except OSError: - pass - try: - os.remove('temp_out.fasta') - except OSError: - pass + for tmp_file in ['temp_in.fasta', 'temp_out.fasta']: + try: + os.remove(tmp_file) + except OSError: + pass def main(viruses): From 91b47ac6dfb9474c9179de0c56cf0afc290c14a0 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Fri, 27 Feb 2015 02:24:28 +0100 Subject: [PATCH 13/48] collapse nodes based on identical sequences rather then edge length --- augur/src/tree_infer.py | 2 +- augur/src/tree_refine.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/augur/src/tree_infer.py b/augur/src/tree_infer.py index 9448f086..95778129 100644 --- a/augur/src/tree_infer.py +++ b/augur/src/tree_infer.py @@ -28,9 +28,9 @@ def main(viruses, raxml_time_limit, outgroup): tree.resolve_polytomies() tree.write_to_path("initial_tree.newick", "newick") + os.system("seqmagick convert temp.fasta temp.phyx") if raxml_time_limit>0: print "RAxML tree optimization with time limit " + str(raxml_time_limit) + " hours" - os.system("seqmagick convert temp.fasta temp.phyx") # using exec to be able to kill process end_time = time.time() + int(raxml_time_limit*3600) process = subprocess.Popen("exec raxml -f d -T 6 -j -s temp.phyx -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick", shell=True) diff --git a/augur/src/tree_refine.py b/augur/src/tree_refine.py index 4a99ec72..175598a6 100644 --- a/augur/src/tree_refine.py +++ b/augur/src/tree_refine.py @@ -56,10 +56,11 @@ def remove_outgroup(tree): tree.prune_subtree(outgroup_node) def collapse(tree): - """Collapse short edges to polytomies""" + """Collapse edges without mutations to polytomies""" for edge in tree.postorder_edge_iter(): - if edge.length < 0.00001 and edge.is_internal(): - edge.collapse() + if edge.tail_node is not None: + if edge.is_internal() and edge.head_node.seq==edge.tail_node.seq: + edge.collapse() def reduce(tree): """Remove outlier tips""" From 2948195cae7c7b37524fa5af1d6c9e2f4aeb2165 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Fri, 27 Feb 2015 02:31:57 +0100 Subject: [PATCH 14/48] fixed problem with reload from file --- augur/src/nextflu_process.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/augur/src/nextflu_process.py b/augur/src/nextflu_process.py index f75348c8..bd743d2b 100644 --- a/augur/src/nextflu_process.py +++ b/augur/src/nextflu_process.py @@ -14,12 +14,12 @@ def __init__(self): self.clean_virus_fname = 'data/virus_clean.json' def load_from_file(self, tree_fname=None, virus_fname = None): - if tree_fname is None: tree_fname = 'data/tree_ancestral.json' + if tree_fname is None: tree_fname = 'data/tree_refine.json' if os.path.isfile(tree_fname): - self.tree = dendropy_to_json(read_json(tree_fname)) + self.tree = json_to_dendropy(read_json(tree_fname)) if virus_fname is None: virus_fname = 'data/virus_clean.json' if os.path.isfile(virus_fname): - self.viruses = dendropy_to_json(read_json(virus_fname)) + self.viruses = read_json(virus_fname) def load_viruses(self, aln_fname = None, years_back=3, viruses_per_month=50): if config['virus']: @@ -85,7 +85,9 @@ def run(self,years_back=3, viruses_per_month=50, raxml_time_limit = 1.0): parser.add_argument('-y', '--years_back', type = int, default=3, help='number of past years to sample sequences from') parser.add_argument('-v', '--viruses_per_month', type = int, default = 50, help='number of viruses sampled per month') parser.add_argument('-r', '--raxml_time_limit', type = float, default = 1.0, help='number of hours raxml is run') + parser.add_argument('--test', default = False, action="store_true", help ="don't run the pipeline") params = parser.parse_args() my_nextflu = nextflu() - my_nextflu.run(**params.__dict__) + if not params.test: + my_nextflu.run(**params.__dict__) From 39a1ed7aabf5f95a1ceba703a345b36b1b480c91 Mon Sep 17 00:00:00 2001 From: Trevor Bedford Date: Thu, 26 Feb 2015 21:57:20 -0800 Subject: [PATCH 15/48] Size tree diagram based on container width. Progress on #31. Needs more work. At the moment, you need to reload the page to get treeplot to resize. --- auspice/css/auspice.css | 15 +++----- auspice/index.html | 76 ++++++++++++++++++++++------------------- auspice/js/auspice.js | 10 +++--- 3 files changed, 52 insertions(+), 49 deletions(-) diff --git a/auspice/css/auspice.css b/auspice/css/auspice.css index 2a612af3..8cb5a54c 100644 --- a/auspice/css/auspice.css +++ b/auspice/css/auspice.css @@ -47,18 +47,19 @@ body { } .plot-container { - width: 800px; height: 650px; top: 25px; } +.logo-container { + min-width: 300px; +} + .io-container { - width: 240px; - position: absolute; + min-width: 280px; } .subtitle-box { - width: 300px; margin-left: 5px; } @@ -128,12 +129,6 @@ a:hover { padding: 5px; } -.logobox { - height: 100px; - width: 500px; - display: table-cell; - vertical-align: middle; -} /* navbar */ .navbar-brand { diff --git a/auspice/index.html b/auspice/index.html index c862ef18..28af9d15 100644 --- a/auspice/index.html +++ b/auspice/index.html @@ -4,53 +4,59 @@ ---
-
-
-
+
+
+
nextflu
Real-time tracking of seasonal influenza H3N2 virus evolution in humans
-
- -
-
-
Legend
- +
+
+
+
+ +
+
+
Legend
+ +
+
-
-
-
- +
+ +
+ +
+ +
+
+
+
- +
+
+ +
+ +
+
+
-
- -
- -
-
- -
- -
-
- -
-
+
+
-
+
diff --git a/auspice/js/auspice.js b/auspice/js/auspice.js index fe7f8c37..96713b1f 100644 --- a/auspice/js/auspice.js +++ b/auspice/js/auspice.js @@ -214,8 +214,10 @@ function tipFillColor(col) { return d3.rgb(col).brighter([0.65]).toString(); } -var width = 800, - height = 600; +var containerWidth = parseInt(d3.select(".plot-container").style("width"), 10); + +var width = containerWidth, + height = 520 + 0.1 * containerWidth; var cladeToSeq = {} @@ -416,12 +418,12 @@ d3.json("data/tree.json", function(error, root) { var dateScale = d3.time.scale() .domain([earliestDate, globalDate]) - .range([5, 235]) + .range([5, 240]) .clamp([true]); var niceDateScale = d3.time.scale() .domain([earliestDate, globalDate]) - .range([5, 235]) + .range([5, 240]) .clamp([true]) .nice(d3.time.month); From 3e6b84f642c7c0f8ffa02b40ac7f407539626e58 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Fri, 27 Feb 2015 07:00:03 +0100 Subject: [PATCH 16/48] made regions and some frequency parameters controlable by the config fixed bug in frequency estimation of small clades, --- augur/nextflu_config.py | 7 ++++++- augur/src/bernoulli_frequency.py | 9 ++++---- augur/src/nextflu_process.py | 35 ++++++++++++++++++++++++++------ augur/src/process.py | 4 ++-- augur/src/streamline.py | 4 ++-- 5 files changed, 44 insertions(+), 15 deletions(-) diff --git a/augur/nextflu_config.py b/augur/nextflu_config.py index 0939ad9f..0b3a890a 100644 --- a/augur/nextflu_config.py +++ b/augur/nextflu_config.py @@ -2,5 +2,10 @@ 'virus':'H3N2', 'alignment_file':'data/20150222_all_H3N2_HA1.fasta', 'fasta_fields':{0:'strain', 1:"date", 4:"passage", -1:'accession'}, - 'outgroup':'A/Beijing/32/1992' + 'outgroup':'A/Beijing/32/1992', + 'aggregate_regions': [ ("global", None), ("NA", ["NorthAmerica"]), ("EU", ["Europe"]), + ("AS", ["China", "SoutheastAsia", "JapanKorea"]), ("OC", ["Oceania"]) ], + 'frequency_stiffness':1.0, + 'time_interval':(2012.0, 2015.1), + 'pivots_per_year':6.0 } diff --git a/augur/src/bernoulli_frequency.py b/augur/src/bernoulli_frequency.py index 00555fde..154ac179 100644 --- a/augur/src/bernoulli_frequency.py +++ b/augur/src/bernoulli_frequency.py @@ -12,6 +12,7 @@ time_interval = (2012.0, 2015.1) flu_stiffness = 10.0 pivots_per_year = 12.0 +relevant_pos_cutoff = 0.1 inertia = 0.7 # fraction of previous frequency changes that is carried over window_size = 20 # smooting window tol = 1e-4 @@ -230,7 +231,7 @@ def estimate_sub_frequencies(node, all_dates, tip_to_date_index, threshold=50, r ci+=1 # if the above loop finished assign the frequency of the remaining clade to the frequency_left - if ci>0 and ci==len(node.child_nodes())-1: + if ci==len(node.child_nodes())-1 and frequency_left is not None: last_child = children_by_size[-1] last_child.freq[region_name] = frequency_left last_child.logit_freq[region_name] = logit_transform(last_child.freq[region_name]) @@ -438,7 +439,7 @@ def test(): -def all_mutations(tree, region_list, plot=False): +def all_mutations(tree, region_list, threshold = 5, plot=False): import matplotlib.pyplot as plt mutation_frequencies = {} for region_label, regions in region_list: @@ -446,7 +447,7 @@ def all_mutations(tree, region_list, plot=False): if plot: plt.figure("mutations in "+region_label, figsize = (12,7)) if regions is not None: plt.title("Region: "+", ".join(regions)) - mutation_frequencies[region_label] = determine_mutation_frequencies(tree, regions, plot=plot, threshold = 5) + mutation_frequencies[region_label] = determine_mutation_frequencies(tree, regions, plot=plot, threshold = threshold) if plot: plt.legend() ticloc = np.arange(time_interval[0], int(time_interval[1])+1,1) @@ -459,7 +460,7 @@ def all_mutations(tree, region_list, plot=False): relevant_pos = [] for mut, freq in mutation_frequencies["global"].iteritems(): if "pivot" not in mut: - if np.max(freq)-np.min(freq)>0.1: + if np.max(freq)-np.min(freq)>relevant_pos_cutoff: pos = int(mut.split('_')[-1][:-1])+15 relevant_pos.append(pos) relevant_pos = sorted(set(relevant_pos)) diff --git a/augur/src/nextflu_process.py b/augur/src/nextflu_process.py index bd743d2b..1bd299a9 100644 --- a/augur/src/nextflu_process.py +++ b/augur/src/nextflu_process.py @@ -10,14 +10,16 @@ class nextflu(object): def __init__(self): self.viruses = None self.tree = None + self.frequencies = {} self.initial_virus_fname = 'data/virus_ingest.json' self.clean_virus_fname = 'data/virus_clean.json' + self.intermediate_tree_fname = 'data/tree_refine.json' def load_from_file(self, tree_fname=None, virus_fname = None): - if tree_fname is None: tree_fname = 'data/tree_refine.json' + if tree_fname is None: tree_fname = self.intermediate_tree_fname if os.path.isfile(tree_fname): self.tree = json_to_dendropy(read_json(tree_fname)) - if virus_fname is None: virus_fname = 'data/virus_clean.json' + if virus_fname is None: virus_fname = self.clean_virus_fname if os.path.isfile(virus_fname): self.viruses = read_json(virus_fname) @@ -64,20 +66,41 @@ def infer_ancestral(self, virus_fname = None): def refine_tree(self): import tree_refine tree_refine.main(self.tree, self.viruses) - write_json(dendropy_to_json(self.tree.seed_node), 'data/tree_refine.json') + write_json(dendropy_to_json(self.tree.seed_node), self.intermediate_tree_fname) + + def estimate_frequencies(self, tasks = ['mutations','genotypes' 'clades', 'tree']): + import bernoulli_frequency as freq_est + plot=False + freq_est.flu_stiffness = config['frequency_stiffness'] + freq_est.time_interval = config['time_interval'] + freq_est.pivots_per_year = config['pivots_per_year'] + freq_est.relevant_pos_cutoff = 0.1 + + if 'mutations' in tasks or 'genotypes' in tasks: + self.frequencies['mutations'], relevant_pos = freq_est.all_mutations(self.tree, config['aggregate_regions'], threshold = 5, plot=plot) + if 'genotypes' in tasks: + self.frequencies['genotypes'] = freq_est.all_genotypes(self.tree, config['aggregate_regions'], relevant_pos) + if 'clades' in tasks: + self.frequencies['clades'] = freq_est.all_clades(self.tree, config['aggregate_regions'], plot) + + if 'tree' in tasks: + for region_label, regions in config['aggregate_regions']: + print "--- "+"adding frequencies to tree "+region_label+ " " + time.strftime("%H:%M:%S") + " ---" + freq_est.estimate_tree_frequencies(self.tree, threshold = 10, regions=regions, region_name=region_label) def export_to_auspice(self): import streamline tree_json = dendropy_to_json(self.tree.seed_node) - streamline.main(tree_json) + streamline.main(tree_json, self.frequencies) - def run(self,years_back=3, viruses_per_month=50, raxml_time_limit = 1.0): + def run(self,years_back=3, viruses_per_month=50, raxml_time_limit = 1.0, **kwargs): self.load_viruses(years_back=years_back, viruses_per_month=viruses_per_month) - self.clean_viruses() self.align() + self.clean_viruses() self.infer_tree(raxml_time_limit = raxml_time_limit) self.infer_ancestral() self.refine_tree() + self.estimate_frequencies() self.export_to_auspice() if __name__=="__main__": diff --git a/augur/src/process.py b/augur/src/process.py index 0e50ee0c..5cb6676e 100644 --- a/augur/src/process.py +++ b/augur/src/process.py @@ -15,13 +15,13 @@ def main(years_back=3, viruses_per_month=50): virus_fname = 'data/gisaid_epiflu_sequence.fasta' # Filter sequences - virus_fname = virus_filter.main(virus_fname, years_back=years_back, viruses_per_month=viruses_per_month) + virus_fname = H3N2_filter.main(virus_fname, years_back=years_back, viruses_per_month=viruses_per_month) # Align sequences virus_fname = virus_align.main(virus_fname) # Clean sequences - virus_fname = H3N2_filter.main(virus_fname) + vires_fname = virus_clean.main(virus_fname) # Make tree, creates raxml files tree_fname = tree_infer.main(virus_fname) diff --git a/augur/src/streamline.py b/augur/src/streamline.py index be76aa26..c02e1c91 100644 --- a/augur/src/streamline.py +++ b/augur/src/streamline.py @@ -2,7 +2,7 @@ from io_util import * from tree_util import * -def main(tree_json): +def main(tree_json, frequencies): """Prep tree for auspice, stripping sequence data""" print "--- Streamline at " + time.strftime("%H:%M:%S") + " ---" @@ -35,7 +35,7 @@ def main(tree_json): write_json(tree_json, out_fname_tree, indent=1) # Include genotype frequencies - shutil.copy2("data/genotype_frequencies.json", "../auspice/data/frequencies.json") + write_json(frequencies, "../auspice/data/frequencies.json") if __name__ == "__main__": main() From 8c5a91d074dbe9456dfbadbc3549b4befc540122 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Fri, 27 Feb 2015 07:27:28 +0100 Subject: [PATCH 17/48] made config file optional argument, added root trimming when outgroup is missing (not quite sure why) --- augur/nextflu_config.py | 1 + augur/src/nextflu_process.py | 9 ++++++--- augur/src/tree_refine.py | 16 ++++++++++------ 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/augur/nextflu_config.py b/augur/nextflu_config.py index 0b3a890a..636db24e 100644 --- a/augur/nextflu_config.py +++ b/augur/nextflu_config.py @@ -3,6 +3,7 @@ 'alignment_file':'data/20150222_all_H3N2_HA1.fasta', 'fasta_fields':{0:'strain', 1:"date", 4:"passage", -1:'accession'}, 'outgroup':'A/Beijing/32/1992', + 'max_global':True, # sample as evenly as possible from different geographic regions 'aggregate_regions': [ ("global", None), ("NA", ["NorthAmerica"]), ("EU", ["Europe"]), ("AS", ["China", "SoutheastAsia", "JapanKorea"]), ("OC", ["Oceania"]) ], 'frequency_stiffness':1.0, diff --git a/augur/src/nextflu_process.py b/augur/src/nextflu_process.py index 1bd299a9..e7267947 100644 --- a/augur/src/nextflu_process.py +++ b/augur/src/nextflu_process.py @@ -1,6 +1,5 @@ import time, argparse,os,subprocess, shutil, glob, sys sys.path.append('./src') -from nextflu_config import config from Bio import SeqIO from io_util import write_json, read_json, write_fasta, read_fasta from tree_util import dendropy_to_json, json_to_dendropy, delimit_newick @@ -36,7 +35,7 @@ def load_viruses(self, aln_fname = None, years_back=3, viruses_per_month=50): my_filter = virus_filter(aln_fname, fasta_fields) my_filter.filter() my_filter.subsample(years_back, viruses_per_month, prioritize = force_include_strains, - all_priority = True, region_specific=False) + all_priority = True, region_specific=config['max_global']) self.viruses = my_filter.virus_subsample write_json(self.viruses, self.initial_virus_fname) @@ -65,7 +64,7 @@ def infer_ancestral(self, virus_fname = None): def refine_tree(self): import tree_refine - tree_refine.main(self.tree, self.viruses) + tree_refine.main(self.tree, self.viruses, config['outgroup']) write_json(dendropy_to_json(self.tree.seed_node), self.intermediate_tree_fname) def estimate_frequencies(self, tasks = ['mutations','genotypes' 'clades', 'tree']): @@ -108,9 +107,13 @@ def run(self,years_back=3, viruses_per_month=50, raxml_time_limit = 1.0, **kwar parser.add_argument('-y', '--years_back', type = int, default=3, help='number of past years to sample sequences from') parser.add_argument('-v', '--viruses_per_month', type = int, default = 50, help='number of viruses sampled per month') parser.add_argument('-r', '--raxml_time_limit', type = float, default = 1.0, help='number of hours raxml is run') + parser.add_argument('--config', default = "nextflu_config.py" , type=str, help ="config file") parser.add_argument('--test', default = False, action="store_true", help ="don't run the pipeline") params = parser.parse_args() + execfile(params.config) + print config + my_nextflu = nextflu() if not params.test: my_nextflu.run(**params.__dict__) diff --git a/augur/src/tree_refine.py b/augur/src/tree_refine.py index 175598a6..b174f4e7 100644 --- a/augur/src/tree_refine.py +++ b/augur/src/tree_refine.py @@ -8,8 +8,6 @@ from date_util import * from tree_util import * -OUTGROUP = 'A/Beijing/32/1992' - def delimit_newick(infile_name): with open(infile_name, 'r') as file: newick = file.read().replace('\n', '') @@ -46,14 +44,20 @@ def get_xvalue(node): root = node.get_tree_root() return node.get_distance(root) -def remove_outgroup(tree): +def remove_outgroup(tree, outgroup): """Reroot tree to outgroup""" outgroup_node = None for node in tree.postorder_node_iter(): - if (str(node.taxon) == OUTGROUP): + if (str(node.taxon) == outgroup): outgroup_node = node if outgroup_node: tree.prune_subtree(outgroup_node) + else: + print "outgroup",outgroup, "not found" + if len(tree.seed_node.child_nodes())==1: + tree.seed_node = tree.seed_node.child_nodes()[0] + tree.seed_node.parent_node = None + tree.seed_node.edge_length = 0.002 def collapse(tree): """Collapse edges without mutations to polytomies""" @@ -189,10 +193,10 @@ def define_trunk(tree): node.trunk = True; -def main(tree, viruses): +def main(tree, viruses, outgroup): print "--- Tree refine at " + time.strftime("%H:%M:%S") + " ---" print "Remove outgroup" - remove_outgroup(tree) + remove_outgroup(tree, outgroup) print "Remove outlier branches" reduce(tree) print "Collapse internal nodes" From 95a958d3cfcdff99ac0230a2c19e286d2ff08f76 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Fri, 27 Feb 2015 09:05:24 +0100 Subject: [PATCH 18/48] added flags to run parts of the pipeline, simplified sequences.json (why not make a dict directly) current pipeline runs for me on small data sets and auspice has all the functionality --- augur/src/bernoulli_frequency.py | 11 +++++++---- augur/src/nextflu_process.py | 23 +++++++++++++++++++++-- augur/src/streamline.py | 15 ++++++++------- augur/src/tree_refine.py | 2 +- augur/src/tree_util.py | 7 +++++-- auspice/js/auspice.js | 13 +++++++------ 6 files changed, 49 insertions(+), 22 deletions(-) diff --git a/augur/src/bernoulli_frequency.py b/augur/src/bernoulli_frequency.py index 154ac179..14da69dd 100644 --- a/augur/src/bernoulli_frequency.py +++ b/augur/src/bernoulli_frequency.py @@ -35,10 +35,13 @@ def running_average(obs, ws): obs -- observations ws -- winodw size (number of points to average) ''' - tmp_vals = np.convolve(np.ones(ws, dtype=float)/ws, obs, mode='same') - # fix the edges. using mode='same' assumes zeros outside the range - tmp_vals[:ws//2]*=float(ws)/np.arange(ws//2,ws) - tmp_vals[-ws//2+1:]*=float(ws)/np.arange(ws-1,ws//2,-1.0) + try: + tmp_vals = np.convolve(np.ones(ws, dtype=float)/ws, obs, mode='same') + # fix the edges. using mode='same' assumes zeros outside the range + tmp_vals[:ws//2]*=float(ws)/np.arange(ws//2,ws) + tmp_vals[-ws//2+1:]*=float(ws)/np.arange(ws-1,ws//2,-1.0) + except: + import pdb; pdb.set_trace() return tmp_vals def fix_freq(freq, pc): diff --git a/augur/src/nextflu_process.py b/augur/src/nextflu_process.py index e7267947..21892b0d 100644 --- a/augur/src/nextflu_process.py +++ b/augur/src/nextflu_process.py @@ -13,6 +13,7 @@ def __init__(self): self.initial_virus_fname = 'data/virus_ingest.json' self.clean_virus_fname = 'data/virus_clean.json' self.intermediate_tree_fname = 'data/tree_refine.json' + self.frequency_fname = 'data/frequencies.json' def load_from_file(self, tree_fname=None, virus_fname = None): if tree_fname is None: tree_fname = self.intermediate_tree_fname @@ -21,6 +22,8 @@ def load_from_file(self, tree_fname=None, virus_fname = None): if virus_fname is None: virus_fname = self.clean_virus_fname if os.path.isfile(virus_fname): self.viruses = read_json(virus_fname) + if os.path.isfile(self.frequency_fname): + self.frequencies = read_json(self.frequency_fname) def load_viruses(self, aln_fname = None, years_back=3, viruses_per_month=50): if config['virus']: @@ -67,7 +70,7 @@ def refine_tree(self): tree_refine.main(self.tree, self.viruses, config['outgroup']) write_json(dendropy_to_json(self.tree.seed_node), self.intermediate_tree_fname) - def estimate_frequencies(self, tasks = ['mutations','genotypes' 'clades', 'tree']): + def estimate_frequencies(self, tasks = ['mutations','genotypes', 'clades', 'tree']): import bernoulli_frequency as freq_est plot=False freq_est.flu_stiffness = config['frequency_stiffness'] @@ -81,6 +84,8 @@ def estimate_frequencies(self, tasks = ['mutations','genotypes' 'clades', 'tree' self.frequencies['genotypes'] = freq_est.all_genotypes(self.tree, config['aggregate_regions'], relevant_pos) if 'clades' in tasks: self.frequencies['clades'] = freq_est.all_clades(self.tree, config['aggregate_regions'], plot) + if any(x in tasks for x in ['mutations','clades', 'genotypes']): + write_json(self.frequencies, self.frequency_fname) if 'tree' in tasks: for region_label, regions in config['aggregate_regions']: @@ -109,11 +114,25 @@ def run(self,years_back=3, viruses_per_month=50, raxml_time_limit = 1.0, **kwar parser.add_argument('-r', '--raxml_time_limit', type = float, default = 1.0, help='number of hours raxml is run') parser.add_argument('--config', default = "nextflu_config.py" , type=str, help ="config file") parser.add_argument('--test', default = False, action="store_true", help ="don't run the pipeline") + parser.add_argument('--virus', default = False, action="store_true", help ="only select viruses") + parser.add_argument('--tree', default = False, action="store_true", help ="only build tree") + parser.add_argument('--frequencies', default = False, action="store_true", help ="only estimate frequencies") params = parser.parse_args() execfile(params.config) print config my_nextflu = nextflu() - if not params.test: + my_nextflu.load_from_file() + if params.virus: + my_nextflu.load_viruses(years_back=params.years_back, viruses_per_month = params.viruses_per_month) + my_nextflu.align() + my_nextflu.clean_viruses() + elif params.tree: + my_nextflu.infer_tree(raxml_time_limit=params.raxml_time_limit) + my_nextflu.infer_ancestral() + my_nextflu.refine_tree() + elif params.frequencies: + my_nextflu.estimate_frequencies() + elif not params.test: my_nextflu.run(**params.__dict__) diff --git a/augur/src/streamline.py b/augur/src/streamline.py index c02e1c91..f8b9628b 100644 --- a/augur/src/streamline.py +++ b/augur/src/streamline.py @@ -9,14 +9,15 @@ def main(tree_json, frequencies): # Move sequence data to separate file print "Writing sequences" - elems = [] + elems = {} for node in all_descendants(tree_json): - elem = {} - if 'clade' in node: - elem['clade'] = node['clade'] - if 'aa_seq' in node: - elem['aa_seq'] = node['aa_seq'] - elems.append(elem) + if "clade" in tree_json: + elems[node["clade"]] = node["aa_seq"] +# if 'clade' in node: +# elem['clade'] = node['clade'] +# if 'aa_seq' in node: +# elem['aa_seq'] = node['aa_seq'] +# elems.append(elem) write_json(elems, "../auspice/data/sequences.json", indent=None) # Streamline tree for auspice diff --git a/augur/src/tree_refine.py b/augur/src/tree_refine.py index b174f4e7..ad2df13e 100644 --- a/augur/src/tree_refine.py +++ b/augur/src/tree_refine.py @@ -48,7 +48,7 @@ def remove_outgroup(tree, outgroup): """Reroot tree to outgroup""" outgroup_node = None for node in tree.postorder_node_iter(): - if (str(node.taxon) == outgroup): + if (str(node.taxon).lower() == outgroup.lower()): outgroup_node = node if outgroup_node: tree.prune_subtree(outgroup_node) diff --git a/augur/src/tree_util.py b/augur/src/tree_util.py index 490634db..1f07db05 100644 --- a/augur/src/tree_util.py +++ b/augur/src/tree_util.py @@ -123,7 +123,7 @@ def dendropy_to_json(node, extra_attr = ['ep', 'ne', 'rb','tol', 'fitness', 'ser try: if hasattr(node, 'freq') and node.freq is not None: - json['freq'] = {reg: [round(x, 3) for x in freq] if freq is not None else "undefined" for reg, freq in node.freq.iteritems()} + json['freq'] = {reg: [round(x, 3) for x in freq] if freq is not None else "undefined" for reg, freq in node.freq.iteritems()} if hasattr(node, 'logit_freq') and node.logit_freq is not None: json['logit_freq'] = {reg: [round(x,3) for x in freq] if freq is not None else "undefined" for reg, freq in node.logit_freq.iteritems()} if hasattr(node, 'pivots'): @@ -172,7 +172,10 @@ def json_to_dendropy_sub(json, node, taxon_set): try: node.__setattr__(attr, float(val)) except: - node.__setattr__(attr, val) + if val=='undefined': + node.__setattr__(attr, None) + else: + node.__setattr__(attr, val) if len(node.child_nodes())==0: node.taxon = dendropy.Taxon(label=json['strain'].lower()) node.strain = json['strain'] diff --git a/auspice/js/auspice.js b/auspice/js/auspice.js index fe7f8c37..afde6e45 100644 --- a/auspice/js/auspice.js +++ b/auspice/js/auspice.js @@ -1022,12 +1022,13 @@ d3.json("data/meta.json", function(error, json) { d3.json("data/sequences.json", function(error, json) { if (error) return console.warn(error); - for (var key in json) { - if (json.hasOwnProperty(key)) { - var hash = json[key]; - cladeToSeq[hash['clade']] = hash['aa_seq']; - } - } + cladeToSeq=json; +// for (var key in json) { +// if (json.hasOwnProperty(key)) { +// var hash = json[key]; +// cladeToSeq[hash['clade']] = hash['aa_seq']; +// } +// } }); d3.json("data/frequencies.json", function(error, json){ From 6401b6effb77955721dfa01ac8ce9f93a7ead375 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Fri, 27 Feb 2015 17:29:13 +0100 Subject: [PATCH 19/48] added cds field to config to trim the relevant coding area. this way we can get rid of the all the +- 15 --- augur/nextflu_config.py | 3 ++- augur/src/nextflu_process.py | 2 +- augur/src/seq_util.py | 20 +++++++++++--------- augur/src/tree_refine.py | 8 ++++---- 4 files changed, 18 insertions(+), 15 deletions(-) diff --git a/augur/nextflu_config.py b/augur/nextflu_config.py index 636db24e..b709c974 100644 --- a/augur/nextflu_config.py +++ b/augur/nextflu_config.py @@ -8,5 +8,6 @@ ("AS", ["China", "SoutheastAsia", "JapanKorea"]), ("OC", ["Oceania"]) ], 'frequency_stiffness':1.0, 'time_interval':(2012.0, 2015.1), - 'pivots_per_year':6.0 + 'pivots_per_year':6.0, + 'cds':[48,-1] } diff --git a/augur/src/nextflu_process.py b/augur/src/nextflu_process.py index 21892b0d..73d1366c 100644 --- a/augur/src/nextflu_process.py +++ b/augur/src/nextflu_process.py @@ -67,7 +67,7 @@ def infer_ancestral(self, virus_fname = None): def refine_tree(self): import tree_refine - tree_refine.main(self.tree, self.viruses, config['outgroup']) + tree_refine.main(self.tree, self.viruses, config['outgroup'], config['cds']) write_json(dendropy_to_json(self.tree.seed_node), self.intermediate_tree_fname) def estimate_frequencies(self, tasks = ['mutations','genotypes', 'clades', 'tree']): diff --git a/augur/src/seq_util.py b/augur/src/seq_util.py index 40159e77..5afdfcc6 100644 --- a/augur/src/seq_util.py +++ b/augur/src/seq_util.py @@ -1,6 +1,7 @@ from itertools import izip import numpy as np -epitope_mask = np.fromstring("00000000000000000000000000000000000000000000000000000000000011111011011001010011000100000001001011110011100110101000001100000100000001000110101011111101011010111110001010011111000101011011111111010010001111101110111001010001110011111111000000111110000000101010101110000000000011100100000001011011100000000000001001011000110111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000", dtype='S1') +#epitope_mask = np.fromstring("00000000000000000000000000000000000000000000000000000000000011111011011001010011000100000001001011110011100110101000001100000100000001000110101011111101011010111110001010011111000101011011111111010010001111101110111001010001110011111111000000111110000000101010101110000000000011100100000001011011100000000000001001011000110111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000", dtype='S1') +epitope_mask = np.fromstring("0000000000000000000000000000000000000000000011111011011001010011000100000001001011110011100110101000001100000100000001000110101011111101011010111110001010011111000101011011111111010010001111101110111001010001110011111111000000111110000000101010101110000000000011100100000001011011100000000000001001011000110111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000", dtype='S1') def partition_string(string, length): return list(string[0+i:length+i] for i in range(0, len(string), length)) @@ -19,18 +20,19 @@ def nonepitope_sites(aa): return ''.join(aaa[epitope_mask[:len(aa)]=='0']) def receptor_binding_sites(aa): - """Receptor binding site mutations from Koel et al. 2014""" - """These are (145, 155, 156, 158, 159, 189, 193) in canonical HA numbering""" - """When counting from ATG/M, need to offset by 16, giving (161, 171, 172, 174, 175, 205, 209)""" - """When indexing from 0, these are (160, 170, 171, 173, 174, 204, 208)""" - sites = [160, 170, 171, 173, 174, 204, 208] + ''' + Receptor binding site mutations from Koel et al. 2014 + These are (145, 155, 156, 158, 159, 189, 193) in canonical HA numbering + need to subtract one since python arrays start at 0 + ''' + sites = [144, 154, 155, 157, 158, 188, 192] return ''.join([aa[pos] for pos in sites]) def get_HA1(aa): ''' - return the part of the peptide corresponding to HA1, starts at pos 16, is 329 aa long + return the part of the peptide corresponding to HA1, starts is 329 aa long ''' - return aa[16:(16+329)] + return aa[:329] def epitope_distance(aaA, aaB): """Return distance of sequences aaA and aaB by comparing epitope sites""" @@ -64,7 +66,7 @@ def json_to_Bio_alignment(seq_json): def main(): """Testing with Hong Kong/68""" nuc = "ATGAAGACCATCATTGCTTTGAGCTACATTTTCTGTCTGGCTCTCGGCCAAGACCTTCCAGGAAATGACAACAGCACAGCAACGCTGTGCCTGGGACATCATGCGGTGCCAAACGGAACACTAGTGAAAACAATCACAGATGATCAGATTGAAGTGACTAATGCTACTGAGCTAGTTCAGAGCTCCTCAACGGGGAAAATATGCAACAATCCTCATCGAATCCTTGATGGAATAGACTGCACACTGATAGATGCTCTATTGGGGGACCCTCATTGTGATGTTTTTCAAAATGAGACATGGGACCTTTTCGTTGAACGCAGCAAAGCTTTCAGCAACTGTTACCCTTATGATGTGCCAGATTATGCCTCCCTTAGGTCACTAGTTGCCTCGTCAGGCACTCTGGAGTTTATCACTGAGGGTTTCACTTGGACTGGGGTCACTCAGAATGGGGGAAGCAATGCTTGCAAAAGGGGACCTGGTAGCGGTTTTTTCAGTAGACTGAACTGGTTGACCAAATCAGGAAGCACATATCCAGTGCTGAACGTGACTATGCCAAACAATGACAATTTTGACAAACTATACATTTGGGGGGTTCACCACCCGAGCACGAACCAAGAACAAACCAGCCTGTATGTTCAAGCATCAGGGAGAGTCACAGTCTCTACCAGAAGAAGCCAGCAAACTATAATCCCGAATATCTGGTCCAGACCCTGGGTAAGGGGTCTGTCTAGTAGAATAAGCATCTATTGGACAATAGTTAAGCCGGGAGACGTACTGGTAATTAATAGTAATGGGAACCTAATCGCTCCTCGGGGTTATTTCAAAATGCGCACTGGGAAAAGCTCAATAATGAGGTCAGATGCACCTATTGATACCTGTATTTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAGCCCTTTCAAAACGTAAACAAGATCACATATGGAGCATGCCCCAAGTATGTTAAGCAAAACACC" - aa = translate(nuc) + aa = translate(nuc[48:]) ep = epitope_sites(aa) ne = nonepitope_sites(aa) rb = receptor_binding_sites(aa) diff --git a/augur/src/tree_refine.py b/augur/src/tree_refine.py index ad2df13e..39f81d4a 100644 --- a/augur/src/tree_refine.py +++ b/augur/src/tree_refine.py @@ -149,9 +149,9 @@ def add_node_attributes(tree): node.trunk_count = 0 node.trunk = False -def translate_all(tree): +def translate_all(tree, cds): for node in tree.postorder_node_iter(): - node.aa_seq = translate(node.seq) + node.aa_seq = translate(node.seq[cds[0]:cds[1]]) def unique_date(tree): leaf_count = 0 @@ -193,7 +193,7 @@ def define_trunk(tree): node.trunk = True; -def main(tree, viruses, outgroup): +def main(tree, viruses, outgroup, cds = [0,-1]): print "--- Tree refine at " + time.strftime("%H:%M:%S") + " ---" print "Remove outgroup" remove_outgroup(tree, outgroup) @@ -207,7 +207,7 @@ def main(tree, viruses, outgroup): add_virus_attributes(viruses, tree) add_node_attributes(tree) print "Translate nucleotide sequences" - translate_all(tree) + translate_all(tree, cds) print "Enumerate leaves of ladderized tree and calculate unique numerical date" unique_date(tree) print "Define trunk" From 5366021819d67eba7f143095c5695a6447e95205 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Fri, 27 Feb 2015 19:08:50 +0100 Subject: [PATCH 20/48] added cds, changed +/- 15 in auspice to +/- 1 (generic 0/1 numbering difference) --- augur/nextflu_config.py | 2 +- auspice/js/auspice.js | 14 ++++---------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/augur/nextflu_config.py b/augur/nextflu_config.py index b709c974..e3fcfba1 100644 --- a/augur/nextflu_config.py +++ b/augur/nextflu_config.py @@ -8,6 +8,6 @@ ("AS", ["China", "SoutheastAsia", "JapanKorea"]), ("OC", ["Oceania"]) ], 'frequency_stiffness':1.0, 'time_interval':(2012.0, 2015.1), - 'pivots_per_year':6.0, + 'pivots_per_year':12.0, 'cds':[48,-1] } diff --git a/auspice/js/auspice.js b/auspice/js/auspice.js index 3d8f13db..dacc19cb 100644 --- a/auspice/js/auspice.js +++ b/auspice/js/auspice.js @@ -929,9 +929,9 @@ d3.json("data/tree.json", function(error, root) { var positions_string = document.getElementById("gt-color").value.split(','); var positions_list = [] positions_string.map(function(d) { - val = parseInt(d)+15; + val = parseInt(d)+1; if (!isNaN(val)) { - if (val < 561) { + if (val < 551) { positions_list.push(val); } } @@ -950,7 +950,7 @@ d3.json("data/tree.json", function(error, root) { var gts = nodes.map(function (d) {var tmp = []; for (var i=0; i1){ for (freq_gt in json["genotypes"][region]){ var gt_agree = gt.map(function (d) { - var aa =freq_gt[parseInt(d.substring(0,d.length-1))+15]; + var aa =freq_gt[parseInt(d.substring(0,d.length-1))+1]; return (aa==d[d.length-1])||(aa=='.'); }); if (gt_agree.every(function (d,i,a) {return d;})) From 36ef6d4dc316460cc3af057e0f6c1680faa46bcf Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Fri, 27 Feb 2015 19:58:33 +0100 Subject: [PATCH 21/48] * moved clade_designations to config file. * fixed error in amino-acid position mapping -15 -> +1, +15 -> -1 rather than the other way around. this is because there are 16 codons prior to HA1 start. * made the frequency estimation restricted to the pivots where variation is observed. this speeds it up by a factor of 2 for the 3 y tree I tested. gain should be larger when covering later time intervals --- augur/nextflu_config.py | 8 ++- augur/src/bernoulli_frequency.py | 102 +++++++++++++++++-------------- augur/src/nextflu_process.py | 3 +- auspice/js/auspice.js | 6 +- 4 files changed, 69 insertions(+), 50 deletions(-) diff --git a/augur/nextflu_config.py b/augur/nextflu_config.py index e3fcfba1..5a0ef9b5 100644 --- a/augur/nextflu_config.py +++ b/augur/nextflu_config.py @@ -9,5 +9,11 @@ 'frequency_stiffness':1.0, 'time_interval':(2012.0, 2015.1), 'pivots_per_year':12.0, - 'cds':[48,-1] + 'cds':[48,-1], # define the HA1 start i n 0 numbering + # define relevant clades in canonical HA1 numbering (+1) + 'clade_designations': { "3c3.a":[(128,'A'), (142,'G'), (159,'S')], + "3c3": [(128,'A'), (142,'G'), (159,'F')], + "3c2.a":[(144,'S'), (159,'Y'), (225,'D'), (311,'H'),(489,'N')], + "3c2": [(144,'N'), (159,'F'),(225,'N'), (489,'N')] + } } diff --git a/augur/src/bernoulli_frequency.py b/augur/src/bernoulli_frequency.py index 14da69dd..5898b5fa 100644 --- a/augur/src/bernoulli_frequency.py +++ b/augur/src/bernoulli_frequency.py @@ -15,19 +15,11 @@ relevant_pos_cutoff = 0.1 inertia = 0.7 # fraction of previous frequency changes that is carried over window_size = 20 # smooting window +extra_pivots=5 tol = 1e-4 reg = 1e-6 debug = False -clade_designations = { "3c3.a":[(128,'A'), (142,'G'), (159,'S')], - "3c3": [(128,'A'), (142,'G'), (159,'F')], - "3c2.a":[(144,'S'), (159,'Y'), (225,'D'), (311,'H'),(489,'N')], - "3c2": [(144,'N'), (159,'F'),(225,'N'), (489,'N')], - } - -region_names = ['Europe', 'India', 'NorthAmerica', 'SouthAmerica', 'Africa', - 'JapanKorea', 'Oceania', 'China', 'WestAsia', 'SoutheastAsia'] - cols = np.array([(166,206,227),(31,120,180),(178,223,138),(51,160,44),(251,154,153),(227,26,28),(253,191,111),(255,127,0),(202,178,214),(106,61,154)], dtype=float)/255 def running_average(obs, ws): ''' @@ -105,8 +97,8 @@ def __init__(self, observations, pivots = None, stiffness = 20.0, logit=False, v self.verbose=verbose # make sure they are searchsorted tmp = np.argsort(self.tps) - self.tps = self.tps[tmp] - self.obs = self.obs[tmp] + self.full_tps = self.tps[tmp] + self.full_obs = self.obs[tmp] if pivots is None: self.final_pivot_tps = get_pivots(self.tps[0], self.tps[1]) @@ -161,33 +153,56 @@ def logLH(self, pivots): def learn(self): from scipy.optimize import fmin_powell as minimizer - self.final_pivot_freq = self.initial_guess(self.final_pivot_tps, ws=2*(min(50,len(self.obs))//2)) + switches = np.abs(np.diff(self.obs)).nonzero()[0] + try: + if len(switches)>5: + first_switch = self.tps[switches[0]] + last_switch = self.tps[switches[-1]] + else: + first_switch = self.tps[0] + last_switch = self.tps[-1] + if first_switch>self.final_pivot_tps[0]: + first_pivot = max(0, np.where(first_switch<=self.final_pivot_tps)[0][0] - extra_pivots) + else: + first_pivot=0 + if last_switchself.final_pivot_tps)[0][-1]+extra_pivots) + else: + last_pivot = len(self.final_pivot_tps) + tmp_pivots = self.final_pivot_tps[first_pivot:last_pivot] + if min(np.diff(tmp_pivots))<0.000001: + print pivots + self.tps = self.full_tps[(self.full_tps>=tmp_pivots[0])*(self.full_tps=tmp_pivots[0])*(self.full_tps0) # instantiate an interpolation object based on the optimal frequency pivots self.frequency_estimate = interp1d(self.pivot_tps, self.pivot_freq, kind=self.interolation_type, bounds_error=False) + if min(np.diff(self.pivot_tps))<0.000001: + print pivots if self.verbose: print "neg logLH using",len(self.pivot_tps),"pivots:", self.logLH(self.pivot_freq) - self.final_pivot_freq=self.pivot_freq + self.final_pivot_freq=np.zeros_like(self.final_pivot_tps) + self.final_pivot_freq[first_pivot:last_pivot]=self.pivot_freq + self.final_pivot_freq[:first_pivot] = self.final_pivot_freq[first_pivot] + self.final_pivot_freq[last_pivot:] = self.final_pivot_freq[last_pivot-1] + self.frequency_estimate = interp1d(self.final_pivot_tps, self.final_pivot_freq, kind=self.interolation_type, bounds_error=False) def estimate_sub_frequencies(node, all_dates, tip_to_date_index, threshold=50, region_name="global"): # extract time points and the subset of observations that fall in the clade. @@ -219,15 +234,12 @@ def estimate_sub_frequencies(node, all_dates, tip_to_date_index, threshold=50, r fe = frequency_estimator(zip(tps, obs), pivots=pivots, stiffness=flu_stiffness*len(all_dates)/2000.0, logit=True) fe.learn() - try: - # assign the frequency vector to the node - child.freq[region_name] = frequency_left * logit_inv(fe.pivot_freq) - child.logit_freq[region_name] = logit_transform(child.freq[region_name]) - except: - import pdb; pdb.set_trace() + # assign the frequency vector to the node + child.freq[region_name] = frequency_left * logit_inv(fe.final_pivot_freq) + child.logit_freq[region_name] = logit_transform(child.freq[region_name]) # update the frequency remaining to be explained and prune explained observations - frequency_left *= (1.0-logit_inv(fe.pivot_freq)) + frequency_left *= (1.0-logit_inv(fe.final_pivot_freq)) tps_left = np.ones_like(tps,dtype=bool) tps_left[obs]=False # prune observations from clade tps = tps[tps_left] @@ -339,7 +351,7 @@ def estimate_genotype_frequency(tree, gt, time_interval=None, regions = None, re return fe.frequency_estimate, (tps,obs) -def determine_clade_frequencies(tree, regions=None, plot=False): +def determine_clade_frequencies(tree, clades, regions=None, plot=False): ''' loop over different clades and determine their frequencies returns a dictionary with clades:frequencies @@ -349,9 +361,9 @@ def determine_clade_frequencies(tree, regions=None, plot=False): clade_frequencies = {"pivots":list(get_pivots(time_interval[0], time_interval[1])), "xpol_pivots":list(xpol_pivots)} - for ci, (clade_name, clade_gt) in enumerate(clade_designations.iteritems()): + for ci, (clade_name, clade_gt) in enumerate(clades.iteritems()): print "estimating frequency of clade", clade_name, clade_gt - freq, (tps, obs) = estimate_genotype_frequency(tree, [(pos+15, aa) for pos, aa in clade_gt], time_interval, regions) + freq, (tps, obs) = estimate_genotype_frequency(tree, [(pos-1, aa) for pos, aa in clade_gt], time_interval, regions) clade_frequencies[clade_name] = list(np.round(logit_inv(freq.y),3)) if plot: grid_tps = np.linspace(time_interval[0], time_interval[1], 100) @@ -388,12 +400,12 @@ def determine_mutation_frequencies(tree, regions=None, threshold=50, plot=False) if count>threshold and countrelevant_pos_cutoff: - pos = int(mut.split('_')[-1][:-1])+15 + pos = int(mut.split('_')[-1][:-1])-1 relevant_pos.append(pos) relevant_pos = sorted(set(relevant_pos)) @@ -491,7 +503,7 @@ def all_genotypes(tree, region_list, relevant_pos): return gt_frequencies -def all_clades(tree, region_list, plot=False): +def all_clades(tree, clades, region_list, plot=False): clade_frequencies = {} import matplotlib.pyplot as plt for region_label, regions in region_list: @@ -499,7 +511,7 @@ def all_clades(tree, region_list, plot=False): if plot: plt.figure("region "+region_label, figsize = (12,7)) if regions is not None: plt.title("Region: "+", ".join(regions)) - clade_frequencies[region_label] = determine_clade_frequencies(tree, regions=regions, plot=plot) + clade_frequencies[region_label] = determine_clade_frequencies(tree, clades, regions=regions, plot=plot) if plot: plt.legend() ticloc = np.arange(time_interval[0], int(time_interval[1])+1,1) @@ -510,7 +522,7 @@ def all_clades(tree, region_list, plot=False): plt.savefig('data/clade_frequencies_'+region_label+'.pdf') return clade_frequencies -def main(tree_fname = 'data/tree_refine.json', clades_freq = True, mutation_freq = True, tree_freq = True): +def main(tree_fname = 'data/tree_refine.json', clades=None, clades_freq = True, mutation_freq = True, tree_freq = True): # load tree from io_util import read_json plot = debug @@ -528,8 +540,8 @@ def main(tree_fname = 'data/tree_refine.json', clades_freq = True, mutation_freq gt_frequencies["genotypes"] = all_genotypes(tree, region_list, relevant_pos) write_json(gt_frequencies, out_fname, indent=None) - if clades_freq: - gt_frequencies["clades"] = all_clades(tree, region_list, plot) + if clades_freq and clades is not None: + gt_frequencies["clades"] = all_clades(tree, clades, region_list, plot) if clades_freq or mutation_freq: # round frequencies diff --git a/augur/src/nextflu_process.py b/augur/src/nextflu_process.py index 73d1366c..b76cb957 100644 --- a/augur/src/nextflu_process.py +++ b/augur/src/nextflu_process.py @@ -83,7 +83,8 @@ def estimate_frequencies(self, tasks = ['mutations','genotypes', 'clades', 'tree if 'genotypes' in tasks: self.frequencies['genotypes'] = freq_est.all_genotypes(self.tree, config['aggregate_regions'], relevant_pos) if 'clades' in tasks: - self.frequencies['clades'] = freq_est.all_clades(self.tree, config['aggregate_regions'], plot) + self.frequencies['clades'] = freq_est.all_clades(self.tree, config['clade_designations'], + config['aggregate_regions'], plot) if any(x in tasks for x in ['mutations','clades', 'genotypes']): write_json(self.frequencies, self.frequency_fname) diff --git a/auspice/js/auspice.js b/auspice/js/auspice.js index dacc19cb..a37f68bc 100644 --- a/auspice/js/auspice.js +++ b/auspice/js/auspice.js @@ -929,7 +929,7 @@ d3.json("data/tree.json", function(error, root) { var positions_string = document.getElementById("gt-color").value.split(','); var positions_list = [] positions_string.map(function(d) { - val = parseInt(d)+1; + val = parseInt(d)-1; if (!isNaN(val)) { if (val < 551) { positions_list.push(val); @@ -950,7 +950,7 @@ d3.json("data/tree.json", function(error, root) { var gts = nodes.map(function (d) {var tmp = []; for (var i=0; i1){ for (freq_gt in json["genotypes"][region]){ var gt_agree = gt.map(function (d) { - var aa =freq_gt[parseInt(d.substring(0,d.length-1))+1]; + var aa =freq_gt[parseInt(d.substring(0,d.length-1))-1]; return (aa==d[d.length-1])||(aa=='.'); }); if (gt_agree.every(function (d,i,a) {return d;})) From 407cfdb5a7f754ab069755335a9b29e3b3bfeb80 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Fri, 27 Feb 2015 22:03:04 +0100 Subject: [PATCH 22/48] added condition to catch cases where all observations are outside the the time window specified for frequency estimation --- augur/src/bernoulli_frequency.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/augur/src/bernoulli_frequency.py b/augur/src/bernoulli_frequency.py index 5898b5fa..5e9e02ca 100644 --- a/augur/src/bernoulli_frequency.py +++ b/augur/src/bernoulli_frequency.py @@ -161,11 +161,11 @@ def learn(self): else: first_switch = self.tps[0] last_switch = self.tps[-1] - if first_switch>self.final_pivot_tps[0]: + if first_switch>self.final_pivot_tps[0] and first_switch < self.final_pivot_tps[-1]: first_pivot = max(0, np.where(first_switch<=self.final_pivot_tps)[0][0] - extra_pivots) else: first_pivot=0 - if last_switchself.final_pivot_tps[0]: last_pivot = min(len(self.final_pivot_tps), np.where(last_switch>self.final_pivot_tps)[0][-1]+extra_pivots) else: last_pivot = len(self.final_pivot_tps) From 49eda848b6bd2cd203e0a23e58a5e91bbb7f2c8f Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Sat, 28 Feb 2015 01:23:38 +0100 Subject: [PATCH 23/48] * added HI strain file * fixed various pitfalls with frequency estimation * made regional representation robust by randomly sampling from a global representation, rather than taking the first k viruses in the list * auspice gt_chart ticks are now dynamic, adjusting when you put in a 10y tree --- augur/nextflu_config.py | 9 +- augur/source-data/HI_strains.txt | 489 +++++++++++++++++++++++++++++++ augur/src/bernoulli_frequency.py | 95 +++--- augur/src/nextflu_process.py | 9 +- augur/src/streamline.py | 10 +- augur/src/virus_filter.py | 5 +- auspice/js/auspice.js | 17 +- 7 files changed, 578 insertions(+), 56 deletions(-) create mode 100644 augur/source-data/HI_strains.txt diff --git a/augur/nextflu_config.py b/augur/nextflu_config.py index 5a0ef9b5..d2470812 100644 --- a/augur/nextflu_config.py +++ b/augur/nextflu_config.py @@ -1,15 +1,20 @@ config = { + # data source and sequence parsing/cleaning/processing 'virus':'H3N2', 'alignment_file':'data/20150222_all_H3N2_HA1.fasta', 'fasta_fields':{0:'strain', 1:"date", 4:"passage", -1:'accession'}, 'outgroup':'A/Beijing/32/1992', + 'force_include':'source-data/HI_strains.txt', 'max_global':True, # sample as evenly as possible from different geographic regions + 'cds':[48,-1], # define the HA1 start i n 0 numbering + + # frequency estimation parameters 'aggregate_regions': [ ("global", None), ("NA", ["NorthAmerica"]), ("EU", ["Europe"]), ("AS", ["China", "SoutheastAsia", "JapanKorea"]), ("OC", ["Oceania"]) ], - 'frequency_stiffness':1.0, + 'frequency_stiffness':10.0, 'time_interval':(2012.0, 2015.1), 'pivots_per_year':12.0, - 'cds':[48,-1], # define the HA1 start i n 0 numbering + 'min_mutation_count':10, # define relevant clades in canonical HA1 numbering (+1) 'clade_designations': { "3c3.a":[(128,'A'), (142,'G'), (159,'S')], "3c3": [(128,'A'), (142,'G'), (159,'F')], diff --git a/augur/source-data/HI_strains.txt b/augur/source-data/HI_strains.txt new file mode 100644 index 00000000..998addd0 --- /dev/null +++ b/augur/source-data/HI_strains.txt @@ -0,0 +1,489 @@ +A/TOULON/1244/2006 +A/Serres/77/2007 +A/LYON/1313/2006 +A/Nepal/921/2006 +A/LYON/1292/2006 +A/Victoria/503/2006 +A/Wisconsin/67/2005 +A/Hiroshima/52/2005 +A/Sofia/319/2007 +A/Sarajevo/21/2012 +A/Stockholm/1/2010 +A/ALASKA/5/2010 +A/Wisconsin/13/2010 +A/GUADELOUPE/202/2010 +A/SAKAI/20/2011 +A/Iowa/19/2010 +A/Florida/43/2010 +A/Uppsala/3/2011 +A/Uvurkhangai/3970/2010 +A/Ulaanbaatar/3849/2010 +A/PERTH/10/2010 +A/Centre/1329/2014 +A/Israel/Z774/2014 +A/Lisboa/MS102/2014 +A/Norway/86/2014 +A/Jordan/30022/2014 +A/Jordan/30062/2014 +A/Jordan/30541/2013 +A/Cameroon/6468/2013 +A/Cameroon/7186/2013 +A/Cameroon/6624/2013 +A/Cameroon/5793/2013 +A/Norway/58/2014 +A/Norway/161/2014 +A/Genova/09/2014 +A/Poland/1955/2014 +A/Poland/896/2014 +A/Cyprus/F4/2014 +A/Paris/1124/2014 +A/Belgium/14G0496/2014 +A/Serbia/NS-707/2014 +A/Switzerland/10261363/2014 +A/Switzerland/10295858/2014 +A/Switzerland/9851384/2014 +A/Bratislava/111/2014 +A/Serbia/NS-783/2014 +A/Tomsk/6/2014 +A/Algeria/277/2013 +A/Extremadura/1753/2014 +A/Hatay/367/2014 +A/Norway/184/2014 +A/Georgia/76/2014 +A/Kharkov/203/2014 +A/Yaroslavl/234/2014 +A/Ireland/21550/2014 +A/Ireland/22878/2014 +A/Lithuania/9133/2014 +A/Ireland/22041/2014 +A/NEWCASTLE/22/2014 +A/Lithuania/5009/2014 +A/Belgium/14G0510/2014 +A/Lithuania/10543/2014 +A/Lithuania/9430/2014 +A/Lithuania/11496/2014 +A/Lithuania/10373/2014 +A/Lithuania/5879/2014 +A/Levice/223/2014 +A/Norway/1020/2014 +A/Madrid/SO12318/2014 +A/Austria/790710/2014 +A/Sastin/137/2014 +A/Lithuania/13347/2014 +A/Lithuania/8237/2014 +A/Mogilev/1273/2014 +A/Lithuania/10716/2014 +A/Norway/1003/2014 +A/Ireland/22350/2014 +A/Lithuania/7452/2014 +A/Canarias/1686/2014 +A/Canarias/1687/2014 +A/Canarias/1689/2014 +A/Lisboa/MS105/2014 +A/Finland/405/2014 +A/Bremen/1/2014 +A/V.Novgorod/223/2014 +A/Vladimir/220/2014 +A/Poland/1899/2014 +A/Norway/53/2014 +A/SAPPORO/116/2014 +A/Extremadura/1749/2014 +A/Albania/4979/2014 +A/Catalonia/6095851NS/2014 +A/Austria/773236/2014 +A/Austria/784503/2014 +A/Ukraine/125/2014 +A/Extremadura/1752/2014 +A/Stockholm/12/2014 +A/Norway/507/2014 +A/Albania/4869/2014 +A/Poland/120/2014 +A/Poland/179/2014 +A/Poland/1702/2014 +A/Milano/72/2014 +A/Ukraine/77/2014 +A/Ukraine/218/2014 +A/Belgium/14G0508/2014 +A/Khmelnitsky/244/2014 +A/Ukraine/86/2014 +A/Catalonia/6623S/2013 +A/Belgrade/2668/2014 +A/Kragujevac/2378/2014 +A/Trieste/10/2014 +A/Firenze/9/2014 +A/Slovenia/87/2014 +A/Norway/277/2014 +A/Norway/347/2014 +A/Norway/850/2014 +A/Ukraine/6101/2014 +A/Dnipro/232/2014 +A/Ukraine/6080/2014 +A/Ukraine/6158/2014 +A/Lithuania/7802/2014 +A/Iceland/45/2014 +A/Hessen/4/2014 +A/Slovenia/622/2014 +A/Kragujevac/2337/2014 +A/Valjevo/2029/2014 +A/Valencia/624026S/2014 +A/Norway/389/2014 +A/Switzerland/10295823/2014 +A/Milano/58/2014 +A/Sassari/17/2014 +A/Austria/786648/2014 +A/TOKYO/31512/2013 +A/Norway/1078/2014 +A/Norway/313/2014 +A/Belgium/14S0265/2014 +A/Serbia/NS-666/2014 +A/Serbia/NS-669/2014 +A/England/256/2014 +A/Milano/84/2014 +A/Ghana/DILI-0479/2014 +A/Glasgow/4165/2014 +A/Stockholm/6/2014 +A/Palau/6759/2014 +A/Finland/438/2014 +A/Finland/439/2014 +A/Finland/440/2014 +A/Glasgow/4144/2014 +A/TASMANIA/11/2014 +A/OSAKA-C/2003/2014 +A/Norway/466/2014 +A/Switzerland/9715293/2013 +A/Finland/428/2014 +A/Finland/437/2014 +A/Ghana/DILI-0522/2014 +A/Ghana/DILI-0483/2014 +A/Ghana/DARI-0104/2014 +A/Ghana/DILI-0658/2014 +A/Ghana/DILI-0659/2014 +A/Ghana/FS-0514/2014 +A/Ghana/DARI-0101/2014 +A/Dakar/09/2014 +A/Dakar/15/2014 +A/Dakar/10/2014 +A/Dakar/13/2014 +A/Dakar/14/2014 +A/Dakar/16/2014 +A/Dakar/17/2014 +A/Ghana/DILI-0428/2014 +A/Dakar/12/2014 +A/Galicia/1786/2014 +A/Bretagne/1267/2014 +A/Salamanca/44/2014 +A/Dnipro/235/2014 +A/Catalonia/2141012NS/2013 +A/Jiangxi-Xunyang/1790/2013 +A/Jordan/30677/2013 +A/Dnipro/229/2014 +A/Ukraine/6054/2014 +A/Fujian-Yanping/2707/2013 +A/Chongqing-Yuzhong/11653/2013 +A/Slovenia/1213/2014 +A/Slovenia/712/2014 +A/Poland/1111/2014 +A/Poland/154/2014 +A/Baden-Wurttemberg/14/2014 +A/Ukraine/6069/2014 +A/Ukraine/6079/2014 +A/Ukraine/6097/2014 +A/Ukraine/6138/2014 +A/Ukraine/6141/2014 +A/Austria/788090/2014 +A/Zambia/13/093/2013 +A/Zambia/13/127/2013 +A/Ukraine/6156/2014 +A/Botosani/162690/2014 +A/Austria/772655/2014 +A/Ankara/4084/2013 +A/Segovia/81/2014 +A/Bulgaria/127/2014 +A/Zambia/13/109/2013 +A/Lamia.GR/79/2014 +A/Austria/785558/2014 +A/Samara/73/2013 +A/Zambia/13/176/2013 +A/Norway/120/2014 +A/Norway/226/2014 +A/Latvia/03-026603/2014 +A/Bulgaria/725/2014 +A/Parma/40/2014 +A/Antalya/1083/2013 +A/Georgia/328/2014 +A/Poland/160/2014 +A/Serbia/NS-682/2014 +A/Berlin/20/2014 +A/Bulgaria/845/2014 +A/Ankara/266/2014 +A/Bulgaria/652/2014 +A/Bulgaria/147/2014 +A/Hamburg/1/2014 +A/Poland/148/2014 +A/Poland/39/2014 +A/Poland/40/2014 +A/Madagascar/00153/2014 +A/Madagascar/04228/2013 +A/Madagascar/00147/2014 +A/Madagascar/04245/2013 +A/Madagascar/00031/2014 +A/Madagascar/00041/2014 +A/Catalonia/2152420NS/2014 +A/Jordan/20564/2013 +A/Bulgaria/040/2014 +A/Prievidza/182/2014 +A/Vrancea/162893/2014 +A/Norway/272/2014 +A/Belgium/14H0010/2014 +A/Mogilev/1484/2014 +A/Stockholm/13/2014 +A/Finland/411/2014 +A/Jordan/20006/2014 +A/Jordan/30586/2013 +A/Cameroon/7010/2013 +A/Cameroon/7012/2013 +A/Roma/11/2014 +A/Iasi/162712/2014 +A/Switzerland/9884248/2014 +A/Zhitomir/286/2014 +A/Zhitomir/290/2014 +A/Mures/163025/2014 +A/Kharkov/201/2014 +A/Iceland/08202/2014 +A/Norway/2034/2014 +A/Norway/211/2014 +A/Norway/409/2014 +A/Norway/580/2014 +A/Norway/2006/2014 +A/Paris/1268/2014 +A/Bulgaria/800/2014 +A/Jordan/20053/2014 +A/Van/354/2014 +A/Calarasi/162804/2014 +A/Jordan/20539/2013 +A/Jordan/20561/2013 +A/Iceland/62/2014 +A/Ireland/23577/2014 +A/Salamanca/92/2014 +A/Greece/12/2014 +A/Stockholm/14/2014 +A/Iceland/23/2014 +A/Jordan/30714/2013 +A/Cyprus/F30/2014 +A/Serbia/NS-613/2014 +A/Serbia/NS-619/2014 +A/Belgrade/2114/2014 +A/Serbia/NS-210/2013 +A/Gambia/G0063940/2012 +A/Gambia/G0069640/2012 +A/Gambia/G0069636/2012 +A/Gambia/G0070336/2012 +A/Sarajevo/9/2013 +A/Gambia/G0065236/2012 +A/Gambia/G0065240/2012 +A/Gambia/G0065736/2012 +A/Gambia/G0071436/2012 +A/Lyon/1381/2007 +A/Victoria/210/2009 +A/Sweden/2/2009 +A/Umea/4/2009 +A/Perth/16/2009 +A/Philippines/2191/2009 +A/Wisconsin/15/2009 +A/Taiwan/760/2007 +A/Texas/37/2007 +A/Uruguay/716/2007 +A/Finland/9/2008 +A/LYON/1359/2006 +A/Brisbane/10/2007 +A/LYON/1324/2006 +A/LYON/1331/2006 +A/Brisbane/24/2008 +A/Switzerland/1397477/2008 +A/Sweden/3/2008 +A/Stockholm/26/2008 +A/Tanger/533/2009 +A/Singapore/N593/2008 +A/Poitiers/1341/2008 +A/Seoul/4436/2008 +A/Paris/458/2008 +A/Algeria/G202/2009 +A/Switzerland/1409281/2008 +A/England/669/2008 +A/England/687/2008 +A/Torino/14/2008 +A/Norway/1/2009 +A/SAKAI/72/2014 +A/Nebraska/4/2014 +A/Sarajevo/58/2013 +A/Johannesburg/99/2011 +A/Norway/1775/2011 +A/Stockholm/18/2011 +A/Johannesburg/78/2011 +A/Durban/92/2011 +A/Minnesota/17/2010 +A/Victoria/361/2011 +A/Bratislava/31/2011 +A/Texas/50/2012 +A/Centre/1497/2014 +A/Taiwan/442/2014 +A/Stockholm/2/2014 +A/Finland/410/2014 +A/Slovenia/1162/2014 +A/Poland/3464/2014 +A/Norway/1903/2014 +A/Khmelnitsky/249/2014 +A/Pozarevac/1336/2014 +A/Nis/1059/2014 +A/Sabac/1334/2014 +A/Slovenia/842/2014 +A/Bulgaria/283/2014 +A/Olt/162753/2014 +A/Moscow/206/2014 +A/England/400/2014 +A/Georgia/699/2014 +A/Samsun/4191/2013 +A/Georgia/495/2014 +A/Georgia/584/2014 +A/Dnipro/234/2014 +A/Georgia/215/2014 +A/Malatya/3872/2013 +A/Ukraine/6161/2014 +A/Latvia/03-013033/2014 +A/Malatya/3946/2013 +A/Yalova/4145/2013 +A/Belgium/14S0070/2014 +A/Norway/18/2014 +A/Lisboa/SU540/2014 +A/Sassari/12/2014 +A/Milano/113/2014 +A/Stockholm/1/2013 +A/Switzerland/9851671/2014 +A/Norway/1762/2011 +A/Bangladesh/5071/2011 +A/Johannesburg/73/2011 +A/Johannesburg/107/2011 +A/BRISBANE/11/2010 +A/Stockholm/2/2010 +A/Lyon/Cx-R/3120/2009 +A/Stockholm/89/2009 +A/Israel/26/2009 +A/Israel/22/2009 +A/Israel/24/2009 +A/Cameroon/675/2009 +A/Johannesburg/113/2011 +A/Stockholm/112/2009 +A/Johannesburg/79/2011 +A/Johannesburg/94/2011 +A/Johannesburg/153/2011 +A/Johannesburg/114/2011 +A/Victoria/208/2009 +A/Norway/3789/2009 +A/Norway/3790/2009 +A/BRISBANE/5/2002 +A/Christchurch/28/2003 +A/INCHEON/677/2006 +A/Tennessee/6/2004 +A/CANADA/578/2004 +A/Fujian/445/2003 +A/California/7/2004 +A/Malaysia/1/2004 +A/Singapore/36/2004 +A/Singapore/37/2004 +A/BRISBANE/3/2005 +A/Wellington/1/2004 +A/Texas/6/2004 +A/Washington/1/2004 +A/Victoria/523/2004 +A/Philippines/825/2003 +A/Victoria/500/2004 +A/Victoria/110/2004 +A/Fujian/411/2002 +A/Netherlands/22/2003 +A/Daejeon/390/2002 +A/Pusan/504/2002 +A/Cheonnam/432/2002 +A/Korea/770/2002 +A/Moscow/10/1999 +A/Netherlands/300/1997 +A/Sydney/5/1997 +A/Auckland/10/1997 +A/Netherlands/427/1998 +A/Netherlands/462/1998 +A/Toulouse/878/2001 +A/Finland/170/2003 +A/Netherlands/118/2001 +A/Netherlands/1/2002 +A/Singapore/15/2001 +A/Netherlands/124/2001 +A/Netherlands/3/2000 +A/Netherlands/126/2001 +A/Panama/2007/1999 +A/Netherlands/301/1999 +A/Stockholm/20/1991 +A/Stockholm/8/1992 +A/Nijmegen/3126/1992 +A/Lyon/1182/1991 +A/Lyon/24222/1991 +A/Paris/548/1992 +A/Lyon/1149/1991 +A/Netherlands/823/1992 +A/Paris/407/1992 +A/Paris/320/1992 +A/Paris/325/1992 +A/Paris/424/1992 +A/Paris/457/1992 +A/Nijmegen/3129/1992 +A/Madrid/G12/1991 +A/Oviedo/31/1992 +A/Netherlands/816/1991 +A/Netherlands/891/1991 +A/Madrid/G58/1992 +A/Tilburg/5957/1992 +A/Stockholm/13/1992 +A/Umea/2000/1992 +A/Victoria/33/1992 +A/Stockholm/7/1992 +A/Paris/614/1992 +A/Paris/467/1992 +A/Netherlands/935/1992 +A/Paris/583/1992 +A/Netherlands/5/1998 +A/Netherlands/1/1995 +A/Johannesburg/33/1994 +A/Netherlands/938/1992 +A/Netherlands/126/1993 +A/Madrid/G102/1993 +A/Netherlands/101/1993 +A/Akita/4/1993 +A/Lyon/672/1993 +A/Netherlands/3/1993 +A/Beijing/32/1992 +A/Yamagata/56/1993 +A/Yamagata/61/1993 +A/Sendai/C273/1992 +A/Stockholm/12/1992 +A/Stockholm/20/1993 +A/Guangdong/25/1993 +A/England/7/1994 +A/Netherlands/241/1993 +A/Netherlands/18/1994 +A/Madrid/G252/1993 +A/Madrid/G122/1993 +A/Madrid/G109/1993 +A/Finland/338/1995 +A/Netherlands/271/1995 +A/Netherlands/91/1996 +A/Lyon/2279/1995 +A/Victoria/75/1995 +A/Finland/381/1995 +A/Finland/339/1995 +A/Johannesburg/10/1997 +A/Oslo/21/1997 +A/Nice/491/1997 +A/Lyon/1781/1996 +A/Brisbane/8/1996 +A/Nanchang/933/1995 +A/Wuhan/359/1995 +A/Singapore/1/1996 +A/Geneva/3958/1996 diff --git a/augur/src/bernoulli_frequency.py b/augur/src/bernoulli_frequency.py index 5e9e02ca..dfe08b90 100644 --- a/augur/src/bernoulli_frequency.py +++ b/augur/src/bernoulli_frequency.py @@ -30,8 +30,13 @@ def running_average(obs, ws): try: tmp_vals = np.convolve(np.ones(ws, dtype=float)/ws, obs, mode='same') # fix the edges. using mode='same' assumes zeros outside the range - tmp_vals[:ws//2]*=float(ws)/np.arange(ws//2,ws) - tmp_vals[-ws//2+1:]*=float(ws)/np.arange(ws-1,ws//2,-1.0) + if ws%2==0: + tmp_vals[:ws//2]*=float(ws)/np.arange(ws//2,ws) + if ws//2>1: + tmp_vals[-ws//2+1:]*=float(ws)/np.arange(ws-1,ws//2,-1.0) + else: + tmp_vals[:ws//2]*=float(ws)/np.arange(ws//2+1,ws) + tmp_vals[-ws//2:]*=float(ws)/np.arange(ws,ws//2,-1.0) except: import pdb; pdb.set_trace() return tmp_vals @@ -51,7 +56,7 @@ def get_extrapolation_pivots(start=None, dt=0.5): def logit_transform(freq): - return np.log(freq/(1-freq)) + return np.log(freq/np.maximum(1e-10,(1-freq))) def logit_inv(logit_freq): logit_freq[logit_freq<-20]=-20 @@ -213,10 +218,10 @@ def estimate_sub_frequencies(node, all_dates, tip_to_date_index, threshold=50, r # we estimate frequencies of subclades, they will be multiplied by the # frequency of the parent node and corrected for the frequency of sister clades # already fit - try: + if node.freq[region_name] is None: + frequency_left=None + else: frequency_left = np.array(node.freq[region_name]) - except: - import pdb; pdb.set_trace() ci=0 # need to resort, since the clade size order might differs after subsetting to regions children_by_size = sorted(node.child_nodes(), key = lambda x:len(x.tips), reverse=True) @@ -254,7 +259,6 @@ def estimate_sub_frequencies(node, all_dates, tip_to_date_index, threshold=50, r for child in children_by_size[ci:]: # assign freqs of all remaining clades to None. child.freq[region_name] = None child.logit_freq[region_name] = None - # recursively repeat for subclades for child in node.child_nodes(): estimate_sub_frequencies(child, all_dates, tip_to_date_index, threshold, region_name) @@ -343,12 +347,15 @@ def estimate_genotype_frequency(tree, gt, time_interval=None, regions = None, re tps = all_dates[leaf_order] obs = np.array(observations)[leaf_order] # define pivots and estimate - pivots = get_pivots(tps[0], tps[1]) - fe = frequency_estimator(zip(tps, obs), pivots=pivots, - stiffness=flu_stiffness*float(len(observations))/total_leaf_count, - logit=True, verbose = 0) - fe.learn() - return fe.frequency_estimate, (tps,obs) + pivots = get_pivots() + if len(tps)>10: + fe = frequency_estimator(zip(tps, obs), pivots=pivots, + stiffness=flu_stiffness*float(len(observations))/total_leaf_count, + logit=True, verbose = 0) + fe.learn() + return fe.frequency_estimate, (tps,obs) + else: + return interp1d(pivots, np.zeros_like(pivots)), (tps,obs) def determine_clade_frequencies(tree, clades, regions=None, plot=False): @@ -423,37 +430,6 @@ def add_genotype_at_pos(tree, positions): for node in tree.postorder_node_iter(): node.gt = "".join([node.aa_seq[pos] for pos in positions]) - -def test(): - import matplotlib.pyplot as plt - tps = np.sort(100 * np.random.uniform(size=100)) - freq = [0.1] - logit = True - stiffness=100 - s=-0.02 - for dt in np.diff(tps): - freq.append(freq[-1]*np.exp(-s*dt)+np.sqrt(2*np.max(0,freq[-1]*(1-freq[-1]))*dt/stiffness)*np.random.normal()) - obs = np.random.uniform(size=tps.shape) Date: Sat, 28 Feb 2015 01:39:22 +0100 Subject: [PATCH 24/48] small issue with the calculation of the epitope sites. they need to use the translated aa_seq, rather than translate again the entire nucleotide sequence, otherwise, the numbers are off. --- augur/src/tree_refine.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/augur/src/tree_refine.py b/augur/src/tree_refine.py index 39f81d4a..5a0adfd5 100644 --- a/augur/src/tree_refine.py +++ b/augur/src/tree_refine.py @@ -142,9 +142,9 @@ def add_node_attributes(tree): node.xvalue = node.distance_from_root() root = tree.seed_node for node in tree.postorder_node_iter(): - node.ep = epitope_distance(translate(node.seq), translate(root.seq)) - node.ne = nonepitope_distance(translate(node.seq), translate(root.seq)) - node.rb = receptor_binding_distance(translate(node.seq), translate(root.seq)) + node.ep = epitope_distance(node.aa_seq, root.aa_seq) + node.ne = nonepitope_distance(node.aa_seq, root.aa_seq) + node.rb = receptor_binding_distance(node.aa_seq, root.aa_seq) for node in tree.postorder_node_iter(): node.trunk_count = 0 node.trunk = False @@ -203,11 +203,11 @@ def main(tree, viruses, outgroup, cds = [0,-1]): collapse(tree) print "Ladderize tree" ladderize(tree) + print "Translate nucleotide sequences" + translate_all(tree, cds) print "Append node attributes" add_virus_attributes(viruses, tree) add_node_attributes(tree) - print "Translate nucleotide sequences" - translate_all(tree, cds) print "Enumerate leaves of ladderized tree and calculate unique numerical date" unique_date(tree) print "Define trunk" From 3e2a9454b3b2267f45a3bd7f9710c6e289440078 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Sat, 28 Feb 2015 05:08:03 +0100 Subject: [PATCH 25/48] fixed geo sampling --- augur/src/virus_filter.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/augur/src/virus_filter.py b/augur/src/virus_filter.py index f2dfcf6a..0fd92742 100644 --- a/augur/src/virus_filter.py +++ b/augur/src/virus_filter.py @@ -121,19 +121,21 @@ def select_viruses(self, priority_viruses,other_viruses, y, m, viruses_per_month select viruses_per_month strains as evenly as possible from all regions ''' from itertools import izip_longest - from random import sample + from random import sample,shuffle select_set = [] for vset in [priority_viruses, other_viruses]: select_set.append([]) for representative in izip_longest(*[vset[(y,m,r)] for r in regions], fillvalue = None): - select_set[-1].extend([v for v in representative if v is not None]) + tmp = [v for v in representative if v is not None] + shuffle(tmp) + select_set[-1].extend(tmp) print "found",len(select_set[-1]), 'in year',y,'month',m if all_priority: n_other = max(0,viruses_per_month-len(select_set[0])) - return select_set[0] + sample(select_set[1], min(len(select_set[1]), n_other)) + return select_set[0] + select_set[1][:n_other] else: tmp = select_set[0] + select_set[1] - return sample(tmp, max(len(tmp), viruses_per_month)) + return tmp[:viruses_per_month] def select_viruses_global(self, priority_viruses,other_viruses, y, m, viruses_per_month, regions, all_priority = False): ''' @@ -147,7 +149,7 @@ def select_viruses_global(self, priority_viruses,other_viruses, y, m, viruses_pe print "found",len(priority_viruses_flat)+len(other_viruses_flat), 'in year',y,'month',m n_other = max(0,viruses_per_month-len(priority_viruses_flat)) - return sample(priority_viruses_flat, min(len(priority_viruses_flat), viruses_per_month))\ + return sample(priority_viruses_flat, len(priority_viruses_flat) if all_priority else min(len(priority_viruses_flat), viruses_per_month))\ + sample(other_viruses_flat, min(n_other, len(other_viruses_flat))) From 142ce588089aad3b0de322f766d28059cf509a81 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Sun, 1 Mar 2015 09:28:44 +0100 Subject: [PATCH 26/48] major reorganization, made all processing steps classes that contain the generic part of the processing. in addition, there is a H3N2_process.py, which contains subclasses of the former adding special features for H3N2 and could be implemented analogously for other viruses --- augur/src/{H3N2_filter.py => H3N2_process.py} | 222 ++++++++++++- augur/src/process.py | 170 +++++++--- augur/src/seq_util.py | 7 +- augur/src/tree_refine.py | 310 +++++++----------- augur/src/tree_util.py | 6 +- augur/src/virus_clean.py | 190 ++++------- augur/src/virus_filter.py | 10 +- 7 files changed, 510 insertions(+), 405 deletions(-) rename augur/src/{H3N2_filter.py => H3N2_process.py} (60%) diff --git a/augur/src/H3N2_filter.py b/augur/src/H3N2_process.py similarity index 60% rename from augur/src/H3N2_filter.py rename to augur/src/H3N2_process.py index 5a56424e..189073e5 100644 --- a/augur/src/H3N2_filter.py +++ b/augur/src/H3N2_process.py @@ -1,11 +1,47 @@ -import time -from io_util import write_json +import time, argparse,re, sys +sys.path.append('src') from virus_filter import flu_filter +from virus_clean import virus_clean +from tree_refine import tree_refine +from process import process from Bio import SeqIO +from Bio.Seq import Seq +from Bio.Align import MultipleSeqAlignment +import numpy as np +from itertools import izip + +epitope_mask = np.fromstring("0000000000000000000000000000000000000000000011111011011001010011000100000001001011110011100110101000001100000100000001000110101011111101011010111110001010011111000101011011111111010010001111101110111001010001110011111111000000111110000000101010101110000000000011100100000001011011100000000000001001011000110111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000", dtype='S1') + + +virus_config = { + # data source and sequence parsing/cleaning/processing + 'virus':'H3N2', + 'alignment_file':'data/20150222_all_H3N2_HA1.fasta', + 'fasta_fields':{0:'strain', 1:"date", 4:"passage", -1:'accession'}, + 'outgroup':'A/Beijing/32/1992', + 'force_include':'source-data/HI_strains.txt', + 'max_global':True, # sample as evenly as possible from different geographic regions + 'cds':[48,-1], # define the HA1 start i n 0 numbering + + # frequency estimation parameters + 'aggregate_regions': [ ("global", None), ("NA", ["NorthAmerica"]), ("EU", ["Europe"]), + ("AS", ["China", "SoutheastAsia", "JapanKorea"]), ("OC", ["Oceania"]) ], + 'frequency_stiffness':10.0, + 'time_interval':(2012.0, 2015.1), + 'pivots_per_year':12.0, + 'min_mutation_count':10, + # define relevant clades in canonical HA1 numbering (+1) + 'clade_designations': { "3c3.a":[(128,'A'), (142,'G'), (159,'S')], + "3c3": [(128,'A'), (142,'G'), (159,'F')], + "3c2.a":[(144,'S'), (159,'Y'), (225,'D'), (311,'H'),(489,'N')], + "3c2": [(144,'N'), (159,'F'),(225,'N'), (489,'N')] + } +} + class H3N2_filter(flu_filter): - def __init__(self, fasta_fname, fasta_header): - flu_filter.__init__(self, fasta_fname, fasta_header) + def __init__(self): + flu_filter.__init__(self, virus_config['alignment_file'], virus_config['fasta_fields']) self.vaccine_strains =[ { "strain": "A/Wisconsin/67/2005", @@ -62,19 +98,171 @@ def __init__(self, fasta_fname, fasta_header): # 'seq': 'ATGAAGACTATCATTGCTTTGAGCTACATTTTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACAACAGCACAGCAACGCTGTGCCTGGGACATCATGCAGTGCCAAACGGAACGCTAGTGAAAACAATCACGAATGATCAAATTGAAGTGACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTAGAATATGCGACAGTCCTCACCGAATCCTTGATGGAAAAAACTGCACACTGATAGATGCTCTATTGGGAGACCCTCATTGTGATGGCTTCCAAAATAAGGAATGGGACCTTTTTGTTGAACGCAGCAAAGCTTACAGCAACTGTTACCCTTATGATGTACCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCAGGCACCCTGGAGTTTATCAATGAAGACTTCAATTGGACTGGAGTCGCTCAGGATGGGGGAAGCTATGCTTGCAAAAGGGGATCTGTTAACAGTTTCTTTAGTAGATTGAATTGGTTGCACAAATCAGAATACAAATATCCAGCGCTGAACGTGACTATGCCAAACAATGGCAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGAGCACGGACAGAGACCAAACCAGCCTATATGTTCGAGCATCAGGGAGAGTCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAACCCCGAATATCGGGTCTAGACCCTGGGTAAGGGGTCAGTCCAGTAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAATAGCACAGGGAATCTAATTGCTCCTCGGGGTTACTTCAAAATACGAAATGGGAAAAGCTCAATAATGAGGTCAGATGCACCCATTGGCACCTGCAGTTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCTTTTCAAAATGTAAACAGGATCACATATGGGGCCTGCCCCAGATATGTTAAGCAAAACACT' } +class H3N2_clean(virus_clean): + def __init__(self): + pass + def clean_outbreaks(self): + """Remove duplicate strains, where the geographic location, date of sampling and sequence are identical""" + virus_hashes = set() + new_viruses = [] + for v in self.viruses: + geo = re.search(r'A/([^/]+)/', v.strain).group(1) + if geo: + vhash = (geo, v.date, str(v.seq)) + if vhash not in virus_hashes: + new_viruses.append(v) + virus_hashes.add(vhash) -def main(in_fname='data/gisaid_epiflu_sequence.fasta', years_back=3, viruses_per_month=50): - print "--- Filter at " + time.strftime("%H:%M:%S") + " ---" - myH3N2_filter = H3N2_filter(in_fname, {0:'strain', 1:"date", 4:"passage", -1:'accession'}) - myH3N2_filter.filter() - HI_data_strains = [seq.name for seq in SeqIO.parse('data/strains_with_HI.fasta', 'fasta')] - myH3N2_filter.subsample(years_back, viruses_per_month, prioritize = HI_data_strains, - all_priority = True, region_specific=False) + self.viruses = MultipleSeqAlignment(new_viruses) + return new_viruses - out_fname = 'data/virus_filter.json' - write_json(myH3N2_filter.virus_subsample, out_fname) - return out_fname - -if __name__ == "__main__": - main() + def clean_reassortants(self): + from seq_util import hamming_distance as distance + """Remove viruses from the outbreak of triple reassortant pH1N1""" + remove_viruses = [] + + reassortant_seq = "ATGAAGACTATCATTGCTTTTAGCTGCATTTTATGTCTGATTTTCGCTCAAAAACTTCCCGGAAGTGACAACAGCATGGCAACGCTGTGCCTGGGACACCATGCAGTGCCAAACGGAACATTAGTGAAAACAATCACGGATGACCAAATTGAAGTGACTAATGCTACTGAGCTGGTCCAGAGTTCCTCAACAGGTGGAATATGCAACAGTCCTCACCAAATCCTTGATGGGAAAAATTGCACACTGATAGATGCTCTATTGGGGGACCCTCATTGTGATGACTTCCAAAACAAGGAATGGGACCTTTTTGTTGAACGAAGCACAGCCTACAGCAACTGTTACCCTTATTACGTGCCGGATTATGCCACCCTTAGATCATTAGTTGCCTCATCCGGCAACCTGGAATTTACCCAAGAAAGCTTCAATTGGACTGGAGTTGCTCAAGGCGGATCAAGCTATGCCTGCAGAAGGGGATCTGTTAACAGTTTCTTTAGTAGATTGAATTGGTTGTATAACTTGAATTACAAGTATCCAGAGCAGAACGTAACTATGCCAAACAATGACAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAACCAACCTATATGTCCAAGCATCAGGGAGAGTTATAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGGTCTAGACCCTGGGTAAGGGGTGTCTCCAGCATAATAAGCATCTATTGGACGATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCCCCTCGGGGTTACTTCAAAATACAAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACACATTGATGAATGCAATTCTGAATGCATTACTCCAAATGGAAGCATTCCCAATGACAAACCTTTTCAAAATGTAAACAAGATCACATATGGAGCCTGTCCCAGATATGTTAAGCAAAACACCCTGAAATTGGCAACAGGAATGCGGAATGTACCAGAGAAACAAACTAGAGGCATATTCGGCGCAATTGCAGGTTTCATAGAAAATGGTTGGGAGGGAATGGTAGACGGTTGGTACGGTTTCAGGCATCAGAATTCTGAAGGCACAGGACAAGCAGCAGATCTTAAAAGCACTCAAGCAGCAATCAACCAAATCACCGGGAAACTAAATAGAGTAATCAAGAAAACAAACGAGAAATTCCATCAAATCGAAAAAGAATTCTCAGAAGTAGAAGGAAGAATTCAGGACCTAGAGAAATACGTTGAAGACACTAAAATAGATCTCTGGTCTTACAACGCTGAGATTCTTGTTGCCCTGGAGAACCAACATACAATTGATTTAACCGACTCAGAGATGAGCAAACTGTTCGAAAGAACAAGAAGGCAACTGCGGGAAAATGCTGAGGACATGGGCAATGGTTGCTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCATGATATATACAGAAACGAGGCATTAAACAATCGGTTCCAGATCAAAGGTGTTCAGCTAAAGTCAGGATACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGCTTTTTGCTTTGTGTTGTTCTGCTGGGGTTCATTATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGA" + for v in self.viruses: + dist = distance(Seq(reassortant_seq), v) + if (dist < 0.02): + remove_viruses.append(v) + if self.verbose>1: + print "\t\tremoving",v.strain + + reassortant_seq = "ATGAAGACTATCATTGCTTTTAGCTGCATCTTATGTCAGATCTCCGCTCAAAAACTCCCCGGAAGTGACAACAGCATGGCAACGCTGTGCCTGGGGCATCACGCAGTACCAAACGGAACGTTAGTGAAAACAATAACAGATGACCAAATTGAAGTGACTAATGCTACTGAGCTGGTCCAGAGTACCTCAAAAGGTGAAATATGCAGTAGTCCTCACCAAATCCTTGATGGAAAAAATTGTACACTGATAGATGCTCTATTGGGAGACCCTCATTGTGATGACTTCCAAAACAAGAAATGGGACCTTTTTGTTGAACGAAGCACAGCTTACAGCAACTGTTACCCTTATTATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACCCTGGAATTTACTCAAGAAAGCTTCAATTGGACTGGGGTTGCTCAAGACGGAGCAAGCTATTCTTGCAGAAGGGAATCTGAAAACAGTTTCTTTAGTAGATTGAATTGGTTATATAGTTTGAATTACAAATATCCAGCGCTGAACGTAACTATGCCAAACAATGACAAATTTGACAAATTGTACATTTGGGGGGTACACCACCCGGGTACGGACAAGGACCAAACCAGTCTATATATTCAAGCATCAGGGAGAGTTACAGTCTCCACCAAATGGAGCCAACAAACTGTAATCCCGAATATCGGGTCTAGACCCTGGATAAGGGGTGTCTCCAGCATAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCCCCTCGGGGTTACTTCAAAATACAAAGTGGGAAAAGCTCAATAATGAGGTCAGATGCACACATTGGCAACTGCAACTCTGAATGCATTACCCCAAATGGAAGCATTCCCAACGACAAACCTTTTCAAAATGTAAACAGAATAACATATGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTAGCAACAGGAATGCGGAATGTACCAGAGAAACAAACTAGAGGCATATTCGGCGCAATCGCAGGTTTCATAGAAAATGGTTGGGAAGGGATGGTGGACGGTTGGTATGGTTTCAGGCATCAAAACTCTGAAGGCACAGGGCAAGCAGCAGATCTTAAAAGCACTCAAGCGGCAATCAACCAAATCACCGGGAAACTAAATAGAGTAATCAAGAAGACGAATGAAAAATTCCATCAGATCGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTAGAGAGATACGTTGAAGACACTAAAATAGACCTCTGGTCTTACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATTTAACTGACTCAGAAATGAACAAACTGTTCGAAAGGACAAGGAAGCAACTGCGGGAAAATGCTGAGGACATGGGCAATGGATGCTTTAAAATATATCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCATGATGTATACAGAGACGAAGCAGTAAACAATCGGTTCCAGATCAAAGGTGTTCAGCTGAAGTTAGGATACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGCTTTTTGCTTTGTGCTGTTCTGCTAGGATTCATTATGTGGGCATGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGA" + for v in self.viruses: + dist = distance(Seq(reassortant_seq), v) + if (dist < 0.02): + remove_viruses.append(v) + if self.verbose>1: + print "\t\tremoving",v.strain + + reassortant_seq = "ATGAAGACTAGTAGTTCTGCTATATACATTGCAA------------------------CCGCAAATG---------CAGACACATTATGTATAGGTTATCATGCAGTACTAGAAAAGAATGTAACAGTAACACACTCTGTTAACCAAACTGAGAGGGGTAGCCCCATTGCATTTG--------------------GGTAAATGTAACATTGCTGGCTGGATCC------------------------------------TGGGAAATCCAGAGTGTGACACTCTCCACAGCAAGCTCATGGTCCTACATCGTGGAAACATCTAAGACAATGGAACGTGCTACCCAGGAGATTTCATCAATTATGAGGAGCTAAGGTCATCATTTGAAAGGTTTGAGATATTACAAGTTCATGGCCCAATCATGACTCGAACAAAGGTTCCTCAAGCTGGAGCAA---------------------------AAAGCTTCTACAAAAATTTAATATGGCTAGTTAAAAAAGGAAATTCATACCCAA------------------------------AGCTCAGCAAATCCTACATTTGGGGCATTCACCATCCATCTACTAGTGCTGACCAA-------CAAAGTCTCTATCAGAGTGCAGATGCATATGTTTTATCAAAATACAGCAAGAAGTTCAAG--CCGGAAATAGCAGTAAGACCCAAAGTGAGGGATCAAGAAGGGAGAATGAACTATTACTGGACACTAGTAGAGCCGGGAGACAAAATAACATTCGAAGCAACTGGAAATCTATTGGTACCGAGATATGCATTCGCAATGGAAA----GAAATGCTGGATTATCATTTCAGATACACCAGTCCACGATTGCAATACAACTTGTCAGACACCCAAGGGTGCTATAAACACCAGCCTCCCATTTCAGAATATACATCCGATCACAATTGGAAAATGTCCCAAATATGTAAAAAGCACAAAATTGAGACTGGCCACAGGATTGAGGAATGTCCCGTCTATTCAATCTAGAGGCCTATTTGGGGCCATTGCCGGTTTCATTGAAGGGGGGTGGACAGGGATGGTAGATGGATGGTACGGTTATCACCATCAAAATGCGCAGGGGTCAGGATATGCAGCCGACCTGAAGAGCACACAGAATGCCATTGACAAGATTACTAACAAAGTAAATTCTGTTATTGAAAAGATGAATACACAGTTCACAGCAGTAGGTAAAGAGTTCAACCACCTGGAAAAAAGAATAGAGAATTTAAATAAAAAAGTTGATGATGGTTTCCTGGACATTTGGACTTACAATGCCGAACTGTTGGTTCTATTGGAAAATGAAAGAACTTTGGACTACCACGATTCAAATGTGAAAAACTTATATGAAAAGGTAAGAAGCCAGTTAAAAAACAATGCCAAGGAAATTGGAAACGGCTGCTTTGAATTTTACCACAAATGCGATAACACGTGCATGGAAAGTGTCAAAAATGGGACTTATGACTACCCAAAATACTCAGAGGAAGCAAAATTAAACAGAGAAGAAATAGATGGGGTAAAGCTGGAATCAACAAGGATTTACCAGATTTTGGCGATCTATTCAACTGTCGCCAGTTCATTGGTACTGGTAGTCTCCCTGGGGGCAATCATCTGGATGTGCTCTAATGGGTCTCTACAGTGTAGAATATGTATTTAA" + for v in self.viruses: + dist = distance(Seq(reassortant_seq), v) + if (dist < 0.02): + remove_viruses.append(v) + if self.verbose>1: + print "\t\tremoving",v.strain + + self.viruses = MultipleSeqAlignment([v for v in self.viruses if v not in remove_viruses]) + + def clean(self): + self.clean_generic() + self.clean_outbreaks() + print "Number of viruses after outbreak filtering:",len(self.viruses) + self.clean_reassortants() + print "Number of viruses after reassortant filtering:",len(self.viruses) + + +class H3N2_refine(tree_refine): + def __init__(self, **kwargs): + tree_refine.__init__(self, **kwargs) + + def refine(self): + self.refine_generic() # -> all nodes now have aa_seq, xvalue, yvalue, trunk, and basic virus properties + self.add_H3N2_attributes() + + def epitope_sites(self, aa): + aaa = np.fromstring(aa, 'S1') + return ''.join(aaa[epitope_mask[:len(aa)]=='1']) + + def nonepitope_sites(self, aa): + aaa = np.fromstring(aa, 'S1') + return ''.join(aaa[epitope_mask[:len(aa)]=='0']) + + def receptor_binding_sites(self, aa): + ''' + Receptor binding site mutations from Koel et al. 2014 + These are (145, 155, 156, 158, 159, 189, 193) in canonical HA numbering + need to subtract one since python arrays start at 0 + ''' + sites = [144, 154, 155, 157, 158, 188, 192] + return ''.join([aa[pos] for pos in sites]) + + def get_HA1(self, aa): + ''' + return the part of the peptide corresponding to HA1, starts is 329 aa long + ''' + return aa[:329] + + def epitope_distance(self, aaA, aaB): + """Return distance of sequences aaA and aaB by comparing epitope sites""" + epA = self.epitope_sites(aaA) + epB = self.epitope_sites(aaB) + distance = sum(a != b for a, b in izip(epA, epB)) + return distance + + def nonepitope_distance(self, aaA, aaB): + """Return distance of sequences aaA and aaB by comparing non-epitope sites""" + neA = self.nonepitope_sites(aaA) + neB = self.nonepitope_sites(aaB) + distance = sum(a != b for a, b in izip(neA, neB)) + return distance + + def receptor_binding_distance(self, aaA, aaB): + """Return distance of sequences aaA and aaB by comparing receptor binding sites""" + neA = self.receptor_binding_sites(aaA) + neB = self.receptor_binding_sites(aaB) + distance = sum(a != b for a, b in izip(neA, neB)) + return distance + + def add_H3N2_attributes(self): + root = self.tree.seed_node + for node in self.tree.postorder_node_iter(): + node.ep = self.epitope_distance(node.aa_seq, root.aa_seq) + node.ne = self.nonepitope_distance(node.aa_seq, root.aa_seq) + node.rb = self.receptor_binding_distance(node.aa_seq, root.aa_seq) + + for v in self.viruses: + if v.strain in self.node_lookup: + node = self.node_lookup[v.strain] + try: + node.passage=v.passage + except: + pass + +class H3N2_process(process, H3N2_filter, H3N2_clean, H3N2_refine): + """docstring for H3N2_process, H3N2_filter""" + def __init__(self,verbose = 0, **kwargs): + process.__init__(self, **kwargs) + H3N2_filter.__init__(self,**kwargs) + H3N2_clean.__init__(self,**kwargs) + H3N2_refine.__init__(self,**kwargs) + self.verbose = verbose + + def run(self, years_back=3, viruses_per_month=50, raxml_time_limit = 1.0, **kwargs): + print "--- Virus filtering at " + time.strftime("%H:%M:%S") + " ---" + self.filter() + self.subsample(years_back, viruses_per_month) + self.align() # -> self.viruses is an alignment object + print "--- Clean at " + time.strftime("%H:%M:%S") + " ---" + self.clean() # -> every node as a numerical date + self.dump() + print "--- Tree infer at " + time.strftime("%H:%M:%S") + " ---" + self.infer_tree(raxml_time_limit) # -> self has a tree + self.dump() + print "--- Infer ancestral sequences " + time.strftime("%H:%M:%S") + " ---" + self.infer_ancestral() # -> every node has a sequence + self.dump() + print "--- Tree refine at " + time.strftime("%H:%M:%S") + " ---" + self.refine() + + +if __name__=="__main__": + parser = argparse.ArgumentParser(description='Process virus sequences, build tree, and prepare of web visualization') + parser.add_argument('-y', '--years_back', type = int, default=3, help='number of past years to sample sequences from') + parser.add_argument('-v', '--viruses_per_month', type = int, default = 50, help='number of viruses sampled per month') + parser.add_argument('-r', '--raxml_time_limit', type = float, default = 1.0, help='number of hours raxml is run') + parser.add_argument('--config', default = "nextflu_config.py" , type=str, help ="config file") + parser.add_argument('--test', default = False, action="store_true", help ="don't run the pipeline") + parser.add_argument('--virus', default = False, action="store_true", help ="only select viruses") + parser.add_argument('--tree', default = False, action="store_true", help ="only build tree") + parser.add_argument('--frequencies', default = False, action="store_true", help ="only estimate frequencies") + params = parser.parse_args() + + myH3N2 = H3N2_process() + myH3N2.load() + if not params.test: + myH3N2.run(**params.__dict__) diff --git a/augur/src/process.py b/augur/src/process.py index 5cb6676e..a3020bb6 100644 --- a/augur/src/process.py +++ b/augur/src/process.py @@ -1,52 +1,118 @@ -import time, os, argparse -import virus_download, H3N2_filter, virus_align, virus_clean -import tree_infer, tree_ancestral, tree_refine -import bernoulli_frequency -import streamline -from io_util import * - -def main(years_back=3, viruses_per_month=50): - """Run full pipeline""" - - print "--- Start processing at " + time.strftime("%H:%M:%S") + " ---" - - # Run pipeline -# virus_download.main() # Download from GISAID - virus_fname = 'data/gisaid_epiflu_sequence.fasta' - - # Filter sequences - virus_fname = H3N2_filter.main(virus_fname, years_back=years_back, viruses_per_month=viruses_per_month) - - # Align sequences - virus_fname = virus_align.main(virus_fname) - - # Clean sequences - vires_fname = virus_clean.main(virus_fname) - - # Make tree, creates raxml files - tree_fname = tree_infer.main(virus_fname) - - # infer ancestral states using the cleaned viruses and the raxml tree - tree_fname = tree_ancestral.main(tree_fname=tree_fname, virus_fname=virus_fname) - - # Clean tree, reads viruses in fname + raxml files - tree_fname = tree_refine.main(tree_fname=tree_fname, virus_fname = virus_fname) - - # Estimate frequencies - tree_fname = bernoulli_frequency.main(tree_fname=tree_fname) - - # Streamline tree for auspice - tree_fname = streamline.main(tree_fname) - - # Write out metadata - print "Writing out metadata" - meta = {"updated": time.strftime("X%d %b %Y").replace('X0','X').replace('X','')} - meta_fname = "../auspice/data/meta.json" - write_json(meta, meta_fname) - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Process virus sequences, build tree, and prepare of web visualization') - parser.add_argument('-y', '--years_back', type = int, default=3, help='number of past years to sample sequences from') - parser.add_argument('-v', '--viruses_per_month', type = int, default = 50, help='number of viruses sampled per month') - params = parser.parse_args() - main(**params.__dict__) +import time, os, argparse,shutil,subprocess, glob +from Bio import SeqIO, AlignIO,Phylo +from Bio.SeqRecord import SeqRecord +from Bio.Seq import Seq +import dendropy +from tree_util import delimit_newick +from StringIO import StringIO +from itertools import izip + +class process(object): + """generic template class for processing virus sequences into trees""" + def __init__(self, tree_fname = 'data/tree.pkl', virus_fname = 'data/virus.pkl', + frequency_fname = 'data/frequency.pkl',**kwargs): + self.tree_fname = tree_fname + self.virus_fname = virus_fname + self.frequency_fname = frequency_fname + + def dump(self): + import cPickle + if hasattr(self, 'tree'): + with open(self.tree_fname, 'w') as outfile: + cPickle.dump(self.tree, outfile) + if hasattr(self, 'viruses'): + with open(self.virus_fname, 'w') as outfile: + cPickle.dump(self.viruses, outfile) + if hasattr(self, 'frequencies'): + with open(self.frequency_fname, 'w') as outfile: + cPickle.dump(self.frequencies, outfile) + + def load(self): + import cPickle + if os.path.isfile(self.tree_fname): + with open(self.tree_fname, 'r') as infile: + self.tree = cPickle.load(infile) + if os.path.isfile(self.virus_fname): + with open(self.virus_fname, 'r') as infile: + self.viruses = cPickle.load(infile) + if os.path.isfile(self.frequency_fname): + with open(self.frequency_fname, 'r') as infile: + self.frequencies = cPickle.load(infile) + + def align(self): + SeqIO.write([SeqRecord(Seq(v['seq']), id=v['strain']) for v in self.viruses], "temp_in.fasta", "fasta") + os.system("mafft --nofft temp_in.fasta > temp_out.fasta") + aln = AlignIO.read('temp_out.fasta', 'fasta') + for tmp_file in ['temp_in.fasta', 'temp_out.fasta']: + try: + os.remove(tmp_file) + except OSError: + pass + + self.sequence_lookup = {seq.id:seq for seq in aln} + # add attributes to alignment + for v in self.viruses: + self.sequence_lookup[v['strain']].__dict__.update({k:val for k,val in v.iteritems() if k!='seq'}) + self.viruses = aln + + def infer_tree(self, raxml_time_limit): + def cleanup(): + for file in glob.glob("RAxML_*") + glob.glob("temp*") + ["raxml_tree.newick", "initial_tree.newick"]: + try: + os.remove(file) + except OSError: + pass + + cleanup() + AlignIO.write(self.viruses, 'temp.fasta', 'fasta') + + print "Building initial tree with FastTree" + os.system("fasttree -gtr -nt -gamma -nosupport -mlacc 2 -slownni temp.fasta > initial_tree.newick") + self.tree = dendropy.Tree.get_from_string(delimit_newick('initial_tree.newick'),'newick', as_rooted=True) + self.tree.resolve_polytomies() + self.tree.write_to_path("initial_tree.newick", "newick") + + AlignIO.write(self.viruses,"temp.phyx", "phylip-relaxed") + if raxml_time_limit>0: + print "RAxML tree optimization with time limit " + str(raxml_time_limit) + " hours" + # using exec to be able to kill process + end_time = time.time() + int(raxml_time_limit*3600) + process = subprocess.Popen("exec raxml -f d -T 6 -j -s temp.phyx -n topology -c 25 -m GTRCAT -p 344312987 -t initial_tree.newick", shell=True) + while (time.time() < end_time): + if os.path.isfile('RAxML_result.topology'): + break + time.sleep(10) + process.terminate() + + checkpoint_files = [file for file in glob.glob("RAxML_checkpoint*")] + if os.path.isfile('RAxML_result.topology'): + checkpoint_files.append('RAxML_result.topology') + if len(checkpoint_files) > 0: + last_tree_file = checkpoint_files[-1] + shutil.copy(last_tree_file, 'raxml_tree.newick') + else: + shutil.copy("initial_tree.newick", 'raxml_tree.newick') + else: + shutil.copy("initial_tree.newick", 'raxml_tree.newick') + + print "RAxML branch length optimization and rooting" + os.system("raxml -f e -T 6 -s temp.phyx -n branches -c 25 -m GTRGAMMA -p 344312987 -t raxml_tree.newick -o " + self.outgroup['strain']) + + out_fname = "data/tree_infer.newick" + os.rename('RAxML_result.branches', out_fname) + Phylo.write(Phylo.read(out_fname, 'newick'),'temp.newick','newick') + self.tree = self.tree = dendropy.Tree.get_from_string(delimit_newick(out_fname), 'newick', as_rooted=True) + cleanup() + + def infer_ancestral(self): + from tree_util import to_Biopython + from tree_ancestral import ancestral_sequences + anc_seq = ancestral_sequences(to_Biopython(self.tree), self.viruses,seqtype='str') + anc_seq.calc_ancestral_sequences() + # copy the inferred sequences into the biopython tree + for node, anc_node in izip(self.tree.postorder_internal_node_iter(), anc_seq.T.get_nonterminals(order='postorder')): + node.seq = anc_node.seq + for node, anc_node in izip(self.tree.leaf_iter(), anc_seq.T.get_terminals()): + node.seq = anc_node.seq + + diff --git a/augur/src/seq_util.py b/augur/src/seq_util.py index 5afdfcc6..1a271066 100644 --- a/augur/src/seq_util.py +++ b/augur/src/seq_util.py @@ -1,7 +1,10 @@ from itertools import izip import numpy as np -#epitope_mask = np.fromstring("00000000000000000000000000000000000000000000000000000000000011111011011001010011000100000001001011110011100110101000001100000100000001000110101011111101011010111110001010011111000101011011111111010010001111101110111001010001110011111111000000111110000000101010101110000000000011100100000001011011100000000000001001011000110111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000", dtype='S1') -epitope_mask = np.fromstring("0000000000000000000000000000000000000000000011111011011001010011000100000001001011110011100110101000001100000100000001000110101011111101011010111110001010011111000101011011111111010010001111101110111001010001110011111111000000111110000000101010101110000000000011100100000001011011100000000000001001011000110111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000", dtype='S1') + +def hamming_distance(seq1, seq2): + aseq1, aseq2 = np.array(seq1), np.array(seq2) + non_gap = (aseq1!='-')*(aseq2!='-') + return np.mean(aseq1[non_gap]!=aseq2[non_gap]) def partition_string(string, length): return list(string[0+i:length+i] for i in range(0, len(string), length)) diff --git a/augur/src/tree_refine.py b/augur/src/tree_refine.py index 5a0adfd5..6c228477 100644 --- a/augur/src/tree_refine.py +++ b/augur/src/tree_refine.py @@ -8,210 +8,120 @@ from date_util import * from tree_util import * -def delimit_newick(infile_name): - with open(infile_name, 'r') as file: - newick = file.read().replace('\n', '') - newick = re.sub(r'(A/[^\:^,^)]+)', r"'\1'", newick) - return newick - -def crossref_import(branches_tree_file, states_tree_file, states_file): - """RAxML won't return a single NEWICK tree with both ancestral states and branch lengths""" - """This opens the necessary RAxL output files and outputs a single Dendropy tree""" - label_to_seq = {} - with open(states_file) as file: - for line in file: - (label, seq) = line.split() - label_to_seq[label] = seq - branches_tree = dendropy.Tree.get_from_string(delimit_newick(branches_tree_file), "newick", as_rooted=True) - states_tree = dendropy.Tree.get_from_string(delimit_newick(states_tree_file), "newick", as_rooted=True) - for (bn, sn) in zip(branches_tree.postorder_node_iter(), states_tree.postorder_node_iter()): - if sn.label: - bn.seq = label_to_seq[sn.label] - return branches_tree - -def get_yvalue(node): - """Return y location based on recursive mean of daughter locations""" - if hasattr(node, 'yvalue'): - return node.yvalue - if node.child_nodes(): - mean = 0 - for ch in node.child_nodes(): - mean += get_yvalue(ch) - return mean / float(len(node.child_nodes())) - -def get_xvalue(node): - """Return x location based on total distance from root""" - root = node.get_tree_root() - return node.get_distance(root) - -def remove_outgroup(tree, outgroup): - """Reroot tree to outgroup""" - outgroup_node = None - for node in tree.postorder_node_iter(): - if (str(node.taxon).lower() == outgroup.lower()): - outgroup_node = node - if outgroup_node: - tree.prune_subtree(outgroup_node) - else: - print "outgroup",outgroup, "not found" - if len(tree.seed_node.child_nodes())==1: - tree.seed_node = tree.seed_node.child_nodes()[0] - tree.seed_node.parent_node = None - tree.seed_node.edge_length = 0.002 - -def collapse(tree): - """Collapse edges without mutations to polytomies""" - for edge in tree.postorder_edge_iter(): - if edge.tail_node is not None: - if edge.is_internal() and edge.head_node.seq==edge.tail_node.seq: - edge.collapse() - -def reduce(tree): - """Remove outlier tips""" - """Remove internal nodes left as orphan tips""" - for node in tree.postorder_node_iter(): - if node.edge_length > 0.01 and node.is_leaf(): - parent = node.parent_node - parent.remove_child(node) - for node in tree.postorder_node_iter(): - if node.is_leaf() and not hasattr(node, 'strain'): - parent = node.parent_node - parent.remove_child(node) - -def ladderize(tree): - """Sorts child nodes in terms of the length of subtending branches each child node has""" - node_desc_counts = {} - for node in tree.postorder_node_iter(): - if len(node._child_nodes) == 0: - node_desc_counts[node] = node.edge_length +class tree_refine(object): + def __init__(self,cds = (0,None), **kwargs): + self.cds = cds + + def refine_generic(self): + self.node_lookup = {node.taxon.label:node for node in self.tree.leaf_iter()} + self.remove_outgroup() + self.ladderize() + self.collapse() + self.translate_all() + self.add_node_attributes() + self.reduce() + self.define_trunk() + + def remove_outgroup(self): + """Reroot tree to outgroup""" + if self.outgroup['strain'] in self.node_lookup: + outgroup_node = self.node_lookup[self.outgroup['strain']] + self.tree.prune_subtree(outgroup_node) + print "removed outgroup",self.outgroup['strain'] else: - total = 0 - if node.edge_length > 0: - total += node.edge_length - for child in node._child_nodes: - total += node_desc_counts[child] - node_desc_counts[node] = total - node._child_nodes.sort(key=lambda n: node_desc_counts[n], reverse=True) - -def layout(tree): - """Set yvalue of tips by post-order traversal""" - yvalue = 0 - distance_matrix = dendropy.treecalc.PatristicDistanceMatrix(tree) - tips = [node for node in tree.leaf_iter()] - tips[0].yvalue = yvalue - for (a,b) in zip(tips[:-1], tips[1:]): - d = distance_matrix(a.taxon, b.taxon) - # print str(a.taxon) + " to " + str(b.taxon) + ": " + str(d) - if b.is_leaf(): - yvalue += d - b.yvalue = yvalue - - for node in tree.postorder_node_iter(): - node.yvalue = get_yvalue(node) - -def add_virus_attributes(viruses, tree): - """Add date and loc attributes to all tips in tree""" - strain_to_date = {} - strain_to_country = {} - strain_to_region = {} - for v in viruses: - strain_to_date[v['strain'].lower()] = v['date'] - strain_to_country[v['strain'].lower()] = v['country'] - strain_to_region[v['strain'].lower()] = v['region'] - for node in tree.postorder_node_iter(): - strain = str(node.taxon).replace("'", '') - if strain_to_date.has_key(strain): - node.date = strain_to_date[strain] - if strain_to_country.has_key(strain): - node.country = strain_to_country[strain] - if strain_to_region.has_key(strain): - node.region = strain_to_region[strain] - -def add_node_attributes(tree): - """Add clade, xvalue, yvalue, mutation and trunk attributes to all nodes in tree""" - clade = 0 - yvalue = 0 - for node in tree.postorder_node_iter(): - node.clade = clade - clade += 1 - if node.is_leaf(): - node.yvalue = yvalue - yvalue += 1 - for node in tree.postorder_node_iter(): - node.yvalue = get_yvalue(node) - node.xvalue = node.distance_from_root() - root = tree.seed_node - for node in tree.postorder_node_iter(): - node.ep = epitope_distance(node.aa_seq, root.aa_seq) - node.ne = nonepitope_distance(node.aa_seq, root.aa_seq) - node.rb = receptor_binding_distance(node.aa_seq, root.aa_seq) - for node in tree.postorder_node_iter(): - node.trunk_count = 0 - node.trunk = False - -def translate_all(tree, cds): - for node in tree.postorder_node_iter(): - node.aa_seq = translate(node.seq[cds[0]:cds[1]]) - -def unique_date(tree): - leaf_count = 0 - for node in tree.postorder_node_iter(): - if node.is_leaf(): - # attach index to a leaf, to allow for array indexing later - node.tip_index = leaf_count - # modify date by a tiny amount << than a day to ensure uniqueness - node.num_date = numerical_date(node.date) + 1e-7*node.tip_index - leaf_count+=1 - else: # internal node preceed the oldest child - node.num_date = min([c.num_date for c in node.child_nodes()]) - -def define_trunk(tree): - """Trace current lineages backward to define trunk""" - - # Find most recent tip - dates = [] - for node in tree.postorder_node_iter(): - if node.is_leaf(): - dates.append(node.date) - most_recent_date = string_to_date(sorted(dates)[-1]) - - # Mark ancestry of recent tips - number_recent = 0 - for node in tree.postorder_node_iter(): - if node.is_leaf(): - diff = year_difference(string_to_date(node.date), most_recent_date) - if (diff < 1): + print "outgroup",self.outgroup['strain'], "not found" + if len(self.tree.seed_node.child_nodes())==1: + self.tree.seed_node = self.tree.seed_node.child_nodes()[0] + self.tree.seed_node.parent_node = None + self.tree.seed_node.edge_length = 0.002 + + def collapse(self): + """Collapse edges without mutations to polytomies""" + for edge in self.tree.postorder_edge_iter(): + if edge.tail_node is not None: + if edge.is_internal() and edge.head_node.seq==edge.tail_node.seq: + edge.collapse() + + def reduce(self, max_length =0.01): + """ + Remove outlier tips + Remove internal nodes left as orphan tips + """ + for node in self.tree.postorder_node_iter(): + if node.edge_length > max_length and node.is_leaf(): + parent = node.parent_node + parent.remove_child(node) + for node in self.tree.postorder_node_iter(): + if node.is_leaf() and not hasattr(node, 'strain'): + parent = node.parent_node + parent.remove_child(node) + + def ladderize(self): + """Sorts child nodes in terms of the length of subtending branches each child node has""" + for node in self.tree.postorder_node_iter(): + if node.is_leaf(): + node.tree_length = node.edge_length + else: + node.tree_length = node.edge_length + for child in node.child_nodes(): + node.tree_length += child.tree_length + node._child_nodes.sort(key=lambda n:n.tree_length, reverse=True) + + def translate_all(self): + for node in self.tree.postorder_node_iter(): + node.aa_seq = translate(node.seq[self.cds[0]:self.cds[1]]) + + def get_yvalue(self, node): + """Return y location based on recursive mean of daughter locations""" + if hasattr(node, 'yvalue'): + return node.yvalue + if node.child_nodes(): + return np.mean([n.yvalue for n in node.child_nodes()]) + + def add_node_attributes(self): + """Add clade, xvalue, yvalue, mutation and trunk attributes to all nodes in tree""" + clade = 0 + yvalue = 0 + for node in self.tree.postorder_node_iter(): + node.clade = clade + clade += 1 + if node.is_leaf(): + node.yvalue = yvalue + yvalue += 1 + for node in self.tree.postorder_node_iter(): + node.yvalue = self.get_yvalue(node) + node.xvalue = node.distance_from_root() + + for v in self.viruses: + if v.strain in self.node_lookup: + node = self.node_lookup[v.strain] + for attr in ['strain', 'date', 'accession', 'num_date', 'db', 'region', 'country']: + node.__setattr__(attr, v.__getattribute__(attr)) + + def define_trunk(self, dt = 1): + """Trace current lineages backward to define trunk""" + + # Find most recent tip + most_recent_date = -1e10 + for node in self.tree.leaf_iter(): + if node.num_date>most_recent_date: + most_recent_date=node.num_date + for node in self.tree.postorder_node_iter(): + node.trunk_count=0 + + # Mark ancestry of recent tips + number_recent = 0 + for node in self.tree.leaf_iter(): + if most_recent_date - node.num_date -3 * r_sd and r < 3 * r_sd) or v['strain'] == OUTGROUP: - new_viruses.append(v) - return new_viruses - -def clean_outbreaks(viruses): - """Remove duplicate strains, where the geographic location, date of sampling and sequence are identical""" - hash_to_count = {} - new_viruses = [] - for v in viruses: - geo = re.search(r'A/([^/]+)/', v['strain']).group(1) - if geo: - hash = geo + "_" + v['date'] + "_" + v['seq'] - if hash in hash_to_count: - hash_to_count[hash] += 1 - else: - hash_to_count[hash] = 1 +class virus_clean(object): + """docstring for virus_clean""" + def __init__(self): + pass + + def remove_insertions(self): + outgroup_ok = np.array(self.sequence_lookup[self.outgroup['strain']])!='-' + for seq in self.viruses: + seq.seq = Seq("".join(np.array(seq.seq)[outgroup_ok]).upper()) + + def clean_gaps(self): + self.viruses = filter(lambda x: '-' in x.seq, self.viruses) + + def clean_ambiguous(self): + for v in self.viruses: + v.seq = Seq(re.sub(r'[BDEFHIJKLMNOPQRSUVWXYZ]', '-',str(v.seq))) + + def unique_date(self): + from date_util import numerical_date + og = self.sequence_lookup[self.outgroup['strain']] + og.num_date = numerical_date(og.date) + for ii, v in enumerate(self.viruses): + v.num_date = numerical_date(v.date) + 1e-7*(ii+1) + + def times_from_outgroup(self): + self.unique_date() + outgroup_date = self.sequence_lookup[self.outgroup['strain']].num_date + return np.array([x.num_date-outgroup_date for x in self.viruses]) + + def distance_from_outgroup(self): + from seq_util import hamming_distance + outgroup_seq = self.sequence_lookup[self.outgroup['strain']].seq + return np.array([hamming_distance(x.seq, outgroup_seq) for x in self.viruses]) + + def clean_distances(self, n_std = 5): + """Remove viruses that don't follow a loose clock """ + times = self.times_from_outgroup() + distances = self.distance_from_outgroup() + slope, intercept, r_value, p_value, std_err = stats.linregress(times, distances) + residuals = slope*times - distances + r_sd = residuals.std() + if self.verbose: + print "\tslope: " + str(slope) + print "\tr: " + str(r_value) + print "\tresiduals sd: " + str(r_sd) + new_viruses = [] + for (v,r) in izip(self.viruses,residuals): # filter viruses more than 5 sds up or down + if np.abs(r)1: + print "\t\tresidual:", r, "\nremoved ",v.strain + self.viruses = MultipleSeqAlignment(new_viruses) + + def clean_generic(self): + print "Number of viruses before cleaning:",len(self.viruses) + self.remove_insertions() + self.clean_ambiguous() + self.clean_distances() + print "Number of viruses after outlier filtering:",len(self.viruses) diff --git a/augur/src/virus_filter.py b/augur/src/virus_filter.py index 0fd92742..f728f840 100644 --- a/augur/src/virus_filter.py +++ b/augur/src/virus_filter.py @@ -102,7 +102,7 @@ def subsample(self, years_back, viruses_per_month, prioritize = None, all_priori if self.outgroup is not None: filtered_viruses.append(self.outgroup) print len(filtered_viruses), "with outgroup" - self.virus_subsample = filtered_viruses + self.viruses = filtered_viruses def viruses_by_date_region(self, tmp_viruses): ''' @@ -129,7 +129,8 @@ def select_viruses(self, priority_viruses,other_viruses, y, m, viruses_per_month tmp = [v for v in representative if v is not None] shuffle(tmp) select_set[-1].extend(tmp) - print "found",len(select_set[-1]), 'in year',y,'month',m + if self.verbose>1: + print "\t\tfound",len(select_set[-1]), 'in year',y,'month',m if all_priority: n_other = max(0,viruses_per_month-len(select_set[0])) return select_set[0] + select_set[1][:n_other] @@ -147,7 +148,8 @@ def select_viruses_global(self, priority_viruses,other_viruses, y, m, viruses_pe other_viruses_flat = [] for r in regions: other_viruses_flat.extend(other_viruses[(y,m,r)]) - print "found",len(priority_viruses_flat)+len(other_viruses_flat), 'in year',y,'month',m + if self.verbose>1: + print "\t\tfound",len(priority_viruses_flat)+len(other_viruses_flat), 'in year',y,'month',m n_other = max(0,viruses_per_month-len(priority_viruses_flat)) return sample(priority_viruses_flat, len(priority_viruses_flat) if all_priority else min(len(priority_viruses_flat), viruses_per_month))\ + sample(other_viruses_flat, min(n_other, len(other_viruses_flat))) @@ -177,7 +179,7 @@ def parse_gisaid(self, fasta): words = record.description.replace(">","").replace(" ","").split('|') v = {key:words[ii] for ii, key in self.fasta_header.iteritems()} v['db']="GISAID" - v['seq']=str(record.seq).upper() + v['seq']= str(record.seq) if 'passage' not in v: v['passage']='' viruses.append(v) handle.close() From 45de46ab40481168c9e51b2133780fc8dac81264 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Sun, 1 Mar 2015 17:32:06 +0100 Subject: [PATCH 27/48] * added keyword arguments to most classes * made ancestral use dendropy instead of Bio.Phylo * added regional/temperal statistics * added nucleotide and amino acide frequences * added variable positions and consensus sequences --- augur/src/H3N2_process.py | 15 +++---- augur/src/process.py | 62 ++++++++++++++++++++++++++--- augur/src/seq_util.py | 46 ---------------------- augur/src/tree_ancestral.py | 78 ++++++++++++++++--------------------- augur/src/tree_refine.py | 10 +++++ augur/src/virus_clean.py | 1 + augur/src/virus_filter.py | 6 +-- 7 files changed, 113 insertions(+), 105 deletions(-) diff --git a/augur/src/H3N2_process.py b/augur/src/H3N2_process.py index 189073e5..2e7bed37 100644 --- a/augur/src/H3N2_process.py +++ b/augur/src/H3N2_process.py @@ -40,8 +40,8 @@ class H3N2_filter(flu_filter): - def __init__(self): - flu_filter.__init__(self, virus_config['alignment_file'], virus_config['fasta_fields']) + def __init__(self,**kwargs): + flu_filter.__init__(self, virus_config['alignment_file'], virus_config['fasta_fields'], **kwargs) self.vaccine_strains =[ { "strain": "A/Wisconsin/67/2005", @@ -99,7 +99,7 @@ def __init__(self): } class H3N2_clean(virus_clean): - def __init__(self): + def __init__(self,**kwargs): pass def clean_outbreaks(self): @@ -261,8 +261,9 @@ def run(self, years_back=3, viruses_per_month=50, raxml_time_limit = 1.0, **kwa parser.add_argument('--tree', default = False, action="store_true", help ="only build tree") parser.add_argument('--frequencies', default = False, action="store_true", help ="only estimate frequencies") params = parser.parse_args() - - myH3N2 = H3N2_process() - myH3N2.load() - if not params.test: + params.cds = (48,None) + myH3N2 = H3N2_process(**params.__dict__) + if params.test: + myH3N2.load() + else: myH3N2.run(**params.__dict__) diff --git a/augur/src/process.py b/augur/src/process.py index a3020bb6..9bf26ded 100644 --- a/augur/src/process.py +++ b/augur/src/process.py @@ -6,6 +6,7 @@ from tree_util import delimit_newick from StringIO import StringIO from itertools import izip +import numpy as np class process(object): """generic template class for processing virus sequences into trees""" @@ -107,12 +108,63 @@ def cleanup(): def infer_ancestral(self): from tree_util import to_Biopython from tree_ancestral import ancestral_sequences - anc_seq = ancestral_sequences(to_Biopython(self.tree), self.viruses,seqtype='str') + anc_seq = ancestral_sequences(self.tree, self.viruses,seqtype='str') anc_seq.calc_ancestral_sequences() # copy the inferred sequences into the biopython tree - for node, anc_node in izip(self.tree.postorder_internal_node_iter(), anc_seq.T.get_nonterminals(order='postorder')): - node.seq = anc_node.seq - for node, anc_node in izip(self.tree.leaf_iter(), anc_seq.T.get_terminals()): - node.seq = anc_node.seq +# for node, anc_node in izip(self.tree.postorder_internal_node_iter(), anc_seq.T.get_nonterminals(order='postorder')): +# node.seq = anc_node.seq +# for node, anc_node in izip(self.tree.leaf_iter(), anc_seq.T.get_terminals()): +# node.seq = anc_node.seq + def temporal_regional_statistics(self): + ''' + produces a dictionary with (year, month) keys, each entry of which is a + a dictionary that contains the isolate count in each region observed + stored as: + + self.date_region_count + self.regions + self.region_totals + ''' + from collections import defaultdict, Counter + self.date_region_count = defaultdict(lambda:defaultdict(int)) + regions = set() + # count viruses in every month and every region + for v in self.viruses: + if v.strain != self.outgroup['strain']: + year, month, day = map(int, v.date.split('-')) + self.date_region_count[(year, month)][v.region]+=1 + regions.add(v.region) + # add a sorted list of all regions to self and calculate region totals + self.regions = sorted(regions) + self.region_totals = {reg:sum(val[reg] for val in self.date_region_count.values()) for reg in self.regions} + + def determine_variable_positions(self, min_freq = 0.01): + ''' + calculates nucleoties_frequencies and aa_frequencies at each position of the alignment + also computes consensus sequences and position at which the major allele is at less than 1-min_freq + results are stored as + self.nucleoties_frequencies + self.aa_frequencies + self.variable_nucleotides + self.variable_aa + ''' + aln_array = np.array(self.viruses) + self.nuc_alphabet = 'ACGT-N' + self.nucleoties_frequencies = np.zeros((len(self.nuc_alphabet),aln_array.shape[1])) + for ni,nuc in enumerate(self.nuc_alphabet): + self.nucleoties_frequencies[ni,:]=(aln_array==nuc).mean(axis=0) + + self.variable_nucleotides = np.where(np.max(self.nucleoties_frequencies,axis=0)<1.0-min_freq)[0] + self.consensus_nucleotides = "".join(np.fromstring(self.nuc_alphabet, 'S1')[np.argmax(self.nucleoties_frequencies,axis=0)]) + + if hasattr(self, 'aa_aln'): + aln_array = np.array(self.aa_aln) + self.aa_alphabet = 'ACDEFGHIKLMNPQRSTVWY*X' + self.aa_frequencies = np.zeros((len(self.aa_alphabet),aln_array.shape[1])) + for ai,aa in enumerate(self.aa_alphabet): + self.aa_frequencies[ai,:]=(aln_array==aa).mean(axis=0) + + self.variable_aa = np.where(np.max(self.aa_frequencies,axis=0)<1.0-min_freq)[0] + self.consensus_aa = "".join(np.fromstring(self.aa_alphabet, 'S1')[np.argmax(self.aa_frequencies,axis=0)]) diff --git a/augur/src/seq_util.py b/augur/src/seq_util.py index 1a271066..e39314b4 100644 --- a/augur/src/seq_util.py +++ b/augur/src/seq_util.py @@ -6,57 +6,11 @@ def hamming_distance(seq1, seq2): non_gap = (aseq1!='-')*(aseq2!='-') return np.mean(aseq1[non_gap]!=aseq2[non_gap]) -def partition_string(string, length): - return list(string[0+i:length+i] for i in range(0, len(string), length)) - def translate(nuc): """Translate nucleotide sequence to amino acid""" from Bio import Seq return Seq.translate(nuc) #returns string when argument is a string, Bio.Seq otherwise -def epitope_sites(aa): - aaa = np.fromstring(aa, 'S1') - return ''.join(aaa[epitope_mask[:len(aa)]=='1']) - -def nonepitope_sites(aa): - aaa = np.fromstring(aa, 'S1') - return ''.join(aaa[epitope_mask[:len(aa)]=='0']) - -def receptor_binding_sites(aa): - ''' - Receptor binding site mutations from Koel et al. 2014 - These are (145, 155, 156, 158, 159, 189, 193) in canonical HA numbering - need to subtract one since python arrays start at 0 - ''' - sites = [144, 154, 155, 157, 158, 188, 192] - return ''.join([aa[pos] for pos in sites]) - -def get_HA1(aa): - ''' - return the part of the peptide corresponding to HA1, starts is 329 aa long - ''' - return aa[:329] - -def epitope_distance(aaA, aaB): - """Return distance of sequences aaA and aaB by comparing epitope sites""" - epA = epitope_sites(aaA) - epB = epitope_sites(aaB) - distance = sum(a != b for a, b in izip(epA, epB)) - return distance - -def nonepitope_distance(aaA, aaB): - """Return distance of sequences aaA and aaB by comparing non-epitope sites""" - neA = nonepitope_sites(aaA) - neB = nonepitope_sites(aaB) - distance = sum(a != b for a, b in izip(neA, neB)) - return distance -def receptor_binding_distance(aaA, aaB): - """Return distance of sequences aaA and aaB by comparing receptor binding sites""" - neA = receptor_binding_sites(aaA) - neB = receptor_binding_sites(aaB) - distance = sum(a != b for a, b in izip(neA, neB)) - return distance - def json_to_Bio_alignment(seq_json): from Bio.Align import MultipleSeqAlignment from Bio.SeqRecord import SeqRecord diff --git a/augur/src/tree_ancestral.py b/augur/src/tree_ancestral.py index 28b3bd1f..0d90dc9d 100644 --- a/augur/src/tree_ancestral.py +++ b/augur/src/tree_ancestral.py @@ -16,11 +16,8 @@ import numpy as np -from Bio import Phylo, Seq +from Bio import Seq import copy, time -from seq_util import json_to_Bio_alignment -from io_util import write_json, read_json -from tree_util import BioPhylo_to_json, to_Biopython, json_to_dendropy class ancestral_sequences: ''' @@ -66,11 +63,11 @@ def __init__(self, tree, aln, alphabet = 'ACGT', sub_matrix = None, self.calc_eigendecomp() names_to_seqs = {seq.id:seq for seq in aln} - for leaf in self.T.get_terminals(): - if leaf.name in names_to_seqs: + for leaf in self.T.leaf_iter(): + if leaf.taxon.label in names_to_seqs: leaf.prob = self.get_state_array() - seqtmp = names_to_seqs[leaf.name].seq + seqtmp = names_to_seqs[leaf.taxon.label].seq if self.seqtype != 'Seq': seqtmp = ''.join(seqtmp) @@ -86,17 +83,19 @@ def __init__(self, tree, aln, alphabet = 'ACGT', sub_matrix = None, for ni in xrange(self.nstates): leaf.prob[:,ni]+=missing_prob else: - print('ancestral sequences: Leaf '+leaf.name+' has no sequence') + print('ancestral sequences: Leaf '+leaf.taxon.label+' has no sequence') if self.seqtype == 'Seq': self.biopython_alphabet = leaf.seq.alphabet # dress each internal node with a probability vector for the ancestral state - for node in self.T.get_nonterminals(): + for node in self.T.postorder_internal_node_iter(): node.prob = self.get_state_array() + if node.edge_length is None: + node.edge_length = 0.0 # there is no parental information to the root (the root node is artificial) # hence init the message with ones - self.T.root.up_message = self.get_state_array() + self.T.seed_node.up_message = self.get_state_array() def get_state_array(self): @@ -129,7 +128,7 @@ def normalize(self, clade): clade.prob/=np.repeat(np.array([np.sum(clade.prob, axis=1)]).T, self.nstates, axis=1) if np.isnan(np.sum(clade.prob[:])): - print "encountered nan in ancestral inference in clade ", clade.name + print "encountered nan in ancestral inference in clade ", clade.taxon.label print np.isnan(clade.prob).nonzero() def log_normalize(self, clade): @@ -149,21 +148,21 @@ def calc_down_messages(self,clade): recursively calculates the messages passed on the parents of each node input: clade whose down_message is to be calculated ''' - if clade.is_terminal(): + if clade.is_leaf(): # if clade is terminal, the sequence is fix and we can emit the state probabilities - clade.down_message = self.calc_state_probabilites(clade.prob, clade.branch_length) - #print "down clade", clade.name, 'min:', np.min(clade.down_message) + clade.down_message = self.calc_state_probabilites(clade.prob, clade.edge_length) + #print "down clade", clade.taxon.label, 'min:', np.min(clade.down_message) clade.down_message[clade.down_message<1e-30] = 1e-30 else: #otherwise, multiply all down messages from children, normalize and pass down clade.prob[:]=0 - for child in clade.clades: + for child in clade.child_nodes(): self.calc_down_messages(child) clade.prob+=np.log(child.down_message) self.log_normalize(clade) - clade.down_message = self.calc_state_probabilites(clade.prob, clade.branch_length) - #print "down clade", clade.name, 'min:', np.min(clade.down_message) + clade.down_message = self.calc_state_probabilites(clade.prob, clade.edge_length) + #print "down clade", clade.taxon.label, 'min:', np.min(clade.down_message) clade.down_message[clade.down_message<1e-30] = 1e-30 def calc_up_messages(self,clade): @@ -171,26 +170,26 @@ def calc_up_messages(self,clade): calculate the messages that are passed on to the children input calde for which these are to calculated ''' - if clade.is_terminal(): + if clade.is_leaf(): #nothing to be done for terminal nodes return else: #else, loop over children and calculate the message for each of the children - for child in clade.clades: + for child in clade.child_nodes(): # initialize with the message comming from the parent clade.prob[:]=np.log(clade.up_message) - for child2 in clade.clades: + for child2 in clade.child_nodes(): if child2 != child: #multiply with the down message from each of the children, but skip child 1 clade.prob+=np.log(child2.down_message) # normalize, adjust for modifications along the branch, and save. self.log_normalize(clade) - child.up_message = self.calc_state_probabilites(clade.prob, child.branch_length) - #print "up clade", clade.name, 'min:', np.min(child.up_message) + child.up_message = self.calc_state_probabilites(clade.prob, child.edge_length) + #print "up clade", clade.taxon.label, 'min:', np.min(child.up_message) child.up_message[child.up_message<1e-30] = 1e-30 # do recursively for all children - for child in clade.clades: + for child in clade.child_nodes(): self.calc_up_messages(child) def calc_marginal_probabilities(self,clade): @@ -198,13 +197,13 @@ def calc_marginal_probabilities(self,clade): calculate the marginal probabilities by multiplying all incoming messages ''' clade.prob[:]=np.log(clade.up_message) - for child in clade.clades: + for child in clade.child_nodes(): clade.prob+=np.log(child.down_message) # normalize and continue for all children self.log_normalize(clade) - #print clade.name, np.max(1.0-np.max(clade.prob, axis=1)) - for child in clade.clades: + #print clade.taxon.label, np.max(1.0-np.max(clade.prob, axis=1)) + for child in clade.child_nodes(): self.calc_marginal_probabilities(child) def calc_most_likely_sequences(self, clade): @@ -218,7 +217,7 @@ def calc_most_likely_sequences(self, clade): setattr(clade, self.attrname, seq) # repeat for all children - for child in clade.clades: + for child in clade.child_nodes(): self.calc_most_likely_sequences(child) @@ -228,43 +227,34 @@ def calc_ancestral_sequences(self): and the marginal probabilities for each position at each internal node. ''' print "--- Forward pass at " + time.strftime("%H:%M:%S") + " ---" - self.calc_down_messages(self.T.root) + self.calc_down_messages(self.T.seed_node) print "--- Backward pass at " + time.strftime("%H:%M:%S") + " ---" - self.calc_up_messages(self.T.root) + self.calc_up_messages(self.T.seed_node) print "--- Calculating marginals at " + time.strftime("%H:%M:%S") + " ---" - self.calc_marginal_probabilities(self.T.root) + self.calc_marginal_probabilities(self.T.seed_node) print "--- Most likely nucleotides at " + time.strftime("%H:%M:%S") + " ---" - self.calc_most_likely_sequences(self.T.root) + self.calc_most_likely_sequences(self.T.seed_node) def cleanup_tree(self, attrnames=['prob', 'down_message', 'up_message']): '''Clean up pollution attributes of leaves''' - nodes = self.T.get_terminals() + self.T.get_nonterminals() + nodes = self.T.postorder_node_iter() for leaf in nodes: for attrname in attrnames: if hasattr(leaf, attrname): delattr(leaf, attrname) def main(tree, viruses): + from seq_util import json_to_Bio_alignment + from tree_util import json_to_dendropy print "--- Ancestral inference at " + time.strftime("%H:%M:%S") + " ---" - from Bio import Phylo aln = json_to_Bio_alignment(viruses) - tree = to_Biopython(tree) print "--- Set-up ancestral inference at " + time.strftime("%H:%M:%S") + " ---" anc_seq = ancestral_sequences(tree, aln, seqtype='str') anc_seq.calc_ancestral_sequences() anc_seq.cleanup_tree() out_fname = "data/tree_ancestral.json" - return json_to_dendropy(BioPhylo_to_json(anc_seq.T.root)) - -def test(): - from Bio import Phylo, AlignIO - from StringIO import StringIO - tree = Phylo.read(StringIO('((A/Tbilisi/GNCDC0557/2012:0.00877583894583227123,((A/SHIMANE/194/2011:0.00400338899817878624,A/KUMAMOTO-C/36/2011:0.00088323906079086276):0.01868163425362149091,A/Kenya/222/2012:0.00506846715254004234):0.00000042200543794419):0.05229485694422789793,A/Beijing/32/1992:0.05229485694422789793);'), format = 'newick') - aln = AlignIO.read('../scratch/test_aln.phyx', 'phylip-relaxed') - anc_seq = ancestral_sequences(tree=tree, aln=aln, seqtype='str') - anc_seq.calc_ancestral_sequences() - return anc_seq.T + return json_to_dendropy(anc_seq.T.seed_node) if __name__=="__main__": tree = main() diff --git a/augur/src/tree_refine.py b/augur/src/tree_refine.py index 6c228477..34524741 100644 --- a/augur/src/tree_refine.py +++ b/augur/src/tree_refine.py @@ -22,6 +22,15 @@ def refine_generic(self): self.reduce() self.define_trunk() + # make an amino acid aligment + from Bio.Align import MultipleSeqAlignment + from Bio.Seq import Seq + from Bio.SeqRecord import SeqRecord + tmp_aaseqs = [SeqRecord(Seq(node.aa_seq), id=node.strain, annotations = {'num_date':node.num_date}) for node in self.tree.leaf_iter()] + tmp_aaseqs.sort(key = lambda x:x.annotations['num_date']) + self.aa_aln = MultipleSeqAlignment(tmp_aaseqs) + + def remove_outgroup(self): """Reroot tree to outgroup""" if self.outgroup['strain'] in self.node_lookup: @@ -71,6 +80,7 @@ def translate_all(self): for node in self.tree.postorder_node_iter(): node.aa_seq = translate(node.seq[self.cds[0]:self.cds[1]]) + def get_yvalue(self, node): """Return y location based on recursive mean of daughter locations""" if hasattr(node, 'yvalue'): diff --git a/augur/src/virus_clean.py b/augur/src/virus_clean.py index a14a025c..75e82f0f 100644 --- a/augur/src/virus_clean.py +++ b/augur/src/virus_clean.py @@ -68,4 +68,5 @@ def clean_generic(self): self.remove_insertions() self.clean_ambiguous() self.clean_distances() + self.viruses.sort(key=lambda x:x.num_date) print "Number of viruses after outlier filtering:",len(self.viruses) diff --git a/augur/src/virus_filter.py b/augur/src/virus_filter.py index f728f840..5b69cb00 100644 --- a/augur/src/virus_filter.py +++ b/augur/src/virus_filter.py @@ -11,7 +11,7 @@ class virus_filter(object): - def __init__(self,viruses=None): + def __init__(self,viruses=None, **kwargs): if viruses is None: viruses=[] self.viruses = viruses self.strain_lookup = {} @@ -157,13 +157,13 @@ def select_viruses_global(self, priority_viruses,other_viruses, y, m, viruses_pe class flu_filter(virus_filter): - def __init__(self,fasta_fname, fasta_header=None): + def __init__(self,fasta_fname, fasta_header=None, **kwargs): if fasta_header is None: self.fasta_header = {0:'strain', 1:'accession', 3:'passage', 5:'date' } else: self.fasta_header = fasta_header viruses = self.parse_gisaid(fasta_fname) - virus_filter.__init__(self, viruses) + virus_filter.__init__(self, viruses, **kwargs) self.fix_strain_names() self.vaccine_strains=[] From bf8b8d99a5d6d9667be9d03736ad2dfa8e10e279 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Sun, 1 Mar 2015 18:01:44 +0100 Subject: [PATCH 28/48] added some comments and more kwargs --- augur/src/H3N2_process.py | 17 ++++++++++------- augur/src/process.py | 23 +++++++++++++---------- augur/src/tree_refine.py | 25 +++++++++++++++++++------ augur/src/virus_clean.py | 12 ++++++++---- augur/src/virus_filter.py | 33 ++++++++++++++++++++------------- 5 files changed, 70 insertions(+), 40 deletions(-) diff --git a/augur/src/H3N2_process.py b/augur/src/H3N2_process.py index 2e7bed37..a3fb719b 100644 --- a/augur/src/H3N2_process.py +++ b/augur/src/H3N2_process.py @@ -29,19 +29,21 @@ 'frequency_stiffness':10.0, 'time_interval':(2012.0, 2015.1), 'pivots_per_year':12.0, - 'min_mutation_count':10, + 'min_freq':10, # define relevant clades in canonical HA1 numbering (+1) 'clade_designations': { "3c3.a":[(128,'A'), (142,'G'), (159,'S')], "3c3": [(128,'A'), (142,'G'), (159,'F')], "3c2.a":[(144,'S'), (159,'Y'), (225,'D'), (311,'H'),(489,'N')], "3c2": [(144,'N'), (159,'F'),(225,'N'), (489,'N')] - } + }, + 'verbose':1 } class H3N2_filter(flu_filter): - def __init__(self,**kwargs): - flu_filter.__init__(self, virus_config['alignment_file'], virus_config['fasta_fields'], **kwargs) + def __init__(self,min_length = 987, **kwargs): + self.min_length = min_length + flu_filter.__init__(self, **kwargs) self.vaccine_strains =[ { "strain": "A/Wisconsin/67/2005", @@ -100,7 +102,7 @@ def __init__(self,**kwargs): class H3N2_clean(virus_clean): def __init__(self,**kwargs): - pass + virus_clean.__init__(self, **kwargs) def clean_outbreaks(self): """Remove duplicate strains, where the geographic location, date of sampling and sequence are identical""" @@ -262,8 +264,9 @@ def run(self, years_back=3, viruses_per_month=50, raxml_time_limit = 1.0, **kwa parser.add_argument('--frequencies', default = False, action="store_true", help ="only estimate frequencies") params = parser.parse_args() params.cds = (48,None) - myH3N2 = H3N2_process(**params.__dict__) + virus_config.update(params.__dict__) + myH3N2 = H3N2_process(**virus_config) if params.test: myH3N2.load() else: - myH3N2.run(**params.__dict__) + myH3N2.run(**virus_config) diff --git a/augur/src/process.py b/augur/src/process.py index 9bf26ded..d0f1b5cd 100644 --- a/augur/src/process.py +++ b/augur/src/process.py @@ -11,10 +11,11 @@ class process(object): """generic template class for processing virus sequences into trees""" def __init__(self, tree_fname = 'data/tree.pkl', virus_fname = 'data/virus.pkl', - frequency_fname = 'data/frequency.pkl',**kwargs): + frequency_fname = 'data/frequency.pkl', min_freq = 0.01, **kwargs): self.tree_fname = tree_fname self.virus_fname = virus_fname self.frequency_fname = frequency_fname + self.min_freq = min_freq def dump(self): import cPickle @@ -41,6 +42,10 @@ def load(self): self.frequencies = cPickle.load(infile) def align(self): + ''' + aligns viruses using mafft. produces temporary files and deletes those at the end + after this step, self.viruses is a BioPhython multiple alignment object + ''' SeqIO.write([SeqRecord(Seq(v['seq']), id=v['strain']) for v in self.viruses], "temp_in.fasta", "fasta") os.system("mafft --nofft temp_in.fasta > temp_out.fasta") aln = AlignIO.read('temp_out.fasta', 'fasta') @@ -57,6 +62,10 @@ def align(self): self.viruses = aln def infer_tree(self, raxml_time_limit): + ''' + builds a tree from the alignment using fasttree and RAxML. raxml runs for + raxml_time_limit and is terminated thereafter. raxml_time_limit can be 0. + ''' def cleanup(): for file in glob.glob("RAxML_*") + glob.glob("temp*") + ["raxml_tree.newick", "initial_tree.newick"]: try: @@ -110,12 +119,6 @@ def infer_ancestral(self): from tree_ancestral import ancestral_sequences anc_seq = ancestral_sequences(self.tree, self.viruses,seqtype='str') anc_seq.calc_ancestral_sequences() - # copy the inferred sequences into the biopython tree -# for node, anc_node in izip(self.tree.postorder_internal_node_iter(), anc_seq.T.get_nonterminals(order='postorder')): -# node.seq = anc_node.seq -# for node, anc_node in izip(self.tree.leaf_iter(), anc_seq.T.get_terminals()): -# node.seq = anc_node.seq - def temporal_regional_statistics(self): ''' @@ -140,7 +143,7 @@ def temporal_regional_statistics(self): self.regions = sorted(regions) self.region_totals = {reg:sum(val[reg] for val in self.date_region_count.values()) for reg in self.regions} - def determine_variable_positions(self, min_freq = 0.01): + def determine_variable_positions(self): ''' calculates nucleoties_frequencies and aa_frequencies at each position of the alignment also computes consensus sequences and position at which the major allele is at less than 1-min_freq @@ -156,7 +159,7 @@ def determine_variable_positions(self, min_freq = 0.01): for ni,nuc in enumerate(self.nuc_alphabet): self.nucleoties_frequencies[ni,:]=(aln_array==nuc).mean(axis=0) - self.variable_nucleotides = np.where(np.max(self.nucleoties_frequencies,axis=0)<1.0-min_freq)[0] + self.variable_nucleotides = np.where(np.max(self.nucleoties_frequencies,axis=0)<1.0-self.min_freq)[0] self.consensus_nucleotides = "".join(np.fromstring(self.nuc_alphabet, 'S1')[np.argmax(self.nucleoties_frequencies,axis=0)]) if hasattr(self, 'aa_aln'): @@ -166,5 +169,5 @@ def determine_variable_positions(self, min_freq = 0.01): for ai,aa in enumerate(self.aa_alphabet): self.aa_frequencies[ai,:]=(aln_array==aa).mean(axis=0) - self.variable_aa = np.where(np.max(self.aa_frequencies,axis=0)<1.0-min_freq)[0] + self.variable_aa = np.where(np.max(self.aa_frequencies,axis=0)<1.0-self.min_freq)[0] self.consensus_aa = "".join(np.fromstring(self.aa_alphabet, 'S1')[np.argmax(self.aa_frequencies,axis=0)]) diff --git a/augur/src/tree_refine.py b/augur/src/tree_refine.py index 34524741..960e0444 100644 --- a/augur/src/tree_refine.py +++ b/augur/src/tree_refine.py @@ -9,10 +9,22 @@ from tree_util import * class tree_refine(object): - def __init__(self,cds = (0,None), **kwargs): - self.cds = cds + def __init__(self,cds = (0,None), max_length = 0.01, dt=1, **kwargs): + ''' + parameters: + cds -- coding region + max_length -- maximal lenght of external branches + dt -- time interval used to define the trunk of the tree + ''' + self.cds = cds + self.max_length = max_length + self.dt = dt def refine_generic(self): + ''' + run through the generic refining methods, + will add strain attributes to nodes and translate the sequences -> produces aa_aln + ''' self.node_lookup = {node.taxon.label:node for node in self.tree.leaf_iter()} self.remove_outgroup() self.ladderize() @@ -51,13 +63,13 @@ def collapse(self): if edge.is_internal() and edge.head_node.seq==edge.tail_node.seq: edge.collapse() - def reduce(self, max_length =0.01): + def reduce(self): """ Remove outlier tips Remove internal nodes left as orphan tips """ for node in self.tree.postorder_node_iter(): - if node.edge_length > max_length and node.is_leaf(): + if node.edge_length > self.max_length and node.is_leaf(): parent = node.parent_node parent.remove_child(node) for node in self.tree.postorder_node_iter(): @@ -108,9 +120,10 @@ def add_node_attributes(self): for attr in ['strain', 'date', 'accession', 'num_date', 'db', 'region', 'country']: node.__setattr__(attr, v.__getattribute__(attr)) - def define_trunk(self, dt = 1): + def define_trunk(self, dt = None): """Trace current lineages backward to define trunk""" - + if dt is None: + dt = self.dt # Find most recent tip most_recent_date = -1e10 for node in self.tree.leaf_iter(): diff --git a/augur/src/virus_clean.py b/augur/src/virus_clean.py index 75e82f0f..50fdf1ca 100644 --- a/augur/src/virus_clean.py +++ b/augur/src/virus_clean.py @@ -11,8 +11,12 @@ class virus_clean(object): """docstring for virus_clean""" - def __init__(self): - pass + def __init__(self,n_std = 5, **kwargs): + ''' + parameters + n_std -- number of standard deviations accepted in molecular clock filter + ''' + self.n_std = n_std def remove_insertions(self): outgroup_ok = np.array(self.sequence_lookup[self.outgroup['strain']])!='-' @@ -43,7 +47,7 @@ def distance_from_outgroup(self): outgroup_seq = self.sequence_lookup[self.outgroup['strain']].seq return np.array([hamming_distance(x.seq, outgroup_seq) for x in self.viruses]) - def clean_distances(self, n_std = 5): + def clean_distances(self): """Remove viruses that don't follow a loose clock """ times = self.times_from_outgroup() distances = self.distance_from_outgroup() @@ -56,7 +60,7 @@ def clean_distances(self, n_std = 5): print "\tresiduals sd: " + str(r_sd) new_viruses = [] for (v,r) in izip(self.viruses,residuals): # filter viruses more than 5 sds up or down - if np.abs(r)1: diff --git a/augur/src/virus_filter.py b/augur/src/virus_filter.py index 5b69cb00..49a43462 100644 --- a/augur/src/virus_filter.py +++ b/augur/src/virus_filter.py @@ -11,21 +11,27 @@ class virus_filter(object): - def __init__(self,viruses=None, **kwargs): + def __init__(self,viruses=None, date_spec='full', **kwargs): + ''' + parameters: + viruses -- a list of virses. dict structures as of now + date_spec -- if 'full', dates with day are required, if 'year', only year is accepted + ''' if viruses is None: viruses=[] self.viruses = viruses self.strain_lookup = {} self.outgroup = None + self.date_spec = date_spec - def filter_generic(self, min_length=None, date_spec = 'full', prepend_strains = None): + def filter_generic(self, prepend_strains = None): ''' filter viruses by length and accurate date, sort, add additioanl strains such as vaccine strains that are preferentially retained and prune to unique strains ''' print len(self.viruses), "initial viruses" - if min_length is not None: - self.filter_length(min_length) - print len(self.viruses), "after filtering by length >=", min_length + if hasattr(self, 'min_length'): + self.filter_length(self.min_length) + print len(self.viruses), "after filtering by length >=", self.min_length self.filter_date(date_spec) print len(self.viruses), "after filtering for precise dates" @@ -60,9 +66,9 @@ def filter_length(self, min_length): self.viruses = filter(lambda v: len(v['seq']) >= min_length, self.viruses) def filter_date(self, date_spec): - if date_spec=='full': + if self.date_spec=='full': self.viruses = filter(lambda v: re.match(r'\d\d\d\d-\d\d-\d\d', v['date']) != None, self.viruses) - elif date_spec=='year': + elif self.date_spec=='year': self.viruses = filter(lambda v: re.match(r'\d\d\d\d', v['date']) != None, self.viruses) def subsample(self, years_back, viruses_per_month, prioritize = None, all_priority=False, region_specific = True): @@ -157,12 +163,13 @@ def select_viruses_global(self, priority_viruses,other_viruses, y, m, viruses_pe class flu_filter(virus_filter): - def __init__(self,fasta_fname, fasta_header=None, **kwargs): - if fasta_header is None: - self.fasta_header = {0:'strain', 1:'accession', 3:'passage', 5:'date' } + def __init__(self, alignment_file='', fasta_fields=None, **kwargs): + if fasta_fields is None: + self.fasta_fields = {0:'strain', 1:'accession', 3:'passage', 5:'date' } else: - self.fasta_header = fasta_header - viruses = self.parse_gisaid(fasta_fname) + self.fasta_fields = fasta_fields + self.alignment_file = alignment_file + viruses = self.parse_gisaid(self.alignment_file) virus_filter.__init__(self, viruses, **kwargs) self.fix_strain_names() self.vaccine_strains=[] @@ -177,7 +184,7 @@ def parse_gisaid(self, fasta): else: for record in SeqIO.parse(handle, "fasta"): words = record.description.replace(">","").replace(" ","").split('|') - v = {key:words[ii] for ii, key in self.fasta_header.iteritems()} + v = {key:words[ii] for ii, key in self.fasta_fields.iteritems()} v['db']="GISAID" v['seq']= str(record.seq) if 'passage' not in v: v['passage']='' From 6285fc1ed8c06afbbd9658bcbac52bd43f7234c6 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Sun, 1 Mar 2015 18:40:01 +0100 Subject: [PATCH 29/48] removed unused imports --- augur/src/H3N2_process.py | 10 ++++++++ augur/src/process.py | 46 ++++++++++++++++++++++++++++++++++--- augur/src/tree_ancestral.py | 3 ++- augur/src/tree_refine.py | 2 -- augur/src/tree_util.py | 24 ++++++++----------- augur/src/virus_clean.py | 1 - augur/src/virus_filter.py | 4 ++-- 7 files changed, 66 insertions(+), 24 deletions(-) diff --git a/augur/src/H3N2_process.py b/augur/src/H3N2_process.py index a3fb719b..7cc3f78b 100644 --- a/augur/src/H3N2_process.py +++ b/augur/src/H3N2_process.py @@ -22,6 +22,7 @@ 'force_include':'source-data/HI_strains.txt', 'max_global':True, # sample as evenly as possible from different geographic regions 'cds':[48,-1], # define the HA1 start i n 0 numbering + 'n_std':3, # standard deviations from clock # frequency estimation parameters 'aggregate_regions': [ ("global", None), ("NA", ["NorthAmerica"]), ("EU", ["Europe"]), @@ -42,6 +43,10 @@ class H3N2_filter(flu_filter): def __init__(self,min_length = 987, **kwargs): + ''' + parameters + min_length -- minimal length for a sequence to be acceptable + ''' self.min_length = min_length flu_filter.__init__(self, **kwargs) self.vaccine_strains =[ @@ -251,6 +256,8 @@ def run(self, years_back=3, viruses_per_month=50, raxml_time_limit = 1.0, **kwa print "--- Tree refine at " + time.strftime("%H:%M:%S") + " ---" self.refine() + self.export_to_auspice() + if __name__=="__main__": parser = argparse.ArgumentParser(description='Process virus sequences, build tree, and prepare of web visualization') @@ -264,7 +271,10 @@ def run(self, years_back=3, viruses_per_month=50, raxml_time_limit = 1.0, **kwa parser.add_argument('--frequencies', default = False, action="store_true", help ="only estimate frequencies") params = parser.parse_args() params.cds = (48,None) + + # add all arguments to virus_config (possibly overriding) virus_config.update(params.__dict__) + # pass all these arguments to the processor: will be passed down as kwargs through all classes myH3N2 = H3N2_process(**virus_config) if params.test: myH3N2.load() diff --git a/augur/src/process.py b/augur/src/process.py index d0f1b5cd..88880213 100644 --- a/augur/src/process.py +++ b/augur/src/process.py @@ -4,18 +4,20 @@ from Bio.Seq import Seq import dendropy from tree_util import delimit_newick -from StringIO import StringIO -from itertools import izip import numpy as np class process(object): """generic template class for processing virus sequences into trees""" def __init__(self, tree_fname = 'data/tree.pkl', virus_fname = 'data/virus.pkl', - frequency_fname = 'data/frequency.pkl', min_freq = 0.01, **kwargs): + frequency_fname = 'data/frequency.pkl', auspice_frequency_fname ='../auspice/data/frequencies.json', + auspice_sequences_fname='../auspice/data/sequences.json', auspice_tree_fname='../auspice/data/tree.json', min_freq = 0.01, **kwargs): self.tree_fname = tree_fname self.virus_fname = virus_fname self.frequency_fname = frequency_fname self.min_freq = min_freq + self.auspice_tree_fname = auspice_tree_fname + self.auspice_sequences_fname = auspice_sequences_fname + self.auspice_frequency_fname = auspice_frequency_fname def dump(self): import cPickle @@ -41,6 +43,44 @@ def load(self): with open(self.frequency_fname, 'r') as infile: self.frequencies = cPickle.load(infile) + def export_to_auspice(self, tree_fields = [], tree_pop_list = []): + from tree_util import dendropy_to_json, all_descendants + from io_util import write_json, read_json + print "--- Streamline at " + time.strftime("%H:%M:%S") + " ---" + # Move sequence data to separate file + print "Writing sequences" + elems = {} + for node in self.tree: + if hasattr(node,"clade"): + elems[node.clade] = node.aa_seq + write_json(elems, self.auspice_sequences_fname, indent=None) + + print "writing tree" + self.tree_json = dendropy_to_json(self.tree.seed_node, tree_fields) + for node in all_descendants(self.tree_json): + for attr in tree_pop_list: + if attr in node: + node.pop(attr, None) + if "freq" in node: + for reg in node["freq"]: + try: + node["freq"][reg] = [round(x,3) for x in node["freq"][reg]] + except: + node["freq"][reg] = "undefined" + + write_json(self.tree_json, self.auspice_tree_fname, indent=None) + try: + read_json(self.auspice_tree_fname) + except: + print "Read failed, rewriting with indents" + write_json(self.tree_json, self.auspice_tree_fname, indent=1) + + # Include genotype frequencies + if hasattr(self, 'frequencies'): + write_json(self.frequencies, self.auspice_frequency_fname) + + + def align(self): ''' aligns viruses using mafft. produces temporary files and deletes those at the end diff --git a/augur/src/tree_ancestral.py b/augur/src/tree_ancestral.py index 0d90dc9d..410ff0bc 100644 --- a/augur/src/tree_ancestral.py +++ b/augur/src/tree_ancestral.py @@ -234,7 +234,8 @@ def calc_ancestral_sequences(self): self.calc_marginal_probabilities(self.T.seed_node) print "--- Most likely nucleotides at " + time.strftime("%H:%M:%S") + " ---" self.calc_most_likely_sequences(self.T.seed_node) - + self.cleanup_tree() + def cleanup_tree(self, attrnames=['prob', 'down_message', 'up_message']): '''Clean up pollution attributes of leaves''' diff --git a/augur/src/tree_refine.py b/augur/src/tree_refine.py index 960e0444..d39c655a 100644 --- a/augur/src/tree_refine.py +++ b/augur/src/tree_refine.py @@ -3,10 +3,8 @@ import os, re, time import dendropy -from io_util import * from seq_util import * from date_util import * -from tree_util import * class tree_refine(object): def __init__(self,cds = (0,None), max_length = 0.01, dt=1, **kwargs): diff --git a/augur/src/tree_util.py b/augur/src/tree_util.py index 86efe7cc..e17cf007 100644 --- a/augur/src/tree_util.py +++ b/augur/src/tree_util.py @@ -103,10 +103,10 @@ def get_dates(node): """Return ordered list of dates of descendants of a node""" return sorted([n['date'] for n in node.leaf_iter()]) -def dendropy_to_json(node, extra_attr = ['ep', 'ne', 'rb','tol', 'fitness', 'serum', 'dHI', 'cHI', 'HI_info']): +def dendropy_to_json(node, extra_attr = []): json = {} - str_attr = ['country','region','seq','aa_seq','clade','strain', 'date'] - num_attr = ['xvalue', 'yvalue', 'num_date', 'tip_index'] + str_attr = ['country','region','clade','strain', 'date'] + num_attr = ['xvalue', 'yvalue', 'num_date'] for prop in str_attr: if hasattr(node, prop): json[prop] = node.__getattribute__(prop) @@ -121,21 +121,15 @@ def dendropy_to_json(node, extra_attr = ['ep', 'ne', 'rb','tol', 'fitness', 'ser if hasattr(node, prop): json[prop] = node.__getattribute__(prop) - try: - if hasattr(node, 'freq') and node.freq is not None: - json['freq'] = {reg: [round(x, 3) for x in freq] if freq is not None else "undefined" for reg, freq in node.freq.iteritems()} - if hasattr(node, 'logit_freq') and node.logit_freq is not None: - json['logit_freq'] = {reg: [round(x,3) for x in freq] if freq is not None else "undefined" for reg, freq in node.logit_freq.iteritems()} - if hasattr(node, 'pivots'): - json['pivots'] = [round(x,3) for x in node.pivots] - if hasattr(node, 'virus_count'): - json['virus_count'] = {reg: [round(x,3) for x in vc[0]] if vc is not None else "undefined" for reg, vc in node.virus_count.iteritems()} - except: - import pdb; pdb.set_trace() + if hasattr(node, 'freq') and node.freq is not None: + json['freq'] = {reg: list(freq) if freq is not None else "undefined" for reg, freq in node.freq.iteritems()} + if hasattr(node, 'pivots'): + json['pivots'] = list(node.pivots) + if node.child_nodes(): json["children"] = [] for ch in node.child_nodes(): - json["children"].append(dendropy_to_json(ch)) + json["children"].append(dendropy_to_json(ch, extra_attr)) return json def json_to_dendropy(json): diff --git a/augur/src/virus_clean.py b/augur/src/virus_clean.py index 50fdf1ca..a84b199b 100644 --- a/augur/src/virus_clean.py +++ b/augur/src/virus_clean.py @@ -7,7 +7,6 @@ from Bio.Seq import Seq from scipy import stats import numpy as np -from io_util import * class virus_clean(object): """docstring for virus_clean""" diff --git a/augur/src/virus_filter.py b/augur/src/virus_filter.py index 49a43462..ac7b5b06 100644 --- a/augur/src/virus_filter.py +++ b/augur/src/virus_filter.py @@ -33,7 +33,7 @@ def filter_generic(self, prepend_strains = None): self.filter_length(self.min_length) print len(self.viruses), "after filtering by length >=", self.min_length - self.filter_date(date_spec) + self.filter_date() print len(self.viruses), "after filtering for precise dates" self.sort_length() if prepend_strains is not None: @@ -65,7 +65,7 @@ def filter_unique(self): def filter_length(self, min_length): self.viruses = filter(lambda v: len(v['seq']) >= min_length, self.viruses) - def filter_date(self, date_spec): + def filter_date(self): if self.date_spec=='full': self.viruses = filter(lambda v: re.match(r'\d\d\d\d-\d\d-\d\d', v['date']) != None, self.viruses) elif self.date_spec=='year': From 378669124382491967f4f8e6d48adccd981a43ad Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Sun, 1 Mar 2015 18:40:29 +0100 Subject: [PATCH 30/48] removed streamline, now part of generic process --- augur/src/streamline.py | 42 ----------------------------------------- 1 file changed, 42 deletions(-) delete mode 100644 augur/src/streamline.py diff --git a/augur/src/streamline.py b/augur/src/streamline.py deleted file mode 100644 index 0b20298d..00000000 --- a/augur/src/streamline.py +++ /dev/null @@ -1,42 +0,0 @@ -import time, os, shutil -from io_util import * -from tree_util import * - -def main(tree_json, frequencies): - """Prep tree for auspice, stripping sequence data""" - - print "--- Streamline at " + time.strftime("%H:%M:%S") + " ---" - - # Move sequence data to separate file - print "Writing sequences" - elems = {} - for node in all_descendants(tree_json): - if "clade" in tree_json: - elems[node["clade"]] = node["aa_seq"] - write_json(elems, "../auspice/data/sequences.json", indent=None) - - # Streamline tree for auspice - print "Writing streamlined tree" - for node in all_descendants(tree_json): - node.pop("seq", None) - node.pop("aa_seq", None) - node.pop("logit_freq", None) - for reg in node["freq"]: - try: - node["freq"][reg] = [round(x,3) for x in node["freq"][reg]] - except: - node["freq"][reg] = "undefined" - - out_fname_tree = "../auspice/data/tree.json" - write_json(tree_json, out_fname_tree, indent=None) - try: - read_json(out_fname_tree) - except: - print "Read failed, rewriting with indents" - write_json(tree_json, out_fname_tree, indent=1) - - # Include genotype frequencies - write_json(frequencies, "../auspice/data/frequencies.json") - -if __name__ == "__main__": - main() From 1ebef3a3d892f5b26b1184668542db31334772ef Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Sun, 1 Mar 2015 19:57:14 +0100 Subject: [PATCH 31/48] deleted nextflu_config.py, added frequencies -- preliminarily --- augur/nextflu_config.py | 24 -- augur/src/bernoulli_frequency.py | 520 +++++++++++++------------------ augur/src/process.py | 27 +- augur/src/tree_refine.py | 2 +- 4 files changed, 246 insertions(+), 327 deletions(-) delete mode 100644 augur/nextflu_config.py diff --git a/augur/nextflu_config.py b/augur/nextflu_config.py deleted file mode 100644 index d2470812..00000000 --- a/augur/nextflu_config.py +++ /dev/null @@ -1,24 +0,0 @@ -config = { - # data source and sequence parsing/cleaning/processing - 'virus':'H3N2', - 'alignment_file':'data/20150222_all_H3N2_HA1.fasta', - 'fasta_fields':{0:'strain', 1:"date", 4:"passage", -1:'accession'}, - 'outgroup':'A/Beijing/32/1992', - 'force_include':'source-data/HI_strains.txt', - 'max_global':True, # sample as evenly as possible from different geographic regions - 'cds':[48,-1], # define the HA1 start i n 0 numbering - - # frequency estimation parameters - 'aggregate_regions': [ ("global", None), ("NA", ["NorthAmerica"]), ("EU", ["Europe"]), - ("AS", ["China", "SoutheastAsia", "JapanKorea"]), ("OC", ["Oceania"]) ], - 'frequency_stiffness':10.0, - 'time_interval':(2012.0, 2015.1), - 'pivots_per_year':12.0, - 'min_mutation_count':10, - # define relevant clades in canonical HA1 numbering (+1) - 'clade_designations': { "3c3.a":[(128,'A'), (142,'G'), (159,'S')], - "3c3": [(128,'A'), (142,'G'), (159,'F')], - "3c2.a":[(144,'S'), (159,'Y'), (225,'D'), (311,'H'),(489,'N')], - "3c2": [(144,'N'), (159,'F'),(225,'N'), (489,'N')] - } -} diff --git a/augur/src/bernoulli_frequency.py b/augur/src/bernoulli_frequency.py index dfe08b90..d444a0a0 100644 --- a/augur/src/bernoulli_frequency.py +++ b/augur/src/bernoulli_frequency.py @@ -7,17 +7,6 @@ from date_util import * import numpy as np -pc=1e-4 -dfreq_pc = 1e-2 -time_interval = (2012.0, 2015.1) -flu_stiffness = 10.0 -pivots_per_year = 12.0 -relevant_pos_cutoff = 0.1 -inertia = 0.7 # fraction of previous frequency changes that is carried over -window_size = 20 # smooting window -extra_pivots=5 -tol = 1e-4 -reg = 1e-6 debug = False cols = np.array([(166,206,227),(31,120,180),(178,223,138),(51,160,44),(251,154,153),(227,26,28),(253,191,111),(255,127,0),(202,178,214),(106,61,154)], dtype=float)/255 @@ -48,7 +37,7 @@ def fix_freq(freq, pc): freq[np.isnan(freq)]=pc return np.minimum(1-pc, np.maximum(pc,freq)) -def get_pivots(start=None, stop=None): +def get_pivots(start=None, stop=None, pivots_per_year=6): return np.arange(np.floor(time_interval[0]*pivots_per_year), np.ceil(time_interval[1]*pivots_per_year)+0.5, 1.0)/pivots_per_year def get_extrapolation_pivots(start=None, dt=0.5): @@ -93,11 +82,12 @@ class frequency_estimator(object): genetic drift, i.e., sampling variation. ''' - def __init__(self, observations, pivots = None, stiffness = 20.0, logit=False, verbose = 0): + def __init__(self, observations, pivots = None, stiffness = 20.0, inertia = 0.0, logit=False, verbose = 0): self.tps = np.array([x[0] for x in observations]) self.obs = np.array([x[1]>0 for x in observations]) self.stiffness = stiffness - self.interolation_type = 'linear' + self.inertia = inertia + self.interpolation_type = 'linear' self.logit = logit self.verbose=verbose # make sure they are searchsorted @@ -106,7 +96,7 @@ def __init__(self, observations, pivots = None, stiffness = 20.0, logit=False, v self.full_obs = self.obs[tmp] if pivots is None: - self.final_pivot_tps = get_pivots(self.tps[0], self.tps[1]) + self.final_pivot_tps = get_pivots(self.tps[0], self.tps[-1]) elif np.isscalar(pivots): self.final_pivot_tps = np.linspace(self.tps[0], self.tps[-1], pivots) else: @@ -136,7 +126,7 @@ def stiffLH(self, pivots): tmp_freq = fix_freq(freq,dfreq_pc) # return wright fisher diffusion likelihood for frequency change. # return -0.25*self.stiffness*np.sum(dfreq**2/np.diff(self.pivot_tps)/pq(fix_freq(freq[:-1],dfreq_pc))) - return -0.25*self.stiffness*(np.sum((dfreq[1:] - inertia*dfreq[:-1])**2/(dt[1:]*pq(tmp_freq[1:-1]))) + return -0.25*self.stiffness*(np.sum((dfreq[1:] - self.inertia*dfreq[:-1])**2/(dt[1:]*pq(tmp_freq[1:-1]))) +dfreq[0]**2/(dt[0]*pq(tmp_freq[0]))) @@ -209,298 +199,228 @@ def learn(self): self.final_pivot_freq[last_pivot:] = self.final_pivot_freq[last_pivot-1] self.frequency_estimate = interp1d(self.final_pivot_tps, self.final_pivot_freq, kind=self.interolation_type, bounds_error=False) -def estimate_sub_frequencies(node, all_dates, tip_to_date_index, threshold=50, region_name="global"): - # extract time points and the subset of observations that fall in the clade. - tps = all_dates[tip_to_date_index[node.tips]] - start_index = max(0,np.searchsorted(tps, time_interval[0])) - stop_index = min(np.searchsorted(tps, time_interval[1]), all_dates.shape[0]-1) - tps = tps[start_index:stop_index] - # we estimate frequencies of subclades, they will be multiplied by the - # frequency of the parent node and corrected for the frequency of sister clades - # already fit - if node.freq[region_name] is None: - frequency_left=None - else: - frequency_left = np.array(node.freq[region_name]) - ci=0 - # need to resort, since the clade size order might differs after subsetting to regions - children_by_size = sorted(node.child_nodes(), key = lambda x:len(x.tips), reverse=True) - for child in children_by_size[:-1]: # clades are ordered by decreasing size - if len(child.tips)threshold: + fe = frequency_estimator(zip(tps, obs), pivots=self.pivots, + stiffness=self.stiffness*float(len(observations))/len(self.viruses), + logit=True, verbose = 0) fe.learn() - - # assign the frequency vector to the node - child.freq[region_name] = frequency_left * logit_inv(fe.final_pivot_freq) - child.logit_freq[region_name] = logit_transform(child.freq[region_name]) - - # update the frequency remaining to be explained and prune explained observations - frequency_left *= (1.0-logit_inv(fe.final_pivot_freq)) - tps_left = np.ones_like(tps,dtype=bool) - tps_left[obs]=False # prune observations from clade - tps = tps[tps_left] - ci+=1 - - # if the above loop finished assign the frequency of the remaining clade to the frequency_left - if ci==len(node.child_nodes())-1 and frequency_left is not None: - last_child = children_by_size[-1] - last_child.freq[region_name] = frequency_left - last_child.logit_freq[region_name] = logit_transform(last_child.freq[region_name]) - else: # broke out of loop because clades too small. - for child in children_by_size[ci:]: # assign freqs of all remaining clades to None. - child.freq[region_name] = None - child.logit_freq[region_name] = None - # recursively repeat for subclades - for child in node.child_nodes(): - estimate_sub_frequencies(child, all_dates, tip_to_date_index, threshold, region_name) - -def estimate_tree_frequencies(tree, threshold = 20, regions=None, region_name = None): - ''' - loop over nodes of the tree and estimate frequencies of all clade above a certain size - ''' - all_dates = [] - # loop over all nodes, make time ordered lists of tips, restrict to the specified regions - tip_index_region_specific = 0 - if not hasattr(tree.seed_node, "virus_count"): tree.seed_node.virus_count = {} - for node in tree.postorder_node_iter(): - tmp_tips = [] - if node.is_leaf(): - if regions is None or node.region in regions: - all_dates.append(node.num_date) - tmp_tips.append((tip_index_region_specific, node.num_date)) - tip_index_region_specific +=1 - for child in node.child_nodes(): - tmp_tips.extend(child.tips) - node.tips = np.array([x for x in sorted(tmp_tips, key = lambda x:x[1] )]) - if not hasattr(node, "freq"): node.freq = {} - if not hasattr(node, "logit_freq"): node.logit_freq = {} - - # erase the dates from the tip lists and cast to int such that they can be used for indexing - for node in tree.postorder_node_iter(): - if len(node.tips.shape)==2: - node.tips = np.array(node.tips[:,0], dtype=int) + return fe.frequency_estimate, (tps,obs) else: - node.tips = np.array([], dtype=int) - - # sort the dates and provide a reverse ordering as a mapping of tip indices to dates - all_dates = np.array(all_dates) - leaf_order = np.argsort(all_dates) - reverse_order = np.argsort(leaf_order) - all_dates = all_dates[leaf_order] - - if regions is None: - region_name="global" - elif region_name is None: - region_name = ",".join(regions) - # set the frequency of the root node to 1, the logit frequency to a large value - tree.seed_node.pivots = get_pivots(time_interval[0], time_interval[1]) - tree.seed_node.virus_count[region_name] = np.histogram(all_dates, bins = tree.seed_node.pivots) - tree.seed_node.freq[region_name] = np.ones_like(tree.seed_node.pivots) - tree.seed_node.logit_freq[region_name] = 10*np.ones_like(tree.seed_node.pivots) - - # start estimating frequencies of subclades recursively - estimate_sub_frequencies(tree.seed_node, all_dates, reverse_order, threshold = threshold, region_name = region_name) - - -def estimate_genotype_frequency(tree, gt, time_interval=None, regions = None, relevant_pos = None): - ''' - estimate the frequency of a particular genotype specified - gt -- [(position, amino acid), ....] - ''' - all_dates = [] - observations = [] - total_leaf_count = 0 - for node in tree.leaf_iter(): - total_leaf_count+=1 - if isinstance(gt, basestring): - if relevant_pos is None: - is_gt = gt==node.aa_seq - else: - is_gt = gt==reduce_genotype(node.aa_seq, relevant_pos) + print "too few observations" + return None, (tps, obs) + + def get_sub_alignment(self, regions=None): + from Bio.Align import MultipleSeqAlignment + sub_aln = [] + all_dates = [] + for seq in self.aa_aln: + if regions is None or seq.annotations['region'] in regions: + seq_date = seq.annotations['num_date'] + if seq_date>=self.time_interval[0] and seq_date < self.time_interval[1]: + sub_aln.append(seq) + all_dates.append(seq_date) + return MultipleSeqAlignment([sub_aln]) + + def determine_mutation_frequencies(self, regions=None, threshold=0.01): + ''' + determine the abundance of all single nucleotide variants and estimate the + frequency trajectory of the top 10, plot those optionally + ''' + sub_aln = self.get_sub_alignment(regions) + mutation_frequencies = {"pivots":list(self.pivots)} + for pos in xrange(sub_aln.get_alignment_length()): + for ai, aa in self.aa_alphabet: + if self.aa_frequencies[ai,pos]>threshold and self.aa_frequencies[ai,pos]<1.0-threshold: + print "estimating freq of ", mut, "total count:", count + freq, (tps, obs) = self.estimate_genotype_frequency(sub_aln, [(pos, aa)]) + if freq is not None: + mutation_frequencies[str(pos+1)+aa] = list(np.round(logit_inv(freq.y),3)) + + def determine_genotype_frequencies(self, regions=None, threshold=0.1): + ''' + determine the abundance of all single nucleotide variants and estimate the + frequency trajectory of the top 10, plot those optionally + ''' + sub_aln = self.get_sub_alignment(regions) + genotype_frequencies = {"pivots":list(self.pivots)} + relevant_pos = np.where(1.0 - np.aa_frequencies.max(axis=0)>threshold) + for i1,pos1 in enumerate(relevant_pos[:-1]): + for pos2 in relevant_pos[i1+1:]: + for ai1, aa1 in self.aa_alphabet: + for ai2, aa2 in self.aa_alphabet: + if self.aa_frequencies[ai1,pos1]>0.3*threshold \ + and self.aa_frequencies[ai2,pos2]<0.3*threshold: + gt = [(pos1,aa1),(pos2,aa2)] + print "estimating freq of ", gt, "total count:", count + freq, (tps, obs) = self.estimate_genotype_frequency(sub_aln, gt) + gt_label = '/'.join(str(pos+1)+aa] for pos,aa in gt) + genotype_frequencies[gt_label] = list(np.round(logit_inv(freq.y),3)) + + + def determine_clade_frequencies(self, clades, regions=None): + ''' + loop over different clades and determine their frequencies + returns a dictionary with clades:frequencies + ''' + sub_aln = self.get_sub_alignment(regions) + clade_frequencies = {"pivots":list(self.pivots)} + + for ci, (clade_name, clade_gt) in enumerate(clades.iteritems()): + print "estimating frequency of clade", clade_name, clade_gt + freq, (tps, obs) = self.estimate_genotype_frequency(sub_aln, [(pos-1, aa) for pos, aa in clade_gt]) + clade_frequencies[clade_name] = list(np.round(logit_inv(freq.y),3)) + return clade_frequencies + + def estimate_sub_frequencies(self, node, all_dates, tip_to_date_index, threshold=50, region_name="global"): + # extract time points and the subset of observations that fall in the clade. + tps = all_dates[tip_to_date_index[node.tips]] + start_index = max(0,np.searchsorted(tps, time_interval[0])) + stop_index = min(np.searchsorted(tps, time_interval[1]), all_dates.shape[0]-1) + tps = tps[start_index:stop_index] + # we estimate frequencies of subclades, they will be multiplied by the + # frequency of the parent node and corrected for the frequency of sister clades + # already fit + if node.freq[region_name] is None: + frequency_left=None else: - is_gt = all([node.aa_seq[pos]==aa for pos, aa in gt]) + frequency_left = np.array(node.freq[region_name]) + ci=0 + # need to resort, since the clade size order might differs after subsetting to regions + children_by_size = sorted(node.child_nodes(), key = lambda x:len(x.tips), reverse=True) + for child in children_by_size[:-1]: # clades are ordered by decreasing size + if len(child.tips)= time_interval[0]) and (node.num_date10: - fe = frequency_estimator(zip(tps, obs), pivots=pivots, - stiffness=flu_stiffness*float(len(observations))/total_leaf_count, - logit=True, verbose = 0) - fe.learn() - return fe.frequency_estimate, (tps,obs) - else: - return interp1d(pivots, np.zeros_like(pivots)), (tps,obs) + def all_genotypes(self, threshold = 0.1): + self.gt_frequencies = {} + for region_label, regions in region_list: + print "--- "+"determining genotype frequencies "+region_label+ " " + time.strftime("%H:%M:%S") + " ---" + self.genotype_frequencies[region_label] = self.determine_genotype_frequencies(regions, threshold=threshold) -def determine_clade_frequencies(tree, clades, regions=None, plot=False): - ''' - loop over different clades and determine their frequencies - returns a dictionary with clades:frequencies - ''' - import matplotlib.pyplot as plt - xpol_pivots = get_extrapolation_pivots(time_interval[1], dt=0.5) - clade_frequencies = {"pivots":list(get_pivots(time_interval[0], time_interval[1])), - "xpol_pivots":list(xpol_pivots)} - - for ci, (clade_name, clade_gt) in enumerate(clades.iteritems()): - print "estimating frequency of clade", clade_name, clade_gt - freq, (tps, obs) = estimate_genotype_frequency(tree, [(pos-1, aa) for pos, aa in clade_gt], time_interval, regions) - clade_frequencies[clade_name] = list(np.round(logit_inv(freq.y),3)) - if plot: - grid_tps = np.linspace(time_interval[0], time_interval[1], 100) - plt.plot(grid_tps, logit_inv(freq(grid_tps)), label=clade_name, lw=2, c=cols[ci%len(cols)]) - if debug: - r_avg = running_average(obs, window_size) - plt.plot(tps, r_avg, c=cols[ci%len(cols)]) - return clade_frequencies - - -def determine_mutation_frequencies(tree, regions=None, threshold=50, plot=False): - ''' - determine the abundance of all single nucleotide variants and estimate the - frequency trajectory of the top 10, plot those optionally - ''' - import matplotlib.pyplot as plt - from collections import defaultdict - from itertools import izip - - mut_counts = defaultdict(int) - relevant_pos = defaultdict(int) - ref_seq = tree.seed_node.aa_seq - total_leaf_count = 0 - for node in tree.leaf_iter(): - if (node.num_date>= time_interval[0]) and (node.num_datethreshold and countrelevant_pos_cutoff: - pos = int(mut.split('_')[-1][:-1])-1 - relevant_pos.append(pos) - relevant_pos = sorted(set(relevant_pos)) - - return mutation_frequencies, relevant_pos - -def all_genotypes(tree, region_list, relevant_pos): - from collections import defaultdict - total_leaf_count = len(tree.leaf_nodes()) - gt_counts = defaultdict(int) - gt_frequencies = {} - # TODO: determine the number of relevant 2,3..mutation genotypes as in 159F/225D - # TODO: for each such genotype above a cut-off, calculate frequency trajectories - # TODO: this is completely analoguous to mutations, just - for node in tree.leaf_iter(): - gt_counts[reduce_genotype(node.aa_seq, relevant_pos)]+=1 - for region_label, regions in region_list: - tc_in_region, sum_gts = 0,0 - gt_frequencies[region_label]={"pivots":list(get_pivots(time_interval[0], time_interval[1]))} - print "--- "+"determining genotype frequencies "+region_label+ " " + time.strftime("%H:%M:%S") + " ---" - for gt, c in gt_counts.iteritems(): - print gt, c - tmp_freq , (tps, obs) = estimate_genotype_frequency(tree, gt, regions=regions, relevant_pos = relevant_pos) - gt_frequencies[region_label][gt] = list(logit_inv(tmp_freq.y)) - sum_gts+=np.sum(obs) - tc_in_region = len(tps) - print "region: ", sum_gts, "out of", tc_in_region, "(",total_leaf_count," in total)" - return gt_frequencies - - -def all_clades(tree, clades, region_list, plot=False): - clade_frequencies = {} - import matplotlib.pyplot as plt - for region_label, regions in region_list: - print "--- "+"determining clade frequencies "+region_label+ " " + time.strftime("%H:%M:%S") + " ---" - if plot: - plt.figure("region "+region_label, figsize = (12,7)) - if regions is not None: plt.title("Region: "+", ".join(regions)) - clade_frequencies[region_label] = determine_clade_frequencies(tree, clades, regions=regions, plot=plot) - if plot: - plt.legend() - ticloc = np.arange(time_interval[0], int(time_interval[1])+1,1) - plt.xticks(ticloc, map(str, ticloc)) - plt.xlim([time_interval[0], time_interval[1]+1]) - plt.ylim([-0.05, 1.05]) - plt.grid() - plt.savefig('data/clade_frequencies_'+region_label+'.pdf') - return clade_frequencies + def all_clades(self): + clade_frequencies = {} + for region_label, regions in region_list: + print "--- "+"determining clade frequencies "+region_label+ " " + time.strftime("%H:%M:%S") + " ---" + clade_frequencies[region_label] = self.determine_clade_frequencies(clades, regions=regions) def test(): import matplotlib.pyplot as plt diff --git a/augur/src/process.py b/augur/src/process.py index 88880213..cccb7289 100644 --- a/augur/src/process.py +++ b/augur/src/process.py @@ -79,8 +79,6 @@ def export_to_auspice(self, tree_fields = [], tree_pop_list = []): if hasattr(self, 'frequencies'): write_json(self.frequencies, self.auspice_frequency_fname) - - def align(self): ''' aligns viruses using mafft. produces temporary files and deletes those at the end @@ -211,3 +209,28 @@ def determine_variable_positions(self): self.variable_aa = np.where(np.max(self.aa_frequencies,axis=0)<1.0-self.min_freq)[0] self.consensus_aa = "".join(np.fromstring(self.aa_alphabet, 'S1')[np.argmax(self.aa_frequencies,axis=0)]) + + def estimate_frequencies(self, tasks = ['mutations','genotypes', 'clades', 'tree']): + import bernoulli_frequency as freq_est + plot=False + freq_est.flu_stiffness = config['frequency_stiffness'] + freq_est.time_interval = config['time_interval'] + freq_est.pivots_per_year = config['pivots_per_year'] + freq_est.relevant_pos_cutoff = 0.1 + + if 'mutations' in tasks or 'genotypes' in tasks: + self.frequencies['mutations'], relevant_pos = freq_est.all_mutations(self.tree, config['aggregate_regions'], + threshold = config['min_mutation_count'], plot=plot) + if 'genotypes' in tasks: + self.frequencies['genotypes'] = freq_est.all_genotypes(self.tree, config['aggregate_regions'], relevant_pos) + if 'clades' in tasks: + self.frequencies['clades'] = freq_est.all_clades(self.tree, config['clade_designations'], + config['aggregate_regions'], plot) + if any(x in tasks for x in ['mutations','clades', 'genotypes']): + write_json(self.frequencies, self.frequency_fname) + + if 'tree' in tasks: + for region_label, regions in config['aggregate_regions']: + print "--- "+"adding frequencies to tree "+region_label+ " " + time.strftime("%H:%M:%S") + " ---" + freq_est.estimate_tree_frequencies(self.tree, threshold = 10, regions=regions, region_name=region_label) + diff --git a/augur/src/tree_refine.py b/augur/src/tree_refine.py index d39c655a..73588201 100644 --- a/augur/src/tree_refine.py +++ b/augur/src/tree_refine.py @@ -36,7 +36,7 @@ def refine_generic(self): from Bio.Align import MultipleSeqAlignment from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord - tmp_aaseqs = [SeqRecord(Seq(node.aa_seq), id=node.strain, annotations = {'num_date':node.num_date}) for node in self.tree.leaf_iter()] + tmp_aaseqs = [SeqRecord(Seq(node.aa_seq), id=node.strain, annotations = {'num_date':node.num_date, 'region':node.region}) for node in self.tree.leaf_iter()] tmp_aaseqs.sort(key = lambda x:x.annotations['num_date']) self.aa_aln = MultipleSeqAlignment(tmp_aaseqs) From 31e6015fbb3e267e197857bdda982a4f5fc7627a Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Sun, 1 Mar 2015 23:24:01 +0100 Subject: [PATCH 32/48] * corrected various little mistakes in the adoption of the frequency estimation * removed a few unneeded imports * added virus_stats to meta.json --- augur/src/H3N2_process.py | 15 ++- augur/src/bernoulli_frequency.py | 211 +++++++++++++++---------------- augur/src/process.py | 40 +++--- augur/src/tree_refine.py | 2 +- augur/src/virus_clean.py | 18 ++- augur/src/virus_filter.py | 3 +- 6 files changed, 147 insertions(+), 142 deletions(-) diff --git a/augur/src/H3N2_process.py b/augur/src/H3N2_process.py index 7cc3f78b..258719aa 100644 --- a/augur/src/H3N2_process.py +++ b/augur/src/H3N2_process.py @@ -22,7 +22,7 @@ 'force_include':'source-data/HI_strains.txt', 'max_global':True, # sample as evenly as possible from different geographic regions 'cds':[48,-1], # define the HA1 start i n 0 numbering - 'n_std':3, # standard deviations from clock + 'n_std':5, # standard deviations from clock # frequency estimation parameters 'aggregate_regions': [ ("global", None), ("NA", ["NorthAmerica"]), ("EU", ["Europe"]), @@ -37,7 +37,11 @@ "3c2.a":[(144,'S'), (159,'Y'), (225,'D'), (311,'H'),(489,'N')], "3c2": [(144,'N'), (159,'F'),(225,'N'), (489,'N')] }, - 'verbose':1 + 'verbose':2, + 'tol':1e-3, #tolerance for frequency optimization + 'pc':1e-3, #pseudocount for frequencies + 'extra_pivots': 6, # number of pivot point for or after the last observations of a mutations + 'inertia':0.7, # fraction of frequency change carry over in the stiffness term } @@ -252,12 +256,15 @@ def run(self, years_back=3, viruses_per_month=50, raxml_time_limit = 1.0, **kwa self.dump() print "--- Infer ancestral sequences " + time.strftime("%H:%M:%S") + " ---" self.infer_ancestral() # -> every node has a sequence - self.dump() print "--- Tree refine at " + time.strftime("%H:%M:%S") + " ---" self.refine() + self.dump() - self.export_to_auspice() + print "--- Estimating frequencies at " + time.strftime("%H:%M:%S") + " ---" + self.estimate_frequencies() + self.dump() + self.export_to_auspice(tree_fields = ['ep', 'ne', 'rb']) if __name__=="__main__": parser = argparse.ArgumentParser(description='Process virus sequences, build tree, and prepare of web visualization') diff --git a/augur/src/bernoulli_frequency.py b/augur/src/bernoulli_frequency.py index d444a0a0..2b724af6 100644 --- a/augur/src/bernoulli_frequency.py +++ b/augur/src/bernoulli_frequency.py @@ -1,10 +1,6 @@ -# estimates clade frequencies using SMC +# estimates clade frequencies from scipy.interpolate import interp1d import time -from io_util import * -from tree_util import * -from seq_util import * -from date_util import * import numpy as np debug = False @@ -38,10 +34,10 @@ def fix_freq(freq, pc): return np.minimum(1-pc, np.maximum(pc,freq)) def get_pivots(start=None, stop=None, pivots_per_year=6): - return np.arange(np.floor(time_interval[0]*pivots_per_year), np.ceil(time_interval[1]*pivots_per_year)+0.5, 1.0)/pivots_per_year + return np.arange(np.floor(start*pivots_per_year), np.ceil(stop*pivots_per_year)+0.5, 1.0)/pivots_per_year def get_extrapolation_pivots(start=None, dt=0.5): - return np.arange(np.floor(time_interval[1]*pivots_per_year), np.ceil((dt+time_interval[1])*pivots_per_year)+0.5, 1.0)/pivots_per_year + return np.arange(np.floor(start*pivots_per_year), np.ceil((dt+start)*pivots_per_year)+0.5, 1.0)/pivots_per_year def logit_transform(freq): @@ -56,7 +52,7 @@ def logit_inv(logit_freq): def pq(p): return p*(1-p) -def logit_regularizer(logit_freqs): +def logit_regularizer(logit_freqs, reg): return reg*np.mean(np.abs(8-np.abs(logit_freqs))) # penalize too large or too small pivots def extrapolation(freq_interp,x): @@ -82,7 +78,8 @@ class frequency_estimator(object): genetic drift, i.e., sampling variation. ''' - def __init__(self, observations, pivots = None, stiffness = 20.0, inertia = 0.0, logit=False, verbose = 0): + def __init__(self, observations, pivots = None, extra_pivots = 5, stiffness = 20.0, + inertia = 0.0, logit=False, verbose = 0, dfreq_pc = 1e-2, pc=1e-3, tol=1e-3, **kwarks): self.tps = np.array([x[0] for x in observations]) self.obs = np.array([x[1]>0 for x in observations]) self.stiffness = stiffness @@ -90,6 +87,11 @@ def __init__(self, observations, pivots = None, stiffness = 20.0, inertia = 0.0, self.interpolation_type = 'linear' self.logit = logit self.verbose=verbose + self.extra_pivots = extra_pivots + self.tol = tol + self.pc = pc + self.dfreq_pc = dfreq_pc + self.reg = 1e-6 # make sure they are searchsorted tmp = np.argsort(self.tps) self.full_tps = self.tps[tmp] @@ -109,7 +111,7 @@ def initial_guess(self, pivots, ws=50): pivot_freq = tmp_interpolator(pivots) pivot_freq[pivots<=tmp_interpolator.x[0]] = tmp_vals[0] pivot_freq[pivots>=tmp_interpolator.x[-1]] = tmp_vals[-1] - pivot_freq = fix_freq(pivot_freq, pc) + pivot_freq = fix_freq(pivot_freq, self.pc) if self.logit: self.pivot_freq = logit_transform(pivot_freq) @@ -123,25 +125,25 @@ def stiffLH(self, pivots): freq = pivots dfreq = np.diff(freq) dt = np.diff(self.pivot_tps) - tmp_freq = fix_freq(freq,dfreq_pc) + tmp_freq = fix_freq(freq,self.dfreq_pc) # return wright fisher diffusion likelihood for frequency change. - # return -0.25*self.stiffness*np.sum(dfreq**2/np.diff(self.pivot_tps)/pq(fix_freq(freq[:-1],dfreq_pc))) + # return -0.25*self.stiffness*np.sum(dfreq**2/np.diff(self.pivot_tps)/pq(fix_freq(freq[:-1],self.dfreq_pc))) return -0.25*self.stiffness*(np.sum((dfreq[1:] - self.inertia*dfreq[:-1])**2/(dt[1:]*pq(tmp_freq[1:-1]))) +dfreq[0]**2/(dt[0]*pq(tmp_freq[0]))) def logLH(self, pivots): - freq = interp1d(self.pivot_tps, pivots, kind=self.interolation_type) + freq = interp1d(self.pivot_tps, pivots, kind=self.interpolation_type) if self.logit: # if logit, convert to frequencies - estfreq = fix_freq(logit_inv(freq(self.tps)), pc) + estfreq = fix_freq(logit_inv(freq(self.tps)), self.pc) else: - estfreq = fix_freq(freq(self.tps), pc) + estfreq = fix_freq(freq(self.tps), self.pc) stiffness_LH = self.stiffLH(pivots) bernoulli_LH = np.sum(np.log(estfreq[self.obs])) + np.sum(np.log((1-estfreq[~self.obs]))) LH = stiffness_LH + bernoulli_LH if self.verbose>2: print "LH:",bernoulli_LH,stiffness_LH if self.logit: - return -LH/len(self.obs) + logit_regularizer(pivots) + return -LH/len(self.obs) + logit_regularizer(pivots, self.reg) else: return -LH/len(self.obs)+100000*(np.sum((pivots<0)*np.abs(pivots))+np.sum((pivots>1)*np.abs(pivots-1))) @@ -157,11 +159,11 @@ def learn(self): first_switch = self.tps[0] last_switch = self.tps[-1] if first_switch>self.final_pivot_tps[0] and first_switch < self.final_pivot_tps[-1]: - first_pivot = max(0, np.where(first_switch<=self.final_pivot_tps)[0][0] - extra_pivots) + first_pivot = max(0, np.where(first_switch<=self.final_pivot_tps)[0][0] - self.extra_pivots) else: first_pivot=0 if last_switchself.final_pivot_tps[0]: - last_pivot = min(len(self.final_pivot_tps), np.where(last_switch>self.final_pivot_tps)[0][-1]+extra_pivots) + last_pivot = min(len(self.final_pivot_tps), np.where(last_switch>self.final_pivot_tps)[0][-1]+self.extra_pivots) else: last_pivot = len(self.final_pivot_tps) tmp_pivots = self.final_pivot_tps[first_pivot:last_pivot] @@ -173,7 +175,7 @@ def learn(self): import pdb; pdb.set_trace() self.pivot_freq = self.initial_guess(tmp_pivots, ws=2*(min(50,len(self.obs))//2)) - self.frequency_estimate = interp1d(tmp_pivots, self.pivot_freq, kind=self.interolation_type, bounds_error=False) + self.frequency_estimate = interp1d(tmp_pivots, self.pivot_freq, kind=self.interpolation_type, bounds_error=False) if self.verbose: print "Initial pivots:", tmp_pivots steps= [4,2,1] @@ -186,9 +188,9 @@ def learn(self): self.pivot_freq = self.frequency_estimate(self.pivot_tps) # determine the optimal pivot freqquencies - self.pivot_freq = minimizer(self.logLH, self.pivot_freq, ftol = tol, xtol = tol, disp = self.verbose>0) + self.pivot_freq = minimizer(self.logLH, self.pivot_freq, ftol = self.tol, xtol = self.tol, disp = self.verbose>0) # instantiate an interpolation object based on the optimal frequency pivots - self.frequency_estimate = interp1d(self.pivot_tps, self.pivot_freq, kind=self.interolation_type, bounds_error=False) + self.frequency_estimate = interp1d(self.pivot_tps, self.pivot_freq, kind=self.interpolation_type, bounds_error=False) if min(np.diff(self.pivot_tps))<0.000001: print pivots if self.verbose: print "neg logLH using",len(self.pivot_tps),"pivots:", self.logLH(self.pivot_freq) @@ -197,30 +199,32 @@ def learn(self): self.final_pivot_freq[first_pivot:last_pivot]=self.pivot_freq self.final_pivot_freq[:first_pivot] = self.final_pivot_freq[first_pivot] self.final_pivot_freq[last_pivot:] = self.final_pivot_freq[last_pivot-1] - self.frequency_estimate = interp1d(self.final_pivot_tps, self.final_pivot_freq, kind=self.interolation_type, bounds_error=False) + self.frequency_estimate = interp1d(self.final_pivot_tps, self.final_pivot_freq, kind=self.interpolation_type, bounds_error=False) class virus_frequencies(object): - def __init__(self, pc=1e-4 ,dfreq_pc = 1e-2 ,time_interval = (2012.0, 2015.1) , - stiffness = 10.0 ,pivots_per_year = 12.0, inertia = 0.7, **kwarks): - self.pc = pc - self.dfreq_pc = dfreq_pc + def __init__(self, time_interval = (2012.0, 2015.1) , + stiffness = 10.0, pivots_per_year = 12.0, + clade_designations={}, aggregate_regions = None, + extra_pivots = 5, **kwarks): self.time_interval = time_interval self.stiffness = stiffness self.pivots_per_year = pivots_per_year - self.inertia = inertia + self.clade_designations=clade_designations + self.aggregate_regions = aggregate_regions self.pivots = get_pivots(self.time_interval[0], self.time_interval[1], self.pivots_per_year) - if not hasattr(self, 'nucleotide_frequencies'): - self.determine_variable_positions() + self.kwarks = kwarks + if not hasattr(self, "frequencies"): + self.frequencies = {} - def estimate_genotype_frequency(self, aln, gt, threshold = 10): + def estimate_genotype_frequency(self, aln, gt, threshold = 10, min_observations = -1): ''' estimate the frequency of a particular genotype specified gt -- [(position, amino acid), ....] ''' all_dates = [seq.annotations['num_date'] for seq in aln] reduced_gt = tuple(aa for pos,aa in gt) - gts = zip(aln[:,pos] for pos, aa in gt) + gts = zip(*[list(aln[:,pos]) for pos, aa in gt]) observations = [x==reduced_gt for x in gts] all_dates = np.array(all_dates) @@ -228,14 +232,16 @@ def estimate_genotype_frequency(self, aln, gt, threshold = 10): tps = all_dates[leaf_order] obs = np.array(observations)[leaf_order] - if len(tps)>threshold: - fe = frequency_estimator(zip(tps, obs), pivots=self.pivots, + if len(tps)>threshold and np.sum(obs)>min_observations: + if self.verbose: + print "# of time points",len(tps), "# observations",sum(obs) + fe = frequency_estimator(zip(tps, obs), pivots=self.pivots, stiffness=self.stiffness*float(len(observations))/len(self.viruses), - logit=True, verbose = 0) + logit=True, **self.kwarks) fe.learn() return fe.frequency_estimate, (tps,obs) else: - print "too few observations" + if self.verbose: print "too few observations" return None, (tps, obs) def get_sub_alignment(self, regions=None): @@ -248,7 +254,7 @@ def get_sub_alignment(self, regions=None): if seq_date>=self.time_interval[0] and seq_date < self.time_interval[1]: sub_aln.append(seq) all_dates.append(seq_date) - return MultipleSeqAlignment([sub_aln]) + return MultipleSeqAlignment(sub_aln) def determine_mutation_frequencies(self, regions=None, threshold=0.01): ''' @@ -258,12 +264,14 @@ def determine_mutation_frequencies(self, regions=None, threshold=0.01): sub_aln = self.get_sub_alignment(regions) mutation_frequencies = {"pivots":list(self.pivots)} for pos in xrange(sub_aln.get_alignment_length()): - for ai, aa in self.aa_alphabet: + for ai, aa in enumerate(self.aa_alphabet): if self.aa_frequencies[ai,pos]>threshold and self.aa_frequencies[ai,pos]<1.0-threshold: - print "estimating freq of ", mut, "total count:", count + mut = str(pos+1)+aa + print "estimating freq of ", mut, "total frequency:", self.aa_frequencies[ai,pos] freq, (tps, obs) = self.estimate_genotype_frequency(sub_aln, [(pos, aa)]) if freq is not None: - mutation_frequencies[str(pos+1)+aa] = list(np.round(logit_inv(freq.y),3)) + mutation_frequencies[mut] = list(np.round(logit_inv(freq.y),3)) + return mutation_frequencies def determine_genotype_frequencies(self, regions=None, threshold=0.1): ''' @@ -272,19 +280,20 @@ def determine_genotype_frequencies(self, regions=None, threshold=0.1): ''' sub_aln = self.get_sub_alignment(regions) genotype_frequencies = {"pivots":list(self.pivots)} - relevant_pos = np.where(1.0 - np.aa_frequencies.max(axis=0)>threshold) + relevant_pos = np.where(1.0 - self.aa_frequencies.max(axis=0)>threshold)[0] for i1,pos1 in enumerate(relevant_pos[:-1]): for pos2 in relevant_pos[i1+1:]: - for ai1, aa1 in self.aa_alphabet: - for ai2, aa2 in self.aa_alphabet: + for ai1, aa1 in enumerate(self.aa_alphabet): + for ai2, aa2 in enumerate(self.aa_alphabet): if self.aa_frequencies[ai1,pos1]>0.3*threshold \ - and self.aa_frequencies[ai2,pos2]<0.3*threshold: + and self.aa_frequencies[ai2,pos2]>0.3*threshold: gt = [(pos1,aa1),(pos2,aa2)] - print "estimating freq of ", gt, "total count:", count - freq, (tps, obs) = self.estimate_genotype_frequency(sub_aln, gt) - gt_label = '/'.join(str(pos+1)+aa] for pos,aa in gt) - genotype_frequencies[gt_label] = list(np.round(logit_inv(freq.y),3)) - + if self.verbose: print "estimating freq of ", gt + freq, (tps, obs) = self.estimate_genotype_frequency(sub_aln, gt, min_observations = 10) + if freq is not None: + gt_label = '/'.join(str(pos+1)+aa for pos,aa in gt) + genotype_frequencies[gt_label] = list(np.round(logit_inv(freq.y),3)) + return genotype_frequencies def determine_clade_frequencies(self, clades, regions=None): ''' @@ -297,14 +306,15 @@ def determine_clade_frequencies(self, clades, regions=None): for ci, (clade_name, clade_gt) in enumerate(clades.iteritems()): print "estimating frequency of clade", clade_name, clade_gt freq, (tps, obs) = self.estimate_genotype_frequency(sub_aln, [(pos-1, aa) for pos, aa in clade_gt]) - clade_frequencies[clade_name] = list(np.round(logit_inv(freq.y),3)) + if freq is not None: + clade_frequencies[clade_name] = list(np.round(logit_inv(freq.y),3)) return clade_frequencies def estimate_sub_frequencies(self, node, all_dates, tip_to_date_index, threshold=50, region_name="global"): # extract time points and the subset of observations that fall in the clade. tps = all_dates[tip_to_date_index[node.tips]] - start_index = max(0,np.searchsorted(tps, time_interval[0])) - stop_index = min(np.searchsorted(tps, time_interval[1]), all_dates.shape[0]-1) + start_index = max(0,np.searchsorted(tps, self.time_interval[0])) + stop_index = min(np.searchsorted(tps, self.time_interval[1]), all_dates.shape[0]-1) tps = tps[start_index:stop_index] # we estimate frequencies of subclades, they will be multiplied by the # frequency of the parent node and corrected for the frequency of sister clades @@ -326,8 +336,8 @@ def estimate_sub_frequencies(self, node, all_dates, tip_to_date_index, threshold # make n pivots a year, interpolate frequencies # FIXME: adjust stiffness to total number of observations in a more robust manner - pivots = get_pivots(tps[0], tps[1]) - fe = frequency_estimator(zip(tps, obs), pivots=pivots, stiffness=flu_stiffness*len(all_dates)/2000.0, logit=True) + fe = frequency_estimator(zip(tps, obs), pivots=self.pivots, stiffness=self.stiffness*len(all_dates)/2000.0, + logit=True, **self.kwarks) fe.learn() # assign the frequency vector to the node @@ -352,7 +362,7 @@ def estimate_sub_frequencies(self, node, all_dates, tip_to_date_index, threshold child.logit_freq[region_name] = None # recursively repeat for subclades for child in node.child_nodes(): - estimate_sub_frequencies(child, all_dates, tip_to_date_index, threshold, region_name) + self.estimate_sub_frequencies(child, all_dates, tip_to_date_index, threshold, region_name) def estimate_tree_frequencies(self, threshold = 20, regions=None, region_name = None): ''' @@ -394,33 +404,49 @@ def estimate_tree_frequencies(self, threshold = 20, regions=None, region_name = elif region_name is None: region_name = ",".join(regions) # set the frequency of the root node to 1, the logit frequency to a large value - rootnode.pivots = get_pivots(time_interval[0], time_interval[1]) - rootnode.virus_count[region_name] = np.histogram(all_dates, bins = rootnode.pivots) - rootnode.freq[region_name] = np.ones_like(rootnode.pivots) - rootnode.logit_freq[region_name] = 10*np.ones_like(rootnode.pivots) + rootnode.pivots = self.pivots + rootnode.virus_count[region_name] = np.histogram(all_dates, bins = self.pivots)[0] + rootnode.freq[region_name] = np.ones_like(self.pivots) + rootnode.logit_freq[region_name] = 10*np.ones_like(self.pivots) # start estimating frequencies of subclades recursively - estimate_sub_frequencies(self, rootnode, all_dates, reverse_order, threshold = threshold, region_name = region_name) + self.estimate_sub_frequencies(rootnode, all_dates, reverse_order, threshold = threshold, region_name = region_name) - def all_mutations(self, threshold = 0.01): - self.mutation_frequencies = {} - for region_label, regions in region_list: + def all_mutation_frequencies(self, threshold = 0.01): + if not hasattr(self, 'nucleotide_frequencies'): + self.determine_variable_positions() + self.frequencies["mutations"]={} + for region_label, regions in self.aggregate_regions: print "--- "+"determining mutation frequencies in "+region_label+ " " + time.strftime("%H:%M:%S") + " ---" - self.mutation_frequencies[region_label] = self.determine_mutation_frequencies(regions, threshold = threshold) + self.frequencies["mutations"][region_label] = self.determine_mutation_frequencies(regions, threshold = threshold) - def all_genotypes(self, threshold = 0.1): - self.gt_frequencies = {} - for region_label, regions in region_list: + def all_genotypes_frequencies(self, threshold = 0.1): + if not hasattr(self, 'nucleotide_frequencies'): + self.determine_variable_positions() + self.frequencies["genotypes"]={} + for region_label, regions in self.aggregate_regions: print "--- "+"determining genotype frequencies "+region_label+ " " + time.strftime("%H:%M:%S") + " ---" - self.genotype_frequencies[region_label] = self.determine_genotype_frequencies(regions, threshold=threshold) + self.frequencies["genotypes"][region_label] = self.determine_genotype_frequencies(regions, threshold=threshold) - def all_clades(self): - clade_frequencies = {} - for region_label, regions in region_list: + def all_clade_frequencies(self, clades = None): + if not hasattr(self, 'nucleotide_frequencies'): + self.determine_variable_positions() + if clades is None: + if hasattr(self, "clade_designations"): + clades = self.clade_designations + else: + return + self.frequencies["clades"] = {} + for region_label, regions in self.aggregate_regions: print "--- "+"determining clade frequencies "+region_label+ " " + time.strftime("%H:%M:%S") + " ---" - clade_frequencies[region_label] = self.determine_clade_frequencies(clades, regions=regions) + self.frequencies["clades"][region_label] = self.determine_clade_frequencies(clades, regions=regions) + + def all_tree_frequencies(self, threshold = 20): + for region_label, regions in self.aggregate_regions: + print "--- "+"determining tree frequencies "+region_label+ " " + time.strftime("%H:%M:%S") + " ---" + self.estimate_tree_frequencies(threshold = threshold,regions=regions, region_name=region_label) def test(): import matplotlib.pyplot as plt @@ -449,47 +475,8 @@ def test(): plt.plot(fe.tps, r_avg, 'k', label = 'running avg') plt.legend(loc=2) -def main(tree_fname = 'data/tree_refine.json', clades=None, clades_freq = True, mutation_freq = True, tree_freq = True): - # load tree - from io_util import read_json - plot = debug - tree = json_to_dendropy(read_json(tree_fname)) - region_list = [("global", None), ("NA", ["NorthAmerica"]) , ("EU", ["Europe"]), - ("AS", ["China", "SoutheastAsia", "JapanKorea"]), ("OC", ["Oceania"]) ] - - out_fname = 'data/genotype_frequencies.json' - gt_frequencies = {} - - if mutation_freq: - gt_frequencies["mutations"], relevant_pos = all_mutations(tree, region_list, plot) - write_json(gt_frequencies, out_fname, indent=None) - - gt_frequencies["genotypes"] = all_genotypes(tree, region_list, relevant_pos) - write_json(gt_frequencies, out_fname, indent=None) - - if clades_freq and clades is not None: - gt_frequencies["clades"] = all_clades(tree, clades, region_list, plot) - - if clades_freq or mutation_freq: - # round frequencies - for gt_type in gt_frequencies: - for reg in region_list: - for gt in gt_frequencies[gt_type][reg[0]]: - tmp = gt_frequencies[gt_type][reg[0]][gt] - gt_frequencies[gt_type][reg[0]][gt] = [round(x,3) for x in tmp] - - write_json(gt_frequencies, out_fname, indent=None) - - if tree_freq: - tree_out_fname = 'data/tree_frequencies.json' - for region_label, regions in region_list: - print "--- "+"adding frequencies to tree "+region_label+ " " + time.strftime("%H:%M:%S") + " ---" - estimate_tree_frequencies(tree, threshold = 10, regions=regions, region_name=region_label) - write_json(dendropy_to_json(tree.seed_node), tree_out_fname, indent=None) - return tree_out_fname - if __name__=="__main__": - #test() - main() + test() + diff --git a/augur/src/process.py b/augur/src/process.py index cccb7289..51f05ae4 100644 --- a/augur/src/process.py +++ b/augur/src/process.py @@ -3,14 +3,16 @@ from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq import dendropy +from bernoulli_frequency import virus_frequencies from tree_util import delimit_newick import numpy as np -class process(object): +class process(virus_frequencies): """generic template class for processing virus sequences into trees""" def __init__(self, tree_fname = 'data/tree.pkl', virus_fname = 'data/virus.pkl', frequency_fname = 'data/frequency.pkl', auspice_frequency_fname ='../auspice/data/frequencies.json', auspice_sequences_fname='../auspice/data/sequences.json', auspice_tree_fname='../auspice/data/tree.json', min_freq = 0.01, **kwargs): + virus_frequencies.__init__(self, **kwargs) self.tree_fname = tree_fname self.virus_fname = virus_fname self.frequency_fname = frequency_fname @@ -79,6 +81,17 @@ def export_to_auspice(self, tree_fields = [], tree_pop_list = []): if hasattr(self, 'frequencies'): write_json(self.frequencies, self.auspice_frequency_fname) + # Write out metadata + print "Writing out metadata" + meta = {"updated": time.strftime("X%d %b %Y").replace('X0','X').replace('X','')} + if hasattr(self,"viruses_by_date_and_region"): + meta["regions"] = self.regions + meta["virus_stats"] = [ [str(y)+'-'+str(m)] + [row[reg] for reg in self.regions] + for y,m in sorted(self.viruses_by_date_and_region) ] + meta_fname = "../auspice/data/meta.json" + write_json(meta, meta_fname, indent=1) + + def align(self): ''' aligns viruses using mafft. produces temporary files and deletes those at the end @@ -211,26 +224,11 @@ def determine_variable_positions(self): self.consensus_aa = "".join(np.fromstring(self.aa_alphabet, 'S1')[np.argmax(self.aa_frequencies,axis=0)]) def estimate_frequencies(self, tasks = ['mutations','genotypes', 'clades', 'tree']): - import bernoulli_frequency as freq_est - plot=False - freq_est.flu_stiffness = config['frequency_stiffness'] - freq_est.time_interval = config['time_interval'] - freq_est.pivots_per_year = config['pivots_per_year'] - freq_est.relevant_pos_cutoff = 0.1 - - if 'mutations' in tasks or 'genotypes' in tasks: - self.frequencies['mutations'], relevant_pos = freq_est.all_mutations(self.tree, config['aggregate_regions'], - threshold = config['min_mutation_count'], plot=plot) + if 'mutations' in tasks: + self.all_mutation_frequencies() if 'genotypes' in tasks: - self.frequencies['genotypes'] = freq_est.all_genotypes(self.tree, config['aggregate_regions'], relevant_pos) + self.all_genotypes_frequencies() if 'clades' in tasks: - self.frequencies['clades'] = freq_est.all_clades(self.tree, config['clade_designations'], - config['aggregate_regions'], plot) - if any(x in tasks for x in ['mutations','clades', 'genotypes']): - write_json(self.frequencies, self.frequency_fname) - + self.all_clade_frequencies() if 'tree' in tasks: - for region_label, regions in config['aggregate_regions']: - print "--- "+"adding frequencies to tree "+region_label+ " " + time.strftime("%H:%M:%S") + " ---" - freq_est.estimate_tree_frequencies(self.tree, threshold = 10, regions=regions, region_name=region_label) - + self.all_tree_frequencies() diff --git a/augur/src/tree_refine.py b/augur/src/tree_refine.py index 73588201..ffdc71cd 100644 --- a/augur/src/tree_refine.py +++ b/augur/src/tree_refine.py @@ -11,7 +11,7 @@ def __init__(self,cds = (0,None), max_length = 0.01, dt=1, **kwargs): ''' parameters: cds -- coding region - max_length -- maximal lenght of external branches + max_length -- maximal length of external branches dt -- time interval used to define the trunk of the tree ''' self.cds = cds diff --git a/augur/src/virus_clean.py b/augur/src/virus_clean.py index a84b199b..770f96b3 100644 --- a/augur/src/virus_clean.py +++ b/augur/src/virus_clean.py @@ -18,18 +18,31 @@ def __init__(self,n_std = 5, **kwargs): self.n_std = n_std def remove_insertions(self): + ''' + remove all columns from the alignment in which the outgroup is gapped + ''' outgroup_ok = np.array(self.sequence_lookup[self.outgroup['strain']])!='-' for seq in self.viruses: seq.seq = Seq("".join(np.array(seq.seq)[outgroup_ok]).upper()) def clean_gaps(self): + ''' + remove viruses with gaps -- not part of the standard pipeline + ''' self.viruses = filter(lambda x: '-' in x.seq, self.viruses) def clean_ambiguous(self): + ''' + substitute all ambiguous characters with '-', + ancestral inference will interpret this as missing data + ''' for v in self.viruses: v.seq = Seq(re.sub(r'[BDEFHIJKLMNOPQRSUVWXYZ]', '-',str(v.seq))) def unique_date(self): + ''' + add a unique numerical date to each leaf. uniqueness is achieved adding a small number + ''' from date_util import numerical_date og = self.sequence_lookup[self.outgroup['strain']] og.num_date = numerical_date(og.date) @@ -51,14 +64,15 @@ def clean_distances(self): times = self.times_from_outgroup() distances = self.distance_from_outgroup() slope, intercept, r_value, p_value, std_err = stats.linregress(times, distances) - residuals = slope*times - distances + residuals = slope*times + intercept - distances r_sd = residuals.std() if self.verbose: print "\tslope: " + str(slope) print "\tr: " + str(r_value) print "\tresiduals sd: " + str(r_sd) new_viruses = [] - for (v,r) in izip(self.viruses,residuals): # filter viruses more than 5 sds up or down + for (v,r) in izip(self.viruses,residuals): + # filter viruses more than n_std standard devitations up or down if np.abs(r) Date: Sun, 1 Mar 2015 23:57:04 +0100 Subject: [PATCH 33/48] implemented the preferential inclusion of HI strains. --- augur/src/H3N2_process.py | 21 +++-- augur/src/nextflu_process.py | 144 ----------------------------------- 2 files changed, 16 insertions(+), 149 deletions(-) delete mode 100644 augur/src/nextflu_process.py diff --git a/augur/src/H3N2_process.py b/augur/src/H3N2_process.py index 258719aa..d20af755 100644 --- a/augur/src/H3N2_process.py +++ b/augur/src/H3N2_process.py @@ -1,4 +1,4 @@ -import time, argparse,re, sys +import time, argparse,re, sys,os sys.path.append('src') from virus_filter import flu_filter from virus_clean import virus_clean @@ -16,10 +16,12 @@ virus_config = { # data source and sequence parsing/cleaning/processing 'virus':'H3N2', - 'alignment_file':'data/20150222_all_H3N2_HA1.fasta', + 'alignment_file':'data/gisaid_epiflu_sequence.fasta', 'fasta_fields':{0:'strain', 1:"date", 4:"passage", -1:'accession'}, + #'fasta_fields':{0:'strain', 1:"date", 4:"passage", -1:'accession'}, 'outgroup':'A/Beijing/32/1992', - 'force_include':'source-data/HI_strains.txt', + #'force_include':'source-data/HI_strains.txt', + 'force_include_all':False, 'max_global':True, # sample as evenly as possible from different geographic regions 'cds':[48,-1], # define the HA1 start i n 0 numbering 'n_std':5, # standard deviations from clock @@ -236,7 +238,11 @@ def add_H3N2_attributes(self): class H3N2_process(process, H3N2_filter, H3N2_clean, H3N2_refine): """docstring for H3N2_process, H3N2_filter""" - def __init__(self,verbose = 0, **kwargs): + def __init__(self,verbose = 0, force_include = None, + force_include_all = False, max_global= True, **kwargs): + self.force_include = force_include + self.force_include_all = force_include_all + self.max_global = max_global process.__init__(self, **kwargs) H3N2_filter.__init__(self,**kwargs) H3N2_clean.__init__(self,**kwargs) @@ -246,7 +252,12 @@ def __init__(self,verbose = 0, **kwargs): def run(self, years_back=3, viruses_per_month=50, raxml_time_limit = 1.0, **kwargs): print "--- Virus filtering at " + time.strftime("%H:%M:%S") + " ---" self.filter() - self.subsample(years_back, viruses_per_month) + if self.force_include is not None and os.path.isfile(self.force_include): + with open(self.force_include) as infile: + forced_strains = [line.strip().lower() for line in infile] + self.subsample(years_back, viruses_per_month, + prioritize=forced_strains, all_priority=self.force_include_all, + region_specific = self.max_global) self.align() # -> self.viruses is an alignment object print "--- Clean at " + time.strftime("%H:%M:%S") + " ---" self.clean() # -> every node as a numerical date diff --git a/augur/src/nextflu_process.py b/augur/src/nextflu_process.py deleted file mode 100644 index 18577304..00000000 --- a/augur/src/nextflu_process.py +++ /dev/null @@ -1,144 +0,0 @@ -import time, argparse,os,subprocess, shutil, glob, sys -sys.path.append('./src') -from Bio import SeqIO -from io_util import write_json, read_json, write_fasta, read_fasta -from tree_util import dendropy_to_json, json_to_dendropy, delimit_newick -import dendropy - -class nextflu(object): - def __init__(self): - self.viruses = None - self.tree = None - self.frequencies = {} - self.initial_virus_fname = 'data/virus_ingest.json' - self.clean_virus_fname = 'data/virus_clean.json' - self.intermediate_tree_fname = 'data/tree_refine.json' - self.frequency_fname = 'data/frequencies.json' - - def load_from_file(self, tree_fname=None, virus_fname = None): - if tree_fname is None: tree_fname = self.intermediate_tree_fname - if os.path.isfile(tree_fname): - self.tree = json_to_dendropy(read_json(tree_fname)) - if virus_fname is None: virus_fname = self.clean_virus_fname - if os.path.isfile(virus_fname): - self.viruses = read_json(virus_fname) - if os.path.isfile(self.frequency_fname): - self.frequencies = read_json(self.frequency_fname) - - def load_viruses(self, aln_fname = None, years_back=3, viruses_per_month=50): - if config['virus']: - from H3N2_filter import H3N2_filter as virus_filter - fasta_fields = config['fasta_fields'] - if 'force_include' in config and os.path.isfile(config['force_include']): - with open(config['force_include']) as force_include_file: - force_include_strains = [line.strip() for line in force_include_file] - else: - force_include_strains = [] - else: - from virus_filter import virus_filter as virus_filter - fasta_fields = {0:'strain'} - if aln_fname is None: aln_fname = config['alignment_file'] - - my_filter = virus_filter(aln_fname, fasta_fields) - my_filter.filter() - my_filter.subsample(years_back, viruses_per_month, prioritize = force_include_strains, - all_priority = True, region_specific=config['max_global']) - - self.viruses = my_filter.virus_subsample - write_json(self.viruses, self.initial_virus_fname) - - def clean_viruses(self): - import virus_clean - self.viruses = virus_clean.main(self.viruses) - write_json(self.viruses, self.clean_virus_fname) - - def align(self): - import virus_align - self.viruses = virus_align.main(self.viruses) - out_fname = 'data/virus_align.json' - write_json(self.viruses, out_fname) - - def infer_tree(self, raxml_time_limit = 1.0): - import tree_infer - tree_fname = tree_infer.main(self.viruses, raxml_time_limit, config['outgroup']) - delimit_newick(tree_fname, "temp.newick") - self.tree = dendropy.Tree.get_from_path("temp.newick", "newick") - os.remove('temp.newick') - - def infer_ancestral(self, virus_fname = None): - import tree_ancestral - self.tree = tree_ancestral.main(self.tree, self.viruses) - - def refine_tree(self): - import tree_refine - tree_refine.main(self.tree, self.viruses, config['outgroup'], config['cds']) - write_json(dendropy_to_json(self.tree.seed_node), self.intermediate_tree_fname) - - def estimate_frequencies(self, tasks = ['mutations','genotypes', 'clades', 'tree']): - import bernoulli_frequency as freq_est - plot=False - freq_est.flu_stiffness = config['frequency_stiffness'] - freq_est.time_interval = config['time_interval'] - freq_est.pivots_per_year = config['pivots_per_year'] - freq_est.relevant_pos_cutoff = 0.1 - - if 'mutations' in tasks or 'genotypes' in tasks: - self.frequencies['mutations'], relevant_pos = freq_est.all_mutations(self.tree, config['aggregate_regions'], - threshold = config['min_mutation_count'], plot=plot) - if 'genotypes' in tasks: - self.frequencies['genotypes'] = freq_est.all_genotypes(self.tree, config['aggregate_regions'], relevant_pos) - if 'clades' in tasks: - self.frequencies['clades'] = freq_est.all_clades(self.tree, config['clade_designations'], - config['aggregate_regions'], plot) - if any(x in tasks for x in ['mutations','clades', 'genotypes']): - write_json(self.frequencies, self.frequency_fname) - - if 'tree' in tasks: - for region_label, regions in config['aggregate_regions']: - print "--- "+"adding frequencies to tree "+region_label+ " " + time.strftime("%H:%M:%S") + " ---" - freq_est.estimate_tree_frequencies(self.tree, threshold = 10, regions=regions, region_name=region_label) - - def export_to_auspice(self): - import streamline - tree_json = dendropy_to_json(self.tree.seed_node) - streamline.main(tree_json, self.frequencies) - - def run(self,years_back=3, viruses_per_month=50, raxml_time_limit = 1.0, **kwargs): - self.load_viruses(years_back=years_back, viruses_per_month=viruses_per_month) - self.align() - self.clean_viruses() - self.infer_tree(raxml_time_limit = raxml_time_limit) - self.infer_ancestral() - self.refine_tree() - self.estimate_frequencies() - self.export_to_auspice() - -if __name__=="__main__": - parser = argparse.ArgumentParser(description='Process virus sequences, build tree, and prepare of web visualization') - parser.add_argument('-y', '--years_back', type = int, default=3, help='number of past years to sample sequences from') - parser.add_argument('-v', '--viruses_per_month', type = int, default = 50, help='number of viruses sampled per month') - parser.add_argument('-r', '--raxml_time_limit', type = float, default = 1.0, help='number of hours raxml is run') - parser.add_argument('--config', default = "nextflu_config.py" , type=str, help ="config file") - parser.add_argument('--test', default = False, action="store_true", help ="don't run the pipeline") - parser.add_argument('--virus', default = False, action="store_true", help ="only select viruses") - parser.add_argument('--tree', default = False, action="store_true", help ="only build tree") - parser.add_argument('--frequencies', default = False, action="store_true", help ="only estimate frequencies") - params = parser.parse_args() - - execfile(params.config) - print config - - my_nextflu = nextflu() - my_nextflu.load_from_file() - if params.virus: - my_nextflu.load_viruses(years_back=params.years_back, viruses_per_month = params.viruses_per_month) - my_nextflu.align() - my_nextflu.clean_viruses() - elif params.tree: - my_nextflu.infer_tree(raxml_time_limit=params.raxml_time_limit) - my_nextflu.infer_ancestral() - my_nextflu.refine_tree() - elif params.frequencies: - my_nextflu.estimate_frequencies() - elif not params.test: - my_nextflu.run(**params.__dict__) From 37a39addee508508f04efced92ae89f73c71bd1a Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Mon, 2 Mar 2015 00:09:46 +0100 Subject: [PATCH 34/48] export of virus statistics to meta.json --- augur/src/H3N2_process.py | 2 ++ augur/src/process.py | 9 ++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/augur/src/H3N2_process.py b/augur/src/H3N2_process.py index d20af755..597ec343 100644 --- a/augur/src/H3N2_process.py +++ b/augur/src/H3N2_process.py @@ -272,9 +272,11 @@ def run(self, years_back=3, viruses_per_month=50, raxml_time_limit = 1.0, **kwa self.dump() print "--- Estimating frequencies at " + time.strftime("%H:%M:%S") + " ---" + self.determine_variable_positions() self.estimate_frequencies() self.dump() + self.temporal_regional_statistics() self.export_to_auspice(tree_fields = ['ep', 'ne', 'rb']) if __name__=="__main__": diff --git a/augur/src/process.py b/augur/src/process.py index 51f05ae4..f2156c18 100644 --- a/augur/src/process.py +++ b/augur/src/process.py @@ -84,13 +84,12 @@ def export_to_auspice(self, tree_fields = [], tree_pop_list = []): # Write out metadata print "Writing out metadata" meta = {"updated": time.strftime("X%d %b %Y").replace('X0','X').replace('X','')} - if hasattr(self,"viruses_by_date_and_region"): + if hasattr(self,"date_region_count"): meta["regions"] = self.regions - meta["virus_stats"] = [ [str(y)+'-'+str(m)] + [row[reg] for reg in self.regions] - for y,m in sorted(self.viruses_by_date_and_region) ] + meta["virus_stats"] = [ [str(y)+'-'+str(m)] + [self.date_region_count[(y,m)][reg] for reg in self.regions] + for y,m in sorted(self.date_region_count.keys()) ] meta_fname = "../auspice/data/meta.json" - write_json(meta, meta_fname, indent=1) - + write_json(meta, meta_fname, indent=0) def align(self): ''' From dc49e192bbfe9df752f11dc3fa19d6c477f8be26 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Mon, 2 Mar 2015 00:14:50 +0100 Subject: [PATCH 35/48] set forced_strains = [] if not desired --- augur/src/H3N2_process.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/augur/src/H3N2_process.py b/augur/src/H3N2_process.py index 597ec343..ae2c2d91 100644 --- a/augur/src/H3N2_process.py +++ b/augur/src/H3N2_process.py @@ -255,6 +255,8 @@ def run(self, years_back=3, viruses_per_month=50, raxml_time_limit = 1.0, **kwa if self.force_include is not None and os.path.isfile(self.force_include): with open(self.force_include) as infile: forced_strains = [line.strip().lower() for line in infile] + else: + forced_strains = [] self.subsample(years_back, viruses_per_month, prioritize=forced_strains, all_priority=self.force_include_all, region_specific = self.max_global) From ee1d3d9ba279a7fc990d2ff4afa910b753db5213 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Mon, 2 Mar 2015 06:50:34 +0100 Subject: [PATCH 36/48] * added start and stop flags to run parts of the pipeline. * fixed an issue with the clock-cleaning. some times the std deviations is very large due to extreme outliers. using interquartile distance now instead. * put layout in a separate function from attribute adding --- augur/src/H3N2_process.py | 78 ++++++++++++++++++++++----------------- augur/src/tree_refine.py | 12 +++--- augur/src/virus_clean.py | 16 ++++---- 3 files changed, 59 insertions(+), 47 deletions(-) diff --git a/augur/src/H3N2_process.py b/augur/src/H3N2_process.py index ae2c2d91..6991e7f0 100644 --- a/augur/src/H3N2_process.py +++ b/augur/src/H3N2_process.py @@ -24,7 +24,7 @@ 'force_include_all':False, 'max_global':True, # sample as evenly as possible from different geographic regions 'cds':[48,-1], # define the HA1 start i n 0 numbering - 'n_std':5, # standard deviations from clock + 'n_iqd':3, # standard deviations from clock # frequency estimation parameters 'aggregate_regions': [ ("global", None), ("NA", ["NorthAmerica"]), ("EU", ["Europe"]), @@ -53,8 +53,8 @@ def __init__(self,min_length = 987, **kwargs): parameters min_length -- minimal length for a sequence to be acceptable ''' - self.min_length = min_length flu_filter.__init__(self, **kwargs) + self.min_length = min_length self.vaccine_strains =[ { "strain": "A/Wisconsin/67/2005", @@ -250,36 +250,47 @@ def __init__(self,verbose = 0, force_include = None, self.verbose = verbose def run(self, years_back=3, viruses_per_month=50, raxml_time_limit = 1.0, **kwargs): - print "--- Virus filtering at " + time.strftime("%H:%M:%S") + " ---" - self.filter() - if self.force_include is not None and os.path.isfile(self.force_include): - with open(self.force_include) as infile: - forced_strains = [line.strip().lower() for line in infile] + all_steps = ['filter', 'align', 'clean', 'tree', 'ancestral', 'refine', 'frequencies', 'export'] + steps = all_steps[all_steps.index(kwargs['start']):(all_steps.index(kwargs['stop'])+1)] + if 'filter' in steps: + print "--- Virus filtering at " + time.strftime("%H:%M:%S") + " ---" + self.filter() + if self.force_include is not None and os.path.isfile(self.force_include): + with open(self.force_include) as infile: + forced_strains = [line.strip().lower() for line in infile] + else: + forced_strains = [] + self.subsample(years_back, viruses_per_month, + prioritize=forced_strains, all_priority=self.force_include_all, + region_specific = self.max_global) + self.dump() else: - forced_strains = [] - self.subsample(years_back, viruses_per_month, - prioritize=forced_strains, all_priority=self.force_include_all, - region_specific = self.max_global) - self.align() # -> self.viruses is an alignment object - print "--- Clean at " + time.strftime("%H:%M:%S") + " ---" - self.clean() # -> every node as a numerical date - self.dump() - print "--- Tree infer at " + time.strftime("%H:%M:%S") + " ---" - self.infer_tree(raxml_time_limit) # -> self has a tree - self.dump() - print "--- Infer ancestral sequences " + time.strftime("%H:%M:%S") + " ---" - self.infer_ancestral() # -> every node has a sequence - print "--- Tree refine at " + time.strftime("%H:%M:%S") + " ---" - self.refine() - self.dump() - - print "--- Estimating frequencies at " + time.strftime("%H:%M:%S") + " ---" - self.determine_variable_positions() - self.estimate_frequencies() - self.dump() - - self.temporal_regional_statistics() - self.export_to_auspice(tree_fields = ['ep', 'ne', 'rb']) + self.load() + if 'align' in steps: + self.align() # -> self.viruses is an alignment object + if 'clean' in steps: + print "--- Clean at " + time.strftime("%H:%M:%S") + " ---" + self.clean() # -> every node as a numerical date + self.dump() + if 'tree' in steps: + print "--- Tree infer at " + time.strftime("%H:%M:%S") + " ---" + self.infer_tree(raxml_time_limit) # -> self has a tree + self.dump() + if 'ancestral' in steps: + print "--- Infer ancestral sequences " + time.strftime("%H:%M:%S") + " ---" + self.infer_ancestral() # -> every node has a sequence + if 'refine' in steps: + print "--- Tree refine at " + time.strftime("%H:%M:%S") + " ---" + self.refine() + self.dump() + if 'frequencies' in steps: + print "--- Estimating frequencies at " + time.strftime("%H:%M:%S") + " ---" + self.determine_variable_positions() + self.estimate_frequencies() + self.dump() + if 'export' in steps: + self.temporal_regional_statistics() + self.export_to_auspice(tree_fields = ['ep', 'ne', 'rb']) if __name__=="__main__": parser = argparse.ArgumentParser(description='Process virus sequences, build tree, and prepare of web visualization') @@ -288,9 +299,8 @@ def run(self, years_back=3, viruses_per_month=50, raxml_time_limit = 1.0, **kwa parser.add_argument('-r', '--raxml_time_limit', type = float, default = 1.0, help='number of hours raxml is run') parser.add_argument('--config', default = "nextflu_config.py" , type=str, help ="config file") parser.add_argument('--test', default = False, action="store_true", help ="don't run the pipeline") - parser.add_argument('--virus', default = False, action="store_true", help ="only select viruses") - parser.add_argument('--tree', default = False, action="store_true", help ="only build tree") - parser.add_argument('--frequencies', default = False, action="store_true", help ="only estimate frequencies") + parser.add_argument('--start', default = 'filter', type = str, help ="start pipeline at virus selection") + parser.add_argument('--stop', default = 'export', type=str, help ="run to end") params = parser.parse_args() params.cds = (48,None) diff --git a/augur/src/tree_refine.py b/augur/src/tree_refine.py index ffdc71cd..fd8bdcff 100644 --- a/augur/src/tree_refine.py +++ b/augur/src/tree_refine.py @@ -30,6 +30,7 @@ def refine_generic(self): self.translate_all() self.add_node_attributes() self.reduce() + self.layout() self.define_trunk() # make an amino acid aligment @@ -49,10 +50,10 @@ def remove_outgroup(self): print "removed outgroup",self.outgroup['strain'] else: print "outgroup",self.outgroup['strain'], "not found" - if len(self.tree.seed_node.child_nodes())==1: - self.tree.seed_node = self.tree.seed_node.child_nodes()[0] - self.tree.seed_node.parent_node = None - self.tree.seed_node.edge_length = 0.002 +# if len(self.tree.seed_node.child_nodes())==1: +# self.tree.seed_node = self.tree.seed_node.child_nodes()[0] +# self.tree.seed_node.parent_node = None + self.tree.seed_node.edge_length = 0.001 def collapse(self): """Collapse edges without mutations to polytomies""" @@ -98,7 +99,7 @@ def get_yvalue(self, node): if node.child_nodes(): return np.mean([n.yvalue for n in node.child_nodes()]) - def add_node_attributes(self): + def layout(self): """Add clade, xvalue, yvalue, mutation and trunk attributes to all nodes in tree""" clade = 0 yvalue = 0 @@ -112,6 +113,7 @@ def add_node_attributes(self): node.yvalue = self.get_yvalue(node) node.xvalue = node.distance_from_root() + def add_node_attributes(self): for v in self.viruses: if v.strain in self.node_lookup: node = self.node_lookup[v.strain] diff --git a/augur/src/virus_clean.py b/augur/src/virus_clean.py index 770f96b3..9c32c882 100644 --- a/augur/src/virus_clean.py +++ b/augur/src/virus_clean.py @@ -10,12 +10,12 @@ class virus_clean(object): """docstring for virus_clean""" - def __init__(self,n_std = 5, **kwargs): + def __init__(self,n_iqd = 5, **kwargs): ''' parameters - n_std -- number of standard deviations accepted in molecular clock filter + n_std -- number of interquartile distances accepted in molecular clock filter ''' - self.n_std = n_std + self.n_iqd = n_iqd def remove_insertions(self): ''' @@ -52,12 +52,12 @@ def unique_date(self): def times_from_outgroup(self): self.unique_date() outgroup_date = self.sequence_lookup[self.outgroup['strain']].num_date - return np.array([x.num_date-outgroup_date for x in self.viruses]) + return np.array([x.num_date-outgroup_date for x in self.viruses if x.strain]) def distance_from_outgroup(self): from seq_util import hamming_distance outgroup_seq = self.sequence_lookup[self.outgroup['strain']].seq - return np.array([hamming_distance(x.seq, outgroup_seq) for x in self.viruses]) + return np.array([hamming_distance(x.seq, outgroup_seq) for x in self.viruses if x.strain]) def clean_distances(self): """Remove viruses that don't follow a loose clock """ @@ -65,15 +65,15 @@ def clean_distances(self): distances = self.distance_from_outgroup() slope, intercept, r_value, p_value, std_err = stats.linregress(times, distances) residuals = slope*times + intercept - distances - r_sd = residuals.std() + r_iqd = stats.scoreatpercentile(residuals,75) - stats.scoreatpercentile(residuals,25) if self.verbose: print "\tslope: " + str(slope) print "\tr: " + str(r_value) - print "\tresiduals sd: " + str(r_sd) + print "\tresiduals iqd: " + str(r_iqd) new_viruses = [] for (v,r) in izip(self.viruses,residuals): # filter viruses more than n_std standard devitations up or down - if np.abs(r)1: From 5b783fb5f60c4e89408795564d6275d09d164bb1 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Mon, 2 Mar 2015 07:52:00 +0100 Subject: [PATCH 37/48] added aa alignment file dump --- augur/src/H3N2_process.py | 6 +++--- augur/src/process.py | 9 ++++++++- augur/src/tree_refine.py | 2 ++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/augur/src/H3N2_process.py b/augur/src/H3N2_process.py index 6991e7f0..db5150b8 100644 --- a/augur/src/H3N2_process.py +++ b/augur/src/H3N2_process.py @@ -27,11 +27,11 @@ 'n_iqd':3, # standard deviations from clock # frequency estimation parameters - 'aggregate_regions': [ ("global", None), ("NA", ["NorthAmerica"]), ("EU", ["Europe"]), - ("AS", ["China", "SoutheastAsia", "JapanKorea"]), ("OC", ["Oceania"]) ], + 'aggregate_regions': [ ("global", None)],# ("NA", ["NorthAmerica"]), ("EU", ["Europe"]), +# ("AS", ["China", "SoutheastAsia", "JapanKorea"]), ("OC", ["Oceania"]) ], 'frequency_stiffness':10.0, 'time_interval':(2012.0, 2015.1), - 'pivots_per_year':12.0, + 'pivots_per_year':6.0, 'min_freq':10, # define relevant clades in canonical HA1 numbering (+1) 'clade_designations': { "3c3.a":[(128,'A'), (142,'G'), (159,'S')], diff --git a/augur/src/process.py b/augur/src/process.py index f2156c18..4241e883 100644 --- a/augur/src/process.py +++ b/augur/src/process.py @@ -9,13 +9,14 @@ class process(virus_frequencies): """generic template class for processing virus sequences into trees""" - def __init__(self, tree_fname = 'data/tree.pkl', virus_fname = 'data/virus.pkl', + def __init__(self, tree_fname = 'data/tree.pkl', virus_fname = 'data/virus.pkl', aa_seq_fname = 'data/aa_seq.pkl', frequency_fname = 'data/frequency.pkl', auspice_frequency_fname ='../auspice/data/frequencies.json', auspice_sequences_fname='../auspice/data/sequences.json', auspice_tree_fname='../auspice/data/tree.json', min_freq = 0.01, **kwargs): virus_frequencies.__init__(self, **kwargs) self.tree_fname = tree_fname self.virus_fname = virus_fname self.frequency_fname = frequency_fname + self.aa_seq_fname = aa_seq_fname self.min_freq = min_freq self.auspice_tree_fname = auspice_tree_fname self.auspice_sequences_fname = auspice_sequences_fname @@ -32,6 +33,9 @@ def dump(self): if hasattr(self, 'frequencies'): with open(self.frequency_fname, 'w') as outfile: cPickle.dump(self.frequencies, outfile) + if hasattr(self, 'aa_aln'): + with open(self.aa_seq_fname, 'w') as outfile: + cPickle.dump(self.aa_aln, outfile) def load(self): import cPickle @@ -44,6 +48,9 @@ def load(self): if os.path.isfile(self.frequency_fname): with open(self.frequency_fname, 'r') as infile: self.frequencies = cPickle.load(infile) + if os.path.isfile(self.aa_seq_fname): + with open(self.aa_seq_fname, 'r') as infile: + self.aa_aln = cPickle.load(infile) def export_to_auspice(self, tree_fields = [], tree_pop_list = []): from tree_util import dendropy_to_json, all_descendants diff --git a/augur/src/tree_refine.py b/augur/src/tree_refine.py index fd8bdcff..59529245 100644 --- a/augur/src/tree_refine.py +++ b/augur/src/tree_refine.py @@ -109,6 +109,8 @@ def layout(self): if node.is_leaf(): node.yvalue = yvalue yvalue += 1 + else: + node.yvalue = 0 for node in self.tree.postorder_node_iter(): node.yvalue = self.get_yvalue(node) node.xvalue = node.distance_from_root() From ccb123edbf3398c596f374149799ee2749b17f35 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Mon, 2 Mar 2015 07:57:33 +0100 Subject: [PATCH 38/48] fixed issue with yvalues --- augur/src/tree_refine.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/augur/src/tree_refine.py b/augur/src/tree_refine.py index 59529245..ec6d1e48 100644 --- a/augur/src/tree_refine.py +++ b/augur/src/tree_refine.py @@ -94,10 +94,11 @@ def translate_all(self): def get_yvalue(self, node): """Return y location based on recursive mean of daughter locations""" - if hasattr(node, 'yvalue'): + if node.is_leaf(): return node.yvalue - if node.child_nodes(): - return np.mean([n.yvalue for n in node.child_nodes()]) + else: + if node.child_nodes(): + return np.mean([n.yvalue for n in node.child_nodes()]) def layout(self): """Add clade, xvalue, yvalue, mutation and trunk attributes to all nodes in tree""" @@ -109,8 +110,6 @@ def layout(self): if node.is_leaf(): node.yvalue = yvalue yvalue += 1 - else: - node.yvalue = 0 for node in self.tree.postorder_node_iter(): node.yvalue = self.get_yvalue(node) node.xvalue = node.distance_from_root() From 9520fa998f7dab3487482c87150eaefc0ef2658d Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Mon, 2 Mar 2015 18:15:06 +0100 Subject: [PATCH 39/48] changed file name for dumps to accect a common prefix for all files. added a steps argument to run which is fed in from main --- augur/src/H3N2_process.py | 12 +++++++----- augur/src/process.py | 11 +++++------ 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/augur/src/H3N2_process.py b/augur/src/H3N2_process.py index db5150b8..acfc3f8b 100644 --- a/augur/src/H3N2_process.py +++ b/augur/src/H3N2_process.py @@ -249,9 +249,7 @@ def __init__(self,verbose = 0, force_include = None, H3N2_refine.__init__(self,**kwargs) self.verbose = verbose - def run(self, years_back=3, viruses_per_month=50, raxml_time_limit = 1.0, **kwargs): - all_steps = ['filter', 'align', 'clean', 'tree', 'ancestral', 'refine', 'frequencies', 'export'] - steps = all_steps[all_steps.index(kwargs['start']):(all_steps.index(kwargs['stop'])+1)] + def run(self, steps, years_back=3, viruses_per_month=50, raxml_time_limit = 1.0): if 'filter' in steps: print "--- Virus filtering at " + time.strftime("%H:%M:%S") + " ---" self.filter() @@ -293,17 +291,19 @@ def run(self, years_back=3, viruses_per_month=50, raxml_time_limit = 1.0, **kwa self.export_to_auspice(tree_fields = ['ep', 'ne', 'rb']) if __name__=="__main__": + all_steps = ['filter', 'align', 'clean', 'tree', 'ancestral', 'refine', 'frequencies', 'export'] parser = argparse.ArgumentParser(description='Process virus sequences, build tree, and prepare of web visualization') parser.add_argument('-y', '--years_back', type = int, default=3, help='number of past years to sample sequences from') parser.add_argument('-v', '--viruses_per_month', type = int, default = 50, help='number of viruses sampled per month') parser.add_argument('-r', '--raxml_time_limit', type = float, default = 1.0, help='number of hours raxml is run') - parser.add_argument('--config', default = "nextflu_config.py" , type=str, help ="config file") + parser.add_argument('--prefix', type = str, default = 'data/', help='path+prefix of file dumps') parser.add_argument('--test', default = False, action="store_true", help ="don't run the pipeline") parser.add_argument('--start', default = 'filter', type = str, help ="start pipeline at virus selection") parser.add_argument('--stop', default = 'export', type=str, help ="run to end") params = parser.parse_args() params.cds = (48,None) + steps = all_steps[all_steps.index(params.start):(all_steps.index(params.stop)+1)] # add all arguments to virus_config (possibly overriding) virus_config.update(params.__dict__) # pass all these arguments to the processor: will be passed down as kwargs through all classes @@ -311,4 +311,6 @@ def run(self, years_back=3, viruses_per_month=50, raxml_time_limit = 1.0, **kwa if params.test: myH3N2.load() else: - myH3N2.run(**virus_config) + myH3N2.run(steps, years_back=virus_config['years_back'], + viruses_per_month = virus_config['viruses_per_month'], + raxml_time_limit = virus_config['raxml_time_limit']) diff --git a/augur/src/process.py b/augur/src/process.py index 4241e883..76971c0d 100644 --- a/augur/src/process.py +++ b/augur/src/process.py @@ -9,14 +9,13 @@ class process(virus_frequencies): """generic template class for processing virus sequences into trees""" - def __init__(self, tree_fname = 'data/tree.pkl', virus_fname = 'data/virus.pkl', aa_seq_fname = 'data/aa_seq.pkl', - frequency_fname = 'data/frequency.pkl', auspice_frequency_fname ='../auspice/data/frequencies.json', + def __init__(self, prefix = 'data/', auspice_frequency_fname ='../auspice/data/frequencies.json', auspice_sequences_fname='../auspice/data/sequences.json', auspice_tree_fname='../auspice/data/tree.json', min_freq = 0.01, **kwargs): virus_frequencies.__init__(self, **kwargs) - self.tree_fname = tree_fname - self.virus_fname = virus_fname - self.frequency_fname = frequency_fname - self.aa_seq_fname = aa_seq_fname + self.tree_fname = prefix+'tree.pkl' + self.virus_fname = prefix+'virus.pkl' + self.frequency_fname = prefix+'frequencies.pkl' + self.aa_seq_fname = prefix+'aa_seq.pkl' self.min_freq = min_freq self.auspice_tree_fname = auspice_tree_fname self.auspice_sequences_fname = auspice_sequences_fname From af481250ae06ec044a47d4dbe8fabe9f821907ba Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Mon, 2 Mar 2015 21:46:16 +0100 Subject: [PATCH 40/48] added node_lookup and sequence_lookup to the loading --- augur/src/process.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/augur/src/process.py b/augur/src/process.py index 76971c0d..71234947 100644 --- a/augur/src/process.py +++ b/augur/src/process.py @@ -41,9 +41,12 @@ def load(self): if os.path.isfile(self.tree_fname): with open(self.tree_fname, 'r') as infile: self.tree = cPickle.load(infile) + self.node_lookup = {l.strain:l for l in self.tree.leaf_iter()} + self.node_lookup.update({node.strain.lower():node for node in self.tree.leaf_iter()}) if os.path.isfile(self.virus_fname): with open(self.virus_fname, 'r') as infile: self.viruses = cPickle.load(infile) + self.sequence_lookup = {v.strain:v for v in self.viruses} if os.path.isfile(self.frequency_fname): with open(self.frequency_fname, 'r') as infile: self.frequencies = cPickle.load(infile) From 1a1e741cbbc6f4fc7b3116abded8a7196b876e71 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Tue, 3 Mar 2015 01:33:04 +0100 Subject: [PATCH 41/48] caught unfinished tree in loading -- only refined trees have strain attributes used to construct the node look-up --- augur/src/process.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/augur/src/process.py b/augur/src/process.py index 71234947..ab2a8da3 100644 --- a/augur/src/process.py +++ b/augur/src/process.py @@ -41,12 +41,18 @@ def load(self): if os.path.isfile(self.tree_fname): with open(self.tree_fname, 'r') as infile: self.tree = cPickle.load(infile) - self.node_lookup = {l.strain:l for l in self.tree.leaf_iter()} - self.node_lookup.update({node.strain.lower():node for node in self.tree.leaf_iter()}) + try: + self.node_lookup = {l.strain:l for l in self.tree.leaf_iter()} + self.node_lookup.update({node.strain.lower():node for node in self.tree.leaf_iter()}) + except: + pass if os.path.isfile(self.virus_fname): with open(self.virus_fname, 'r') as infile: self.viruses = cPickle.load(infile) - self.sequence_lookup = {v.strain:v for v in self.viruses} + try: + self.sequence_lookup = {v.strain:v for v in self.viruses} + except: + pass if os.path.isfile(self.frequency_fname): with open(self.frequency_fname, 'r') as infile: self.frequencies = cPickle.load(infile) From d923d8498600273ded156a4a2295b9e8621af277 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Tue, 3 Mar 2015 03:25:29 +0100 Subject: [PATCH 42/48] increased recursion limit --- augur/src/H3N2_process.py | 1 + 1 file changed, 1 insertion(+) diff --git a/augur/src/H3N2_process.py b/augur/src/H3N2_process.py index acfc3f8b..2a0dcec7 100644 --- a/augur/src/H3N2_process.py +++ b/augur/src/H3N2_process.py @@ -1,5 +1,6 @@ import time, argparse,re, sys,os sys.path.append('src') +sys.setrecursionlimit(10000) # needed since we are dealing with large trees from virus_filter import flu_filter from virus_clean import virus_clean from tree_refine import tree_refine From feae7f5737ab2acdcbb3e86f5310a34ae36dfeb6 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Tue, 3 Mar 2015 07:20:22 +0100 Subject: [PATCH 43/48] defined alphabets early, checked outgroup removal, seems ok --- augur/src/process.py | 4 ++-- augur/src/tree_refine.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/augur/src/process.py b/augur/src/process.py index ab2a8da3..e8fe427b 100644 --- a/augur/src/process.py +++ b/augur/src/process.py @@ -20,6 +20,8 @@ def __init__(self, prefix = 'data/', auspice_frequency_fname ='../auspice/data/f self.auspice_tree_fname = auspice_tree_fname self.auspice_sequences_fname = auspice_sequences_fname self.auspice_frequency_fname = auspice_frequency_fname + self.nuc_alphabet = 'ACGT-N' + self.aa_alphabet = 'ACDEFGHIKLMNPQRSTVWY*X' def dump(self): import cPickle @@ -219,7 +221,6 @@ def determine_variable_positions(self): self.variable_aa ''' aln_array = np.array(self.viruses) - self.nuc_alphabet = 'ACGT-N' self.nucleoties_frequencies = np.zeros((len(self.nuc_alphabet),aln_array.shape[1])) for ni,nuc in enumerate(self.nuc_alphabet): self.nucleoties_frequencies[ni,:]=(aln_array==nuc).mean(axis=0) @@ -229,7 +230,6 @@ def determine_variable_positions(self): if hasattr(self, 'aa_aln'): aln_array = np.array(self.aa_aln) - self.aa_alphabet = 'ACDEFGHIKLMNPQRSTVWY*X' self.aa_frequencies = np.zeros((len(self.aa_alphabet),aln_array.shape[1])) for ai,aa in enumerate(self.aa_alphabet): self.aa_frequencies[ai,:]=(aln_array==aa).mean(axis=0) diff --git a/augur/src/tree_refine.py b/augur/src/tree_refine.py index ec6d1e48..5913189b 100644 --- a/augur/src/tree_refine.py +++ b/augur/src/tree_refine.py @@ -50,9 +50,10 @@ def remove_outgroup(self): print "removed outgroup",self.outgroup['strain'] else: print "outgroup",self.outgroup['strain'], "not found" -# if len(self.tree.seed_node.child_nodes())==1: -# self.tree.seed_node = self.tree.seed_node.child_nodes()[0] -# self.tree.seed_node.parent_node = None + if len(self.tree.seed_node.child_nodes())==1: + print "ROOT had one child only, moving root up!" + self.tree.seed_node = self.tree.seed_node.child_nodes()[0] + self.tree.seed_node.parent_node = None self.tree.seed_node.edge_length = 0.001 def collapse(self): From 3f1100e42f9132d18b931d408bbbb3d05b72adb1 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Tue, 3 Mar 2015 23:20:47 +0100 Subject: [PATCH 44/48] fixed some default arguments, moved path addition (only necessary for interactive sessions) and the higher recursion limit to process.py --- augur/src/H3N2_process.py | 20 +++++++++----------- augur/src/process.py | 4 +++- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/augur/src/H3N2_process.py b/augur/src/H3N2_process.py index 2a0dcec7..2441619b 100644 --- a/augur/src/H3N2_process.py +++ b/augur/src/H3N2_process.py @@ -1,6 +1,4 @@ -import time, argparse,re, sys,os -sys.path.append('src') -sys.setrecursionlimit(10000) # needed since we are dealing with large trees +import time, argparse,re,os from virus_filter import flu_filter from virus_clean import virus_clean from tree_refine import tree_refine @@ -12,28 +10,27 @@ from itertools import izip epitope_mask = np.fromstring("0000000000000000000000000000000000000000000011111011011001010011000100000001001011110011100110101000001100000100000001000110101011111101011010111110001010011111000101011011111111010010001111101110111001010001110011111111000000111110000000101010101110000000000011100100000001011011100000000000001001011000110111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000", dtype='S1') - virus_config = { # data source and sequence parsing/cleaning/processing 'virus':'H3N2', 'alignment_file':'data/gisaid_epiflu_sequence.fasta', - 'fasta_fields':{0:'strain', 1:"date", 4:"passage", -1:'accession'}, + 'fasta_fields':{0:'strain', 1:'accession', 3:'passage', 5:'date' }, #'fasta_fields':{0:'strain', 1:"date", 4:"passage", -1:'accession'}, 'outgroup':'A/Beijing/32/1992', #'force_include':'source-data/HI_strains.txt', 'force_include_all':False, 'max_global':True, # sample as evenly as possible from different geographic regions - 'cds':[48,-1], # define the HA1 start i n 0 numbering + 'cds':[48,None], # define the HA1 start i n 0 numbering 'n_iqd':3, # standard deviations from clock # frequency estimation parameters - 'aggregate_regions': [ ("global", None)],# ("NA", ["NorthAmerica"]), ("EU", ["Europe"]), -# ("AS", ["China", "SoutheastAsia", "JapanKorea"]), ("OC", ["Oceania"]) ], + 'aggregate_regions': [ ("global", None)], ("NA", ["NorthAmerica"]), ("EU", ["Europe"]), + ("AS", ["China", "SoutheastAsia", "JapanKorea"]), ("OC", ["Oceania"]) ], 'frequency_stiffness':10.0, 'time_interval':(2012.0, 2015.1), - 'pivots_per_year':6.0, - 'min_freq':10, + 'pivots_per_year':12.0, + 'min_freq':0.01, # define relevant clades in canonical HA1 numbering (+1) 'clade_designations': { "3c3.a":[(128,'A'), (142,'G'), (159,'S')], "3c3": [(128,'A'), (142,'G'), (159,'F')], @@ -41,7 +38,7 @@ "3c2": [(144,'N'), (159,'F'),(225,'N'), (489,'N')] }, 'verbose':2, - 'tol':1e-3, #tolerance for frequency optimization + 'tol':1e-4, #tolerance for frequency optimization 'pc':1e-3, #pseudocount for frequencies 'extra_pivots': 6, # number of pivot point for or after the last observations of a mutations 'inertia':0.7, # fraction of frequency change carry over in the stiffness term @@ -289,6 +286,7 @@ def run(self, steps, years_back=3, viruses_per_month=50, raxml_time_limit = 1.0) self.dump() if 'export' in steps: self.temporal_regional_statistics() + # exporting to json, including the H3N2 specific fields self.export_to_auspice(tree_fields = ['ep', 'ne', 'rb']) if __name__=="__main__": diff --git a/augur/src/process.py b/augur/src/process.py index e8fe427b..33e2a03a 100644 --- a/augur/src/process.py +++ b/augur/src/process.py @@ -1,4 +1,6 @@ -import time, os, argparse,shutil,subprocess, glob +import sys, time, os, argparse,shutil,subprocess, glob +sys.path.append('src') +sys.setrecursionlimit(10000) # needed since we are dealing with large trees from Bio import SeqIO, AlignIO,Phylo from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq From 0de67fd0940c82d492071e0e340ea98fe3b6370a Mon Sep 17 00:00:00 2001 From: Trevor Bedford Date: Tue, 3 Mar 2015 15:25:16 -0800 Subject: [PATCH 45/48] Change default fasta_fields to match default GISAID output. @rneher: please revert if you disagree --- augur/src/H3N2_process.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/augur/src/H3N2_process.py b/augur/src/H3N2_process.py index 6991e7f0..ab120a55 100644 --- a/augur/src/H3N2_process.py +++ b/augur/src/H3N2_process.py @@ -17,8 +17,8 @@ # data source and sequence parsing/cleaning/processing 'virus':'H3N2', 'alignment_file':'data/gisaid_epiflu_sequence.fasta', - 'fasta_fields':{0:'strain', 1:"date", 4:"passage", -1:'accession'}, #'fasta_fields':{0:'strain', 1:"date", 4:"passage", -1:'accession'}, + 'fasta_fields':{0:'strain', 1:'accession', 3:"passage", 5:"date"}, 'outgroup':'A/Beijing/32/1992', #'force_include':'source-data/HI_strains.txt', 'force_include_all':False, @@ -62,42 +62,36 @@ def __init__(self,min_length = 987, **kwargs): "accession": "CY163984", "date": "2005-08-31", "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGGAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACGATGAAAGCTTCAATTGGACTGGAGTCACTCAAAATGGAACAAGCTCTTCTTGCAAAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACAATGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTTCAAAATGTAAACAGGATCACATATGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAATAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCAATCAAATCAATGGGAAGCTGAATAGGTTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTCGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAGAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCATGATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAAGGCGTTGAGCTGAAGTCAGGATACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGA" - # "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGGAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACGATGAAAGCTTCAATTGGACTGGAGTCACTCAAAATGGAACAAGCTCTTCTTGCAAAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACAATGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTTCAAAATGTAAACAGGATCACATATGGGGCCTGTCCCAGATATGTTAAGCAAAACACT" }, { "strain": "A/Brisbane/10/2007", "db": "IRD", "accession": "CY113005", "date": "2007-02-06", "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCACTCAAAAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAATGACCAAATCTTCCCGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAACGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAATAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATAGGTTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTCGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACCAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACAATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGCGTTGAGCTGAAGTCAGGATACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGA" - # "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCACTCAAAAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAATGACCAAATCTTCCCGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAACGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACT" }, { "strain": "A/Perth/16/2009", "db": "IRD", "accession": "GQ293081", "date": "2009-04-07", "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAAAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAAAAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAAGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACCGTAAGCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATAGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTCGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGAGATGAAGCATTAAACAACCGGTTTCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGA" - # "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAAAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAAAAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAAGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACCGTAAGCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACT" }, { "strain": "A/Victoria/361/2011", "db": "IRD", "accession": "GQ293081", "date": "2011-10-24", "seq": "ATGAAGACTATCATTGCTTTGAGCCACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAATAATAGTTTCTTTAGTAGATTAAATTGGTTGACCCGCTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACAAGGAACAAATCTTCCTGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATATAGACCCAGAATAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACTCTGAAATTGGCAACAGGAATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTCGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTAAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA" - # "seq": "ATGAAGACTATCATTGCTTTGAGCCACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAATAATAGTTTCTTTAGTAGATTAAATTGGTTGACCCGCTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACAAGGAACAAATCTTCCTGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATATAGACCCAGAATAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACT" }, { "strain": "A/Texas/50/2012", "db": "GISAID", "accession": "EPI_ISL_129858", "date": "2012-04-15", "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAACTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGAATGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAATAATAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCTCAACCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACTCTGAAATTGGCAACAGGAATGCGGAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA", - # "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAACTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGAATGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAATAATAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCTCAACCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACT", }, { "strain": "A/Switzerland/9715293/2013", "db": "GISAID", "accession": "EPI_ISL_162149", "date": "2013-12-06", "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGGCTGGAGTCACTCAAAACGGAACAAGTTCTTCTTGCATAAGGGGATCTAATAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTCCAAATACCCAGCATTAAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCACAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACTCTGAAATTGGCAACAGGAATGCGAAATGTACCAGAGAGACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGCTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACAAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA", - # "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGGCTGGAGTCACTCAAAACGGAACAAGTTCTTCTTGCATAAGGGGATCTAATAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTCCAAATACCCAGCATTAAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCACAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACT", } ] self.outgroup = { @@ -108,7 +102,6 @@ def __init__(self,min_length = 987, **kwargs): 'country': 'China', 'region': 'China', 'seq': 'ATGAAGACTATCATTGCTTTGAGCTACATTTTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACAACAGCACAGCAACGCTGTGCCTGGGACATCATGCAGTGCCAAACGGAACGCTAGTGAAAACAATCACGAATGATCAAATTGAAGTGACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTAGAATATGCGACAGTCCTCACCGAATCCTTGATGGAAAAAACTGCACACTGATAGATGCTCTATTGGGAGACCCTCATTGTGATGGCTTCCAAAATAAGGAATGGGACCTTTTTGTTGAACGCAGCAAAGCTTACAGCAACTGTTACCCTTATGATGTACCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCAGGCACCCTGGAGTTTATCAATGAAGACTTCAATTGGACTGGAGTCGCTCAGGATGGGGGAAGCTATGCTTGCAAAAGGGGATCTGTTAACAGTTTCTTTAGTAGATTGAATTGGTTGCACAAATCAGAATACAAATATCCAGCGCTGAACGTGACTATGCCAAACAATGGCAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGAGCACGGACAGAGACCAAACCAGCCTATATGTTCGAGCATCAGGGAGAGTCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAACCCCGAATATCGGGTCTAGACCCTGGGTAAGGGGTCAGTCCAGTAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAATAGCACAGGGAATCTAATTGCTCCTCGGGGTTACTTCAAAATACGAAATGGGAAAAGCTCAATAATGAGGTCAGATGCACCCATTGGCACCTGCAGTTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCTTTTCAAAATGTAAACAGGATCACATATGGGGCCTGCCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGGAATGTACCAGAGAAACAAACTAGAGGCATATTCGGCGCAATCGCAGGTTTCATAGAAAATGGTTGGGAGGGAATGGTAGACGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGCACAGGACAAGCAGCAGATCTTAAAAGCACTCAAGCAGCAATCGACCAAATCAACGGGAAACTGAATAGGTTAATCGAGAAAACGAACGAGAAATTCCATCAAATCGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTCGAGAAATATGTTGAAGACACTAAAATAGATCTCTGGTCTTACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTTACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAGGAAGCAACTGAGGGAAAATGCTGAGGACATGGGCAATGGTTGCTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGGTCAATCAGAAATGGAACTTATGACCATGATGTATACAGAGACGAAGCATTAAACAACCGGTTCCAGATCAAAGGTGTTGAGCTGAAGTCAGGATACAAAGATTGGATCCTGTGGATTTCCTTTGCCATATCATGCTTTTTGCTTTGTGTTGTTTTGCTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGTAACATTTGCATTTGA' - # 'seq': 'ATGAAGACTATCATTGCTTTGAGCTACATTTTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACAACAGCACAGCAACGCTGTGCCTGGGACATCATGCAGTGCCAAACGGAACGCTAGTGAAAACAATCACGAATGATCAAATTGAAGTGACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTAGAATATGCGACAGTCCTCACCGAATCCTTGATGGAAAAAACTGCACACTGATAGATGCTCTATTGGGAGACCCTCATTGTGATGGCTTCCAAAATAAGGAATGGGACCTTTTTGTTGAACGCAGCAAAGCTTACAGCAACTGTTACCCTTATGATGTACCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCAGGCACCCTGGAGTTTATCAATGAAGACTTCAATTGGACTGGAGTCGCTCAGGATGGGGGAAGCTATGCTTGCAAAAGGGGATCTGTTAACAGTTTCTTTAGTAGATTGAATTGGTTGCACAAATCAGAATACAAATATCCAGCGCTGAACGTGACTATGCCAAACAATGGCAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGAGCACGGACAGAGACCAAACCAGCCTATATGTTCGAGCATCAGGGAGAGTCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAACCCCGAATATCGGGTCTAGACCCTGGGTAAGGGGTCAGTCCAGTAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAATAGCACAGGGAATCTAATTGCTCCTCGGGGTTACTTCAAAATACGAAATGGGAAAAGCTCAATAATGAGGTCAGATGCACCCATTGGCACCTGCAGTTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCTTTTCAAAATGTAAACAGGATCACATATGGGGCCTGCCCCAGATATGTTAAGCAAAACACT' } class H3N2_clean(virus_clean): From 877b5e15c0cbde709d32fcd61c922aaae27561bf Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Wed, 4 Mar 2015 00:59:34 +0100 Subject: [PATCH 46/48] deleted extra ] --- augur/src/H3N2_process.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/src/H3N2_process.py b/augur/src/H3N2_process.py index 1e606532..7abca953 100644 --- a/augur/src/H3N2_process.py +++ b/augur/src/H3N2_process.py @@ -24,7 +24,7 @@ 'n_iqd':3, # standard deviations from clock # frequency estimation parameters - 'aggregate_regions': [ ("global", None)], ("NA", ["NorthAmerica"]), ("EU", ["Europe"]), + 'aggregate_regions': [ ("global", None), ("NA", ["NorthAmerica"]), ("EU", ["Europe"]), ("AS", ["China", "SoutheastAsia", "JapanKorea"]), ("OC", ["Oceania"]) ], 'frequency_stiffness':10.0, 'time_interval':(2012.0, 2015.1), From 369a47c5deb758a29cc36744f76c66e44c73ae80 Mon Sep 17 00:00:00 2001 From: Trevor Bedford Date: Tue, 3 Mar 2015 16:24:15 -0800 Subject: [PATCH 47/48] Fix auspice genotype frequency display. --- auspice/js/auspice.js | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/auspice/js/auspice.js b/auspice/js/auspice.js index 3af1df30..0cb8e41c 100644 --- a/auspice/js/auspice.js +++ b/auspice/js/auspice.js @@ -1073,18 +1073,9 @@ d3.json("data/frequencies.json", function(error, json){ console.log("calculating frequencies for :"+gt); var freq = []; for (var pi=0; pi1){ - for (freq_gt in json["genotypes"][region]){ - var gt_agree = gt.map(function (d) { - var aa =freq_gt[parseInt(d.substring(0,d.length-1))-1]; - return (aa==d[d.length-1])||(aa=='.'); - }); - if (gt_agree.every(function (d,i,a) {return d;})) - { - for (var pi=0; pi Date: Tue, 3 Mar 2015 16:48:49 -0800 Subject: [PATCH 48/48] Adjust color scale. --- auspice/js/auspice.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/auspice/js/auspice.js b/auspice/js/auspice.js index 0cb8e41c..fe02ca35 100644 --- a/auspice/js/auspice.js +++ b/auspice/js/auspice.js @@ -451,15 +451,15 @@ d3.json("data/tree.json", function(error, root) { var colorBy = document.getElementById("coloring").value; var epitopeColorScale = d3.scale.linear().clamp([true]) - .domain([0,1,2,3,4,5,6,7,8,9]) + .domain([4,5,6,7,8,9,10,11,12,13]) .range(colors); var nonepitopeColorScale = d3.scale.linear().clamp([true]) - .domain([0,1,2,3,4,5,6,7,8,9]) + .domain([2,3,4,5,6,7,8,9,10,11]) .range(colors); var receptorBindingColorScale = d3.scale.linear().clamp([true]) - .domain([0,1,2, 3, 4,]) + .domain([0,1,2,3,4]) .range(colors.filter( function(d,i){return i%2;})); var lbiColorScale = d3.scale.linear()