Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Update snakefile to fully working version
Previously I had stubbed out a Snakefile with just the barebones of what I needed to think about in terms of expansion through all the sequence and trial combinations, as well as some file concatenation I needed to do. I have since gotten the Snakefile to fully specify the workflow as it is running on AWS batch, and that is what is being committed here.
- Loading branch information
Showing
1 changed file
with
171 additions
and
6 deletions.
There are no files selected for viewing
177 changes: 171 additions & 6 deletions
177
supplemental-analysis/rarefaction-curves/mexico/Snakefile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -1,29 +1,194 @@ | |||
NUM_SEQS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50] | NUM_SEQS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50] | ||
#NUM_SEQS = [1, 2] | |||
TRIAL_NUM = [1, 2, 3, 4, 5] | TRIAL_NUM = [1, 2, 3, 4, 5] | ||
#TRIAL_NUM = [1, 2] | |||
|
|
||
rule all: | rule all: | ||
input: | input: | ||
auspice_tree = expand("auspice/mex_seqs_{num_seqs}_trial_{trial_num}_tree.json", num_seqs=NUM_SEQS), | auspice_tree = expand("auspice/auspice_{num_seqs}_{trial_num}/mex-seqs-{num_seqs}-{trial_num}_tree.json", num_seqs=NUM_SEQS, trial_num=TRIAL_NUM), | ||
auspice_meta = expand("auspice/mex_seqs_{num_seqs}_trial_{trial_num}_meta.json", trial_num=TRIAL_NUM) | auspice_meta = expand("auspice/auspice_{num_seqs}_{trial_num}/mex-seqs-{num_seqs}-{trial_num}_meta.json", num_seqs=NUM_SEQS, trial_num=TRIAL_NUM) | ||
|
|
||
rule files: | rule files: | ||
params: | params: | ||
mexican_seqs_fasta = "data/mexico_{num_seqs}_seqs/mex_{num_seqs}_seqs_trial_{trial_num}.fasta", | mexican_seqs_fasta = "data/mexico_{num_seqs}_seqs/mex_{num_seqs}_seqs_trial_{trial_num}.fasta", | ||
background_seqs_fasta = MAKE THIS FILE! | background_seqs_fasta = "data/background_seqs_no_mexico.fasta", | ||
dropped_strains = "config/dropped_strains.txt", | dropped_strains = "config/dropped_strains.txt", | ||
included_strains = "config/include_strains.txt", | included_strains = "config/include_strains.txt", | ||
reference = "config/zika_outgroup.gb", | reference = "config/zika_outgroup.gb", | ||
colors = "config/colors.tsv", | colors = "config/colors.tsv", | ||
auspice_config = "config/auspice_config.json" | auspice_config = "config/auspice_config.json" | ||
|
|
||
files = rules.files.params | |||
|
|||
rule concatenate_fastas: | rule concatenate_fastas: | ||
message: "concatenating subsampled mexican sequences with Americas-wide background sequences." | message: "concatenating subsampled mexican sequences with Americas-wide background sequences." | ||
input: | input: | ||
mexican_sequences = files.mexican_seqs_fasta | mexican_sequences = files.mexican_seqs_fasta, | ||
background_sequences = files.background_seqs_fasta | background_sequences = files.background_seqs_fasta | ||
output: | output: | ||
full_fasta = "results/mex_and_background_seqs.fasta" | full_fasta = "results/mex-and-background-seqs_{num_seqs}_{trial_num}.fasta" | ||
shell: | |||
""" | |||
cat {input.mexican_sequences} {input.background_sequences} > {output.full_fasta} | |||
""" | |||
|
|||
rule parse: | |||
message: "Parsing fasta into separate sequences and metadata files" | |||
input: | |||
sequences = rules.concatenate_fastas.output.full_fasta | |||
output: | |||
sequences = "results/sequences_{num_seqs}_{trial_num}.fasta", | |||
metadata = "results/metadata_{num_seqs}_{trial_num}.tsv" | |||
params: | |||
fasta_fields = "strain virus accession date region country division city db segment authors url title journal paper_url" | |||
shell: | |||
""" | |||
augur parse \ | |||
--sequences {input.sequences} \ | |||
--output-sequences {output.sequences} \ | |||
--output-metadata {output.metadata} \ | |||
--fields {params.fasta_fields} | |||
""" | |||
|
|||
rule filter: | |||
message: | |||
""" | |||
Filtering to | |||
- {params.sequences_per_group} sequence(s) per {params.group_by!s} | |||
- from {params.min_date} onwards | |||
- excluding strains in {input.exclude} | |||
""" | |||
input: | |||
sequences = rules.parse.output.sequences, | |||
metadata = rules.parse.output.metadata, | |||
exclude = files.dropped_strains | |||
output: | |||
sequences = "results/filtered_{num_seqs}_{trial_num}.fasta" | |||
params: | |||
group_by = "country year month", | |||
sequences_per_group = 500, | |||
min_date = 2013 | |||
shell: | |||
""" | |||
augur filter \ | |||
--sequences {input.sequences} \ | |||
--metadata {input.metadata} \ | |||
--exclude {input.exclude} \ | |||
--output {output.sequences} \ | |||
--group-by {params.group_by} \ | |||
--sequences-per-group {params.sequences_per_group} \ | |||
--min-date {params.min_date} | |||
""" | |||
|
|||
rule align: | |||
message: | |||
""" | |||
Aligning sequences to {input.reference} | |||
- filling gaps with N | |||
""" | |||
input: | |||
sequences = rules.filter.output.sequences, | |||
reference = files.reference | |||
output: | |||
alignment = "results/aligned_{num_seqs}_{trial_num}.fasta" | |||
shell: | |||
""" | |||
augur align \ | |||
--sequences {input.sequences} \ | |||
--reference-sequence {input.reference} \ | |||
--output {output.alignment} \ | |||
--fill-gaps | |||
""" | |||
|
|||
rule tree: | |||
message: "Building maximum likelihood tree" | |||
input: | |||
alignment = rules.align.output.alignment | |||
output: | |||
tree = "results/tree-raw_{num_seqs}_{trial_num}.nwk" | |||
shell: | |||
""" | |||
augur tree \ | |||
--alignment {input.alignment} \ | |||
--output {output.tree} | |||
""" | |||
|
|||
rule refine: | |||
message: | |||
""" | |||
Refining tree | |||
- estimate timetree | |||
- use {params.coalescent} coalescent timescale | |||
- estimate {params.date_inference} node dates | |||
- filter tips more than {params.clock_filter_iqd} IQDs from clock expectation | |||
""" | |||
input: | |||
tree = rules.tree.output.tree, | |||
alignment = rules.align.output, | |||
metadata = rules.parse.output.metadata | |||
output: | |||
tree = "results/tree_{num_seqs}_{trial_num}.nwk", | |||
node_data = "results/branch-lengths_{num_seqs}_{trial_num}.json" | |||
params: | |||
coalescent = "opt", | |||
date_inference = "marginal", | |||
clock_filter_iqd = 4, | |||
root = "1_0049_PF" | |||
shell: | |||
""" | |||
augur refine \ | |||
--tree {input.tree} \ | |||
--alignment {input.alignment} \ | |||
--metadata {input.metadata} \ | |||
--output-tree {output.tree} \ | |||
--output-node-data {output.node_data} \ | |||
--timetree \ | |||
--coalescent {params.coalescent} \ | |||
--date-confidence \ | |||
--date-inference {params.date_inference} \ | |||
--clock-filter-iqd {params.clock_filter_iqd} \ | |||
--root {params.root} | |||
""" | |||
|
|||
rule traits: | |||
message: "Inferring ancestral traits for {params.columns!s}" | |||
input: | |||
tree = rules.refine.output.tree, | |||
metadata = rules.parse.output.metadata | |||
output: | |||
node_data = "results/traits_{num_seqs}_{trial_num}.json", | |||
params: | |||
columns = "region country" | |||
shell: | |||
""" | |||
augur traits \ | |||
--tree {input.tree} \ | |||
--metadata {input.metadata} \ | |||
--output {output.node_data} \ | |||
--columns {params.columns} \ | |||
--confidence | |||
""" | |||
|
|||
rule export: | |||
message: "Exporting data files for for auspice" | |||
input: | |||
tree = rules.refine.output.tree, | |||
metadata = rules.parse.output.metadata, | |||
branch_lengths = rules.refine.output.node_data, | |||
traits = rules.traits.output.node_data, | |||
colors = files.colors, | |||
auspice_config = files.auspice_config | |||
output: | |||
auspice_tree = "auspice/auspice_{num_seqs}_{trial_num}/mex-seqs-{num_seqs}-{trial_num}_tree.json", | |||
auspice_meta = "auspice/auspice_{num_seqs}_{trial_num}/mex-seqs-{num_seqs}-{trial_num}_meta.json" | |||
shell: | shell: | ||
""" | """ | ||
cat input.mexican_sequences input.background_sequences > output.full_fasta | augur export \ | ||
--tree {input.tree} \ | |||
--metadata {input.metadata} \ | |||
--node-data {input.branch_lengths} {input.traits} \ | |||
--colors {input.colors} \ | |||
--auspice-config {input.auspice_config} \ | |||
--output-tree {output.auspice_tree} \ | |||
--output-meta {output.auspice_meta} | |||
""" | """ |