Skip to content

Commit

Permalink
Update snakefile to fully working version
Browse files Browse the repository at this point in the history
Previously I had stubbed out a Snakefile with just the barebones of what I needed to think about in terms of expansion through all the sequence and trial combinations, as well as some file concatenation I needed to do. I have since gotten the Snakefile to fully specify the workflow as it is running on AWS batch, and that is what is being committed here.
  • Loading branch information
alliblk committed Jul 27, 2019
1 parent dc48195 commit da59312
Showing 1 changed file with 171 additions and 6 deletions.
177 changes: 171 additions & 6 deletions supplemental-analysis/rarefaction-curves/mexico/Snakefile
@@ -1,29 +1,194 @@
NUM_SEQS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50] NUM_SEQS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]
#NUM_SEQS = [1, 2]
TRIAL_NUM = [1, 2, 3, 4, 5] TRIAL_NUM = [1, 2, 3, 4, 5]
#TRIAL_NUM = [1, 2]


rule all: rule all:
input: input:
auspice_tree = expand("auspice/mex_seqs_{num_seqs}_trial_{trial_num}_tree.json", num_seqs=NUM_SEQS), auspice_tree = expand("auspice/auspice_{num_seqs}_{trial_num}/mex-seqs-{num_seqs}-{trial_num}_tree.json", num_seqs=NUM_SEQS, trial_num=TRIAL_NUM),
auspice_meta = expand("auspice/mex_seqs_{num_seqs}_trial_{trial_num}_meta.json", trial_num=TRIAL_NUM) auspice_meta = expand("auspice/auspice_{num_seqs}_{trial_num}/mex-seqs-{num_seqs}-{trial_num}_meta.json", num_seqs=NUM_SEQS, trial_num=TRIAL_NUM)


rule files: rule files:
params: params:
mexican_seqs_fasta = "data/mexico_{num_seqs}_seqs/mex_{num_seqs}_seqs_trial_{trial_num}.fasta", mexican_seqs_fasta = "data/mexico_{num_seqs}_seqs/mex_{num_seqs}_seqs_trial_{trial_num}.fasta",
background_seqs_fasta = MAKE THIS FILE! background_seqs_fasta = "data/background_seqs_no_mexico.fasta",
dropped_strains = "config/dropped_strains.txt", dropped_strains = "config/dropped_strains.txt",
included_strains = "config/include_strains.txt", included_strains = "config/include_strains.txt",
reference = "config/zika_outgroup.gb", reference = "config/zika_outgroup.gb",
colors = "config/colors.tsv", colors = "config/colors.tsv",
auspice_config = "config/auspice_config.json" auspice_config = "config/auspice_config.json"


files = rules.files.params

rule concatenate_fastas: rule concatenate_fastas:
message: "concatenating subsampled mexican sequences with Americas-wide background sequences." message: "concatenating subsampled mexican sequences with Americas-wide background sequences."
input: input:
mexican_sequences = files.mexican_seqs_fasta mexican_sequences = files.mexican_seqs_fasta,
background_sequences = files.background_seqs_fasta background_sequences = files.background_seqs_fasta
output: output:
full_fasta = "results/mex_and_background_seqs.fasta" full_fasta = "results/mex-and-background-seqs_{num_seqs}_{trial_num}.fasta"
shell:
"""
cat {input.mexican_sequences} {input.background_sequences} > {output.full_fasta}
"""

rule parse:
message: "Parsing fasta into separate sequences and metadata files"
input:
sequences = rules.concatenate_fastas.output.full_fasta
output:
sequences = "results/sequences_{num_seqs}_{trial_num}.fasta",
metadata = "results/metadata_{num_seqs}_{trial_num}.tsv"
params:
fasta_fields = "strain virus accession date region country division city db segment authors url title journal paper_url"
shell:
"""
augur parse \
--sequences {input.sequences} \
--output-sequences {output.sequences} \
--output-metadata {output.metadata} \
--fields {params.fasta_fields}
"""

rule filter:
message:
"""
Filtering to
- {params.sequences_per_group} sequence(s) per {params.group_by!s}
- from {params.min_date} onwards
- excluding strains in {input.exclude}
"""
input:
sequences = rules.parse.output.sequences,
metadata = rules.parse.output.metadata,
exclude = files.dropped_strains
output:
sequences = "results/filtered_{num_seqs}_{trial_num}.fasta"
params:
group_by = "country year month",
sequences_per_group = 500,
min_date = 2013
shell:
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--exclude {input.exclude} \
--output {output.sequences} \
--group-by {params.group_by} \
--sequences-per-group {params.sequences_per_group} \
--min-date {params.min_date}
"""

rule align:
message:
"""
Aligning sequences to {input.reference}
- filling gaps with N
"""
input:
sequences = rules.filter.output.sequences,
reference = files.reference
output:
alignment = "results/aligned_{num_seqs}_{trial_num}.fasta"
shell:
"""
augur align \
--sequences {input.sequences} \
--reference-sequence {input.reference} \
--output {output.alignment} \
--fill-gaps
"""

rule tree:
message: "Building maximum likelihood tree"
input:
alignment = rules.align.output.alignment
output:
tree = "results/tree-raw_{num_seqs}_{trial_num}.nwk"
shell:
"""
augur tree \
--alignment {input.alignment} \
--output {output.tree}
"""

rule refine:
message:
"""
Refining tree
- estimate timetree
- use {params.coalescent} coalescent timescale
- estimate {params.date_inference} node dates
- filter tips more than {params.clock_filter_iqd} IQDs from clock expectation
"""
input:
tree = rules.tree.output.tree,
alignment = rules.align.output,
metadata = rules.parse.output.metadata
output:
tree = "results/tree_{num_seqs}_{trial_num}.nwk",
node_data = "results/branch-lengths_{num_seqs}_{trial_num}.json"
params:
coalescent = "opt",
date_inference = "marginal",
clock_filter_iqd = 4,
root = "1_0049_PF"
shell:
"""
augur refine \
--tree {input.tree} \
--alignment {input.alignment} \
--metadata {input.metadata} \
--output-tree {output.tree} \
--output-node-data {output.node_data} \
--timetree \
--coalescent {params.coalescent} \
--date-confidence \
--date-inference {params.date_inference} \
--clock-filter-iqd {params.clock_filter_iqd} \
--root {params.root}
"""

rule traits:
message: "Inferring ancestral traits for {params.columns!s}"
input:
tree = rules.refine.output.tree,
metadata = rules.parse.output.metadata
output:
node_data = "results/traits_{num_seqs}_{trial_num}.json",
params:
columns = "region country"
shell:
"""
augur traits \
--tree {input.tree} \
--metadata {input.metadata} \
--output {output.node_data} \
--columns {params.columns} \
--confidence
"""

rule export:
message: "Exporting data files for for auspice"
input:
tree = rules.refine.output.tree,
metadata = rules.parse.output.metadata,
branch_lengths = rules.refine.output.node_data,
traits = rules.traits.output.node_data,
colors = files.colors,
auspice_config = files.auspice_config
output:
auspice_tree = "auspice/auspice_{num_seqs}_{trial_num}/mex-seqs-{num_seqs}-{trial_num}_tree.json",
auspice_meta = "auspice/auspice_{num_seqs}_{trial_num}/mex-seqs-{num_seqs}-{trial_num}_meta.json"
shell: shell:
""" """
cat input.mexican_sequences input.background_sequences > output.full_fasta augur export \
--tree {input.tree} \
--metadata {input.metadata} \
--node-data {input.branch_lengths} {input.traits} \
--colors {input.colors} \
--auspice-config {input.auspice_config} \
--output-tree {output.auspice_tree} \
--output-meta {output.auspice_meta}
""" """

0 comments on commit da59312

Please sign in to comment.