Permalink
Fetching contributors…
Cannot retrieve contributors at this time
591 lines (575 sloc) 33.6 KB
# Copyright 2009 by Cymon J. Cox. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Command line wrapper for the multiple alignment program MUSCLE."""
from __future__ import print_function
from Bio.Application import _Option, _Switch, AbstractCommandline
class MuscleCommandline(AbstractCommandline):
r"""Command line wrapper for the multiple alignment program MUSCLE.
http://www.drive5.com/muscle/
Notes
-----
Last checked against version: 3.7, briefly against 3.8
References
----------
Edgar, Robert C. (2004), MUSCLE: multiple sequence alignment with high
accuracy and high throughput, Nucleic Acids Research 32(5), 1792-97.
Edgar, R.C. (2004) MUSCLE: a multiple sequence alignment method with
reduced time and space complexity. BMC Bioinformatics 5(1): 113.
Examples
--------
>>> from Bio.Align.Applications import MuscleCommandline
>>> muscle_exe = r"C:\Program Files\Aligments\muscle3.8.31_i86win32.exe"
>>> in_file = r"C:\My Documents\unaligned.fasta"
>>> out_file = r"C:\My Documents\aligned.fasta"
>>> muscle_cline = MuscleCommandline(muscle_exe, input=in_file, out=out_file)
>>> print(muscle_cline)
"C:\Program Files\Aligments\muscle3.8.31_i86win32.exe" -in "C:\My Documents\unaligned.fasta" -out "C:\My Documents\aligned.fasta"
You would typically run the command line with muscle_cline() or via
the Python subprocess module, as described in the Biopython tutorial.
"""
def __init__(self, cmd="muscle", **kwargs):
"""Initialize the class."""
CLUSTERING_ALGORITHMS = ["upgma", "upgmb", "neighborjoining"]
DISTANCE_MEASURES_ITER1 = ["kmer6_6", "kmer20_3", "kmer20_4",
"kbit20_3", "kmer4_6"]
DISTANCE_MEASURES_ITER2 = DISTANCE_MEASURES_ITER1 + \
["pctid_kimura", "pctid_log"]
OBJECTIVE_SCORES = ["sp", "ps", "dp", "xp", "spf", "spm"]
TREE_ROOT_METHODS = ["pseudo", "midlongestspan", "minavgleafdist"]
SEQUENCE_TYPES = ["protein", "nucleo", "auto"]
WEIGHTING_SCHEMES = ["none", "clustalw", "henikoff", "henikoffpb",
"gsc", "threeway"]
self.parameters = [
# Can't use "in" as the final alias as this
# is a reserved word in python:
_Option(["-in", "in", "input"],
"Input filename",
filename=True,
equate=False),
_Option(["-out", "out"],
"Output filename",
filename=True,
equate=False),
_Switch(["-diags", "diags"],
"Find diagonals (faster for similar sequences)"),
_Switch(["-profile", "profile"],
"Perform a profile alignment"),
_Option(["-in1", "in1"],
"First input filename for profile alignment",
filename=True,
equate=False),
_Option(["-in2", "in2"],
"Second input filename for a profile alignment",
filename=True,
equate=False),
# anchorspacing Integer 32 Minimum spacing
# between anchor cols
_Option(["-anchorspacing", "anchorspacing"],
"Minimum spacing between anchor columns",
checker_function=lambda x: isinstance(x, int),
equate=False),
# center Floating point [1] Center parameter.
# Should be negative.
_Option(["-center", "center"],
"Center parameter - should be negative",
checker_function=lambda x: isinstance(x, float),
equate=False),
# cluster1 upgma upgmb Clustering method.
_Option(["-cluster1", "cluster1"],
"Clustering method used in iteration 1",
checker_function=lambda x: x in CLUSTERING_ALGORITHMS,
equate=False),
# cluster2 upgmb cluster1 is used
# neighborjoining in iteration 1 and
# 2, cluster2 in
# later iterations.
_Option(["-cluster2", "cluster2"],
"Clustering method used in iteration 2",
checker_function=lambda x: x in CLUSTERING_ALGORITHMS,
equate=False),
# diaglength Integer 24 Minimum length of
# diagonal.
_Option(["-diaglength", "diaglength"],
"Minimum length of diagonal",
checker_function=lambda x: isinstance(x, int),
equate=True),
# diagmargin Integer 5 Discard this many
# positions at ends
# of diagonal.
_Option(["-diagmargin", "diagmargin"],
"Discard this many positions at ends of diagonal",
checker_function=lambda x: isinstance(x, int),
equate=False),
# distance1 kmer6_6 Kmer6_6(amino) or Distance measure
# kmer20_3 Kmer4_6(nucleo) for iteration 1
# kmer20_4
# kbit20_3
# kmer4_6
_Option(["-distance1", "distance1"],
"Distance measure for iteration 1",
checker_function=lambda x: x in DISTANCE_MEASURES_ITER1,
equate=False),
# distance2 kmer6_6 pctid_kimura Distance measure
# kmer20_3 for iterations
# kmer20_4 2, 3 ...
# kbit20_3
# pctid_kimura
# pctid_log
_Option(["-distance2", "distance2"],
"Distance measure for iteration 2",
checker_function=lambda x: x in DISTANCE_MEASURES_ITER2,
equate=False),
# gapextend Floating point [1] The gap extend score
_Option(["-gapextend", "gapextend"],
"Gap extension penalty",
checker_function=lambda x: isinstance(x, float),
equate=False),
# gapopen Floating point [1] The gap open score
# Must be negative.
_Option(["-gapopen", "gapopen"],
"Gap open score - negative number",
checker_function=lambda x: isinstance(x, float),
equate=False),
# hydro Integer 5 Window size for
# determining whether
# a region is
# hydrophobic.
_Option(["-hydro", "hydro"],
"Window size for hydrophobic region",
checker_function=lambda x: isinstance(x, int),
equate=False),
# hydrofactor Floating point 1.2 Multiplier for gap
# open/close
# penalties in
# hydrophobic regions
_Option(["-hydrofactor", "hydrofactor"],
"Multiplier for gap penalties in hydrophobic regions",
checker_function=lambda x: isinstance(x, float),
equate=False),
# log File name None. Log file name
# (delete existing
# file).
_Option(["-log", "log"],
"Log file name",
filename=True,
equate=False),
# loga File name None. Log file name
# (append to existing
# file).
_Option(["-loga", "loga"],
"Log file name (append to existing file)",
filename=True,
equate=False),
# matrix File name None. File name for
# substitution matrix
# in NCBI or WU-BLAST
# format. If you
# specify your own
# matrix, you should
# also specify:
# -gapopen <g>
# -gapextend <e>
# -center 0.0
_Option(["-matrix", "matrix"],
"path to NCBI or WU-BLAST format protein substitution "
"matrix - also set -gapopen, -gapextend and -center",
filename=True,
equate=False),
# diagbreak Integer 1 Maximum distance
# between two
# diagonals that
# allows them to
# merge into one
# diagonal.
_Option(["-diagbreak", "diagbreak"],
"Maximum distance between two diagonals that allows "
"them to merge into one diagonal",
checker_function=lambda x: isinstance(x, int),
equate=False),
_Option(["-maxdiagbreak", "maxdiagbreak"], # deprecated 3.8
"Deprecated in v3.8, use -diagbreak instead.",
checker_function=lambda x: isinstance(x, int),
equate=False),
# maxhours Floating point None. Maximum time to
# run in hours. The
# actual time may
# exceed requested
# limit by a few
# minutes. Decimals
# are allowed, so 1.5
# means one hour and
# 30 minutes.
_Option(["-maxhours", "maxhours"],
"Maximum time to run in hours",
checker_function=lambda x: isinstance(x, float),
equate=False),
# maxiters Integer 1, 2 ... 16 Maximum number of
# iterations.
_Option(["-maxiters", "maxiters"],
"Maximum number of iterations",
checker_function=lambda x: isinstance(x, int),
equate=False),
# maxtrees Integer 1 Maximum number of
# new trees to build
# in iteration 2.
_Option(["-maxtrees", "maxtrees"],
"Maximum number of trees to build in iteration 2",
checker_function=lambda x: isinstance(x, int),
equate=False),
# minbestcolscore Floating point [1] Minimum score a
# column must have to
# be an anchor.
_Option(["-minbestcolscore", "minbestcolscore"],
"Minimum score a column must have to be an anchor",
checker_function=lambda x: isinstance(x, float),
equate=False),
# minsmoothscore Floating point [1] Minimum smoothed
# score a column must
# have to be an
# anchor.
_Option(["-minsmoothscore", "minsmoothscore"],
"Minimum smoothed score a column must have to "
"be an anchor",
checker_function=lambda x: isinstance(x, float),
equate=False),
# objscore sp spm Objective score
# ps used by tree
# dp dependent
# xp refinement.
# spf sp=sum-of-pairs
# spm score. (dimer
# approximation)
# spm=sp for < 100
# seqs, otherwise spf
# dp=dynamic
# programming score.
# ps=average profile-
# sequence score.
# xp=cross profile
# score.
_Option(["-objscore", "objscore"],
"Objective score used by tree dependent refinement",
checker_function=lambda x: x in OBJECTIVE_SCORES,
equate=False),
# refinewindow Integer 200 Length of window
# for -refinew.
_Option(["-refinewindow", "refinewindow"],
"Length of window for -refinew",
checker_function=lambda x: isinstance(x, int),
equate=False),
# root1 pseudo pseudo Method used to root
_Option(["-root1", "root1"],
"Method used to root tree in iteration 1",
checker_function=lambda x: x in TREE_ROOT_METHODS,
equate=False),
# root2 midlongestspan tree; root1 is
# minavgleafdist used in iteration 1
# and 2, root2 in
# later iterations.
_Option(["-root2", "root2"],
"Method used to root tree in iteration 2",
checker_function=lambda x: x in TREE_ROOT_METHODS,
equate=False),
# scorefile File name None File name where to
# write a score file.
# This contains one
# line for each column
# in the alignment.
# The line contains
# the letters in the
# column followed by
# the average BLOSUM62
# score over pairs of
# letters in the
# column.
_Option(["-scorefile", "scorefile"],
"Score file name, contains one line for each column"
" in the alignment with average BLOSUM62 score",
filename=True,
equate=False),
# seqtype protein auto Sequence type.
# nucleo
# auto
_Option(["-seqtype", "seqtype"],
"Sequence type",
checker_function=lambda x: x in SEQUENCE_TYPES,
equate=False),
# smoothscoreceil Floating point [1] Maximum value of
# column score for
# smoothing purposes.
_Option(["-smoothscoreceil", "smoothscoreceil"],
"Maximum value of column score for smoothing",
checker_function=lambda x: isinstance(x, float),
equate=False),
# smoothwindow Integer 7 Window used for
# anchor column
# smoothing.
_Option(["-smoothwindow", "smoothwindow"],
"Window used for anchor column smoothing",
checker_function=lambda x: isinstance(x, int),
equate=False),
# spscore File name Compute SP
# objective score of
# multiple alignment.
_Option(["-spscore", "spscore"],
"Compute SP objective score of multiple alignment",
filename=True,
equate=False),
# SUEFF Floating point value 0.1 Constant used in
# between 0 and 1. UPGMB clustering.
# Determines the
# relative fraction
# of average linkage
# (SUEFF) vs. nearest
# neighbor linkage
# (1 SUEFF).
_Option(["-sueff", "sueff"],
"Constant used in UPGMB clustering",
checker_function=lambda x: isinstance(x, float),
equate=False),
# tree1 File name None Save tree
_Option(["-tree1", "tree1"],
"Save Newick tree from iteration 1",
equate=False),
# tree2 first or second
# iteration to given
# file in Newick
# (Phylip-compatible)
# format.
_Option(["-tree2", "tree2"],
"Save Newick tree from iteration 2",
equate=False),
# usetree File name None Use given tree as
# guide tree. Must by
# in Newick
# (Phyip-compatible)
# format.
_Option(["-usetree", "usetree"],
"Use given Newick tree as guide tree",
filename=True,
equate=False),
# weight1 none clustalw Sequence weighting
_Option(["-weight1", "weight1"],
"Weighting scheme used in iteration 1",
checker_function=lambda x: x in WEIGHTING_SCHEMES,
equate=False),
# weight2 henikoff scheme.
# henikoffpb weight1 is used in
# gsc iterations 1 and 2.
# clustalw weight2 is used for
# threeway tree-dependent
# refinement.
# none=all sequences
# have equal weight.
# henikoff=Henikoff &
# Henikoff weighting
# scheme.
# henikoffpb=Modified
# Henikoff scheme as
# used in PSI-BLAST.
# clustalw=CLUSTALW
# method.
# threeway=Gotoh
# three-way method.
_Option(["-weight2", "weight2"],
"Weighting scheme used in iteration 2",
checker_function=lambda x: x in WEIGHTING_SCHEMES,
equate=False),
# ################### FORMATS ####################################
# Multiple formats can be specified on the command line
# If -msf appears it will be used regardless of other formats
# specified. If -clw appears (and not -msf), clustalw format will
# be used regardless of other formats specified. If both -clw and
# -clwstrict are specified -clwstrict will be used regardless of
# other formats specified. If -fasta is specified and not -msf,
# -clw, or clwstrict, fasta will be used. If -fasta and -html are
# specified -fasta will be used. Only if -html is specified alone
# will html be used. I kid ye not.
# clw no Write output in CLUSTALW format
# (default is FASTA).
_Switch(["-clw", "clw"],
"Write output in CLUSTALW format (with a MUSCLE header)"),
# clwstrict no Write output in CLUSTALW format with
# the "CLUSTAL W (1.81)" header rather
# than the MUSCLE version. This is
# useful when a post-processing step is
# picky about the file header.
_Switch(["-clwstrict", "clwstrict"],
"Write output in CLUSTALW format with version"
"1.81 header"),
# fasta yes Write output in FASTA format.
# Alternatives include clw,
# clwstrict, msf and html.
_Switch(["-fasta", "fasta"],
"Write output in FASTA format"),
# html no Write output in HTML format (default
# is FASTA).
_Switch(["-html", "html"],
"Write output in HTML format"),
# msf no Write output in MSF format (default
# is FASTA).
_Switch(["-msf", "msf"],
"Write output in MSF format"),
# Phylip interleaved - undocumented as of 3.7
_Switch(["-phyi", "phyi"],
"Write output in PHYLIP interleaved format"),
# Phylip sequential - undocumented as of 3.7
_Switch(["-phys", "phys"],
"Write output in PHYLIP sequential format"),
# ################# Additional specified output files #########
_Option(["-phyiout", "phyiout"],
"Write PHYLIP interleaved output to specified filename",
filename=True,
equate=False),
_Option(["-physout", "physout"],
"Write PHYLIP sequential format to specified filename",
filename=True,
equate=False),
_Option(["-htmlout", "htmlout"],
"Write HTML output to specified filename",
filename=True,
equate=False),
_Option(["-clwout", "clwout"],
"Write CLUSTALW output (with MUSCLE header) to specified "
"filename",
filename=True,
equate=False),
_Option(["-clwstrictout", "clwstrictout"],
"Write CLUSTALW output (with version 1.81 header) to "
"specified filename",
filename=True,
equate=False),
_Option(["-msfout", "msfout"],
"Write MSF format output to specified filename",
filename=True,
equate=False),
_Option(["-fastaout", "fastaout"],
"Write FASTA format output to specified filename",
filename=True,
equate=False),
# ############# END FORMATS ###################################
# anchors yes Use anchor optimization in tree
# dependent refinement iterations.
_Switch(["-anchors", "anchors"],
"Use anchor optimisation in tree dependent "
"refinement iterations"),
# noanchors no Disable anchor optimization. Default
# is anchors.
_Switch(["-noanchors", "noanchors"],
"Do not use anchor optimisation in tree dependent "
"refinement iterations"),
# brenner no Use Steven Brenner's method for
# computing the root alignment.
_Switch(["-brenner", "brenner"],
"Use Steve Brenner's root alignment method"),
# cluster no Perform fast clustering of input
# sequences. Use the tree1 option to
# save the tree.
_Switch(["-cluster", "cluster"],
"Perform fast clustering of input sequences, "
"use -tree1 to save tree"),
# dimer no Use dimer approximation for the
# SP score (faster, less accurate).
_Switch(["-dimer", "dimer"],
"Use faster (slightly less accurate) dimer approximation"
"for the SP score"),
# group yes Group similar sequences together
# in the output. This is the default.
# See also stable.
_Switch(["-group", "group"],
"Group similar sequences in output"),
# ############# log-expectation profile score ####################
# One of either -le, -sp, or -sv
#
# According to the doc, spn is default and the only option for
# nucleotides: this doesn't appear to be true. -le, -sp, and -sv
# can be used and produce numerically different logs
# (what is going on?)
#
# spn fails on proteins
# le maybe Use log-expectation profile score
# (VTML240). Alternatives are to use sp
# or sv. This is the default for amino
# acid sequences.
_Switch(["-le", "le"],
"Use log-expectation profile score (VTML240)"),
# sv no Use sum-of-pairs profile score
# (VTML240). Default is le.
_Switch(["-sv", "sv"],
"Use sum-of-pairs profile score (VTML240)"),
# sp no Use sum-of-pairs protein profile
# score (PAM200). Default is le.
_Switch(["-sp", "sp"],
"Use sum-of-pairs protein profile score (PAM200)"),
# spn maybe Use sum-of-pairs nucleotide profile
# score (BLASTZ parameters). This is
# the only option for nucleotides,
# and is therefore the default.
_Switch(["-spn", "spn"],
"Use sum-of-pairs protein nucleotide profile score"),
# ########## END log-expectation profile score ###################
# quiet no Do not display progress messages.
_Switch(["-quiet", "quiet"],
"Do not display progress messages"),
# refine no Input file is already aligned, skip
# first two iterations and begin tree
# dependent refinement.
_Switch(["-refine", "refine"],
"Only do tree dependent refinement"),
# refinew no Refine an alignment by dividing it
# into non-overlapping windows and
# re-aligning each window. Typically
# used for whole-genome nucleotide
# alignments.
_Switch(["-refinew", "refinew"],
"Only do tree dependent refinement using "
"sliding window approach"),
# core yes in muscle, Do not catch exceptions.
# no in muscled.
_Switch(["-core", "core"],
"Do not catch exceptions"),
# nocore no in muscle, Catch exceptions and give an
# yes in muscled. error message if possible.
_Switch(["-nocore", "nocore"],
"Catch exceptions"),
# stable no Preserve input order of sequences
# in output file. Default is to group
# sequences by similarity (group).
_Switch(["-stable", "stable"],
"Do not group similar sequences in output "
"(not supported in v3.8)"),
# termgaps4 yes Use 4-way test for treatment of
# terminal gaps.
# (Cannot be disabled in this version).
#
# termgapsfull no Terminal gaps penalized with
# full penalty. [1] Not fully
# supported in this version
#
# termgapshalf yes Terminal gaps penalized with
# half penalty. [1] Not fully
# supported in this version
#
# termgapshalflonger no Terminal gaps penalized with
# half penalty if gap relative
# to longer sequence, otherwise with
# full penalty. [1] Not fully
# supported in this version
#
# verbose no Write parameter settings and
# progress messages to log file.
_Switch(["-verbose", "verbose"],
"Write parameter settings and progress"),
# version no Write version string to
# stdout and exit
_Switch(["-version", "version"],
"Write version string to stdout and exit"),
]
AbstractCommandline.__init__(self, cmd, **kwargs)
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()