Bio/Align/Applications/_Muscle.py

# Copyright 2009 by Cymon J. Cox.  All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Command line wrapper for the multiple alignment program MUSCLE."""

from Bio.Application import _Option
from Bio.Application import _Switch
from Bio.Application import AbstractCommandline


class MuscleCommandline(AbstractCommandline):
    r"""Command line wrapper for the multiple alignment program MUSCLE.

    http://www.drive5.com/muscle/

    Notes
    -----
    Last checked against version: 3.7, briefly against 3.8

    References
    ----------
    Edgar, Robert C. (2004), MUSCLE: multiple sequence alignment with high
    accuracy and high throughput, Nucleic Acids Research 32(5), 1792-97.

    Edgar, R.C. (2004) MUSCLE: a multiple sequence alignment method with
    reduced time and space complexity. BMC Bioinformatics 5(1): 113.

    Examples
    --------
    >>> from Bio.Align.Applications import MuscleCommandline
    >>> muscle_exe = r"C:\Program Files\Alignments\muscle3.8.31_i86win32.exe"
    >>> in_file = r"C:\My Documents\unaligned.fasta"
    >>> out_file = r"C:\My Documents\aligned.fasta"
    >>> muscle_cline = MuscleCommandline(muscle_exe, input=in_file, out=out_file)
    >>> print(muscle_cline)
    "C:\Program Files\Alignments\muscle3.8.31_i86win32.exe" -in "C:\My Documents\unaligned.fasta" -out "C:\My Documents\aligned.fasta"

    You would typically run the command line with muscle_cline() or via
    the Python subprocess module, as described in the Biopython tutorial.

    """

    def __init__(self, cmd="muscle", **kwargs):
        """Initialize the class."""
        CLUSTERING_ALGORITHMS = ["upgma", "upgmb", "neighborjoining"]
        DISTANCE_MEASURES_ITER1 = [
            "kmer6_6",
            "kmer20_3",
            "kmer20_4",
            "kbit20_3",
            "kmer4_6",
        ]
        DISTANCE_MEASURES_ITER2 = DISTANCE_MEASURES_ITER1 + [
            "pctid_kimura",
            "pctid_log",
        ]
        OBJECTIVE_SCORES = ["sp", "ps", "dp", "xp", "spf", "spm"]
        TREE_ROOT_METHODS = ["pseudo", "midlongestspan", "minavgleafdist"]

        # The mucleotide arguments for  the sequence type parameter in MUSCLE (-seqtype)
        # were updated at somepoint in MUSCLE version 3.8. Prior to the update
        # 'nucleo' was used for nucleotide. This has been updated to 'rna' and 'dna'. 'nucleo' kept for
        # backwards compatibility with older MUSCLE versions.
        SEQUENCE_TYPES = ["protein", "rna", "dna", "nucleo", "auto"]
        WEIGHTING_SCHEMES = [
            "none",
            "clustalw",
            "henikoff",
            "henikoffpb",
            "gsc",
            "threeway",
        ]
        self.parameters = [
            # Can't use "in" as the final alias as this
            # is a reserved word in python:
            _Option(
                ["-in", "in", "input"], "Input filename", filename=True, equate=False
            ),
            _Option(["-out", "out"], "Output filename", filename=True, equate=False),
            _Switch(
                ["-diags", "diags"], "Find diagonals (faster for similar sequences)"
            ),
            _Switch(["-profile", "profile"], "Perform a profile alignment"),
            _Option(
                ["-in1", "in1"],
                "First input filename for profile alignment",
                filename=True,
                equate=False,
            ),
            _Option(
                ["-in2", "in2"],
                "Second input filename for a profile alignment",
                filename=True,
                equate=False,
            ),
            # anchorspacing   Integer              32       Minimum spacing
            #                                              between anchor cols
            _Option(
                ["-anchorspacing", "anchorspacing"],
                "Minimum spacing between anchor columns",
                checker_function=lambda x: isinstance(x, int),
                equate=False,
            ),
            # center          Floating point       [1]      Center parameter.
            #                                              Should be negative.
            _Option(
                ["-center", "center"],
                "Center parameter - should be negative",
                checker_function=lambda x: isinstance(x, float),
                equate=False,
            ),
            # cluster1        upgma                upgmb    Clustering method.
            _Option(
                ["-cluster1", "cluster1"],
                "Clustering method used in iteration 1",
                checker_function=lambda x: x in CLUSTERING_ALGORITHMS,
                equate=False,
            ),
            # cluster2        upgmb                         cluster1 is used
            #                neighborjoining               in iteration 1 and
            #                                              2, cluster2 in
            #                                              later iterations.
            _Option(
                ["-cluster2", "cluster2"],
                "Clustering method used in iteration 2",
                checker_function=lambda x: x in CLUSTERING_ALGORITHMS,
                equate=False,
            ),
            # diaglength      Integer              24       Minimum length of
            #                                              diagonal.
            _Option(
                ["-diaglength", "diaglength"],
                "Minimum length of diagonal",
                checker_function=lambda x: isinstance(x, int),
                equate=True,
            ),
            # diagmargin      Integer              5        Discard this many
            #                                              positions at ends
            #                                              of diagonal.
            _Option(
                ["-diagmargin", "diagmargin"],
                "Discard this many positions at ends of diagonal",
                checker_function=lambda x: isinstance(x, int),
                equate=False,
            ),
            # distance1       kmer6_6       Kmer6_6(amino) or Distance measure
            #                kmer20_3       Kmer4_6(nucleo)   for iteration 1
            #                kmer20_4
            #                kbit20_3
            #                kmer4_6
            _Option(
                ["-distance1", "distance1"],
                "Distance measure for iteration 1",
                checker_function=lambda x: x in DISTANCE_MEASURES_ITER1,
                equate=False,
            ),
            # distance2       kmer6_6       pctid_kimura    Distance measure
            #                kmer20_3                      for iterations
            #                kmer20_4                      2, 3 ...
            #                kbit20_3
            #                pctid_kimura
            #                pctid_log
            _Option(
                ["-distance2", "distance2"],
                "Distance measure for iteration 2",
                checker_function=lambda x: x in DISTANCE_MEASURES_ITER2,
                equate=False,
            ),
            # gapextend       Floating point       [1]    The gap extend score
            _Option(
                ["-gapextend", "gapextend"],
                "Gap extension penalty",
                checker_function=lambda x: isinstance(x, float),
                equate=False,
            ),
            # gapopen         Floating point       [1]      The gap open score
            #                                              Must be negative.
            _Option(
                ["-gapopen", "gapopen"],
                "Gap open score - negative number",
                checker_function=lambda x: isinstance(x, float),
                equate=False,
            ),
            # hydro           Integer              5        Window size for
            #                                              determining whether
            #                                              a region is
            #                                              hydrophobic.
            _Option(
                ["-hydro", "hydro"],
                "Window size for hydrophobic region",
                checker_function=lambda x: isinstance(x, int),
                equate=False,
            ),
            # hydrofactor     Floating point       1.2      Multiplier for gap
            #                                              open/close
            #                                              penalties in
            #                                              hydrophobic regions
            _Option(
                ["-hydrofactor", "hydrofactor"],
                "Multiplier for gap penalties in hydrophobic regions",
                checker_function=lambda x: isinstance(x, float),
                equate=False,
            ),
            # log             File name            None.    Log file name
            #                                              (delete existing
            #                                              file).
            _Option(["-log", "log"], "Log file name", filename=True, equate=False),
            # loga            File name            None.    Log file name
            #                                              (append to existing
            #                                              file).
            _Option(
                ["-loga", "loga"],
                "Log file name (append to existing file)",
                filename=True,
                equate=False,
            ),
            # matrix          File name            None.    File name for
            #                                              substitution matrix
            #                                              in NCBI or WU-BLAST
            #                                              format. If you
            #                                              specify your own
            #                                              matrix, you should
            #                                              also specify:
            #                                                -gapopen <g>
            #                                                -gapextend <e>
            #                                                -center 0.0
            _Option(
                ["-matrix", "matrix"],
                "path to NCBI or WU-BLAST format protein substitution "
                "matrix - also set -gapopen, -gapextend and -center",
                filename=True,
                equate=False,
            ),
            # diagbreak    Integer              1           Maximum distance
            #                                              between two
            #                                              diagonals that
            #                                              allows them to
            #                                              merge into one
            #                                              diagonal.
            _Option(
                ["-diagbreak", "diagbreak"],
                "Maximum distance between two diagonals that allows "
                "them to merge into one diagonal",
                checker_function=lambda x: isinstance(x, int),
                equate=False,
            ),
            _Option(
                ["-maxdiagbreak", "maxdiagbreak"],  # deprecated 3.8
                "Deprecated in v3.8, use -diagbreak instead.",
                checker_function=lambda x: isinstance(x, int),
                equate=False,
            ),
            # maxhours        Floating point       None.    Maximum time to
            #                                              run in hours. The
            #                                              actual time may
            #                                              exceed requested
            #                                              limit by a few
            #                                              minutes. Decimals
            #                                              are allowed, so 1.5
            #                                              means one hour and
            #                                              30 minutes.
            _Option(
                ["-maxhours", "maxhours"],
                "Maximum time to run in hours",
                checker_function=lambda x: isinstance(x, float),
                equate=False,
            ),
            # maxiters        Integer 1, 2 ...     16       Maximum number of
            #                                              iterations.
            _Option(
                ["-maxiters", "maxiters"],
                "Maximum number of iterations",
                checker_function=lambda x: isinstance(x, int),
                equate=False,
            ),
            # maxtrees        Integer              1        Maximum number of
            #                                              new trees to build
            #                                              in iteration 2.
            _Option(
                ["-maxtrees", "maxtrees"],
                "Maximum number of trees to build in iteration 2",
                checker_function=lambda x: isinstance(x, int),
                equate=False,
            ),
            # minbestcolscore Floating point       [1]      Minimum score a
            #                                              column must have to
            #                                              be an anchor.
            _Option(
                ["-minbestcolscore", "minbestcolscore"],
                "Minimum score a column must have to be an anchor",
                checker_function=lambda x: isinstance(x, float),
                equate=False,
            ),
            # minsmoothscore  Floating point       [1]      Minimum smoothed
            #                                              score a column must
            #                                              have to be an
            #                                              anchor.
            _Option(
                ["-minsmoothscore", "minsmoothscore"],
                "Minimum smoothed score a column must have to be an anchor",
                checker_function=lambda x: isinstance(x, float),
                equate=False,
            ),
            # objscore        sp                   spm      Objective score
            #                ps                            used by tree
            #                dp                            dependent
            #                xp                            refinement.
            #                spf                           sp=sum-of-pairs
            #                spm                           score. (dimer
            #                                              approximation)
            #                                              spm=sp for < 100
            #                                              seqs, otherwise spf
            #                                              dp=dynamic
            #                                              programming score.
            #                                              ps=average profile-
            #                                              sequence score.
            #                                              xp=cross profile
            #                                              score.
            _Option(
                ["-objscore", "objscore"],
                "Objective score used by tree dependent refinement",
                checker_function=lambda x: x in OBJECTIVE_SCORES,
                equate=False,
            ),
            # refinewindow    Integer              200      Length of window
            #                                              for -refinew.
            _Option(
                ["-refinewindow", "refinewindow"],
                "Length of window for -refinew",
                checker_function=lambda x: isinstance(x, int),
                equate=False,
            ),
            # root1           pseudo               pseudo  Method used to root
            _Option(
                ["-root1", "root1"],
                "Method used to root tree in iteration 1",
                checker_function=lambda x: x in TREE_ROOT_METHODS,
                equate=False,
            ),
            # root2           midlongestspan                tree; root1 is
            #                minavgleafdist                used in iteration 1
            #                                              and 2, root2 in
            #                                              later iterations.
            _Option(
                ["-root2", "root2"],
                "Method used to root tree in iteration 2",
                checker_function=lambda x: x in TREE_ROOT_METHODS,
                equate=False,
            ),
            # scorefile       File name            None    File name where to
            #                                             write a score file.
            #                                             This contains one
            #                                             line for each column
            #                                             in the alignment.
            #                                             The line contains
            #                                             the letters in the
            #                                             column followed by
            #                                             the average BLOSUM62
            #                                             score over pairs of
            #                                             letters in the
            #                                             column.
            _Option(
                ["-scorefile", "scorefile"],
                "Score file name, contains one line for each column"
                " in the alignment with average BLOSUM62 score",
                filename=True,
                equate=False,
            ),
            # seqtype         protein              auto     Sequence type.
            #                 dna (MUSCLE version > 3.8)
            #                 rna (MUSCLE version > 3.8)
            #                 auto
            #                 nucleo (only valid for MUSCLE versions < 3.8)
            _Option(
                ["-seqtype", "seqtype"],
                "Sequence type",
                checker_function=lambda x: x in SEQUENCE_TYPES,
                equate=False,
            ),
            # smoothscoreceil Floating point       [1]      Maximum value of
            #                                              column score for
            #                                              smoothing purposes.
            _Option(
                ["-smoothscoreceil", "smoothscoreceil"],
                "Maximum value of column score for smoothing",
                checker_function=lambda x: isinstance(x, float),
                equate=False,
            ),
            # smoothwindow    Integer              7        Window used for
            #                                              anchor column
            #                                              smoothing.
            _Option(
                ["-smoothwindow", "smoothwindow"],
                "Window used for anchor column smoothing",
                checker_function=lambda x: isinstance(x, int),
                equate=False,
            ),
            # spscore         File name                     Compute SP
            #                                              objective score of
            #                                              multiple alignment.
            _Option(
                ["-spscore", "spscore"],
                "Compute SP objective score of multiple alignment",
                filename=True,
                equate=False,
            ),
            # SUEFF           Floating point value 0.1      Constant used in
            #                between 0 and 1.              UPGMB clustering.
            #                                              Determines the
            #                                              relative fraction
            #                                              of average linkage
            #                                              (SUEFF) vs. nearest
            #                                              neighbor linkage
            #                                              (1 SUEFF).
            _Option(
                ["-sueff", "sueff"],
                "Constant used in UPGMB clustering",
                checker_function=lambda x: isinstance(x, float),
                equate=False,
            ),
            # tree1           File name            None     Save tree
            _Option(
                ["-tree1", "tree1"], "Save Newick tree from iteration 1", equate=False
            ),
            # tree2                                         first or second
            #                                              iteration to given
            #                                              file in Newick
            #                                              (Phylip-compatible)
            #                                              format.
            _Option(
                ["-tree2", "tree2"], "Save Newick tree from iteration 2", equate=False
            ),
            # usetree         File name     None            Use given tree as
            #                                              guide tree. Must by
            #                                              in Newick
            #                                              (Phyip-compatible)
            #                                              format.
            _Option(
                ["-usetree", "usetree"],
                "Use given Newick tree as guide tree",
                filename=True,
                equate=False,
            ),
            # weight1         none          clustalw        Sequence weighting
            _Option(
                ["-weight1", "weight1"],
                "Weighting scheme used in iteration 1",
                checker_function=lambda x: x in WEIGHTING_SCHEMES,
                equate=False,
            ),
            # weight2         henikoff                      scheme.
            #                henikoffpb                    weight1 is used in
            #                gsc                           iterations 1 and 2.
            #                clustalw                      weight2 is used for
            #                threeway                      tree-dependent
            #                                              refinement.
            #                                              none=all sequences
            #                                              have equal weight.
            #                                              henikoff=Henikoff &
            #                                              Henikoff weighting
            #                                              scheme.
            #                                              henikoffpb=Modified
            #                                              Henikoff scheme as
            #                                              used in PSI-BLAST.
            #                                              clustalw=CLUSTALW
            #                                              method.
            #                                              threeway=Gotoh
            #                                              three-way method.
            _Option(
                ["-weight2", "weight2"],
                "Weighting scheme used in iteration 2",
                checker_function=lambda x: x in WEIGHTING_SCHEMES,
                equate=False,
            ),
            # ################### FORMATS ####################################
            # Multiple formats can be specified on the command line
            # If -msf appears it will be used regardless of other formats
            # specified. If -clw appears (and not -msf), clustalw format will
            # be used regardless of other formats specified. If both -clw and
            # -clwstrict are specified -clwstrict will be used regardless of
            # other formats specified. If -fasta is specified and not -msf,
            # -clw, or clwstrict, fasta will be used. If -fasta and -html are
            # specified -fasta will be used. Only if -html is specified alone
            # will html be used. I kid ye not.
            # clw                no       Write output in CLUSTALW format
            #                            (default is FASTA).
            _Switch(
                ["-clw", "clw"],
                "Write output in CLUSTALW format (with a MUSCLE header)",
            ),
            # clwstrict          no       Write output in CLUSTALW format with
            #                            the "CLUSTAL W (1.81)" header rather
            #                            than the MUSCLE version. This is
            #                            useful when a post-processing step is
            #                            picky about the file header.
            _Switch(
                ["-clwstrict", "clwstrict"],
                "Write output in CLUSTALW format with version 1.81 header",
            ),
            # fasta              yes             Write output in FASTA format.
            #                                   Alternatives include clw,
            #                                   clwstrict, msf and html.
            _Switch(["-fasta", "fasta"], "Write output in FASTA format"),
            # html               no       Write output in HTML format (default
            #                            is FASTA).
            _Switch(["-html", "html"], "Write output in HTML format"),
            # msf                no       Write output in MSF format (default
            #                            is FASTA).
            _Switch(["-msf", "msf"], "Write output in MSF format"),
            # Phylip interleaved - undocumented as of 3.7
            _Switch(["-phyi", "phyi"], "Write output in PHYLIP interleaved format"),
            # Phylip sequential - undocumented as of 3.7
            _Switch(["-phys", "phys"], "Write output in PHYLIP sequential format"),
            # ################# Additional specified output files #########
            _Option(
                ["-phyiout", "phyiout"],
                "Write PHYLIP interleaved output to specified filename",
                filename=True,
                equate=False,
            ),
            _Option(
                ["-physout", "physout"],
                "Write PHYLIP sequential format to specified filename",
                filename=True,
                equate=False,
            ),
            _Option(
                ["-htmlout", "htmlout"],
                "Write HTML output to specified filename",
                filename=True,
                equate=False,
            ),
            _Option(
                ["-clwout", "clwout"],
                "Write CLUSTALW output (with MUSCLE header) to specified filename",
                filename=True,
                equate=False,
            ),
            _Option(
                ["-clwstrictout", "clwstrictout"],
                "Write CLUSTALW output (with version 1.81 header) to "
                "specified filename",
                filename=True,
                equate=False,
            ),
            _Option(
                ["-msfout", "msfout"],
                "Write MSF format output to specified filename",
                filename=True,
                equate=False,
            ),
            _Option(
                ["-fastaout", "fastaout"],
                "Write FASTA format output to specified filename",
                filename=True,
                equate=False,
            ),
            # ############# END FORMATS ###################################
            # anchors            yes      Use anchor optimization in tree
            #                            dependent refinement iterations.
            _Switch(
                ["-anchors", "anchors"],
                "Use anchor optimisation in tree dependent refinement iterations",
            ),
            # noanchors          no       Disable anchor optimization. Default
            #                            is anchors.
            _Switch(
                ["-noanchors", "noanchors"],
                "Do not use anchor optimisation in tree dependent "
                "refinement iterations",
            ),
            # brenner            no       Use Steven Brenner's method for
            #                            computing the root alignment.
            _Switch(
                ["-brenner", "brenner"], "Use Steve Brenner's root alignment method"
            ),
            # cluster            no       Perform fast clustering of input
            #                            sequences. Use the tree1 option to
            #                            save the tree.
            _Switch(
                ["-cluster", "cluster"],
                "Perform fast clustering of input sequences, "
                "use -tree1 to save tree",
            ),
            # dimer              no       Use dimer approximation for the
            #                            SP score (faster, less accurate).
            _Switch(
                ["-dimer", "dimer"],
                "Use faster (slightly less accurate) dimer approximation"
                "for the SP score",
            ),
            # group              yes      Group similar sequences together
            #                            in the output. This is the default.
            #                            See also stable.
            _Switch(["-group", "group"], "Group similar sequences in output"),
            # ############# log-expectation profile score ####################
            # One of either -le, -sp, or -sv
            #
            # According to the doc, spn is default and the only option for
            # nucleotides: this doesn't appear to be true. -le, -sp, and -sv
            # can be used and produce numerically different logs
            # (what is going on?)
            #
            # spn fails on proteins
            # le                 maybe    Use log-expectation profile score
            #                            (VTML240). Alternatives are to use sp
            #                            or sv. This is the default for amino
            #                            acid sequences.
            _Switch(["-le", "le"], "Use log-expectation profile score (VTML240)"),
            # sv                 no       Use sum-of-pairs profile score
            #                            (VTML240). Default is le.
            _Switch(["-sv", "sv"], "Use sum-of-pairs profile score (VTML240)"),
            # sp                 no       Use sum-of-pairs protein profile
            #                            score (PAM200). Default is le.
            _Switch(["-sp", "sp"], "Use sum-of-pairs protein profile score (PAM200)"),
            # spn                maybe    Use sum-of-pairs nucleotide profile
            #                            score (BLASTZ parameters). This is
            #                            the only option for nucleotides,
            #                            and is therefore the default.
            _Switch(
                ["-spn", "spn"], "Use sum-of-pairs protein nucleotide profile score"
            ),
            # ########## END log-expectation profile score ###################
            # quiet              no      Do not display progress messages.
            _Switch(["-quiet", "quiet"], "Do not display progress messages"),
            # refine             no       Input file is already aligned, skip
            #                            first two iterations and begin tree
            #                            dependent refinement.
            _Switch(["-refine", "refine"], "Only do tree dependent refinement"),
            # refinew            no      Refine an alignment by dividing it
            #                           into non-overlapping windows and
            #                           re-aligning each window. Typically
            #                           used for whole-genome nucleotide
            #                           alignments.
            _Switch(
                ["-refinew", "refinew"],
                "Only do tree dependent refinement using sliding window approach",
            ),
            # core           yes in muscle,       Do not catch exceptions.
            #                no in muscled.
            _Switch(["-core", "core"], "Do not catch exceptions"),
            # nocore         no in muscle,        Catch exceptions and give an
            #                yes in muscled.     error message if possible.
            _Switch(["-nocore", "nocore"], "Catch exceptions"),
            # stable             no      Preserve input order of sequences
            #                           in output file. Default is to group
            #                           sequences by similarity (group).
            _Switch(
                ["-stable", "stable"],
                "Do not group similar sequences in output (not supported in v3.8)",
            ),
            # termgaps4          yes     Use 4-way test for treatment of
            #                           terminal gaps.
            #                           (Cannot be disabled in this version).
            #
            # termgapsfull       no      Terminal gaps penalized with
            #                           full penalty. [1] Not fully
            #                           supported in this version
            #
            # termgapshalf       yes     Terminal gaps penalized with
            #                           half penalty. [1] Not fully
            #                           supported in this version
            #
            # termgapshalflonger no      Terminal gaps penalized with
            #                           half penalty if gap relative
            #                           to longer sequence, otherwise with
            #                           full penalty. [1] Not fully
            #                           supported in this version
            #
            # verbose            no      Write parameter settings and
            #                           progress messages to log file.
            _Switch(["-verbose", "verbose"], "Write parameter settings and progress"),
            # version            no      Write version string to
            #                           stdout and exit
            _Switch(["-version", "version"], "Write version string to stdout and exit"),
        ]
        AbstractCommandline.__init__(self, cmd, **kwargs)


if __name__ == "__main__":
    from Bio._utils import run_doctest

    run_doctest()