Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Merge branch 'master' of github.com:biopython/biopython

  • Loading branch information...
commit b3031cbdac01f6a8ef09cde632b41c5f95ec5f0e 2 parents af7ae41 + 06eeb0a
Michiel de Hoon authored
View
173 Bio/Motif/Applications/_XXmotif.py
@@ -0,0 +1,173 @@
+# -*- coding: utf-8 -*-
+# Copyright 2012 by Christian Brueffer. All rights reserved.
+#
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+"""Command line wrapper for the motif finding program XXmotif."""
+
+import os
+from Bio.Application import AbstractCommandline, _Option, _Switch, _Argument
+
+class XXmotifCommandline(AbstractCommandline):
+ """Command line wrapper for XXmotif.
+
+ http://xxmotif.genzentrum.lmu.de/
+
+ Example:
+
+ >>> from Bio.Motif.Applications import XXmotifCommandline
+ >>> out_dir = "results"
+ >>> in_file = "sequences.fasta"
+ >>> xxmotif_cline = XXmotifCommandline(outdir=out_dir, seqfile=in_file, revcomp=True)
+ >>> print xxmotif_cline
+ XXmotif results sequences.fasta --revcomp
+
+ You would typically run the command line with xxmotif_cline() or via
+ the Python subprocess module, as described in the Biopython tutorial.
+
+ Citations:
+
+ Luehr S, Hartmann H, and Söding J. The XXmotif web server for eXhaustive,
+ weight matriX-based motif discovery in nucleotide sequences,
+ Nucleic Acids Res. 40: W104-W109 (2012).
+
+ Hartmann H, Guthoehrlein EW, Siebert M., Luehr S, and Söding J. P-value
+ based regulatory motif discovery using positional weight matrices
+ (to be published)
+
+ Last checked against version: 1.3
+ """
+
+ def __init__(self, cmd="XXmotif", **kwargs):
+ # order of parameters is the same as in XXmotif --help
+ _valid_alphabet = set("ACGTNX")
+
+ self.parameters = \
+ [
+ _Argument(["outdir", "OUTDIR"],
+ "output directory for all results",
+ filename = True,
+ is_required = True,
+ # XXmotif currently does not accept spaces in the outdir name
+ checker_function = lambda x: " " not in x),
+ _Argument(["seqfile", "SEQFILE"],
+ "file name with sequences from positive set in FASTA format",
+ filename = True,
+ is_required = True,
+ # XXmotif currently only accepts a pure filename
+ checker_function = lambda x: os.path.split(x)[0] == ""),
+
+ # Options
+ _Option(["--negSet", "negSet", "negset", "NEGSET"],
+ "sequence set which has to be used as a reference set",
+ filename = True,
+ equate = False),
+ _Switch(["--zoops", "zoops", "ZOOPS"],
+ "use zero-or-one occurrence per sequence model (DEFAULT)"),
+ _Switch(["--mops", "mops", "MOPS"],
+ "use multiple occurrence per sequence model"),
+ _Switch(["--oops", "oops", "OOPS"],
+ "use one occurrence per sequence model"),
+ _Switch(["--revcomp", "revcomp", "REVCOMP"],
+ "search in reverse complement of sequences as well (DEFAULT: NO)"),
+ _Option(["--background-model-order", "background-model-order", "BACKGROUND-MODEL-ORDER"],
+ "order of background distribution (DEFAULT: 2, 8(--negset) )",
+ checker_function = lambda x: isinstance(x, int),
+ equate = False),
+ _Option(["--pseudo", "pseudo", "PSEUDO"],
+ "percentage of pseudocounts used (DEFAULT: 10)",
+ checker_function = lambda x: isinstance(x, int),
+ equate = False),
+ _Option(["-g", "--gaps", "gaps", "GAPS"],
+ "maximum number of gaps used for start seeds [0-3] (DEFAULT: 0)",
+ checker_function = lambda x: x in [0-3],
+ equate = False),
+ _Option(["--type", "type", "TYPE"],
+ "defines what kind of start seeds are used (DEFAULT: ALL)"
+ "possible types: ALL, FIVEMERS, PALINDROME, TANDEM, NOPALINDROME, NOTANDEM",
+ checker_function = lambda x: x in ["ALL", "all",
+ "FIVEMERS", "fivemers",
+ "PALINDROME", "palindrome",
+ "TANDEM", "tandem",
+ "NOPALINDROME", "nopalindrome",
+ "NOTANDEM", "notandem"],
+ equate = False),
+ _Option(["--merge-motif-threshold", "merge-motif-threshold", "MERGE-MOTIF-THRESHOLD"],
+ "defines the similarity threshold for merging motifs (DEFAULT: HIGH)"
+ "possible modes: LOW, MEDIUM, HIGH",
+ checker_function = lambda x: x in ["LOW", "low",
+ "MEDIUM", "medium",
+ "HIGH", "high"],
+ equate = False),
+ _Switch(["--no-pwm-length-optimization", "no-pwm-length-optimization", "NO-PWM-LENGTH-OPTIMIZATION"],
+ "do not optimize length during iterations (runtime advantages)"),
+ _Option(["--max-match-positions", "max-match-positions", "MAX-MATCH-POSITIONS"],
+ "max number of positions per motif (DEFAULT: 17, higher values will lead to very long runtimes)",
+ checker_function = lambda x: isinstance(x, int),
+ equate = False),
+ _Switch(["--batch", "batch", "BATCH"],
+ "suppress progress bars (reduce output size for batch jobs)"),
+ _Option(["--maxPosSetSize", "maxPosSetSize", "maxpossetsize", "MAXPOSSETSIZE"],
+ "maximum number of sequences from the positive set used [DEFAULT: all]",
+ checker_function = lambda x: isinstance(x, int),
+ equate = False),
+ # does not make sense in biopython
+ #_Switch(["--help", "help", "HELP"],
+ # "print this help page"),
+ _Option(["--trackedMotif", "trackedMotif", "trackedmotif", "TRACKEDMOTIF"],
+ "inspect extensions and refinement of a given seed (DEFAULT: not used)",
+ checker_function = lambda x: any((c in _valid_alphabet) for c in x),
+ equate = False),
+
+ # Using conservation information
+ _Option(["--format", "format", "FORMAT"],
+ "defines what kind of format the input sequences have (DEFAULT: FASTA)",
+ checker_function = lambda x: x in ["FASTA", "fasta",
+ "MFASTA", "mfasta"],
+ equate = False),
+ _Option(["--maxMultipleSequences", "maxMultipleSequences", "maxmultiplesequences", "MAXMULTIPLESEQUENCES"],
+ "maximum number of sequences used in an alignment [DEFAULT: all]",
+ checker_function = lambda x: isinstance(x, int),
+ equate = False),
+
+ # Using localization information
+ _Switch(["--localization", "localization", "LOCALIZATION"],
+ "use localization information to calculate combined P-values"
+ "(sequences should have all the same length)"),
+ _Option(["--downstream", "downstream", "DOWNSTREAM"],
+ "number of residues in positive set downstream of anchor point (DEFAULT: 0)",
+ checker_function = lambda x: isinstance(x, int),
+ equate = False),
+
+ # Start with self defined motif
+ _Option(["-m", "--startMotif", "startMotif", "startmotif", "STARTMOTIF"],
+ "Start motif (IUPAC characters)",
+ checker_function = lambda x: any((c in _valid_alphabet) for c in x),
+ equate = False),
+ _Option(["-p", "--profileFile", "profileFile", "profilefile", "PROFILEFILE"],
+ "profile file",
+ filename = True,
+ equate = False),
+ _Option(["--startRegion", "startRegion", "startregion", "STARTREGION"],
+ "expected start position for motif occurrences relative to anchor point (--localization)",
+ checker_function = lambda x: isinstance(x, int),
+ equate = False),
+ _Option(["--endRegion", "endRegion", "endregion", "ENDREGION"],
+ "expected end position for motif occurrences relative to anchor point (--localization)",
+ checker_function = lambda x: isinstance(x, int),
+ equate = False),
+ ]
+ AbstractCommandline.__init__(self, cmd, **kwargs)
+
+
+def _test():
+ """Run the module's doctests (PRIVATE)."""
+ print "Running XXmotif doctests..."
+ import doctest
+ doctest.testmod()
+ print "Done"
+
+
+if __name__ == "__main__":
+ _test()
View
1  Bio/Motif/Applications/__init__.py
@@ -6,3 +6,4 @@
"""Motif command line tool wrappers."""
from _AlignAce import AlignAceCommandline
from _AlignAce import CompareAceCommandline
+from _XXmotif import XXmotifCommandline
View
1  Tests/run_tests.py
@@ -83,6 +83,7 @@ def is_numpy():
"Bio.KEGG.Compound",
"Bio.KEGG.Enzyme",
"Bio.Motif",
+ "Bio.Motif.Applications._XXmotif",
"Bio.pairwise2",
"Bio.Phylo.Applications._Raxml",
"Bio.Seq",
View
190 Tests/test_XXmotif_tool.py
@@ -0,0 +1,190 @@
+# Copyright 2012 by Christian Brueffer. All rights reserved.
+#
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+import glob
+import os
+import shutil
+import sys
+import unittest
+from Bio import MissingExternalDependencyError
+from Bio import Motif
+from Bio import SeqIO
+from Bio.Application import ApplicationError
+from Bio.Motif.Applications import XXmotifCommandline
+
+
+# Try to avoid problems when the OS is in another language
+os.environ['LANG'] = 'C'
+
+xxmotif_exe = None
+if sys.platform == "win32":
+ # TODO
+ raise MissingExternalDependencyError("Testing this on Windows is not implemented yet")
+else:
+ import commands
+ output = commands.getoutput("XXmotif")
+ if output.find("== XXmotif version") != -1:
+ xxmotif_exe = "XXmotif"
+
+if not xxmotif_exe:
+ raise MissingExternalDependencyError(\
+ "Install XXmotif if you want to use XXmotif from Biopython.")
+
+
+class XXmotifTestCase(unittest.TestCase):
+
+ def setUp(self):
+ self.out_dir = "xxmotif-temp"
+ self.files_to_clean = set()
+
+ def tearDown(self):
+ for filename in self.files_to_clean:
+ if os.path.isfile(filename):
+ os.remove(filename)
+
+ if os.path.isdir(self.out_dir):
+ shutil.rmtree(self.out_dir)
+
+ def standard_test_procedure(self, cline):
+ """Standard testing procedure used by all tests."""
+ output, error = cline()
+
+ self.assertTrue(os.path.isdir(self.out_dir))
+ self.assertTrue(glob.glob(os.path.join(self.out_dir, "*.meme")))
+ self.assertTrue(glob.glob(os.path.join(self.out_dir, "*_MotifFile.txt")))
+ self.assertTrue(glob.glob(os.path.join(self.out_dir, "*_Pvals.txt")))
+ self.assertTrue(glob.glob(os.path.join(self.out_dir, "*.pwm")))
+ self.assertTrue(glob.glob(os.path.join(self.out_dir, "*_sequence.txt")))
+
+ # TODO
+ # Parsing the MEME file would be nice, but unfortunately the
+ # MEME parser does not like what XXmotif produces yet.
+
+ def copy_and_mark_for_cleanup(self, path):
+ """
+ XXmotif currently only handles a canonical filename as input, no paths.
+ This method copies the specified file in the specified path to the
+ current working directory and marks it for removal.
+ """
+ filename = os.path.split(path)[1]
+
+ shutil.copyfile(path, filename)
+ self.add_file_to_clean(filename)
+
+ return filename
+
+ def add_file_to_clean(self, filename):
+ """Adds a file for deferred removal by the tearDown routine."""
+ self.files_to_clean.add(filename)
+
+
+class XXmotifTestErrorConditions(XXmotifTestCase):
+
+ def test_empty_file(self):
+ """Test a non-existing input file."""
+ input_file = "does_not_exist.fasta"
+ self.assertFalse(os.path.isfile(input_file))
+
+ cline = XXmotifCommandline(outdir = self.out_dir,
+ seqfile = input_file)
+
+ try:
+ stdout, stderr = cline()
+ except ApplicationError, err:
+ self.assertEqual(err.returncode, 255)
+ else:
+ self.fail("Should have failed, returned:\n%s\n%s" % (stdout, stderr))
+
+ def test_invalid_format(self):
+ """Test an input file in an invalid format."""
+ input_file = self.copy_and_mark_for_cleanup("Medline/pubmed_result1.txt")
+
+ cline = XXmotifCommandline(outdir = self.out_dir,
+ seqfile = input_file)
+
+ try:
+ stdout, stderr = cline()
+ except ApplicationError, err:
+ self.assertEqual(err.returncode, 255)
+ else:
+ self.fail("Should have failed, returned:\n%s\n%s" % (stdout, stderr))
+
+ def test_output_directory_with_space(self):
+ """Test an output directory containing a space."""
+ temp_out_dir = "xxmotif test"
+ input_file = self.copy_and_mark_for_cleanup("Fasta/f002")
+
+ try:
+ cline = XXmotifCommandline(outdir = temp_out_dir,
+ seqfile = input_file)
+ except ValueError:
+ pass
+ else:
+ self.fail("expected ValueError")
+
+
+class XXmotifTestNormalConditions(XXmotifTestCase):
+
+ def test_fasta_one_sequence(self):
+ """Test a fasta input file containing only one sequence."""
+ input_file = "seq.fasta"
+ handle = open(input_file, "w")
+ record = list(SeqIO.parse("Registry/seqs.fasta", "fasta"))[0]
+ SeqIO.write(record, handle, "fasta")
+ handle.close()
+ del handle, record
+
+ cline = XXmotifCommandline(outdir = self.out_dir,
+ seqfile = input_file)
+
+ self.add_file_to_clean(input_file)
+ self.standard_test_procedure(cline)
+
+ def test_properties(self):
+ """Test setting options via properties."""
+ input_file = self.copy_and_mark_for_cleanup("Fasta/f002")
+
+ cline = XXmotifCommandline(outdir = self.out_dir,
+ seqfile = input_file)
+
+ cline.revcomp = True
+ cline.pseudo = 20
+ cline.startmotif = "ACGGGT"
+
+ self.standard_test_procedure(cline)
+
+ def test_large_fasta_file(self):
+ """Test a large fasta input file."""
+ input_file = "temp_b_nuc.fasta"
+ handle = open(input_file, "w")
+ records = list(SeqIO.parse("NBRF/B_nuc.pir", "pir"))
+ SeqIO.write(records, handle, "fasta")
+ handle.close()
+ del handle, records
+
+ cline = XXmotifCommandline(outdir = self.out_dir,
+ seqfile = input_file)
+
+ self.add_file_to_clean(input_file)
+ self.standard_test_procedure(cline)
+
+ def test_input_filename_with_space(self):
+ """Test an input filename containing a space."""
+ input_file = "temp horses.fasta"
+ handle = open(input_file, "w")
+ SeqIO.write(SeqIO.parse("Phylip/hennigian.phy", "phylip"), handle, "fasta")
+ handle.close()
+
+ cline = XXmotifCommandline(outdir = self.out_dir,
+ seqfile = input_file)
+
+ self.add_file_to_clean(input_file)
+ self.standard_test_procedure(cline)
+
+
+if __name__ == "__main__":
+ runner = unittest.TextTestRunner(verbosity = 2)
+ unittest.main(testRunner = runner)
Please sign in to comment.
Something went wrong with that request. Please try again.