Added some documentation on MEME file parsing

biopython · Jan 3, 2013 · 5c1c321 · 5c1c321
1 parent 4e68a7e
commit 5c1c321
Show file tree

Hide file tree

Showing 2 changed files with 135 additions and 17 deletions.
diff --git a/Doc/Tutorial.tex b/Doc/Tutorial.tex
@@ -11028,6 +11028,126 @@ \subsubsection*{JASPAR}
 
 \subsubsection*{MEME}
 
+MEME \cite{bailey1994} is a tool for discovering motifs in a group of related DNA or protein sequences. It takes as input a group of DNA or protein sequences and outputs as many motifs as requested. Therefore, in contrast to JASPAR files, MEME output files typically contain multiple motifs. This is an example
+
+At the top of an output file generated by MEME shows some background information about the MEME and the version of MEME used:
+\begin{verbatim}
+********************************************************************************
+MEME - Motif discovery tool
+********************************************************************************
+MEME version 3.0 (Release date: 2004/08/18 09:07:01)
+...
+\end{verbatim}
+Further down, the input set of training sequences is recapitulated:
+\begin{verbatim}
+********************************************************************************
+TRAINING SET
+********************************************************************************
+DATAFILE= INO_up800.s
+ALPHABET= ACGT
+Sequence name            Weight Length  Sequence name            Weight Length
+-------------            ------ ------  -------------            ------ ------
+CHO1                     1.0000    800  CHO2                     1.0000    800
+FAS1                     1.0000    800  FAS2                     1.0000    800
+ACC1                     1.0000    800  INO1                     1.0000    800
+OPI3                     1.0000    800
+********************************************************************************
+\end{verbatim}
+and the exact command line that was used:
+\begin{verbatim}
+********************************************************************************
+COMMAND LINE SUMMARY
+********************************************************************************
+This information can also be useful in the event you wish to report a
+problem with the MEME software.
+
+command: meme -mod oops -dna -revcomp -nmotifs 2 -bfile yeast.nc.6.freq INO_up800.s
+...
+\end{verbatim}
+Next is detailed information on each motif that was found:
+\begin{verbatim}
+********************************************************************************
+MOTIF  1        width =   12   sites =   7   llr = 95   E-value = 2.0e-001
+********************************************************************************
+--------------------------------------------------------------------------------
+        Motif 1 Description
+--------------------------------------------------------------------------------
+Simplified        A  :::9:a::::3:
+pos.-specific     C  ::a:9:11691a
+probability       G  ::::1::94:4:
+matrix            T  aa:1::9::11:
+\end{verbatim}
+To parse this file (stored as \verb+meme.dna.oops.txt+), use
+%cont-doctest
+\begin{verbatim}
+>>> handle = open("meme.dna.oops.txt")
+>>> record = Motif.parse(handle, "MEME")
+>>> handle.close()
+\end{verbatim}
+The \verb+Motif.parse+ command reads the complete file directly, so you can
+close the file after calling \verb+Motif.parse+.
+The header information is stored in attributes:
+%cont-doctest
+\begin{verbatim}
+>>> record.version
+'3.0'
+>>> record.datafile
+'INO_up800.s'
+>>> record.command
+'meme -mod oops -dna -revcomp -nmotifs 2 -bfile yeast.nc.6.freq INO_up800.s'
+>>> record.alphabet
+IUPACUnambiguousDNA()
+>>> record.sequences
+['CHO1', 'CHO2', 'FAS1', 'FAS2', 'ACC1', 'INO1', 'OPI3']
+\end{verbatim}
+The record is an object of the \verb+Bio.Motif.MEME.Record+ class.
+The class inherits from list, and you can think of \verb+record+ as a list of Motif objects:
+%cont-doctest
+\begin{verbatim}
+>>> len(record)
+2
+>>> motif = record[0]
+>>> print motif.consensus
+TTCACATGCCGC
+>>> print motif.degenerate_consensus
+TTCACATGSCNC
+\end{verbatim}
+In addition to these generic motif attributes, each motif also stores its
+specific information as calculated by MEME. For example,
+%cont-doctest
+\begin{verbatim}
+>>> motif.num_occurrences
+7
+>>> motif.name
+'Motif 1'
+>>> motif.length
+12
+>>> evalue = motif.evalue
+>>> print "%3.1g" % evalue
+0.2
+\end{verbatim}
+Each motif has an attribute \verb+.instances+ with the sequence instances in which the motif was found, providing some information on each instance
+%cont-doctest
+\begin{verbatim}
+>>> len(motif.instances)
+7
+>>> motif.instances[0]
+Instance('TTCACATGCCGC', IUPACUnambiguousDNA())
+>>> motif.instances[0].motif_name
+'Motif 1'
+>>> motif.instances[0].sequence_name
+'INO1'
+>>> motif.instances[0].start
+620
+>>> motif.instances[0].strand
+'-'
+>>> motif.instances[0].length
+12
+>>> pvalue = motif.instances[0].pvalue
+>>> print "%5.3g" % pvalue
+1.85e-08
+\end{verbatim}
+
 \subsubsection*{TRANSFAC}
 
 \subsection{Writing motifs}
@@ -16586,6 +16706,8 @@ \subsection{Creating a handle from a string}
 Athel Cornish-Bowden: ``Nomenclature for incompletely specified bases in nucleic acid sequences: Recommendations 1984.'' \textit{Nucleic Acids Research} {\bf 13} (9): 3021--3030 (1985). \href{http://dx.doi.org/10.1093/nar/13.9.3021}{doi:10.1093/nar/13.9.3021}
 \bibitem{cavener1987}
 Douglas R. Cavener: ``Comparison of the consensus sequence flanking translational start sites in Drosophila and vertebrates.'' \textit{Nucleic Acids Research} {\bf 15} (4): 1353--1361 (1987). \href{http://dx.doi.org/10.1093/nar/15.4.1353}{doi:10.1093/nar/15.4.1353}
+\bibitem{bailey1994}
+Timothy L. Bailey and Charles Elkan: ``Fitting a mixture model by expectation maximization to discover motifs in biopolymers'', \textit{Proceedings of the Second International Conference on Intelligent Systems for Molecular Biology} 28--36. AAAI Press, Menlo Park, California (1994).
 \bibitem{dehoon2004}
 Michiel J. L. de Hoon, Seiya Imoto, John Nolan, Satoru Miyano: ``Open source clustering software''. \textit{Bioinformatics} {\bf 20} (9): 1453--1454 (2004). \href{http://dx.doi.org/10.1093/bioinformatics/bth078}{doi:10.1093/bioinformatics/bth078}
 \bibitem{eisen1998}

diff --git a/Tests/test_Motif.py b/Tests/test_Motif.py
@@ -443,9 +443,8 @@ def test_meme_parser_1(self):
         """Test if Motif can parse MEME output files (first test)
         """
         from Bio.Alphabet import IUPAC
-        from Bio.Motif import MEME
         handle = open("Motif/meme.out")
-        record = MEME.read(handle)
+        record = Motif.parse(handle, 'MEME')
         self.assertEqual(record.version, '3.5.7')
         self.assertEqual(record.datafile, 'test.fa')
         self.assertEqual(record.alphabet, IUPAC.unambiguous_dna)
@@ -554,9 +553,8 @@ def test_meme_parser_2(self):
         """Test if Motif can parse MEME output files (second test)
         """
         from Bio.Alphabet import IUPAC
-        from Bio.Motif import MEME
         handle = open("Motif/meme.dna.oops.txt")
-        record = MEME.read(handle)
+        record = Motif.parse(handle, 'MEME')
         self.assertEqual(record.version, '3.0')
         self.assertEqual(record.datafile, 'INO_up800.s')
         self.assertEqual(record.alphabet, IUPAC.unambiguous_dna)
@@ -569,8 +567,8 @@ def test_meme_parser_2(self):
         self.assertEqual(record.sequences[5], 'INO1')
         self.assertEqual(record.sequences[6], 'OPI3')
         self.assertEqual(record.command, 'meme -mod oops -dna -revcomp -nmotifs 2 -bfile yeast.nc.6.freq INO_up800.s')
-        self.assertEqual(len(record.motifs), 2)
+        self.assertEqual(len(record), 2)
-        motif = record.motifs[0]
+        motif = record[0]
         self.assertEqual(motif.num_occurrences, 7)
         self.assertAlmostEqual(motif.evalue, 0.2)
         self.assertEqual(motif.alphabet, IUPAC.unambiguous_dna)
@@ -618,7 +616,7 @@ def test_meme_parser_2(self):
         self.assertEqual(str(motif.instances[4]), "TTCACACGGCAC")
         self.assertEqual(str(motif.instances[5]), "TTCACATGCTAC")
         self.assertEqual(str(motif.instances[6]), "TTCAGATCGCTC")
-        motif = record.motifs[1]
+        motif = record[1]
         self.assertEqual(motif.num_occurrences, 7)
         self.assertAlmostEqual(motif.evalue, 110)
         self.assertEqual(motif.alphabet, IUPAC.unambiguous_dna)
@@ -672,9 +670,8 @@ def test_meme_parser_3(self):
         """Test if Motif can parse MEME output files (third test)
         """
         from Bio.Alphabet import IUPAC
-        from Bio.Motif import MEME
         handle = open("Motif/meme.protein.oops.txt")
-        record = MEME.read(handle)
+        record = Motif.parse(handle, 'MEME')
         self.assertEqual(record.version, '3.0')
         self.assertEqual(record.datafile, 'adh.s')
         self.assertEqual(record.alphabet, IUPAC.protein)
@@ -713,8 +710,8 @@ def test_meme_parser_3(self):
         self.assertEqual(record.sequences[31], "RFBB_NEIGO")
         self.assertEqual(record.sequences[32], "YURA_MYXXA")
         self.assertEqual(record.command, 'meme adh.s -mod oops -protein -nmotifs 2')
-        self.assertEqual(len(record.motifs), 2)
+        self.assertEqual(len(record), 2)
-        motif = record.motifs[0]
+        motif = record[0]
         self.assertEqual(motif.num_occurrences, 33)
         self.assertAlmostEqual(motif.evalue, 3.6e-165)
         self.assertEqual(motif.alphabet, IUPAC.protein)
@@ -918,7 +915,7 @@ def test_meme_parser_3(self):
         self.assertEqual(str(motif.instances[30]), "YGVTKIGVTVLSRIHARKLSEQRKGDKIL")
         self.assertEqual(str(motif.instances[31]), "KDSTLFGVSSLSDSLKGDFTSSALRCKEL")
         self.assertEqual(str(motif.instances[32]), "YINCVAPLRMTELCLPHLYETGSGRIVNI")
-        motif = record.motifs[1]
+        motif = record[1]
         self.assertEqual(motif.num_occurrences, 33)
         self.assertAlmostEqual(motif.evalue, 2.3e-159)
         self.assertEqual(motif.alphabet, IUPAC.protein)
@@ -1062,9 +1059,8 @@ def test_meme_parser_4(self):
         """Test if Motif can parse MEME output files (fourth test)
         """
         from Bio.Alphabet import IUPAC
-        from Bio.Motif import MEME
         handle = open("Motif/meme.protein.tcm.txt")
-        record = MEME.read(handle)
+        record = Motif.parse(handle, 'MEME')
         self.assertEqual(record.version, '3.0')
         self.assertEqual(record.datafile, 'farntrans5.s')
         self.assertEqual(record.alphabet, IUPAC.protein)
@@ -1075,8 +1071,8 @@ def test_meme_parser_4(self):
         self.assertEqual(record.sequences[3], "RATRABGERB")
         self.assertEqual(record.sequences[4], "CAL1_YEAST")
         self.assertEqual(record.command, 'meme farntrans5.s -mod tcm -protein -nmotifs 2')
-        self.assertEqual(len(record.motifs), 2)
+        self.assertEqual(len(record), 2)
-        motif = record.motifs[0]
+        motif = record[0]
         self.assertEqual(motif.num_occurrences, 24)
         self.assertAlmostEqual(motif.evalue, 2.2e-94)
         self.assertEqual(motif.alphabet, IUPAC.protein)
@@ -1226,7 +1222,7 @@ def test_meme_parser_4(self):
         self.assertEqual(str(motif.instances[21]), "PGLRDKPGAHSDFYHTNYCLLGLAVAESSY")
         self.assertEqual(str(motif.instances[22]), "GGFSKNDEEDADLYHSCLGSAALALIEGKF")
         self.assertEqual(str(motif.instances[23]), "HNFEYWLTEHLRLNGIYWGLTALCVLDSPE")
-        motif = record.motifs[1]
+        motif = record[1]
         self.assertEqual(motif.num_occurrences, 21)
         self.assertAlmostEqual(motif.evalue, 3.1e-19)
         self.assertEqual(motif.alphabet, IUPAC.protein)