Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Added some documentation on MEME file parsing

  • Loading branch information...
commit 5c1c32124f16d9188addff72aad7e99ca2eb47e0 1 parent 4e68a7e
Michiel de Hoon authored
Showing with 135 additions and 17 deletions.
  1. +122 −0 Doc/Tutorial.tex
  2. +13 −17 Tests/test_Motif.py
View
122 Doc/Tutorial.tex
@@ -11028,6 +11028,126 @@ \subsubsection*{JASPAR}
\subsubsection*{MEME}
+MEME \cite{bailey1994} is a tool for discovering motifs in a group of related DNA or protein sequences. It takes as input a group of DNA or protein sequences and outputs as many motifs as requested. Therefore, in contrast to JASPAR files, MEME output files typically contain multiple motifs. This is an example
+
+At the top of an output file generated by MEME shows some background information about the MEME and the version of MEME used:
+\begin{verbatim}
+********************************************************************************
+MEME - Motif discovery tool
+********************************************************************************
+MEME version 3.0 (Release date: 2004/08/18 09:07:01)
+...
+\end{verbatim}
+Further down, the input set of training sequences is recapitulated:
+\begin{verbatim}
+********************************************************************************
+TRAINING SET
+********************************************************************************
+DATAFILE= INO_up800.s
+ALPHABET= ACGT
+Sequence name Weight Length Sequence name Weight Length
+------------- ------ ------ ------------- ------ ------
+CHO1 1.0000 800 CHO2 1.0000 800
+FAS1 1.0000 800 FAS2 1.0000 800
+ACC1 1.0000 800 INO1 1.0000 800
+OPI3 1.0000 800
+********************************************************************************
+\end{verbatim}
+and the exact command line that was used:
+\begin{verbatim}
+********************************************************************************
+COMMAND LINE SUMMARY
+********************************************************************************
+This information can also be useful in the event you wish to report a
+problem with the MEME software.
+
+command: meme -mod oops -dna -revcomp -nmotifs 2 -bfile yeast.nc.6.freq INO_up800.s
+...
+\end{verbatim}
+Next is detailed information on each motif that was found:
+\begin{verbatim}
+********************************************************************************
+MOTIF 1 width = 12 sites = 7 llr = 95 E-value = 2.0e-001
+********************************************************************************
+--------------------------------------------------------------------------------
+ Motif 1 Description
+--------------------------------------------------------------------------------
+Simplified A :::9:a::::3:
+pos.-specific C ::a:9:11691a
+probability G ::::1::94:4:
+matrix T aa:1::9::11:
+\end{verbatim}
+To parse this file (stored as \verb+meme.dna.oops.txt+), use
+%cont-doctest
+\begin{verbatim}
+>>> handle = open("meme.dna.oops.txt")
+>>> record = Motif.parse(handle, "MEME")
+>>> handle.close()
+\end{verbatim}
+The \verb+Motif.parse+ command reads the complete file directly, so you can
+close the file after calling \verb+Motif.parse+.
+The header information is stored in attributes:
+%cont-doctest
+\begin{verbatim}
+>>> record.version
+'3.0'
+>>> record.datafile
+'INO_up800.s'
+>>> record.command
+'meme -mod oops -dna -revcomp -nmotifs 2 -bfile yeast.nc.6.freq INO_up800.s'
+>>> record.alphabet
+IUPACUnambiguousDNA()
+>>> record.sequences
+['CHO1', 'CHO2', 'FAS1', 'FAS2', 'ACC1', 'INO1', 'OPI3']
+\end{verbatim}
+The record is an object of the \verb+Bio.Motif.MEME.Record+ class.
+The class inherits from list, and you can think of \verb+record+ as a list of Motif objects:
+%cont-doctest
+\begin{verbatim}
+>>> len(record)
+2
+>>> motif = record[0]
+>>> print motif.consensus
+TTCACATGCCGC
+>>> print motif.degenerate_consensus
+TTCACATGSCNC
+\end{verbatim}
+In addition to these generic motif attributes, each motif also stores its
+specific information as calculated by MEME. For example,
+%cont-doctest
+\begin{verbatim}
+>>> motif.num_occurrences
+7
+>>> motif.name
+'Motif 1'
+>>> motif.length
+12
+>>> evalue = motif.evalue
+>>> print "%3.1g" % evalue
+0.2
+\end{verbatim}
+Each motif has an attribute \verb+.instances+ with the sequence instances in which the motif was found, providing some information on each instance
+%cont-doctest
+\begin{verbatim}
+>>> len(motif.instances)
+7
+>>> motif.instances[0]
+Instance('TTCACATGCCGC', IUPACUnambiguousDNA())
+>>> motif.instances[0].motif_name
+'Motif 1'
+>>> motif.instances[0].sequence_name
+'INO1'
+>>> motif.instances[0].start
+620
+>>> motif.instances[0].strand
+'-'
+>>> motif.instances[0].length
+12
+>>> pvalue = motif.instances[0].pvalue
+>>> print "%5.3g" % pvalue
+1.85e-08
+\end{verbatim}
+
\subsubsection*{TRANSFAC}
\subsection{Writing motifs}
@@ -16586,6 +16706,8 @@ \subsection{Creating a handle from a string}
Athel Cornish-Bowden: ``Nomenclature for incompletely specified bases in nucleic acid sequences: Recommendations 1984.'' \textit{Nucleic Acids Research} {\bf 13} (9): 3021--3030 (1985). \href{http://dx.doi.org/10.1093/nar/13.9.3021}{doi:10.1093/nar/13.9.3021}
\bibitem{cavener1987}
Douglas R. Cavener: ``Comparison of the consensus sequence flanking translational start sites in Drosophila and vertebrates.'' \textit{Nucleic Acids Research} {\bf 15} (4): 1353--1361 (1987). \href{http://dx.doi.org/10.1093/nar/15.4.1353}{doi:10.1093/nar/15.4.1353}
+\bibitem{bailey1994}
+Timothy L. Bailey and Charles Elkan: ``Fitting a mixture model by expectation maximization to discover motifs in biopolymers'', \textit{Proceedings of the Second International Conference on Intelligent Systems for Molecular Biology} 28--36. AAAI Press, Menlo Park, California (1994).
\bibitem{dehoon2004}
Michiel J. L. de Hoon, Seiya Imoto, John Nolan, Satoru Miyano: ``Open source clustering software''. \textit{Bioinformatics} {\bf 20} (9): 1453--1454 (2004). \href{http://dx.doi.org/10.1093/bioinformatics/bth078}{doi:10.1093/bioinformatics/bth078}
\bibitem{eisen1998}
View
30 Tests/test_Motif.py
@@ -443,9 +443,8 @@ def test_meme_parser_1(self):
"""Test if Motif can parse MEME output files (first test)
"""
from Bio.Alphabet import IUPAC
- from Bio.Motif import MEME
handle = open("Motif/meme.out")
- record = MEME.read(handle)
+ record = Motif.parse(handle, 'MEME')
self.assertEqual(record.version, '3.5.7')
self.assertEqual(record.datafile, 'test.fa')
self.assertEqual(record.alphabet, IUPAC.unambiguous_dna)
@@ -554,9 +553,8 @@ def test_meme_parser_2(self):
"""Test if Motif can parse MEME output files (second test)
"""
from Bio.Alphabet import IUPAC
- from Bio.Motif import MEME
handle = open("Motif/meme.dna.oops.txt")
- record = MEME.read(handle)
+ record = Motif.parse(handle, 'MEME')
self.assertEqual(record.version, '3.0')
self.assertEqual(record.datafile, 'INO_up800.s')
self.assertEqual(record.alphabet, IUPAC.unambiguous_dna)
@@ -569,8 +567,8 @@ def test_meme_parser_2(self):
self.assertEqual(record.sequences[5], 'INO1')
self.assertEqual(record.sequences[6], 'OPI3')
self.assertEqual(record.command, 'meme -mod oops -dna -revcomp -nmotifs 2 -bfile yeast.nc.6.freq INO_up800.s')
- self.assertEqual(len(record.motifs), 2)
- motif = record.motifs[0]
+ self.assertEqual(len(record), 2)
+ motif = record[0]
self.assertEqual(motif.num_occurrences, 7)
self.assertAlmostEqual(motif.evalue, 0.2)
self.assertEqual(motif.alphabet, IUPAC.unambiguous_dna)
@@ -618,7 +616,7 @@ def test_meme_parser_2(self):
self.assertEqual(str(motif.instances[4]), "TTCACACGGCAC")
self.assertEqual(str(motif.instances[5]), "TTCACATGCTAC")
self.assertEqual(str(motif.instances[6]), "TTCAGATCGCTC")
- motif = record.motifs[1]
+ motif = record[1]
self.assertEqual(motif.num_occurrences, 7)
self.assertAlmostEqual(motif.evalue, 110)
self.assertEqual(motif.alphabet, IUPAC.unambiguous_dna)
@@ -672,9 +670,8 @@ def test_meme_parser_3(self):
"""Test if Motif can parse MEME output files (third test)
"""
from Bio.Alphabet import IUPAC
- from Bio.Motif import MEME
handle = open("Motif/meme.protein.oops.txt")
- record = MEME.read(handle)
+ record = Motif.parse(handle, 'MEME')
self.assertEqual(record.version, '3.0')
self.assertEqual(record.datafile, 'adh.s')
self.assertEqual(record.alphabet, IUPAC.protein)
@@ -713,8 +710,8 @@ def test_meme_parser_3(self):
self.assertEqual(record.sequences[31], "RFBB_NEIGO")
self.assertEqual(record.sequences[32], "YURA_MYXXA")
self.assertEqual(record.command, 'meme adh.s -mod oops -protein -nmotifs 2')
- self.assertEqual(len(record.motifs), 2)
- motif = record.motifs[0]
+ self.assertEqual(len(record), 2)
+ motif = record[0]
self.assertEqual(motif.num_occurrences, 33)
self.assertAlmostEqual(motif.evalue, 3.6e-165)
self.assertEqual(motif.alphabet, IUPAC.protein)
@@ -918,7 +915,7 @@ def test_meme_parser_3(self):
self.assertEqual(str(motif.instances[30]), "YGVTKIGVTVLSRIHARKLSEQRKGDKIL")
self.assertEqual(str(motif.instances[31]), "KDSTLFGVSSLSDSLKGDFTSSALRCKEL")
self.assertEqual(str(motif.instances[32]), "YINCVAPLRMTELCLPHLYETGSGRIVNI")
- motif = record.motifs[1]
+ motif = record[1]
self.assertEqual(motif.num_occurrences, 33)
self.assertAlmostEqual(motif.evalue, 2.3e-159)
self.assertEqual(motif.alphabet, IUPAC.protein)
@@ -1062,9 +1059,8 @@ def test_meme_parser_4(self):
"""Test if Motif can parse MEME output files (fourth test)
"""
from Bio.Alphabet import IUPAC
- from Bio.Motif import MEME
handle = open("Motif/meme.protein.tcm.txt")
- record = MEME.read(handle)
+ record = Motif.parse(handle, 'MEME')
self.assertEqual(record.version, '3.0')
self.assertEqual(record.datafile, 'farntrans5.s')
self.assertEqual(record.alphabet, IUPAC.protein)
@@ -1075,8 +1071,8 @@ def test_meme_parser_4(self):
self.assertEqual(record.sequences[3], "RATRABGERB")
self.assertEqual(record.sequences[4], "CAL1_YEAST")
self.assertEqual(record.command, 'meme farntrans5.s -mod tcm -protein -nmotifs 2')
- self.assertEqual(len(record.motifs), 2)
- motif = record.motifs[0]
+ self.assertEqual(len(record), 2)
+ motif = record[0]
self.assertEqual(motif.num_occurrences, 24)
self.assertAlmostEqual(motif.evalue, 2.2e-94)
self.assertEqual(motif.alphabet, IUPAC.protein)
@@ -1226,7 +1222,7 @@ def test_meme_parser_4(self):
self.assertEqual(str(motif.instances[21]), "PGLRDKPGAHSDFYHTNYCLLGLAVAESSY")
self.assertEqual(str(motif.instances[22]), "GGFSKNDEEDADLYHSCLGSAALALIEGKF")
self.assertEqual(str(motif.instances[23]), "HNFEYWLTEHLRLNGIYWGLTALCVLDSPE")
- motif = record.motifs[1]
+ motif = record[1]
self.assertEqual(motif.num_occurrences, 21)
self.assertAlmostEqual(motif.evalue, 3.1e-19)
self.assertEqual(motif.alphabet, IUPAC.protein)
Please sign in to comment.
Something went wrong with that request. Please try again.