Permalink
Browse files

Making the different parsers work uniformly

  • Loading branch information...
1 parent 3aedeba commit 4e68a7ea8d8c572eaa48d4018b4d7b8e14d80afa Michiel de Hoon committed Jan 3, 2013
Showing with 433 additions and 409 deletions.
  1. +12 −3 Bio/Motif/AlignAce.py
  2. +4 −4 Bio/Motif/MAST.py
  3. +23 −14 Bio/Motif/MEME.py
  4. +15 −6 Bio/Motif/TRANSFAC.py
  5. +42 −49 Bio/Motif/__init__.py
  6. +5 −1 Doc/Tutorial.tex
  7. +332 −332 Tests/test_Motif.py
View
@@ -11,11 +11,20 @@
from Bio.Seq import Seq
-class Record(object):
+class Record(list):
def __init__(self):
- self.motifs=[]
self.parameters = None
+ @property
+ def motifs(self):
+ import warnings
+ warnings.warn("""\
+The .motifs attribute is now obsolete, and will be deprecated and removed
+in a future release of Biopython. This class now inherits from list, so
+instead of record.motifs[i], please use record[i].
+""", PendingDeprecationWarning)
+ return self
+
def read(handle):
"""read(handle)"""
@@ -50,7 +59,7 @@ def read(handle):
motif.score = float(line.split()[-1])
motif.number = number
motif.mask = mask
- record.motifs.append(motif)
+ record.append(motif)
elif len(line.split("\t"))==4:
seq = Seq(line.split("\t")[0],IUPAC.unambiguous_dna)
instances.append(seq)
View
@@ -5,17 +5,17 @@
# as part of this package.
from Bio.Alphabet import IUPAC
-from Bio.Motif.MEME import MEMEMotif
+from Bio.Motif import MEME
class Record(object):
"""The class for holding the results from a MAST run.
A MAST.Record holds data about matches between motifs and sequences.
- The motifs held by the Record are objects of the class MEMEMotif.
+ The motifs held by the Record are objects of the class MEME.Motif.
Methods:
- get_motif_by_name (motif_name): returns a MEMEMotif with the given
+ get_motif_by_name (motif_name): returns a MEME.Motif with the given
name.
"""
@@ -82,7 +82,7 @@ def __read_database_and_motifs(record, handle):
if not line.strip():
break
words = line.strip().split()
- motif = MEMEMotif(record.alphabet)
+ motif = MEME.Motif(record.alphabet)
motif.name = words[0]
motif.length = int(words[1])
# words[2] contains the best possible match
View
@@ -6,7 +6,7 @@
from Bio.Alphabet import IUPAC
from Bio import Seq
-from Bio.Motif import Motif
+from Bio.Motif import Motif as BaseMotif
def read(handle):
@@ -17,12 +17,12 @@ def read(handle):
>>> f = open("meme.output.txt")
>>> from Bio.Motif import MEME
>>> record = MEME.read(f)
- >>> for motif in record.motifs:
+ >>> for motif in record:
... for instance in motif.instances:
... print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue
"""
- record = MEMERecord()
+ record = Record()
__read_version(record, handle)
__read_datafile(record, handle)
__read_alphabet(record, handle)
@@ -39,12 +39,12 @@ def read(handle):
length, num_occurrences, evalue = __read_motif_statistics(line)
name = __read_motif_name(handle)
instances = __read_motif_sequences(handle, name, alphabet, length, revcomp)
- motif = MEMEMotif(alphabet, instances)
+ motif = Motif(alphabet, instances)
motif.length = length
motif.num_occurrences = num_occurrences
motif.evalue = evalue
motif.name = name
- record.motifs.append(motif)
+ record.append(motif)
__skip_unused_lines(handle)
try:
line = handle.next()
@@ -57,7 +57,7 @@ def read(handle):
return record
-class MEMEMotif(Motif):
+class Motif(BaseMotif):
"""A subclass of Motif used in parsing MEME (and MAST) output.
This subclass defines functions and data specific to MEME motifs.
@@ -68,7 +68,7 @@ class MEMEMotif(Motif):
(DEPRECATION PENDING)
"""
def __init__(self, alphabet=None, instances=None):
- Motif.__init__(self, alphabet, instances)
+ BaseMotif.__init__(self, alphabet, instances)
self.evalue = 0.0
def _numoccurrences(self, number):
@@ -91,7 +91,7 @@ def add_instance_from_values(self, name = 'default', pvalue = 1, sequence = 'ATA
import warnings
warnings.warn("This function is now obsolete, and will be deprecated and removed "
"in a future release of Biopython.", PendingDeprecationWarning)
- inst = MEMEInstance(sequence,self.alphabet)
+ inst = Instance(sequence,self.alphabet)
inst._pvalue(pvalue)
inst._seqname(name)
inst._start(start)
@@ -115,7 +115,7 @@ def _evalue(self, evalue):
self.evalue = evalue
-class MEMEInstance(Seq.Seq):
+class Instance(Seq.Seq):
"""A class describing the instances of a MEME motif, and the data thereof.
"""
def __init__(self,*args,**kwds):
@@ -173,23 +173,32 @@ def _length(self, length):
self.length = length
-class MEMERecord(object):
+class Record(list):
"""A class for holding the results of a MEME run.
- A MEMERecord is an object that holds the results from running
+ A Record is an object that holds the results from running
MEME. It implements no methods of its own.
"""
def __init__(self):
"""__init__ (self)"""
- self.motifs = []
self.version = ""
self.datafile = ""
self.command = ""
self.alphabet = None
self.sequences = []
+ @property
+ def motifs(self):
+ import warnings
+ warnings.warn("""\
+The .motifs attribute is now obsolete, and will be deprecated and removed
+in a future release of Biopython. This class now inherits from list, so
+instead of record.motifs[i], please use record[i].
+""", PendingDeprecationWarning)
+ return self
+
def get_motif_by_name(self, name):
- for m in self.motifs:
+ for m in self:
if m.name == name:
return m
@@ -335,7 +344,7 @@ def __read_motif_sequences(handle, motif_name, alphabet, length, revcomp):
strand = '+'
sequence = words[4]
assert len(sequence)==length
- instance = MEMEInstance(sequence, alphabet)
+ instance = Instance(sequence, alphabet)
instance.motif_name = motif_name
instance.sequence_name = words[0]
instance.start = int(words[1])
View
@@ -81,18 +81,27 @@ def __str__(self):
return format(self, "transfac")
-class Record(object):
+class Record(list):
"""A Bio.Motif.TRANSFAC.Record stores the information in a TRANSFAC
-matrix table.
+matrix table. The record inherits from a list containing the individual
+motifs.
Attributes:
o version: The version number, corresponding to the 'VV' field
in the TRANSFAC file;
- o motifs: The list of motifs.
"""
def __init__(self):
self.version = None
- self.motifs = []
+
+ @property
+ def motifs(self):
+ import warnings
+ warnings.warn("""\
+The .motifs attribute is now obsolete, and will be deprecated and removed
+in a future release of Biopython. This class now inherits from list, so
+instead of record.motifs[i], please use record[i].
+""", PendingDeprecationWarning)
+ return self
def __str__(self):
blocks = []
@@ -103,7 +112,7 @@ def __str__(self):
//
""" % self.version
blocks.append(block)
- for motif in self.motifs:
+ for motif in self:
block = str(motif)
blocks.append(block)
text = "".join(blocks)
@@ -119,7 +128,7 @@ def read(handle):
line = line.strip()
if line=='//':
if motif is not None:
- record.motifs.append(motif)
+ record.append(motif)
motif = None
status = None
elif line=='XX':
View
@@ -10,23 +10,11 @@
It also includes functionality for parsing AlignACE and MEME programs.
"""
from Bio.Motif._Motif import Motif
-from Bio.Motif.AlignAce import read as _AlignAce_read
-from Bio.Motif.MEME import read as _MEME_read
-from Bio.Motif import Jaspar
-from Bio.Motif.Thresholds import ScoreDistribution
-from Bio.Alphabet import IUPAC
-from Bio.Seq import Seq
-
-
-def _from_pfm(handle):
- return Motif()._from_jaspar_pfm(handle)
-
-
-def _from_sites(handle):
- return Motif()._from_jaspar_sites(handle)
def create(instances, alphabet=None):
+ from Bio.Alphabet import IUPAC
+ from Bio.Seq import Seq
for instance in instances:
try:
a = instance.alphabet
@@ -52,13 +40,16 @@ def parse(handle,format):
"""Parses an output file of motif finding programs.
Currently supported formats:
- - AlignAce
- - MEME
-
- You can also use single-motif formats, although the Bio.Motif.read()
- function is simpler to use in this situation.
- - jaspar-pfm
- - jaspar-sites
+ - AlignAce: AlignAce output file format
+ - MEME: MEME output file motif
+ - TRANSFAC: TRANSFAC database file format
+ - pfm: JASPAR-style position-frequency matrix
+ - sites: JASPAR-style sites file
+ - jaspar-pfm: JASPAR-style position-frequency matrix [DEPRECATED]
+ - jaspar-sites: JASPAR-style sites file [DEPRECATED]
+ As files in the pfm and sites formats contain only a single motif,
+ it is easier to use Bio.Motif.read() instead of Bio.Motif.parse()
+ for those.
For example:
@@ -82,30 +73,38 @@ def parse(handle,format):
GAGGCCGGGGAT
CGACTCGTGCTTAGAAGG
"""
- if format in ('pfm', 'sites'):
- yield Jaspar.read(handle, format)
- elif format=="AlignAce":
- record = _AlignAce_read(handle)
- for m in record.motifs:
- yield m
+ if format=="AlignAce":
+ from Bio.Motif import AlignAce
+ record = AlignAce.read(handle)
+ return record
elif format=="MEME":
- record = _MEME_read(handle)
- for m in record.motifs:
- yield m
+ from Bio.Motif import MEME
+ record = MEME.read(handle)
+ return record
+ elif format=="TRANSFAC":
+ from Bio.Motif import TRANSFAC
+ record = TRANSFAC.read(handle)
+ return record
+ elif format in ('pfm', 'sites'):
+ from Bio.Motif import Jaspar
+ motif = Jaspar.read(handle, format)
elif format=="jaspar-pfm":
- yield _from_pfm(handle)
+ motif = Motif()._from_jaspar_pfm(handle)
elif format=="jaspar-sites":
- yield _from_sites(handle)
+ motif = Motif()._from_jaspar_sites(handle)
else:
raise ValueError("Unknown format %s" % format)
+ # Treat the single-motif formats
+ motifs = [motif]
+ return motifs
def read(handle,format):
"""Reads a motif from a handle using a specified file-format.
This supports the same formats as Bio.Motif.parse(), but
- only for files containing exactly one record. For example,
- reading a pfm file:
+ only for files containing exactly one motif. For example,
+ reading a JASPAR-style pfm file:
>>> from Bio import Motif
>>> motif = Motif.read(open("Motif/SRF.pfm"), "pfm")
@@ -128,32 +127,26 @@ def read(handle,format):
...
ValueError: More than one motif found in handle
- If however you want the first record from a file containing
- multiple records this function would raise an exception (as
+ If however you want the first motif from a file containing
+ multiple motifs this function would raise an exception (as
shown in the example above). Instead use:
>>> from Bio import Motif
- >>> motif = Motif.parse(open("Motif/alignace.out"),"AlignAce").next()
+ >>> motifs = Motif.parse(open("Motif/alignace.out"),"AlignAce")
+ >>> motif = motifs[0]
>>> motif.consensus
Seq('TCTACGATTGAG', IUPACUnambiguousDNA())
Use the Bio.Motif.parse(handle, format) function if you want
to read multiple records from the handle.
"""
- iterator = parse(handle, format)
- try:
- first = iterator.next()
- except StopIteration:
- first = None
- if first is None:
+ motifs = parse(handle, format)
+ if len(motifs)==0:
raise ValueError("No motifs found in handle")
- try:
- second = iterator.next()
- except StopIteration:
- second = None
- if second is not None:
+ if len(motifs) > 1:
raise ValueError("More than one motif found in handle")
- return first
+ motif = motifs[0]
+ return motif
if __name__ == "__main__":
View
@@ -80,7 +80,7 @@
\author{Jeff Chang, Brad Chapman, Iddo Friedberg, Thomas Hamelryck, \\
Michiel de Hoon, Peter Cock, Tiago Antao, Eric Talevich, Bartek Wilczy\'{n}ski}
-\date{Last Update -- 29 December 2012 (Biopython 1.60+)}
+\date{Last Update -- 3 January 2013 (Biopython 1.60+)}
%Hack to get the logo at the start of the HTML front page:
%(hopefully this isn't going to be too wide for most people)
@@ -11026,6 +11026,10 @@ \subsubsection*{JASPAR}
GCCCATATATGG
\end{verbatim}
+\subsubsection*{MEME}
+
+\subsubsection*{TRANSFAC}
+
\subsection{Writing motifs}
Speaking of exporting, let's look at export functions. We can export to fasta:
%cont-doctest
Oops, something went wrong.

0 comments on commit 4e68a7e

Please sign in to comment.