Making the different parsers work uniformly

biopython · Jan 3, 2013 · 4e68a7e · 4e68a7e
1 parent 3aedeba
commit 4e68a7e
Show file tree

Hide file tree

Showing 7 changed files with 433 additions and 409 deletions.
diff --git a/Bio/Motif/AlignAce.py b/Bio/Motif/AlignAce.py
@@ -11,11 +11,20 @@
 from Bio.Seq import Seq
 
 
-class Record(object):
+class Record(list):
     def __init__(self):
-        self.motifs=[]
         self.parameters = None
 
+    @property
+    def motifs(self):
+        import warnings
+        warnings.warn("""\
+The .motifs attribute is now obsolete, and will be deprecated and removed
+in a future release of Biopython. This class now inherits from list, so
+instead of record.motifs[i], please use record[i].
+""", PendingDeprecationWarning)
+        return self
+
 
 def read(handle):
     """read(handle)"""
@@ -50,7 +59,7 @@ def read(handle):
             motif.score = float(line.split()[-1])
             motif.number = number
             motif.mask = mask
-            record.motifs.append(motif)
+            record.append(motif)
         elif len(line.split("\t"))==4:
             seq = Seq(line.split("\t")[0],IUPAC.unambiguous_dna)
             instances.append(seq)

diff --git a/Bio/Motif/MAST.py b/Bio/Motif/MAST.py
@@ -5,17 +5,17 @@
 # as part of this package.
 
 from Bio.Alphabet import IUPAC
-from Bio.Motif.MEME import MEMEMotif
+from Bio.Motif import MEME
 
 
 class Record(object):
     """The class for holding the results from a MAST run.
 
     A MAST.Record holds data about matches between motifs and sequences.
-    The motifs held by the Record are objects of the class MEMEMotif.
+    The motifs held by the Record are objects of the class MEME.Motif.
 
     Methods:
-    get_motif_by_name (motif_name): returns a MEMEMotif with the given
+    get_motif_by_name (motif_name): returns a MEME.Motif with the given
     name.
     """
 
@@ -82,7 +82,7 @@ def __read_database_and_motifs(record, handle):
         if not line.strip():
             break
         words = line.strip().split()
-        motif = MEMEMotif(record.alphabet)
+        motif = MEME.Motif(record.alphabet)
         motif.name = words[0]
         motif.length = int(words[1])
         # words[2] contains the best possible match

diff --git a/Bio/Motif/MEME.py b/Bio/Motif/MEME.py
@@ -6,7 +6,7 @@
 
 from Bio.Alphabet import IUPAC
 from Bio import Seq
-from Bio.Motif import Motif
+from Bio.Motif import Motif as BaseMotif
 
 
 def read(handle):
@@ -17,12 +17,12 @@ def read(handle):
     >>> f = open("meme.output.txt")
     >>> from Bio.Motif import MEME
     >>> record = MEME.read(f)
-    >>> for motif in record.motifs:
+    >>> for motif in record:
     ...     for instance in motif.instances:
     ...         print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue
 
     """
-    record = MEMERecord()
+    record = Record()
     __read_version(record, handle)
     __read_datafile(record, handle)
     __read_alphabet(record, handle)
@@ -39,12 +39,12 @@ def read(handle):
         length, num_occurrences, evalue = __read_motif_statistics(line)
         name = __read_motif_name(handle)
         instances = __read_motif_sequences(handle, name, alphabet, length, revcomp)
-        motif = MEMEMotif(alphabet, instances)
+        motif = Motif(alphabet, instances)
         motif.length = length
         motif.num_occurrences = num_occurrences
         motif.evalue = evalue
         motif.name = name
-        record.motifs.append(motif)
+        record.append(motif)
         __skip_unused_lines(handle)
         try:
             line = handle.next()
@@ -57,7 +57,7 @@ def read(handle):
     return record
 
 
-class MEMEMotif(Motif):
+class Motif(BaseMotif):
     """A subclass of Motif used in parsing MEME (and MAST) output.
 
     This subclass defines functions and data specific to MEME motifs.
@@ -68,7 +68,7 @@ class MEMEMotif(Motif):
        (DEPRECATION PENDING)
     """
     def __init__(self, alphabet=None, instances=None):
-        Motif.__init__(self, alphabet, instances)
+        BaseMotif.__init__(self, alphabet, instances)
         self.evalue = 0.0
 
     def _numoccurrences(self, number):
@@ -91,7 +91,7 @@ def add_instance_from_values(self, name = 'default', pvalue = 1, sequence = 'ATA
         import warnings
         warnings.warn("This function is now obsolete, and will be deprecated and removed "
                       "in a future release of Biopython.", PendingDeprecationWarning)
-        inst = MEMEInstance(sequence,self.alphabet)
+        inst = Instance(sequence,self.alphabet)
         inst._pvalue(pvalue)
         inst._seqname(name)
         inst._start(start)
@@ -115,7 +115,7 @@ def _evalue(self, evalue):
             self.evalue = evalue
 
 
-class MEMEInstance(Seq.Seq):
+class Instance(Seq.Seq):
     """A class describing the instances of a MEME motif, and the data thereof.
     """
     def __init__(self,*args,**kwds):
@@ -173,23 +173,32 @@ def _length(self, length):
         self.length = length
 
 
-class MEMERecord(object):
+class Record(list):
     """A class for holding the results of a MEME run.
 
-    A MEMERecord is an object that holds the results from running
+    A Record is an object that holds the results from running
     MEME. It implements no methods of its own.
     """
     def __init__(self):
         """__init__ (self)"""
-        self.motifs = []
         self.version = ""
         self.datafile = ""
         self.command = ""
         self.alphabet = None
         self.sequences = []
 
+    @property
+    def motifs(self):
+        import warnings
+        warnings.warn("""\
+The .motifs attribute is now obsolete, and will be deprecated and removed
+in a future release of Biopython. This class now inherits from list, so
+instead of record.motifs[i], please use record[i].
+""", PendingDeprecationWarning)
+        return self
+
     def get_motif_by_name(self, name):
-        for m in self.motifs:
+        for m in self:
             if m.name == name:
                 return m
 
@@ -335,7 +344,7 @@ def __read_motif_sequences(handle, motif_name, alphabet, length, revcomp):
             strand = '+'
         sequence = words[4]
         assert len(sequence)==length
-        instance = MEMEInstance(sequence, alphabet)
+        instance = Instance(sequence, alphabet)
         instance.motif_name = motif_name
         instance.sequence_name = words[0]
         instance.start = int(words[1])

diff --git a/Bio/Motif/TRANSFAC.py b/Bio/Motif/TRANSFAC.py
@@ -81,18 +81,27 @@ def __str__(self):
         return format(self, "transfac")
 
 
-class Record(object):
+class Record(list):
     """A Bio.Motif.TRANSFAC.Record stores the information in a TRANSFAC
-matrix table.
+matrix table. The record inherits from a list containing the individual
+motifs.
 
 Attributes:
     o version:   The version number, corresponding to the 'VV' field
                  in the TRANSFAC file;
-    o motifs:    The list of motifs.
 """
     def __init__(self):
         self.version = None
-        self.motifs = []
+
+    @property
+    def motifs(self):
+        import warnings
+        warnings.warn("""\
+The .motifs attribute is now obsolete, and will be deprecated and removed
+in a future release of Biopython. This class now inherits from list, so
+instead of record.motifs[i], please use record[i].
+""", PendingDeprecationWarning)
+        return self
 
     def __str__(self):
         blocks = []
@@ -103,7 +112,7 @@ def __str__(self):
 //
 """ % self.version
             blocks.append(block)
-        for motif in self.motifs:
+        for motif in self:
             block = str(motif)
             blocks.append(block)
         text = "".join(blocks)
@@ -119,7 +128,7 @@ def read(handle):
         line = line.strip()
         if line=='//':
             if motif is not None:
-                record.motifs.append(motif)
+                record.append(motif)
             motif = None
             status = None
         elif line=='XX':

diff --git a/Bio/Motif/__init__.py b/Bio/Motif/__init__.py
@@ -10,23 +10,11 @@
 It also includes functionality for parsing AlignACE and MEME programs.
 """
 from Bio.Motif._Motif import Motif
-from Bio.Motif.AlignAce import read as _AlignAce_read
-from Bio.Motif.MEME import read as _MEME_read
-from Bio.Motif import Jaspar
-from Bio.Motif.Thresholds import ScoreDistribution
-from Bio.Alphabet import IUPAC
-from Bio.Seq import Seq
-
-
-def _from_pfm(handle):
-    return Motif()._from_jaspar_pfm(handle)
-
-
-def _from_sites(handle):
-    return Motif()._from_jaspar_sites(handle)
 
 
 def create(instances, alphabet=None):
+    from Bio.Alphabet import IUPAC
+    from Bio.Seq import Seq
     for instance in instances:
         try:
             a = instance.alphabet
@@ -52,13 +40,16 @@ def parse(handle,format):
     """Parses an output file of motif finding programs.
 
     Currently supported formats:
-     - AlignAce
-     - MEME
-
-    You can also use single-motif formats, although the Bio.Motif.read()
-    function is simpler to use in this situation.
-     - jaspar-pfm
-     - jaspar-sites
+     - AlignAce:      AlignAce output file format
+     - MEME:          MEME output file motif
+     - TRANSFAC:      TRANSFAC database file format
+     - pfm:           JASPAR-style position-frequency matrix
+     - sites:         JASPAR-style sites file
+     - jaspar-pfm:    JASPAR-style position-frequency matrix [DEPRECATED]
+     - jaspar-sites:  JASPAR-style sites file [DEPRECATED]
+    As files in the pfm and sites formats contain only a single motif,
+    it is easier to use Bio.Motif.read() instead of Bio.Motif.parse()
+    for those.
 
     For example:
 
@@ -82,30 +73,38 @@ def parse(handle,format):
     GAGGCCGGGGAT
     CGACTCGTGCTTAGAAGG
     """
-    if format in ('pfm', 'sites'):
-        yield Jaspar.read(handle, format)
-    elif format=="AlignAce":
-        record = _AlignAce_read(handle)
-        for m in record.motifs:
-            yield m
+    if format=="AlignAce":
+        from Bio.Motif import AlignAce
+        record = AlignAce.read(handle)
+        return record
     elif format=="MEME":
-        record = _MEME_read(handle)
-        for m in record.motifs:
-            yield m
+        from Bio.Motif import MEME
+        record = MEME.read(handle)
+        return record
+    elif format=="TRANSFAC":
+        from Bio.Motif import TRANSFAC
+        record = TRANSFAC.read(handle)
+        return record
+    elif format in ('pfm', 'sites'):
+        from Bio.Motif import Jaspar
+        motif = Jaspar.read(handle, format)
     elif format=="jaspar-pfm":
-        yield _from_pfm(handle)
+        motif = Motif()._from_jaspar_pfm(handle)
     elif format=="jaspar-sites":
-        yield _from_sites(handle)
+        motif = Motif()._from_jaspar_sites(handle)
     else:
         raise ValueError("Unknown format %s" % format)
+    # Treat the single-motif formats
+    motifs = [motif]
+    return motifs
 
 
 def read(handle,format):
     """Reads a motif from a handle using a specified file-format.
 
     This supports the same formats as Bio.Motif.parse(), but
-    only for files containing exactly one record.  For example,
-    reading a pfm file:
+    only for files containing exactly one motif.  For example,
+    reading a JASPAR-style pfm file:
 
     >>> from Bio import Motif
     >>> motif = Motif.read(open("Motif/SRF.pfm"), "pfm")
@@ -128,32 +127,26 @@ def read(handle,format):
         ...
     ValueError: More than one motif found in handle
 
-    If however you want the first record from a file containing
-    multiple records this function would raise an exception (as
+    If however you want the first motif from a file containing
+    multiple motifs this function would raise an exception (as
     shown in the example above).  Instead use:
 
     >>> from Bio import Motif
-    >>> motif = Motif.parse(open("Motif/alignace.out"),"AlignAce").next()
+    >>> motifs = Motif.parse(open("Motif/alignace.out"),"AlignAce")
+    >>> motif = motifs[0]
     >>> motif.consensus
     Seq('TCTACGATTGAG', IUPACUnambiguousDNA())
 
     Use the Bio.Motif.parse(handle, format) function if you want
     to read multiple records from the handle.
     """
-    iterator = parse(handle, format)
-    try:
-        first = iterator.next()
-    except StopIteration:
-        first = None
-    if first is None:
+    motifs = parse(handle, format)
+    if len(motifs)==0:
         raise ValueError("No motifs found in handle")
-    try:
-        second = iterator.next()
-    except StopIteration:
-        second = None
-    if second is not None:
+    if len(motifs) > 1:
         raise ValueError("More than one motif found in handle")
-    return first
+    motif = motifs[0]
+    return motif
 
 
 if __name__ == "__main__":

diff --git a/Doc/Tutorial.tex b/Doc/Tutorial.tex
@@ -80,7 +80,7 @@
 
 \author{Jeff Chang, Brad Chapman, Iddo Friedberg, Thomas Hamelryck, \\
 Michiel de Hoon, Peter Cock, Tiago Antao, Eric Talevich, Bartek Wilczy\'{n}ski}
-\date{Last Update -- 29 December 2012 (Biopython 1.60+)}
+\date{Last Update -- 3 January 2013 (Biopython 1.60+)}
 
 %Hack to get the logo at the start of the HTML front page:
 %(hopefully this isn't going to be too wide for most people)
@@ -11026,6 +11026,10 @@ \subsubsection*{JASPAR}
 GCCCATATATGG
 \end{verbatim}
 
+\subsubsection*{MEME}
+
+\subsubsection*{TRANSFAC}
+
 \subsection{Writing motifs}
 Speaking of exporting, let's look at export functions. We can export to fasta:
 %cont-doctest