Skip to content

Commit

Permalink
Making the different parsers work uniformly
Browse files Browse the repository at this point in the history
  • Loading branch information
Michiel de Hoon authored and Michiel de Hoon committed Jan 3, 2013
1 parent 3aedeba commit 4e68a7e
Show file tree
Hide file tree
Showing 7 changed files with 433 additions and 409 deletions.
15 changes: 12 additions & 3 deletions Bio/Motif/AlignAce.py
Expand Up @@ -11,11 +11,20 @@
from Bio.Seq import Seq from Bio.Seq import Seq




class Record(object): class Record(list):
def __init__(self): def __init__(self):
self.motifs=[]
self.parameters = None self.parameters = None


@property
def motifs(self):
import warnings
warnings.warn("""\
The .motifs attribute is now obsolete, and will be deprecated and removed
in a future release of Biopython. This class now inherits from list, so
instead of record.motifs[i], please use record[i].
""", PendingDeprecationWarning)
return self



def read(handle): def read(handle):
"""read(handle)""" """read(handle)"""
Expand Down Expand Up @@ -50,7 +59,7 @@ def read(handle):
motif.score = float(line.split()[-1]) motif.score = float(line.split()[-1])
motif.number = number motif.number = number
motif.mask = mask motif.mask = mask
record.motifs.append(motif) record.append(motif)
elif len(line.split("\t"))==4: elif len(line.split("\t"))==4:
seq = Seq(line.split("\t")[0],IUPAC.unambiguous_dna) seq = Seq(line.split("\t")[0],IUPAC.unambiguous_dna)
instances.append(seq) instances.append(seq)
Expand Down
8 changes: 4 additions & 4 deletions Bio/Motif/MAST.py
Expand Up @@ -5,17 +5,17 @@
# as part of this package. # as part of this package.


from Bio.Alphabet import IUPAC from Bio.Alphabet import IUPAC
from Bio.Motif.MEME import MEMEMotif from Bio.Motif import MEME




class Record(object): class Record(object):
"""The class for holding the results from a MAST run. """The class for holding the results from a MAST run.
A MAST.Record holds data about matches between motifs and sequences. A MAST.Record holds data about matches between motifs and sequences.
The motifs held by the Record are objects of the class MEMEMotif. The motifs held by the Record are objects of the class MEME.Motif.
Methods: Methods:
get_motif_by_name (motif_name): returns a MEMEMotif with the given get_motif_by_name (motif_name): returns a MEME.Motif with the given
name. name.
""" """


Expand Down Expand Up @@ -82,7 +82,7 @@ def __read_database_and_motifs(record, handle):
if not line.strip(): if not line.strip():
break break
words = line.strip().split() words = line.strip().split()
motif = MEMEMotif(record.alphabet) motif = MEME.Motif(record.alphabet)
motif.name = words[0] motif.name = words[0]
motif.length = int(words[1]) motif.length = int(words[1])
# words[2] contains the best possible match # words[2] contains the best possible match
Expand Down
37 changes: 23 additions & 14 deletions Bio/Motif/MEME.py
Expand Up @@ -6,7 +6,7 @@


from Bio.Alphabet import IUPAC from Bio.Alphabet import IUPAC
from Bio import Seq from Bio import Seq
from Bio.Motif import Motif from Bio.Motif import Motif as BaseMotif




def read(handle): def read(handle):
Expand All @@ -17,12 +17,12 @@ def read(handle):
>>> f = open("meme.output.txt") >>> f = open("meme.output.txt")
>>> from Bio.Motif import MEME >>> from Bio.Motif import MEME
>>> record = MEME.read(f) >>> record = MEME.read(f)
>>> for motif in record.motifs: >>> for motif in record:
... for instance in motif.instances: ... for instance in motif.instances:
... print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue ... print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue
""" """
record = MEMERecord() record = Record()
__read_version(record, handle) __read_version(record, handle)
__read_datafile(record, handle) __read_datafile(record, handle)
__read_alphabet(record, handle) __read_alphabet(record, handle)
Expand All @@ -39,12 +39,12 @@ def read(handle):
length, num_occurrences, evalue = __read_motif_statistics(line) length, num_occurrences, evalue = __read_motif_statistics(line)
name = __read_motif_name(handle) name = __read_motif_name(handle)
instances = __read_motif_sequences(handle, name, alphabet, length, revcomp) instances = __read_motif_sequences(handle, name, alphabet, length, revcomp)
motif = MEMEMotif(alphabet, instances) motif = Motif(alphabet, instances)
motif.length = length motif.length = length
motif.num_occurrences = num_occurrences motif.num_occurrences = num_occurrences
motif.evalue = evalue motif.evalue = evalue
motif.name = name motif.name = name
record.motifs.append(motif) record.append(motif)
__skip_unused_lines(handle) __skip_unused_lines(handle)
try: try:
line = handle.next() line = handle.next()
Expand All @@ -57,7 +57,7 @@ def read(handle):
return record return record




class MEMEMotif(Motif): class Motif(BaseMotif):
"""A subclass of Motif used in parsing MEME (and MAST) output. """A subclass of Motif used in parsing MEME (and MAST) output.
This subclass defines functions and data specific to MEME motifs. This subclass defines functions and data specific to MEME motifs.
Expand All @@ -68,7 +68,7 @@ class MEMEMotif(Motif):
(DEPRECATION PENDING) (DEPRECATION PENDING)
""" """
def __init__(self, alphabet=None, instances=None): def __init__(self, alphabet=None, instances=None):
Motif.__init__(self, alphabet, instances) BaseMotif.__init__(self, alphabet, instances)
self.evalue = 0.0 self.evalue = 0.0


def _numoccurrences(self, number): def _numoccurrences(self, number):
Expand All @@ -91,7 +91,7 @@ def add_instance_from_values(self, name = 'default', pvalue = 1, sequence = 'ATA
import warnings import warnings
warnings.warn("This function is now obsolete, and will be deprecated and removed " warnings.warn("This function is now obsolete, and will be deprecated and removed "
"in a future release of Biopython.", PendingDeprecationWarning) "in a future release of Biopython.", PendingDeprecationWarning)
inst = MEMEInstance(sequence,self.alphabet) inst = Instance(sequence,self.alphabet)
inst._pvalue(pvalue) inst._pvalue(pvalue)
inst._seqname(name) inst._seqname(name)
inst._start(start) inst._start(start)
Expand All @@ -115,7 +115,7 @@ def _evalue(self, evalue):
self.evalue = evalue self.evalue = evalue




class MEMEInstance(Seq.Seq): class Instance(Seq.Seq):
"""A class describing the instances of a MEME motif, and the data thereof. """A class describing the instances of a MEME motif, and the data thereof.
""" """
def __init__(self,*args,**kwds): def __init__(self,*args,**kwds):
Expand Down Expand Up @@ -173,23 +173,32 @@ def _length(self, length):
self.length = length self.length = length




class MEMERecord(object): class Record(list):
"""A class for holding the results of a MEME run. """A class for holding the results of a MEME run.
A MEMERecord is an object that holds the results from running A Record is an object that holds the results from running
MEME. It implements no methods of its own. MEME. It implements no methods of its own.
""" """
def __init__(self): def __init__(self):
"""__init__ (self)""" """__init__ (self)"""
self.motifs = []
self.version = "" self.version = ""
self.datafile = "" self.datafile = ""
self.command = "" self.command = ""
self.alphabet = None self.alphabet = None
self.sequences = [] self.sequences = []


@property
def motifs(self):
import warnings
warnings.warn("""\
The .motifs attribute is now obsolete, and will be deprecated and removed
in a future release of Biopython. This class now inherits from list, so
instead of record.motifs[i], please use record[i].
""", PendingDeprecationWarning)
return self

def get_motif_by_name(self, name): def get_motif_by_name(self, name):
for m in self.motifs: for m in self:
if m.name == name: if m.name == name:
return m return m


Expand Down Expand Up @@ -335,7 +344,7 @@ def __read_motif_sequences(handle, motif_name, alphabet, length, revcomp):
strand = '+' strand = '+'
sequence = words[4] sequence = words[4]
assert len(sequence)==length assert len(sequence)==length
instance = MEMEInstance(sequence, alphabet) instance = Instance(sequence, alphabet)
instance.motif_name = motif_name instance.motif_name = motif_name
instance.sequence_name = words[0] instance.sequence_name = words[0]
instance.start = int(words[1]) instance.start = int(words[1])
Expand Down
21 changes: 15 additions & 6 deletions Bio/Motif/TRANSFAC.py
Expand Up @@ -81,18 +81,27 @@ def __str__(self):
return format(self, "transfac") return format(self, "transfac")




class Record(object): class Record(list):
"""A Bio.Motif.TRANSFAC.Record stores the information in a TRANSFAC """A Bio.Motif.TRANSFAC.Record stores the information in a TRANSFAC
matrix table. matrix table. The record inherits from a list containing the individual
motifs.
Attributes: Attributes:
o version: The version number, corresponding to the 'VV' field o version: The version number, corresponding to the 'VV' field
in the TRANSFAC file; in the TRANSFAC file;
o motifs: The list of motifs.
""" """
def __init__(self): def __init__(self):
self.version = None self.version = None
self.motifs = []
@property
def motifs(self):
import warnings
warnings.warn("""\
The .motifs attribute is now obsolete, and will be deprecated and removed
in a future release of Biopython. This class now inherits from list, so
instead of record.motifs[i], please use record[i].
""", PendingDeprecationWarning)
return self


def __str__(self): def __str__(self):
blocks = [] blocks = []
Expand All @@ -103,7 +112,7 @@ def __str__(self):
// //
""" % self.version """ % self.version
blocks.append(block) blocks.append(block)
for motif in self.motifs: for motif in self:
block = str(motif) block = str(motif)
blocks.append(block) blocks.append(block)
text = "".join(blocks) text = "".join(blocks)
Expand All @@ -119,7 +128,7 @@ def read(handle):
line = line.strip() line = line.strip()
if line=='//': if line=='//':
if motif is not None: if motif is not None:
record.motifs.append(motif) record.append(motif)
motif = None motif = None
status = None status = None
elif line=='XX': elif line=='XX':
Expand Down
91 changes: 42 additions & 49 deletions Bio/Motif/__init__.py
Expand Up @@ -10,23 +10,11 @@
It also includes functionality for parsing AlignACE and MEME programs. It also includes functionality for parsing AlignACE and MEME programs.
""" """
from Bio.Motif._Motif import Motif from Bio.Motif._Motif import Motif
from Bio.Motif.AlignAce import read as _AlignAce_read
from Bio.Motif.MEME import read as _MEME_read
from Bio.Motif import Jaspar
from Bio.Motif.Thresholds import ScoreDistribution
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq


def _from_pfm(handle):
return Motif()._from_jaspar_pfm(handle)


def _from_sites(handle):
return Motif()._from_jaspar_sites(handle)




def create(instances, alphabet=None): def create(instances, alphabet=None):
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
for instance in instances: for instance in instances:
try: try:
a = instance.alphabet a = instance.alphabet
Expand All @@ -52,13 +40,16 @@ def parse(handle,format):
"""Parses an output file of motif finding programs. """Parses an output file of motif finding programs.
Currently supported formats: Currently supported formats:
- AlignAce - AlignAce: AlignAce output file format
- MEME - MEME: MEME output file motif
- TRANSFAC: TRANSFAC database file format
You can also use single-motif formats, although the Bio.Motif.read() - pfm: JASPAR-style position-frequency matrix
function is simpler to use in this situation. - sites: JASPAR-style sites file
- jaspar-pfm - jaspar-pfm: JASPAR-style position-frequency matrix [DEPRECATED]
- jaspar-sites - jaspar-sites: JASPAR-style sites file [DEPRECATED]
As files in the pfm and sites formats contain only a single motif,
it is easier to use Bio.Motif.read() instead of Bio.Motif.parse()
for those.
For example: For example:
Expand All @@ -82,30 +73,38 @@ def parse(handle,format):
GAGGCCGGGGAT GAGGCCGGGGAT
CGACTCGTGCTTAGAAGG CGACTCGTGCTTAGAAGG
""" """
if format in ('pfm', 'sites'): if format=="AlignAce":
yield Jaspar.read(handle, format) from Bio.Motif import AlignAce
elif format=="AlignAce": record = AlignAce.read(handle)
record = _AlignAce_read(handle) return record
for m in record.motifs:
yield m
elif format=="MEME": elif format=="MEME":
record = _MEME_read(handle) from Bio.Motif import MEME
for m in record.motifs: record = MEME.read(handle)
yield m return record
elif format=="TRANSFAC":
from Bio.Motif import TRANSFAC
record = TRANSFAC.read(handle)
return record
elif format in ('pfm', 'sites'):
from Bio.Motif import Jaspar
motif = Jaspar.read(handle, format)
elif format=="jaspar-pfm": elif format=="jaspar-pfm":
yield _from_pfm(handle) motif = Motif()._from_jaspar_pfm(handle)
elif format=="jaspar-sites": elif format=="jaspar-sites":
yield _from_sites(handle) motif = Motif()._from_jaspar_sites(handle)
else: else:
raise ValueError("Unknown format %s" % format) raise ValueError("Unknown format %s" % format)
# Treat the single-motif formats
motifs = [motif]
return motifs




def read(handle,format): def read(handle,format):
"""Reads a motif from a handle using a specified file-format. """Reads a motif from a handle using a specified file-format.
This supports the same formats as Bio.Motif.parse(), but This supports the same formats as Bio.Motif.parse(), but
only for files containing exactly one record. For example, only for files containing exactly one motif. For example,
reading a pfm file: reading a JASPAR-style pfm file:
>>> from Bio import Motif >>> from Bio import Motif
>>> motif = Motif.read(open("Motif/SRF.pfm"), "pfm") >>> motif = Motif.read(open("Motif/SRF.pfm"), "pfm")
Expand All @@ -128,32 +127,26 @@ def read(handle,format):
... ...
ValueError: More than one motif found in handle ValueError: More than one motif found in handle
If however you want the first record from a file containing If however you want the first motif from a file containing
multiple records this function would raise an exception (as multiple motifs this function would raise an exception (as
shown in the example above). Instead use: shown in the example above). Instead use:
>>> from Bio import Motif >>> from Bio import Motif
>>> motif = Motif.parse(open("Motif/alignace.out"),"AlignAce").next() >>> motifs = Motif.parse(open("Motif/alignace.out"),"AlignAce")
>>> motif = motifs[0]
>>> motif.consensus >>> motif.consensus
Seq('TCTACGATTGAG', IUPACUnambiguousDNA()) Seq('TCTACGATTGAG', IUPACUnambiguousDNA())
Use the Bio.Motif.parse(handle, format) function if you want Use the Bio.Motif.parse(handle, format) function if you want
to read multiple records from the handle. to read multiple records from the handle.
""" """
iterator = parse(handle, format) motifs = parse(handle, format)
try: if len(motifs)==0:
first = iterator.next()
except StopIteration:
first = None
if first is None:
raise ValueError("No motifs found in handle") raise ValueError("No motifs found in handle")
try: if len(motifs) > 1:
second = iterator.next()
except StopIteration:
second = None
if second is not None:
raise ValueError("More than one motif found in handle") raise ValueError("More than one motif found in handle")
return first motif = motifs[0]
return motif




if __name__ == "__main__": if __name__ == "__main__":
Expand Down
6 changes: 5 additions & 1 deletion Doc/Tutorial.tex
Expand Up @@ -80,7 +80,7 @@


\author{Jeff Chang, Brad Chapman, Iddo Friedberg, Thomas Hamelryck, \\ \author{Jeff Chang, Brad Chapman, Iddo Friedberg, Thomas Hamelryck, \\
Michiel de Hoon, Peter Cock, Tiago Antao, Eric Talevich, Bartek Wilczy\'{n}ski} Michiel de Hoon, Peter Cock, Tiago Antao, Eric Talevich, Bartek Wilczy\'{n}ski}
\date{Last Update -- 29 December 2012 (Biopython 1.60+)} \date{Last Update -- 3 January 2013 (Biopython 1.60+)}


%Hack to get the logo at the start of the HTML front page: %Hack to get the logo at the start of the HTML front page:
%(hopefully this isn't going to be too wide for most people) %(hopefully this isn't going to be too wide for most people)
Expand Down Expand Up @@ -11026,6 +11026,10 @@ \subsubsection*{JASPAR}
GCCCATATATGG GCCCATATATGG
\end{verbatim} \end{verbatim}


\subsubsection*{MEME}

\subsubsection*{TRANSFAC}

\subsection{Writing motifs} \subsection{Writing motifs}
Speaking of exporting, let's look at export functions. We can export to fasta: Speaking of exporting, let's look at export functions. We can export to fasta:
%cont-doctest %cont-doctest
Expand Down

0 comments on commit 4e68a7e

Please sign in to comment.