Permalink
Browse files

putting the Jaspar parser in its own submodule

  • Loading branch information...
1 parent f690cbc commit ad1692eacf56a7cbc7f991255effbcbfbef68615 Michiel de Hoon committed Nov 26, 2012
Showing with 122 additions and 31 deletions.
  1. +37 −0 Bio/Motif/Jaspar.py
  2. +56 −15 Bio/Motif/_Motif.py
  3. +17 −13 Bio/Motif/__init__.py
  4. +12 −3 Tests/test_Motif.py
View
@@ -0,0 +1,37 @@
+from Bio.Motif import Motif
+from Bio.Alphabet import IUPAC
+from Bio.Seq import Seq
+
+def read(handle, format):
+ alphabet = IUPAC.unambiguous_dna
+ counts = {}
+ if format=="pfm":
+ # reads the motif from Jaspar .pfm file
+ letters = "ACGT"
+ for letter, line in zip(letters, handle):
+ words = line.split()
+ #if there is a letter in the beginning, ignore it
+ if words[0]==letter:
+ words = words[1:]
+ counts[letter] = map(float, words)
+ motif = Motif(alphabet, counts=counts)
+ elif format=="sites":
+ # reads the motif from Jaspar .sites file
+ instances = []
+ for line in handle:
+ if not line.startswith(">"):
+ break
+ # line contains the header ">...."
+ # now read the actual sequence
+ line = handle.next()
+ instance = ""
+ for c in line.strip():
+ if c==c.upper():
+ instance += c
+ instance = Seq(instance, alphabet)
+ instances.append(instance)
+ motif = Motif(alphabet, instances=instances)
+ else:
+ raise ValueError("Unknown format %s" % format)
+ motif.set_mask("*"*motif.length)
+ return motif
View
@@ -9,12 +9,15 @@
from Bio.Alphabet import IUPAC
import math
+import warnings
+from Bio import BiopythonExperimentalWarning
+
class Motif(object):
"""
A class representing sequence motifs.
"""
- def __init__(self, alphabet=IUPAC.unambiguous_dna, instances=None):
- self.counts = None
+ def __init__(self, alphabet=IUPAC.unambiguous_dna,
+ instances=None, counts=None):
self.mask = []
self._pwm_is_current = False
self._pwm = []
@@ -27,11 +30,20 @@ def __init__(self, alphabet=IUPAC.unambiguous_dna, instances=None):
self.beta=1.0
self.info=None
self.name=""
- if instances==None:
+ if counts!=None and instances!=None:
+ raise Exception(ValueError,
+ "Specify either instances or counts, don't specify both")
+ elif counts!=None:
+ warnings.warn("This is experimental code, and may change in future versions", BiopythonExperimentalWarning)
+ for letter in counts:
+ length = len(counts[letter])
+ if self.length==None:
+ self.length = length
+ elif self.length!=length:
+ raise Exception("counts matrix has inconsistent lengths")
self.instances = None
- else:
- import warnings
- from Bio import BiopythonExperimentalWarning
+ self.counts = counts
+ elif instances!=None:
warnings.warn("This is experimental code, and may change in future versions", BiopythonExperimentalWarning)
self.instances = []
for instance in instances:
@@ -45,11 +57,19 @@ def __init__(self, alphabet=IUPAC.unambiguous_dna, instances=None):
message = "All instances should have the same length (%d found, %d expected)" % (len(instance), self.length)
raise ValueError(message)
self.instances.append(instance)
+ self.counts = {}
+ for letter in self.alphabet.letters:
+ self.counts[letter] = [0] * self.length
+ for instance in self.instances:
+ for position, letter in enumerate(instance):
+ self.counts[letter][position] += 1
+ else:
+ self.counts = None
+ self.instances = None
@property
def has_instances(self):
"""Legacy property, check if m.instances is None instead (DEPRECATED)."""
- import warnings
from Bio import BiopythonDeprecationWarning
warnings.warn("Instead of 'm.has_instances' use 'm.instances is not None'",
BiopythonDeprecationWarning)
@@ -58,7 +78,6 @@ def has_instances(self):
@property
def has_counts(self):
"""Legacy property, check if m.counts is None instead (DEPRECATED)."""
- import warnings
from Bio import BiopythonDeprecationWarning
warnings.warn("Instead of 'm.has_counts' use 'm.counts is not None'",
BiopythonDeprecationWarning)
@@ -385,7 +404,6 @@ def _read(self,stream):
the self.alphabet variable must be set beforehand.
If the last line contains asterisks it is used for setting mask
"""
- import warnings
warnings.warn("This function is now obsolete, and will be deprecated and removed in a future release of Biopython. As a replacement, please use Bio.Motif.parse instead.", PendingDeprecationWarning)
while 1:
@@ -436,11 +454,30 @@ def _to_fasta(self):
FASTA representation of motif
"""
if self.instances==None:
- self.make_instances_from_counts()
+ alpha="".join(self.alphabet.letters)
+ #col[i] is a column taken from aligned motif instances
+ col=[]
+ instances=[]
+ s = sum(self.counts[nuc][0] for nuc in self.alphabet.letters)
+ for i in range(self.length):
+ col.append("")
+ for n in self.alphabet.letters:
+ col[i] += n * self.counts[n][i]
+ if len(col[i])<s:
+ warnings.warn(UserWarning, "WARNING, column too short (%d; expected %d)" % (len(col[i]), s))
+ col[i]+=(alpha*s)[:(s-len(col[i]))]
+ #iterate over instances
+ for i in range(s):
+ instance="" #start with empty seq
+ for j in range(self.length): #iterate over positions
+ instance+=col[j][i]
+ instance = Seq(instance, self.alphabet)
+ instances.append(instance)
+ else:
+ instances = self.instances
string = ""
- for i,inst in enumerate(self.instances):
- string += ">instance%d\n"%i + str(inst) + "\n"
-
+ for i, instance in enumerate(instances):
+ string += ">instance%d\n%s\n "% (i, instance)
return string
def reverse_complement(self):
@@ -476,12 +513,13 @@ def _from_jaspar_pfm(self,stream,make_instances=False):
The instances are fake, but the pwm is accurate.
"""
+ warnings.warn("This function is now obsolete, and will be deprecated and removed in a future release of Biopython. Please use the 'pfm' format instead of the 'jaspar-pfm' format", PendingDeprecationWarning)
return self._from_horiz_matrix(stream,letters="ACGT",make_instances=make_instances)
def _from_vert_matrix(self,stream,letters=None,make_instances=False):
"""reads a vertical count matrix from stream and fill in the counts.
"""
-
+ warnings.warn("This function is now obsolete, and will be deprecated and removed in a future release of Biopython.", PendingDeprecationWarning)
self.counts = {}
if letters==None:
letters=self.alphabet.letters
@@ -501,6 +539,7 @@ def _from_vert_matrix(self,stream,letters=None,make_instances=False):
def _from_horiz_matrix(self,stream,letters=None,make_instances=False):
"""reads a horizontal count matrix from stream and fill in the counts.
"""
+ warnings.warn("This function is now obsolete, and will be deprecated and removed in a future release of Biopython.", PendingDeprecationWarning)
if letters==None:
letters=self.alphabet.letters
self.counts = {}
@@ -529,9 +568,10 @@ def _from_horiz_matrix(self,stream,letters=None,make_instances=False):
def make_instances_from_counts(self):
"""Creates "fake" instances for a motif created from a count matrix.
- In case the sums of counts are different for different columnes, the
+ In case the sums of counts are different for different columns, the
shorter columns are padded with background.
"""
+ warnings.warn("This function is now obsolete, and will be deprecated and removed in a future release of Biopython.", PendingDeprecationWarning)
alpha="".join(self.alphabet.letters)
#col[i] is a column taken from aligned motif instances
col=[]
@@ -580,6 +620,7 @@ def _from_jaspar_sites(self,stream):
The instances and pwm are OK.
"""
+ warnings.warn("This function is now obsolete, and will be deprecated and removed in a future release of Biopython. Please use the 'sites' format instead of the 'jaspar-sites' format", PendingDeprecationWarning)
# Probably this should be in a separate submodule of Bio.Motif
self.instances = []
for line in stream:
View
@@ -12,6 +12,7 @@
from Bio.Motif._Motif import Motif
from Bio.Motif.AlignAce import read as _AlignAce_read
from Bio.Motif.MEME import read as _MEME_read
+from Bio.Motif import Jaspar
from Bio.Motif.Thresholds import ScoreDistribution
_parsers={"AlignAce" : _AlignAce_read,
@@ -25,7 +26,7 @@ def _from_sites(handle):
return Motif()._from_jaspar_sites(handle)
_readers={"jaspar-pfm": _from_pfm,
- "jaspar-sites": _from_sites
+ "jaspar-sites": _from_sites,
}
@@ -64,19 +65,22 @@ def parse(handle,format):
GACGCCGGGGAT
CGACTCGCGCTTACAAGG
"""
- try:
- parser=_parsers[format]
+ if format in ('pfm', 'sites'):
+ yield Jaspar.read(handle, format)
+ else:
+ try:
+ parser=_parsers[format]
- except KeyError:
- try: #not a true parser, try reader formats
- reader=_readers[format]
- except:
- raise ValueError("Wrong parser format")
- else: #we have a proper reader
- yield reader(handle)
- else: # we have a proper reader
- for m in parser(handle).motifs:
- yield m
+ except KeyError:
+ try: #not a true parser, try reader formats
+ reader=_readers[format]
+ except:
+ raise ValueError("Wrong parser format")
+ else: #we have a proper reader
+ yield reader(handle)
+ else: # we have a proper reader
+ for m in parser(handle).motifs:
+ yield m
def read(handle,format):
"""Reads a motif from a handle using a specified file-format.
View
@@ -401,13 +401,19 @@ def test_alignace_parsing(self):
def test_pfm_parsing(self):
"""Test to be sure that Motif can parse pfm files.
"""
- m = Motif.read(self.PFMin,"jaspar-pfm")
+ import warnings
+ from Bio import BiopythonExperimentalWarning
+ warnings.simplefilter('ignore', BiopythonExperimentalWarning)
+ m = Motif.read(self.PFMin,"pfm")
self.assertEqual(m.length, 12)
def test_sites_parsing(self):
"""Test to be sure that Motif can parse sites files.
"""
- m = Motif.read(self.SITESin,"jaspar-sites")
+ import warnings
+ from Bio import BiopythonExperimentalWarning
+ warnings.simplefilter('ignore', BiopythonExperimentalWarning)
+ m = Motif.read(self.SITESin,"sites")
self.assertEqual(m.length, 6)
def test_FAoutput(self):
@@ -1511,8 +1517,11 @@ def test_mast_parser_3(self):
class MotifTestPWM(unittest.TestCase):
def setUp(self):
+ import warnings
+ from Bio import BiopythonExperimentalWarning
+ warnings.simplefilter('ignore', BiopythonExperimentalWarning)
handle = open("Motif/SRF.pfm")
- self.m = Motif.read(handle, "jaspar-pfm")
+ self.m = Motif.read(handle, "pfm")
handle.close()
self.s = Seq("ACGTGTGCGTAGTGCGT", self.m.alphabet)

0 comments on commit ad1692e

Please sign in to comment.