Permalink
Browse files

Make a true FrequencyPositionMatrix class

  • Loading branch information...
mdehoon committed Dec 13, 2012
1 parent f5e8e1c commit bf128451affa5334595bb7d122f435a680b40eb3
Showing with 67 additions and 63 deletions.
  1. +64 −61 Bio/Motif/_Motif.py
  2. +1 −1 Doc/Tutorial.tex
  3. +2 −1 Tests/test_Motif.py
View
@@ -191,6 +191,37 @@ def reverse_complement(self):
return self.__class__(alphabet, values)
+class FrequencyPositionMatrix(GenericPositionMatrix):
+
+ def normalize(self, pseudocounts=None):
+ """
+ create and return a position-weight matrix by normalizing the counts matrix.
+
+ If pseudocounts is None (default), no pseudocounts are added
+ to the counts.
+ If pseudocounts is a number, it is added to the counts before
+ calculating the position-weight matrix.
+ Alternatively, the pseudocounts can be a dictionary with a key
+ for each letter in the alphabet associated with the motif.
+ """
+
+ counts = {}
+ if pseudocounts is None:
+ for letter in self.alphabet.letters:
+ counts[letter] = [0.0] * self.length
+ elif isinstance(pseudocounts, dict):
+ for letter in self.alphabet.letters:
+ counts[letter] = [float(pseudocounts[letter])] * self.length
+ else:
+ for letter in self.alphabet.letters:
+ counts[letter] = [float(pseudocounts)] * self.length
+ for i in xrange(self.length):
+ for letter in self.alphabet.letters:
+ counts[letter][i] += self[letter][i]
+ # Actual normalization is done in the PositionWeightMatrix initializer
+ return PositionWeightMatrix(self.alphabet, counts)
+
+
class PositionWeightMatrix(GenericPositionMatrix):
def __init__(self, alphabet, counts):
@@ -458,7 +489,7 @@ def __init__(self, alphabet=None, instances=None, counts=None):
elif self.length!=length:
raise Exception("counts matrix has inconsistent lengths")
self.instances = None
- self.counts = GenericPositionMatrix(alphabet, counts)
+ self.counts = FrequencyPositionMatrix(alphabet, counts)
elif instances is not None:
warnings.warn("This is experimental code, and may change in future versions", BiopythonExperimentalWarning)
self.instances = []
@@ -483,7 +514,7 @@ def __init__(self, alphabet=None, instances=None, counts=None):
for instance in self.instances:
for position, letter in enumerate(instance):
counts[letter][position] += 1
- self.counts = GenericPositionMatrix(alphabet, counts)
+ self.counts = FrequencyPositionMatrix(alphabet, counts)
else:
self.counts = None
self.instances = None
@@ -620,9 +651,9 @@ def pwm(self,laplace=True):
in a future release of Biopython. As a replacement, instead of
>>> motif.pwm()
use
->>> pwm = motif.make_pwm()
-See the documentation of motif.make_pwm and pwm.make_pssm for details
-on treatment of pseudocounts and background probabilities.
+>>> pwm = motif.counts.normalize()
+See the documentation of motif.counts.normalize and pwm.make_pssm for
+details on treatment of pseudocounts and background probabilities.
""", PendingDeprecationWarning)
if self._pwm_is_current:
return self._pwm
@@ -652,34 +683,6 @@ def pwm(self,laplace=True):
self._pwm_is_current=1
return self._pwm
- def make_pwm(self, pseudocounts=None):
- """
- return the position-weight matrix (calculated from the counts
- matrix).
-
- If pseudocounts is None (default), no pseudocounts are added
- to the counts.
- If pseudocounts is a number, it is added to the counts before
- calculating the position-weight matrix.
- Alternatively, the pseudocounts can be a dictionary with a key
- for each letter in the alphabet associated with the motif.
- """
-
- counts = {}
- if pseudocounts is None:
- for letter in self.alphabet.letters:
- counts[letter] = [0.0] * self.length
- elif isinstance(pseudocounts, dict):
- for letter in self.alphabet.letters:
- counts[letter] = [float(pseudocounts[letter])] * self.length
- else:
- for letter in self.alphabet.letters:
- counts[letter] = [float(pseudocounts)] * self.length
- for i in xrange(self.length):
- for letter in self.alphabet.letters:
- counts[letter][i] += self.counts[letter][i]
- return PositionWeightMatrix(self.alphabet, counts)
-
def log_odds(self,laplace=True):
"""
returns the log odds matrix computed for the set of instances
@@ -689,10 +692,10 @@ def log_odds(self,laplace=True):
in a future release of Biopython. As a replacement, instead of
>>> motif.log_odds()
use
->>> pwm = motif.make_pwm()
+>>> pwm = motif.counts.normalize()
>>> pssm = pwm.make_pssm()
-See the documentation of motif.make_pwm and pwm.make_pssm for details
-on treatment of pseudocounts and background probabilities.
+See the documentation of motif.counts.normalize and pwm.make_pssm for
+details on treatment of pseudocounts and background probabilities.
""", PendingDeprecationWarning)
if self._log_odds_is_current:
return self._log_odds
@@ -715,11 +718,11 @@ def ic(self):
in a future release of Biopython. As a replacement, instead of
>>> motif.ic()
please use
->>> pwm = motif.make_pwm()
+>>> pwm = motif.counts.normalize()
>>> pwm.ic()
-Please be aware though that by default, motif.make_pwm() does not
-use psuedocounts, while motif.ic() does. See the documentation of
-motif.make_pwm for more details.
+Please be aware though that by default, motif.counts.normalize()
+does not use psuedocounts, while motif.ic() does. See the documentation
+of motif.counts.normalize for more details.
""", PendingDeprecationWarning)
res=0
pwm=self.pwm()
@@ -739,9 +742,9 @@ def exp_score(self,st_dev=False):
in a future release of Biopython. As a replacement, instead of
>>> motif.exp_score()
please use
->>> pwm = motif.make_pwm()
+>>> pwm = motif.counts.normalize()
>>> pwm.exp_score()
-See the documentation of motif.make_pwm for details on treatment of
+See the documentation of motif.counts.normalize for details on treatment of
pseudocounts.
""", PendingDeprecationWarning)
exs=0.0
@@ -782,12 +785,12 @@ def score_hit(self,sequence,position,normalized=0,masked=0):
in a future release of Biopython. As a replacement, instead of
>>> motif.score_hit(sequence, position)
please use
->>> pwm = motif.make_pwm()
+>>> pwm = motif.counts.normalize()
>>> pssm = pwm.make_pssm()
>>> s = sequence[position:positon+len(pssm)]
>>> pssm.calculate(s)
-See the documentation of motif.make_pwm and pwm.make_pssm for details
-on treatment of pseudocounts and background probabilities.
+See the documentation of motif.counts.normalize() and pwm.make_pssm
+for details on the treatment of pseudocounts and background probabilities.
""", PendingDeprecationWarning)
lo=self.log_odds()
score = 0.0
@@ -814,11 +817,11 @@ def search_pwm(self,sequence,normalized=0,masked=0,threshold=0.0,both=True):
in a future release of Biopython. As a replacement, instead of
>>> motif.score_hit(sequence, position)
please use
->>> pwm = motif.make_pwm()
+>>> pwm = motif.counts.normalize()
>>> pssm = pwm.make_pssm()
>>> pssm.search(sequence)
-See the documentation of motif.make_pwm and pwm.make_pssm for details
-on treatment of pseudocounts and background probabilities.
+See the documentation of motif.counts.normalize() and pwm.make_pssm
+for details on treatment of pseudocounts and background probabilities.
""", PendingDeprecationWarning)
raise Exception
if both:
@@ -846,11 +849,11 @@ def dist_pearson(self, motif, masked = 0):
in a future release of Biopython. As a replacement, instead of
>>> motif1.dist_pearson(motif2)
please use
->>> pwm1 = motif1.make_pwm()
->>> pwm2 = motif2.make_pwm()
+>>> pwm1 = motif1.counts.normalize()
+>>> pwm2 = motif2.counts.normalize()
>>> pwm1.dist_pearson(pwm2)
-Please see the documentation of motif.make_pwm and pwm.dist_pearson
-for more details.
+Please see the documentation of motif.counts.normalize and
+pwm.dist_pearson for more details.
""", PendingDeprecationWarning)
if self.alphabet != motif.alphabet:
@@ -1248,7 +1251,7 @@ def __getitem__(self,index):
in a future release of Biopython. Instead of
>>> motif[i]
please use
->>> pwm = motif.make_pwm()
+>>> pwm = motif.counts.normalize()
>>> pwm[:,i]
""", PendingDeprecationWarning)
if index in range(self.length):
@@ -1302,7 +1305,7 @@ def max_score(self):
This function is now deprecated. Instead of
>>> motif.max_score()
please use
->>> pwm = motif.make_pwm()
+>>> pwm = motif.counts.normalize()
>>> pssm = pwm.make_pssm()
>>> pssm.max_score()
""",
@@ -1318,7 +1321,7 @@ def min_score(self):
This function is now deprecated. Instead of
>>> motif.min_score()
please use
->>> pwm = motif.make_pwm()
+>>> pwm = motif.counts.normalize()
>>> pssm = pwm.make_pssm()
>>> pssm.min_score()
""",
@@ -1693,11 +1696,11 @@ def scanPWM(self,seq):
in a future release of Biopython. As a replacement, instead of
>>> motif.scanPWM(sequence)
use
->>> pwm = motif.make_pwm()
+>>> pwm = motif.counts.normalize()
>>> pssm = pwm.make_pssm()
>>> pssm.calculate(sequence)
-See the documentation of motif.make_pwm, pwm.make_pssm, and pssm.calculate
-for details.
+See the documentation of motif.counts.normalize, pwm.make_pssm, and
+pssm.calculate for details.
""", PendingDeprecationWarning)
if self.alphabet!=IUPAC.unambiguous_dna:
raise ValueError("Wrong alphabet! Use only with DNA motifs")
@@ -1725,11 +1728,11 @@ def _pwm_calculate(self, sequence):
in a future release of Biopython. As a replacement, instead of
>>> motif._pwm_calculate(sequence)
use
->>> pwm = motif.make_pwm()
+>>> pwm = motif.counts.normalize()
>>> pssm = pwm.make_pssm()
>>> pssm.calculate(sequence)
-See the documentation of motif.make_pwm, pwm.make_pssm, and pssm.calculate
-for details.
+See the documentation of motif.counts.normalize, pwm.make_pssm, and
+pssm.calculate for details.
""", PendingDeprecationWarning)
logodds = self.log_odds()
m = len(logodds)
View
@@ -10813,7 +10813,7 @@ \section{Motif objects}
We can also calculate the information content of a motif with a simple call:
%cont-doctest
\begin{verbatim}
->>> pwm = m.make_pwm(pseudocounts=0.25)
+>>> pwm = m.counts.normalize(pseudocounts=0.25)
>>> print "%0.2f" % pwm.ic()
5.27
\end{verbatim}
View
@@ -1526,7 +1526,8 @@ def setUp(self):
def test_simple(self):
"""Test if Motif PWM scoring works."""
- pwm = self.m.make_pwm(pseudocounts=0.25)
+ counts = self.m.counts
+ pwm = counts.normalize(pseudocounts=0.25)
pssm = pwm.make_pssm()
result = pssm.calculate(self.s)
self.assertEqual(6, len(result))

0 comments on commit bf12845

Please sign in to comment.