Permalink
Browse files

Updating the documentation. Also need to find some solution for calcu…

…lating the correlation
  • Loading branch information...
1 parent 0769bc7 commit ac9f5041df9ebd8d2384119777565fd1f2207851 Michiel de Hoon committed Dec 23, 2012
Showing with 100 additions and 105 deletions.
  1. +20 −63 Bio/Motif/_Motif.py
  2. +79 −41 Doc/Tutorial.tex
  3. +1 −1 Tests/test_Motif.py
View
@@ -286,12 +286,12 @@ def exp_score(self, background=None):
background = dict(background)
total = sum(background.values())
for letter in self.alphabet.letters:
- background[a] /= total
+ background[letter] /= total
for i in range(self.length):
ex1 = 0.0
ex2 = 0.0
for letter in self.alphabet.letters:
- p = self[i][letter]
+ p = self[letter][i]
b = background[letter]
if p!=0:
term = p*(math.log(p,2)-math.log(b,2))
@@ -301,7 +301,7 @@ def exp_score(self, background=None):
var += ex2-ex1**2
return exs, math.sqrt(var)
- def make_pssm(self, background=None):
+ def log_odds(self, background=None):
"""
returns the Position-Specific Scoring Matrix.
@@ -348,51 +348,6 @@ def make_pssm(self, background=None):
values[letter].append(logodds)
return PositionSpecificScoringMatrix(alphabet, values)
- def dist_pearson(self, pwm):
- """
- return the similarity score based on pearson correlation for the given motif against self.
-
- We use the Pearson's correlation of the respective probabilities.
- """
-
- if self.alphabet != pwm.alphabet:
- raise ValueError("Cannot compare motifs with different alphabets")
-
- max_p=-2
- for offset in range(-self.length+1, pwm.length):
- if offset<0:
- p = self.dist_pearson_at(pwm,-offset)
- else: # offset>=0
- p = pwm.dist_pearson_at(self,offset)
-
- if max_p<p:
- max_p=p
- max_o=-offset
- return 1-max_p,max_o
-
- def dist_pearson_at(self, pwm, offset):
- sxx = 0 # \sum x^2
- sxy = 0 # \sum x \cdot y
- sx = 0 # \sum x
- sy = 0 # \sum y
- syy = 0 # \sum y^2
- norm=max(self.length,offset+pwm.length)
-
- for pos in range(max(self.length,offset+pwm.length)):
- for l in self.alphabet.letters:
- xi = self[l][pos]
- yi = pwm[l][pos-offset]
- sx = sx + xi
- sy = sy + yi
- sxx = sxx + xi * xi
- syy = syy + yi * yi
- sxy = sxy + xi * yi
-
- norm *= len(self.alphabet.letters)
- s1 = (sxy - sx*sy*1.0/norm)
- s2 = (norm*sxx - sx*sx*1.0)*(norm*syy- sy*sy*1.0)
- return s1/math.sqrt(s2)
-
class PositionSpecificScoringMatrix(GenericPositionMatrix):
@@ -456,24 +411,26 @@ def search(self, sequence, threshold=0.0, both=True):
if score > threshold:
yield (position-n, score)
+ @property
def max_score(self):
"""Maximal possible score for this motif.
returns the score computed for the consensus sequence.
"""
score = 0.0
- letters = self.alphabet
+ letters = self.alphabet.letters
for position in xrange(0,self.length):
score += max([self[letter][position] for letter in letters])
return score
+ @property
def min_score(self):
"""Minimal possible score for this motif.
returns the score computed for the anticonsensus sequence.
"""
score = 0.0
- letters = self.alphabet
+ letters = self.alphabet.letters
for position in xrange(0,self.length):
score += min([self[letter][position] for letter in letters])
return score
@@ -668,7 +625,7 @@ def pwm(self,laplace=True):
>>> motif.pwm()
use
>>> pwm = motif.counts.normalize()
-See the documentation of motif.counts.normalize and pwm.make_pssm for
+See the documentation of motif.counts.normalize and pwm.log_odds for
details on treatment of pseudocounts and background probabilities.
""", PendingDeprecationWarning)
if self._pwm_is_current:
@@ -709,8 +666,8 @@ def log_odds(self,laplace=True):
>>> motif.log_odds()
use
>>> pwm = motif.counts.normalize()
->>> pssm = pwm.make_pssm()
-See the documentation of motif.counts.normalize and pwm.make_pssm for
+>>> pssm = pwm.log_odds()
+See the documentation of motif.counts.normalize and pwm.log_odds for
details on treatment of pseudocounts and background probabilities.
""", PendingDeprecationWarning)
if self._log_odds_is_current:
@@ -802,10 +759,10 @@ def score_hit(self,sequence,position,normalized=0,masked=0):
>>> motif.score_hit(sequence, position)
please use
>>> pwm = motif.counts.normalize()
->>> pssm = pwm.make_pssm()
+>>> pssm = pwm.log_odds()
>>> s = sequence[position:positon+len(pssm)]
>>> pssm.calculate(s)
-See the documentation of motif.counts.normalize() and pwm.make_pssm
+See the documentation of motif.counts.normalize() and pwm.log_odds
for details on the treatment of pseudocounts and background probabilities.
""", PendingDeprecationWarning)
lo=self.log_odds()
@@ -834,9 +791,9 @@ def search_pwm(self,sequence,normalized=0,masked=0,threshold=0.0,both=True):
>>> motif.score_hit(sequence, position)
please use
>>> pwm = motif.counts.normalize()
->>> pssm = pwm.make_pssm()
+>>> pssm = pwm.log_odds()
>>> pssm.search(sequence)
-See the documentation of motif.counts.normalize() and pwm.make_pssm
+See the documentation of motif.counts.normalize() and pwm.log_odds
for details on treatment of pseudocounts and background probabilities.
""", PendingDeprecationWarning)
raise Exception
@@ -1322,7 +1279,7 @@ def max_score(self):
>>> motif.max_score()
please use
>>> pwm = motif.counts.normalize()
->>> pssm = pwm.make_pssm()
+>>> pssm = pwm.log_odds()
>>> pssm.max_score()
""",
PendingDeprecationWarning)
@@ -1338,7 +1295,7 @@ def min_score(self):
>>> motif.min_score()
please use
>>> pwm = motif.counts.normalize()
->>> pssm = pwm.make_pssm()
+>>> pssm = pwm.log_odds()
>>> pssm.min_score()
""",
PendingDeprecationWarning)
@@ -1713,9 +1670,9 @@ def scanPWM(self,seq):
>>> motif.scanPWM(sequence)
use
>>> pwm = motif.counts.normalize()
->>> pssm = pwm.make_pssm()
+>>> pssm = pwm.log_odds()
>>> pssm.calculate(sequence)
-See the documentation of motif.counts.normalize, pwm.make_pssm, and
+See the documentation of motif.counts.normalize, pwm.log_odds, and
pssm.calculate for details.
""", PendingDeprecationWarning)
if self.alphabet!=IUPAC.unambiguous_dna:
@@ -1745,9 +1702,9 @@ def _pwm_calculate(self, sequence):
>>> motif._pwm_calculate(sequence)
use
>>> pwm = motif.counts.normalize()
->>> pssm = pwm.make_pssm()
+>>> pssm = pwm.log_odds()
>>> pssm.calculate(sequence)
-See the documentation of motif.counts.normalize, pwm.make_pssm, and
+See the documentation of motif.counts.normalize, pwm.log_odds, and
pssm.calculate for details.
""", PendingDeprecationWarning)
logodds = self.log_odds()
View
@@ -11006,64 +11006,89 @@ \section{Position-Weight Matrices}
\begin{verbatim}
>>> pwm = m.counts.normalize(pseudocounts={'A':0.6, 'C': 0.4, 'G': 0.4, 'T': 0.6})
\end{verbatim}
-We can also calculate the information content of a motif with a simple call:
+The position-weight matrix has its own methods to calculate the consensus, anticonsensus, and degenerate consensus sequences:
%cont-doctest
\begin{verbatim}
->>> print "%0.2f" % pwm.ic()
-3.22
+>>> pwm.consensus
+Seq('TACGC', IUPACUnambiguousDNA())
+>>> pwm.anticonsensus
+Seq('GGGTG', IUPACUnambiguousDNA())
+>>> pwm.degenerate_consensus
+Seq('WACNC', IUPACAmbiguousDNA())
\end{verbatim}
-This gives us a number of bits of information provided by the motif,
-which tells us how much the motif differs from background.
-
-By default, the information content is calculated with respect to a uniform
-background, in which all letters in the alphabet of the motif are assigend an
-equal probability. To specify a non-uniform background, pass a dictionary with
-the background probabilities when calling this method. To calculate the
-information content of a motif with respect to a background corresponding
-to a GC content of 40\%, use
+Note that due to the pseudocounts, the degenerate consensus sequence calculated from the position-weight matrix is slightly different from the degenerate consensus sequence calculated from the instances in the motif:
%cont-doctest
\begin{verbatim}
->>> print "%0.2f" % pwm.ic(background={'A': 0.3, 'C': 0.2, 'G': 0.2, 'T': 0.3})
-3.08
+>>> m.degenerate_consensus
+Seq('WACVC', IUPACAmbiguousDNA())
\end{verbatim}
+The reverse complement of the position-weight matrix can be calculated directly from the \verb+pwm+.
\section{Position-Specific Scoring Matrices}
Using the background distribution and PWM with pseudo-counts added,
it's easy to compute the log-odds ratios, telling us what are the log
odds of a particular symbol to be coming from a motif against the
-background. We can use the \verb|.log_odds()| method:
-
-\begin{verbatim}
- >>> m.log_odds()
-[{'A': -2.3219280948873622,
- 'C': -2.3219280948873622,
- 'T': 1.7655347463629771,
- 'G': -2.3219280948873622},
- {'A': 1.7655347463629771,
- 'C': -2.3219280948873622,
- 'T': -2.3219280948873622,
- 'G': -2.3219280948873622},
- {'A': -2.3219280948873622,
- 'C': -2.3219280948873622,
- 'T': 1.7655347463629771,
- 'G': -2.3219280948873622},
- {'A': 1.3785116232537298,
- 'C': -2.3219280948873622,
- 'T': 0.0,
- 'G': -2.3219280948873622},
- {'A': 1.7655347463629771,
- 'C': -2.3219280948873622,
- 'T': -2.3219280948873622,
- 'G': -2.3219280948873622}
-]
+background. We can use the \verb|.log_odds()| method on the position-weight
+matrix:
+
+\begin{verbatim}
+>>> pwm.log_odds()
+{'A': [0.6780719051126378, 1.7560744171139109, -1.9068905956085187, 0.2085866218114176, -0.4918530963296747], 'C': [-2.4918530963296748, -2.4918530963296748, 1.2630344058337941, 0.09310940439148147, 1.5081469036703254], 'T': [1.031708859727338, -1.9068905956085187, 0.2085866218114176, -1.9068905956085187, -1.9068905956085187], 'G': [-2.4918530963296748, -2.4918530963296748, -2.4918530963296748, 0.5956097449206647, -2.4918530963296748]}
\end{verbatim}
Here we can see positive values for symbols more frequent in the motif
than in the background and negative for symbols more frequent in the
background. $0.0$ means that it's equally likely to see a symbol in the
background and in the motif (e.g. `T' in the second-last position).
-\subsection{Searching for instances}
+This assumes that A, C, G, and T are equally likely in the background. To
+calculate the position-specific scoring matrix against a background with
+unequal probabilities for A, C, G, T, use the \verb+background+ argument.
+For example, against a background with a 40\% GC content, use
+\begin{verbatim}
+>>> pwm.log_odds(background={'A':0.3,'C':0.2,'G':0.2,'T':0.3})
+{'A': [0.415037499278844, 1.493040011280117, -2.169925001442312, -0.054447784022376135, -0.7548875021634684], 'C': [-2.1699250014423126, -2.1699250014423126, 1.5849625007211563, 0.4150374992788437, 1.8300749985576878], 'T': [0.7686744538935444, -2.169925001442312, -0.054447784022376135, -2.169925001442312, -2.169925001442312], 'G': [-2.1699250014423126, -2.1699250014423126, -2.1699250014423126, 0.9175378398080271, -2.1699250014423126]}
+\end{verbatim}
+
+The maximum and minimum score that can be obtained from the \verb+max_score+
+and \verb+min_score+ properties:
+%cont-doctest
+\begin{verbatim}
+>>> pssm = pwm.log_odds()
+>>> print "%4.2f" % pssm.max_score
+6.15
+>>> print "%4.2f" % pssm.min_score
+-11.87
+\end{verbatim}
+
+To calculate the information content of this position-specific scoring matrix,
+use the \verb+.ic+ method on the position-weight matrix. For the uniform
+background, we find
+%cont-doctest
+\begin{verbatim}
+>>> print "%0.2f" % pwm.ic()
+3.22
+\end{verbatim}
+This gives us a number of bits of information provided by the motif,
+which tells us how much the motif differs from background.
+The information content of the position-specific scoring matrix against a
+background with a GC content of 40\% can be calculated by using the
+\verb+background+ argument:
+%cont-doctest
+\begin{verbatim}
+>>> print "%0.2f" % pwm.ic(background={'A': 0.3, 'C': 0.2, 'G': 0.2, 'T': 0.3})
+3.08
+\end{verbatim}
+
+The mean and standard deviation of the PSSM scores can be calculated similarly:
+%cont-doctest
+\begin{verbatim}
+>>> mean, std = pwm.exp_score(background={'A': 0.3, 'C': 0.2, 'G': 0.2, 'T': 0.3})
+>>> print "mean = %0.2f, standard deviation = %0.2f" % (mean, std)
+mean = 3.21, standard deviation = 1.50
+\end{verbatim}
+
+\section{Searching for instances}
\label{sec:search}
The most frequent use for a motif is to find its instances in some
@@ -11098,7 +11123,7 @@ \subsection{Searching for instances}
It's just as easy to look for positions, giving rise to high log-odds scores against our motif:
%cont-doctest
\begin{verbatim}
->>> pssm = pwm.make_pssm()
+>>> pssm = pwm.log_odds()
>>> for position, score in pssm.search(test_seq, threshold=4.0):
... print "Position %i gives score %0.2f" % (position, score)
...
@@ -11119,6 +11144,19 @@ \subsection{Searching for instances}
background. The default threshold is $0.0$, which selects everything
that looks more like the motif than the background.
+You can also calculate the scores at all positions along the sequence:
+%cont-doctest
+\begin{verbatim}
+>>> pssm.calculate(test_seq)
+array([ 5.76755142, -5.53445292, -3.87148786, 0.49856207,
+ -7.2893405 , -1.89541376, -10.70437813, -2.92593575,
+ 0.69650143, -3.18081594, 3.76755118, -2.0039382 ,
+ -1.04141295, 5.29843712, -0.94949049, -4.0039382 ,
+ -9.17386341, -0.53891265, -0.45645046, -2.24905086,
+ -10.70437813, -2.92593575], dtype=float32)
+\end{verbatim}
+In general, this is the fastest way to calculate PSSM scores.
+
If you want to use a less arbitrary way of selecting thresholds, you
can explore the \verb|Motif.score_distribution| class implementing a
distribution of scores for a given motif. Since the space for a score
View
@@ -1528,7 +1528,7 @@ def test_simple(self):
"""Test if Motif PWM scoring works."""
counts = self.m.counts
pwm = counts.normalize(pseudocounts=0.25)
- pssm = pwm.make_pssm()
+ pssm = pwm.log_odds()
result = pssm.calculate(self.s)
self.assertEqual(6, len(result))
# The fast C-code in Bio/Motif/_pwm.c stores all results as 32-bit

0 comments on commit ac9f504

Please sign in to comment.