Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Implementing and documenting information content

  • Loading branch information...
commit 3d92676606de318bc7b4d64c9bd79b18a2d2f529 1 parent ac9f504
Michiel de Hoon authored
Showing with 51 additions and 93 deletions.
  1. +37 −67 Bio/Motif/_Motif.py
  2. +14 −26 Doc/Tutorial.tex
View
104 Bio/Motif/_Motif.py
@@ -243,64 +243,6 @@ def __init__(self, alphabet, counts):
for letter in alphabet.letters:
self[letter] = tuple(self[letter])
- def ic(self, background=None):
- """\
-Returns the information content of a motif.
-
-By default, a uniform background is used. To specify a non-uniform
-background, use the 'background' argument to pass a dictionary containing
-the probability of each letter in the alphabet associated with the motif
-under the background distribution.
- """
- result=0
- if background is None:
- background = {}
- for a in self.alphabet.letters:
- background[a] = 1.0
- else:
- background = dict(background)
- total = sum(background.values())
- for a in self.alphabet.letters:
- background[a] /= total
- for a in self.alphabet.letters:
- if background[a]!=0:
- result-=background[a]*math.log(background[a],2)
- result *= self.length
- for i in range(self.length):
- for a in self.alphabet.letters:
- if self[a][i]!=0:
- result+=self[a][i]*math.log(self[a][i],2)
- return result
-
- def exp_score(self, background=None):
- """
- Computes expected score of motif's instance and its standard deviation
- """
- exs = 0.0
- var = 0.0
- if background is None:
- background = {}
- for letter in self.alphabet.letters:
- background[letter] = 1.0
- else:
- background = dict(background)
- total = sum(background.values())
- for letter in self.alphabet.letters:
- background[letter] /= total
- for i in range(self.length):
- ex1 = 0.0
- ex2 = 0.0
- for letter in self.alphabet.letters:
- p = self[letter][i]
- b = background[letter]
- if p!=0:
- term = p*(math.log(p,2)-math.log(b,2))
- ex1 += term
- ex2 += term**2
- exs += ex1
- var += ex2-ex1**2
- return exs, math.sqrt(var)
-
def log_odds(self, background=None):
"""
returns the Position-Specific Scoring Matrix.
@@ -321,14 +263,20 @@ def log_odds(self, background=None):
total = sum(background.values())
for letter in alphabet.letters:
background[letter] /= total
- for letter in alphabet.letters:
values[letter] = []
- b = background[letter]
- if b > 0:
- for i in range(self.length):
+ mean = 0.0
+ variance = 0.0
+ for i in range(self.length):
+ sx = 0.0
+ sxx = 0.0
+ for letter in alphabet.letters:
+ b = background[letter]
+ if b > 0:
p = self[letter][i]
if p > 0:
logodds = math.log(p/b, 2)
+ sx += p*logodds
+ sxx += p*logodds*logodds
else:
#TODO - Ensure this has unittest coverage!
try:
@@ -338,15 +286,23 @@ def log_odds(self, background=None):
# and failed on Windows XP 32bit
logodds = - 1E400
values[letter].append(logodds)
- else:
- for i in range(self.length):
+ else:
p = self[letter][i]
if p > 0:
logodds = float("inf")
+ mean = float("inf")
+ variance = float("inf")
else:
logodds = float("nan")
values[letter].append(logodds)
- return PositionSpecificScoringMatrix(alphabet, values)
+ sxx -= sx*sx
+ mean += sx
+ variance += sxx
+ pssm = PositionSpecificScoringMatrix(alphabet, values)
+ variance = max(variance, 0) # to avoid roundoff problems
+ pssm._mean = mean
+ pssm._std = math.sqrt(variance)
+ return pssm
class PositionSpecificScoringMatrix(GenericPositionMatrix):
@@ -412,7 +368,7 @@ def search(self, sequence, threshold=0.0, both=True):
yield (position-n, score)
@property
- def max_score(self):
+ def max(self):
"""Maximal possible score for this motif.
returns the score computed for the consensus sequence.
@@ -424,7 +380,7 @@ def max_score(self):
return score
@property
- def min_score(self):
+ def min(self):
"""Minimal possible score for this motif.
returns the score computed for the anticonsensus sequence.
@@ -435,6 +391,20 @@ def min_score(self):
score += min([self[letter][position] for letter in letters])
return score
+ @property
+ def mean(self):
+ """Expected value of the score of a motif.
+
+ returns None if the expected value is undefined"""
+ return self._mean
+
+ @property
+ def std(self):
+ """Standard deviation of the score of a motif.
+
+ returns None if the standard deviation is undefined"""
+ return self._std
+
class Motif(object):
"""
View
40 Doc/Tutorial.tex
@@ -11050,43 +11050,31 @@ \section{Position-Specific Scoring Matrices}
{'A': [0.415037499278844, 1.493040011280117, -2.169925001442312, -0.054447784022376135, -0.7548875021634684], 'C': [-2.1699250014423126, -2.1699250014423126, 1.5849625007211563, 0.4150374992788437, 1.8300749985576878], 'T': [0.7686744538935444, -2.169925001442312, -0.054447784022376135, -2.169925001442312, -2.169925001442312], 'G': [-2.1699250014423126, -2.1699250014423126, -2.1699250014423126, 0.9175378398080271, -2.1699250014423126]}
\end{verbatim}
-The maximum and minimum score that can be obtained from the \verb+max_score+
-and \verb+min_score+ properties:
+The maximum and minimum score obtainable from the PSSM are stores in the
+\verb+.max+ and \verb+.min+ properties:
%cont-doctest
\begin{verbatim}
>>> pssm = pwm.log_odds()
->>> print "%4.2f" % pssm.max_score
+>>> print "%4.2f" % pssm.max
6.15
->>> print "%4.2f" % pssm.min_score
+>>> print "%4.2f" % pssm.min
-11.87
\end{verbatim}
-To calculate the information content of this position-specific scoring matrix,
-use the \verb+.ic+ method on the position-weight matrix. For the uniform
-background, we find
+Similarly, the mean and standard deviation of the PSSM scores are stored in
+the \verb+.mean+ and \verb+.std+ attributes:
%cont-doctest
\begin{verbatim}
->>> print "%0.2f" % pwm.ic()
-3.22
-\end{verbatim}
-This gives us a number of bits of information provided by the motif,
-which tells us how much the motif differs from background.
-The information content of the position-specific scoring matrix against a
-background with a GC content of 40\% can be calculated by using the
-\verb+background+ argument:
-%cont-doctest
-\begin{verbatim}
->>> print "%0.2f" % pwm.ic(background={'A': 0.3, 'C': 0.2, 'G': 0.2, 'T': 0.3})
-3.08
-\end{verbatim}
-
-The mean and standard deviation of the PSSM scores can be calculated similarly:
-%cont-doctest
-\begin{verbatim}
->>> mean, std = pwm.exp_score(background={'A': 0.3, 'C': 0.2, 'G': 0.2, 'T': 0.3})
+>>> mean = pssm.mean
+>>> std = pssm.std
>>> print "mean = %0.2f, standard deviation = %0.2f" % (mean, std)
-mean = 3.21, standard deviation = 1.50
+mean = 3.22, standard deviation = 2.49
\end{verbatim}
+The mean is particularly important, as its value is equal to the
+Kullback-Leibler divergence or relative entropy, and is a measure for the
+information content of the motif compared to the background. As in Biopython
+the base-2 logarithm is used in the calculation of the log-odds scores, the
+information content has units of bits.
\section{Searching for instances}
\label{sec:search}
Please sign in to comment.
Something went wrong with that request. Please try again.