Use background as an argument to mean, std, distribution

biopython · Jan 13, 2013 · 012c852 · 012c852
1 parent fa41c9c
commit 012c852
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 35 deletions.
diff --git a/Bio/Motif/Matrix.py b/Bio/Motif/Matrix.py
@@ -264,9 +264,7 @@ def log_odds(self, background=None):
         values = {}
         alphabet = self.alphabet
         if background is None:
-            background = {}
-            for letter in alphabet.letters:
-                background[letter] = 1.0
+            background = dict.fromkeys(self._letters, 1.0)
         else:
             background = dict(background)
         total = sum(background.values())
@@ -296,7 +294,6 @@ def log_odds(self, background=None):
                         logodds = float("nan")
                 values[letter].append(logodds)
         pssm = PositionSpecificScoringMatrix(alphabet, values)
-        pssm._background = background
         return pssm
 
 
@@ -386,14 +383,15 @@ def min(self):
             score += min([self[letter][position] for letter in letters])
         return score
 
-    @property
-    def mean(self):
-        """Expected value of the score of a motif.
-
-        returns None if the expected value is undefined"""
-        if self._background is None:
-            return None
-        background = self._background
+    def mean(self, background=None):
+        """Expected value of the score of a motif."""
+        if background is None:
+            background = dict.fromkeys(self._letters, 1.0)
+        else:
+            background = dict(background)
+        total = sum(background.values())
+        for letter in self._letters:
+            background[letter] /= total
         sx = 0.0
         for i in range(self.length):
             for letter in self._letters:
@@ -402,16 +400,16 @@ def mean(self):
                 p = b * math.pow(2,logodds)
                 sx += p * logodds
         return sx
-
 
-    @property
-    def std(self):
-        """Standard deviation of the score of a motif.
-
-        returns None if the standard deviation is undefined"""
-        if self._background is None:
-            return None
-        background = self._background
+    def std(self, background=None):
+        """Standard deviation of the score of a motif."""
+        if background is None:
+            background = dict.fromkeys(self._letters, 1.0)
+        else:
+            background = dict(background)
+        total = sum(background.values())
+        for letter in self._letters:
+            background[letter] /= total
         variance = 0.0
         for i in range(self.length):
             sx = 0.0
@@ -472,12 +470,14 @@ def dist_pearson_at(self, other, offset):
         denominator = math.sqrt((sxx-sx*sx)*(syy-sy*sy))
         return numerator/denominator
 
-    def distribution(self, precision=10**3):
-        """calculate the distribution of the scores at the given precision
-
-        returns None if the distribution is undefined"""
-        background = self._background
-        if background==None:
-            return None
+    def distribution(self, background=None, precision=10**3):
+        """calculate the distribution of the scores at the given precision."""
         from Thresholds import ScoreDistribution
+        if background is None:
+            background = dict.fromkeys(self._letters, 1.0)
+        else:
+            background = dict(background)
+        total = sum(background.values())
+        for letter in self._letters:
+            background[letter] /= total
         return ScoreDistribution(precision=precision, pssm=self, background=background)
diff --git a/Bio/Motif/Thresholds.py b/Bio/Motif/Thresholds.py
@@ -24,7 +24,7 @@ def __init__(self, motif=None, precision=10**3, pssm=None, background=None):
             self.min_score = min(0.0, pssm.min)
             self.interval = max(0.0, pssm.max)-self.min_score
             self.n_points = precision * pssm.length
-            self.ic = pssm.mean
+            self.ic = pssm.mean(background)
         self.step = self.interval/(self.n_points-1)
         self.mo_density = [0.0]*self.n_points
         self.mo_density[-self._index_diff(self.min_score)] = 1.0

diff --git a/Doc/Tutorial.tex b/Doc/Tutorial.tex
@@ -11240,7 +11240,8 @@ \section{Position-Specific Scoring Matrices}
 For example, against a background with a 40\% GC content, use
 %cont-doctest
 \begin{verbatim}
->>> pssm = pwm.log_odds(background={'A':0.3,'C':0.2,'G':0.2,'T':0.3})
+>>> background = {'A':0.3,'C':0.2,'G':0.2,'T':0.3}
+>>> pssm = pwm.log_odds(background)
 >>> print pssm
         0      1      2      3      4
 A:   0.42   1.49  -2.17  -0.05  -0.75
@@ -11260,15 +11261,16 @@ \section{Position-Specific Scoring Matrices}
 -10.85
 \end{verbatim}
 
-Similarly, the mean and standard deviation of the PSSM scores are stored in
-the \verb+.mean+ and \verb+.std+ attributes:
+The mean and standard deviation of the PSSM scores with respect to a specific
+background are calculated by the \verb+.mean+ and \verb+.std+ methods.
 %cont-doctest
 \begin{verbatim}
->>> mean = pssm.mean
->>> std = pssm.std
+>>> mean = pssm.mean(background)
+>>> std = pssm.std(background)
 >>> print "mean = %0.2f, standard deviation = %0.2f" % (mean, std)
 mean = 3.21, standard deviation = 2.59
 \end{verbatim}
+A uniform background is used if \verb+background+ is not specified.
 The mean is particularly important, as its value is equal to the 
 Kullback-Leibler divergence or relative entropy, and is a measure for the
 information content of the motif compared to the background. As in Biopython
@@ -11374,7 +11376,7 @@ \subsection{Selecting a score threshold}
 approximation with a given precision to keep computation cost manageable:
 %cont-doctest
 \begin{verbatim}
->>> distribution = pssm.distribution(precision=10**4)
+>>> distribution = pssm.distribution(background=background, precision=10**4)
 \end{verbatim}
 The \verb+distribution+ object can be used to determine a number of different thresholds.
 We can specify the requested false-positive rate (probability of ``finding'' a motif instance in background generated sequence):