Permalink
Browse files

Adding a PositionWeightMatrix class

  • Loading branch information...
Michiel de Hoon Michiel de Hoon
Michiel de Hoon authored and Michiel de Hoon committed Dec 6, 2012
1 parent 5ff0fe1 commit 8a0e7250b2bd548c6586cc4bb3b3a2cbdaa99ab0
Showing with 116 additions and 44 deletions.
  1. +95 −25 Bio/Motif/_Motif.py
  2. +21 −19 Doc/Tutorial.tex
View
@@ -153,7 +153,6 @@ def set_mask(self,mask):
>>> motif.set_mask(mask)
please use
>>> motif.mask = mask
-instead.
""", PendingDeprecationWarning)
self._check_length(len(mask))
self.__mask=[]
@@ -226,6 +225,34 @@ def pwm(self,laplace=True):
self._pwm_is_current=1
return self._pwm
+ def make_pwm(self, pseudocounts=None):
+ """
+ return the position-weight matrix (calculated from the counts
+ matrix).
+
+ If pseudocounts is None (default), no pseudocounts are added
+ to the counts.
+ If pseudocounts is a number, it is added to the counts before
+ calculating the position-weight matrix.
+ Alternatively, the pseudocounts can be a dictionary with a key
+ for each letter in the alphabet associated with the motif.
+ """
+
+ counts = {}
+ if pseudocounts==None:
+ for letter in self.alphabet.letters:
+ counts[letter] = [0.0] * self.length
+ elif isinstance(pseudocounts, dict):
+ for letter in self.alphabet.letters:
+ counts[letter] = [float(pseudocounts[letter])] * self.length
+ else:
+ for letter in self.alphabet.letters:
+ counts[letter] = [float(pseudocounts)] * self.length
+ for i in xrange(self.length):
+ for letter in self.alphabet.letters:
+ counts[letter][i] += self.counts[letter][i]
+ return PositionWeightMatrix(self.alphabet, counts)
+
def log_odds(self,laplace=True):
"""
returns the logg odds matrix computed for the set of instances
@@ -243,33 +270,28 @@ def log_odds(self,laplace=True):
self._log_odds_is_current=1
return self._log_odds
- def ic(self, background=None):
- """\
-Returns the information content of a motif.
-
-By default, a uniform background is used. To specify a non-uniform
-background, use the 'background' argument to pass a dictionary containing
-the probability of each letter in the alphabet associated with the motif
-under the background distribution.
+ def ic(self):
+ """Method returning the information content of a motif.
"""
- result=0
- if background is None:
- background = {}
- for a in self.alphabet.letters:
- background[a] = 1.0
- total = sum(background.values())
- for a in self.alphabet.letters:
- background[a] /= total
- for a in self.alphabet.letters:
- if background[a]!=0:
- result-=background[a]*math.log(background[a],2)
- result *= self.length
+ warnings.warn("""\
+This function is now obsolete, and will be deprecated and removed
+in a future release of Biopython. As a replacement, instead of
+>>> motif.ic()
+please use
+>>> pwm = motif.make_pwm()
+>>> pwm.ic()
+Please be aware though that by default, motif.make_pwm() does not
+use psuedocounts, while motif.ic() does. See the documentation of
+motif.make_pwm for more details.
+""", PendingDeprecationWarning)
+ res=0
pwm=self.pwm()
for i in range(self.length):
+ res+=2
for a in self.alphabet.letters:
if pwm[i][a]!=0:
- result+=pwm[i][a]*math.log(pwm[i][a],2)
- return result
+ res+=pwm[i][a]*math.log(pwm[i][a],2)
+ return res
def exp_score(self,st_dev=False):
"""
@@ -559,14 +581,14 @@ def reverse_complement(self):
"""
Gives the reverse complement of the motif
"""
- alphabet = IUPAC.unambiguous_dna
if self.instances is not None:
instances = []
for instance in self.instances:
instance = instance.reverse_complement()
instances.append(instance)
- res = Motif(alphabet, instances)
+ res = Motif(instances=instances)
else: # has counts
+ alphabet = self.alphabet
res = Motif(alphabet)
res.counts={}
res.counts["A"]=self.counts["T"][:]
@@ -1140,3 +1162,51 @@ def _pwm_calculate(self, sequence):
else:
result[i] = score
return result
+
+
+class PositionWeightMatrix(dict):
+
+ def __init__(self, alphabet, counts):
+ self.alphabet = alphabet
+ self.length = None
+ for letter in alphabet.letters:
+ if self.length==None:
+ self.length = len(counts[letter])
+ elif len(counts[letter])!=self.length:
+ raise Exception("Inconsistent size found for the counts")
+ self[letter] = list(counts[letter])
+ for i in xrange(self.length):
+ total = sum([float(self[letter][i]) for letter in alphabet.letters])
+ for letter in self.alphabet.letters:
+ self[letter][i] /= total
+ for letter in self.alphabet.letters:
+ self[letter] = tuple(self[letter])
+
+ def ic(self, background=None):
+ """\
+Returns the information content of a motif.
+
+By default, a uniform background is used. To specify a non-uniform
+background, use the 'background' argument to pass a dictionary containing
+the probability of each letter in the alphabet associated with the motif
+under the background distribution.
+ """
+ result=0
+ if background is None:
+ background = {}
+ for a in self.alphabet.letters:
+ background[a] = 1.0
+ else:
+ background = dict(background)
+ total = sum(background.values())
+ for a in self.alphabet.letters:
+ background[a] /= total
+ for a in self.alphabet.letters:
+ if background[a]!=0:
+ result-=background[a]*math.log(background[a],2)
+ result *= self.length
+ for i in range(self.length):
+ for a in self.alphabet.letters:
+ if self[a][i]!=0:
+ result+=self[a][i]*math.log(self[a][i],2)
+ return result
View
@@ -10770,24 +10770,24 @@ \section{Motif objects}
\begin{verbatim}
>>> from Bio import Motif
\end{verbatim}
-and we can start creating our first motif objects. Let's create a DNA motif:
+and we can start creating our first motif objects.
+Suppose we have these instances of a DNA motif:
%cont-doctest
\begin{verbatim}
->>> from Bio.Alphabet import IUPAC
->>> m = Motif.Motif(alphabet=IUPAC.unambiguous_dna)
+>>> from Bio.Seq import Seq
+>>> instances = [Seq("TATAA"),
+... Seq("TATTA"),
+... Seq("TATAA"),
+... Seq("TATAA"),
+... ]
\end{verbatim}
-This is for now just an empty container, so let's add some sequences to our newly created motif:
+then we can create a Motif object as follows:
%cont-doctest
\begin{verbatim}
->>> from Bio.Seq import Seq
->>> m.add_instance(Seq("TATAA",m.alphabet))
->>> m.add_instance(Seq("TATTA",m.alphabet))
->>> m.add_instance(Seq("TATAA",m.alphabet))
->>> m.add_instance(Seq("TATAA",m.alphabet))
-\end{verbatim}
-Now we have a full \verb|Motif| instance, so we can try to get some
-basic information about it. Let's start with length and consensus
-sequence:
+>>> m = Motif.Motif(instances=instances)
+\end{verbatim}
+Now we can try to get some basic information about it.
+Let's start with length and consensus sequence:
%cont-doctest
\begin{verbatim}
>>> len(m)
@@ -10813,20 +10813,22 @@ \section{Motif objects}
We can also calculate the information content of a motif with a simple call:
%cont-doctest
\begin{verbatim}
->>> print "%0.2f" % m.ic()
+>>> pwm = m.make_pwm(pseudocounts=0.25)
+>>> print "%0.2f" % pwm.ic()
5.27
\end{verbatim}
This gives us a number of bits of information provided by the motif,
-which tells us how much the motif differs from background. By default,
-the information content is calculated with respect to a uniform background,
-in which all letters in the alphabet of the motif are assigend an equal
-probability. To specify a non-uniform background, pass a dictionary with
+which tells us how much the motif differs from background.
+
+By default, the information content is calculated with respect to a uniform
+background, in which all letters in the alphabet of the motif are assigend an
+equal probability. To specify a non-uniform background, pass a dictionary with
the background probabilities when calling this method. For example, the GC
content for human is approximately 40\%. To calculate the information
content of a motif with respect to this background, use
%cont-doctest
\begin{verbatim}
->>> print "%0.2f" % m.ic(background={'A': 0.3, 'C': 0.2, 'G': 0.2, 'T': 0.3})
+>>> print "%0.2f" % pwm.ic(background={'A': 0.3, 'C': 0.2, 'G': 0.2, 'T': 0.3})
5.13
\end{verbatim}

1 comment on commit 8a0e725

Member

cbrueffer commented on 8a0e725 Dec 7, 2012

Hi Michiel,

could you use "if foo is None" or "if not foo" instead of a comparison? Those are recommended by PEP8.

Please sign in to comment.