Skip to content
Browse files

Implement PSSM score calculations

  • Loading branch information...
1 parent 6e09a4a commit a7154f20e395ec2e75ea2fed1252891a57780e4c @mdehoon mdehoon committed Dec 13, 2012
Showing with 60 additions and 12 deletions.
  1. +52 −6 Bio/Motif/_Motif.py
  2. +5 −5 Doc/Tutorial.tex
  3. +3 −1 Tests/test_Motif.py
View
58 Bio/Motif/_Motif.py
@@ -166,8 +166,9 @@ def exp_score(self, background=None):
ex2 = 0.0
for letter in self.alphabet.letters:
p = self[i][letter]
+ b = background[letter]
if p!=0:
- term = p*(math.log(p,2)-math.log(background[a],2))
+ term = p*(math.log(p,2)-math.log(b,2))
ex1 += term
ex2 += term**2
exs += ex1
@@ -220,12 +221,44 @@ class PositionSpecificScoringMatrix(GenericPositionMatrix):
def calculate(self, sequence):
"""
- returns the PWM score for a given sequence
+ returns the PWM score for a given sequence for all positions.
+
+ - the sequence can only be a DNA sequence
+ - the search is performed only on one strand
+ - if the sequence and the motif have the same length, a single
+ number is returned
+ - otherwise, the result is a one-dimensional list or numpy array
"""
- score = 0.0
- for position, letter in enumerate(sequence):
- score += self[letter][position]
- return score
+ if self.alphabet!=IUPAC.unambiguous_dna:
+ raise ValueError("Wrong alphabet! Use only with DNA motifs")
+ if sequence.alphabet!=IUPAC.unambiguous_dna:
+ raise ValueError("Wrong alphabet! Use only with DNA sequences")
+
+ sequence = str(sequence)
+ m = self.length
+ n = len(sequence)
+
+ scores = []
+ # check if the fast C code can be used
+ try:
+ import _pwm
+ except ImportError:
+ # use the slower Python code otherwise
+ for i in xrange(n-m+1):
+ score = 0.0
+ for position in xrange(m):
+ letter = sequence[i+position]
+ score += self[letter][position]
+ scores.append(score)
+ else:
+ # get the log-odds matrix into a proper shape
+ # (each row contains sorted (ACGT) log-odds values)
+ logodds = [[self[letter][i] for letter in "ACGT"] for i in range(m)]
+ scores = _pwm.calculate(sequence, logodds)
+ if len(scores)==1:
+ return scores[0]
+ else:
+ return scores
def search(self, sequence, threshold=0.0, both=True):
"""
@@ -352,13 +385,21 @@ def has_counts(self):
return self.counts is not None
def _check_length(self, len):
+ warnings.warn("""\
+This function is now obsolete, and will be deprecated and removed
+in a future release of Biopython.
+""", PendingDeprecationWarning)
if self.length is None:
self.length = len
elif self.length != len:
print "len",self.length,self.instances, len
raise ValueError("You can't change the length of the motif")
def _check_alphabet(self,alphabet):
+ warnings.warn("""\
+This function is now obsolete, and will be deprecated and removed
+in a future release of Biopython.
+""", PendingDeprecationWarning)
if self.alphabet is None:
self.alphabet=alphabet
elif self.alphabet != alphabet:
@@ -368,6 +409,11 @@ def add_instance(self,instance):
"""
adds new instance to the motif
"""
+ warnings.warn("""\
+This function is now obsolete, and will be deprecated and removed
+in a future release of Biopython. Instead of adding instances to an existing
+Motif object, please create a new Motif object.
+""", PendingDeprecationWarning)
self._check_alphabet(instance.alphabet)
self._check_length(len(instance))
if self.counts is not None:
View
10 Doc/Tutorial.tex
@@ -11048,11 +11048,11 @@ \subsection{Searching for instances}
>>> for position, score in pssm.search(test_seq, threshold=5.0):
... print position, score
...
-10 8.44065060871
--14 7.06213898545
-15 8.44065060871
--6 8.44065060871
-21 8.44065060871
+10 8.44065
+-14 7.06214
+15 8.44065
+-6 8.44065
+21 8.44065
\end{verbatim}
The negative positions refer to instances of the motif found on the
reverse strand of the test sequence, and follow the Python convention
View
4 Tests/test_Motif.py
@@ -1526,7 +1526,9 @@ def setUp(self):
def test_simple(self):
"""Test if Motif PWM scoring works."""
- result = self.m.scanPWM(self.s)
+ pwm = self.m.make_pwm(pseudocounts=0.25)
+ pssm = pwm.make_pssm()
+ result = pssm.calculate(self.s)
self.assertEqual(6, len(result))
# The fast C-code in Bio/Motif/_pwm.c stores all results as 32-bit
# floats; the slower Python code in Bio/Motif/_Motif.py uses 64-bit

0 comments on commit a7154f2

Please sign in to comment.
Something went wrong with that request. Please try again.