# biopython/biopython

Implement PSSM score calculations

1 parent 6e09a4a commit a7154f20e395ec2e75ea2fed1252891a57780e4c mdehoon committed Dec 13, 2012
Showing with 60 additions and 12 deletions.
1. +52 −6 Bio/Motif/_Motif.py
2. +5 −5 Doc/Tutorial.tex
3. +3 −1 Tests/test_Motif.py
58 Bio/Motif/_Motif.py
 @@ -166,8 +166,9 @@ def exp_score(self, background=None): ex2 = 0.0 for letter in self.alphabet.letters: p = self[i][letter] + b = background[letter] if p!=0: - term = p*(math.log(p,2)-math.log(background[a],2)) + term = p*(math.log(p,2)-math.log(b,2)) ex1 += term ex2 += term**2 exs += ex1 @@ -220,12 +221,44 @@ class PositionSpecificScoringMatrix(GenericPositionMatrix): def calculate(self, sequence): """ - returns the PWM score for a given sequence + returns the PWM score for a given sequence for all positions. + + - the sequence can only be a DNA sequence + - the search is performed only on one strand + - if the sequence and the motif have the same length, a single + number is returned + - otherwise, the result is a one-dimensional list or numpy array """ - score = 0.0 - for position, letter in enumerate(sequence): - score += self[letter][position] - return score + if self.alphabet!=IUPAC.unambiguous_dna: + raise ValueError("Wrong alphabet! Use only with DNA motifs") + if sequence.alphabet!=IUPAC.unambiguous_dna: + raise ValueError("Wrong alphabet! Use only with DNA sequences") + + sequence = str(sequence) + m = self.length + n = len(sequence) + + scores = [] + # check if the fast C code can be used + try: + import _pwm + except ImportError: + # use the slower Python code otherwise + for i in xrange(n-m+1): + score = 0.0 + for position in xrange(m): + letter = sequence[i+position] + score += self[letter][position] + scores.append(score) + else: + # get the log-odds matrix into a proper shape + # (each row contains sorted (ACGT) log-odds values) + logodds = [[self[letter][i] for letter in "ACGT"] for i in range(m)] + scores = _pwm.calculate(sequence, logodds) + if len(scores)==1: + return scores[0] + else: + return scores def search(self, sequence, threshold=0.0, both=True): """ @@ -352,13 +385,21 @@ def has_counts(self): return self.counts is not None def _check_length(self, len): + warnings.warn("""\ +This function is now obsolete, and will be deprecated and removed +in a future release of Biopython. +""", PendingDeprecationWarning) if self.length is None: self.length = len elif self.length != len: print "len",self.length,self.instances, len raise ValueError("You can't change the length of the motif") def _check_alphabet(self,alphabet): + warnings.warn("""\ +This function is now obsolete, and will be deprecated and removed +in a future release of Biopython. +""", PendingDeprecationWarning) if self.alphabet is None: self.alphabet=alphabet elif self.alphabet != alphabet: @@ -368,6 +409,11 @@ def add_instance(self,instance): """ adds new instance to the motif """ + warnings.warn("""\ +This function is now obsolete, and will be deprecated and removed +in a future release of Biopython. Instead of adding instances to an existing +Motif object, please create a new Motif object. +""", PendingDeprecationWarning) self._check_alphabet(instance.alphabet) self._check_length(len(instance)) if self.counts is not None:
10 Doc/Tutorial.tex
 @@ -11048,11 +11048,11 @@ \subsection{Searching for instances} >>> for position, score in pssm.search(test_seq, threshold=5.0): ... print position, score ... -10 8.44065060871 --14 7.06213898545 -15 8.44065060871 --6 8.44065060871 -21 8.44065060871 +10 8.44065 +-14 7.06214 +15 8.44065 +-6 8.44065 +21 8.44065 \end{verbatim} The negative positions refer to instances of the motif found on the reverse strand of the test sequence, and follow the Python convention
4 Tests/test_Motif.py
 @@ -1526,7 +1526,9 @@ def setUp(self): def test_simple(self): """Test if Motif PWM scoring works.""" - result = self.m.scanPWM(self.s) + pwm = self.m.make_pwm(pseudocounts=0.25) + pssm = pwm.make_pssm() + result = pssm.calculate(self.s) self.assertEqual(6, len(result)) # The fast C-code in Bio/Motif/_pwm.c stores all results as 32-bit # floats; the slower Python code in Bio/Motif/_Motif.py uses 64-bit