Permalink
Browse files

Adding Pearson distance measure between PSSMs

  • Loading branch information...
1 parent 9df4791 commit f202f7cc5de1bd30480ed514b1aa6b75b51a32ac Michiel de Hoon committed Dec 29, 2012
Showing with 104 additions and 37 deletions.
  1. +57 −12 Bio/Motif/_Motif.py
  2. +42 −25 Doc/Tutorial.tex
  3. +5 −0 Tests/Motif/REB1.pfm
View
@@ -24,13 +24,13 @@ def __init__(self, alphabet, values):
raise Exception("Inconsistent lengths found in dictionary")
self[letter] = list(values[letter])
self.alphabet = alphabet
- self.__letters = sorted(self.alphabet.letters)
+ self._letters = sorted(self.alphabet.letters)
def __str__(self):
words = ["%6d" % i for i in range(self.length)]
line = " " + " ".join(words)
lines = [line]
- for letter in self.__letters:
+ for letter in self._letters:
words = ["%6.2f" % value for value in self[letter]]
line = "%c: " % letter + " ".join(words)
lines.append(line)
@@ -42,15 +42,15 @@ def __getitem__(self, key):
if len(key)==2:
key1, key2 = key
if isinstance(key1, slice):
- start1, stop1, stride1 = key1.indices(len(self.__letters))
+ start1, stop1, stride1 = key1.indices(len(self._letters))
indices1 = range(start1, stop1, stride1)
- letters1 = [self.__letters[i] for i in indices1]
+ letters1 = [self._letters[i] for i in indices1]
dim1 = 2
elif isinstance(key1, int):
- letter1 = self.__letters[key1]
+ letter1 = self._letters[key1]
dim1 = 1
elif isinstance(key1, tuple):
- letters1 = [self.__letters[i] for i in key1]
+ letters1 = [self._letters[i] for i in key1]
dim1 = 2
elif isinstance(key1, str):
if len(key1)==1:
@@ -84,7 +84,7 @@ def __getitem__(self, key):
for letter1 in letters1:
values = dict.__getitem__(self, letter1)
d[letter1] = [values[index2] for index2 in indices2]
- if sorted(letters1)==self.__letters:
+ if sorted(letters1)==self._letters:
return self.__class__(self.alphabet, d)
else:
return d
@@ -93,15 +93,15 @@ def __getitem__(self, key):
else:
raise KeyError("keys should be 1- or 2-dimensional")
if isinstance(key, slice):
- start, stop, stride = key.indices(len(self.__letters))
+ start, stop, stride = key.indices(len(self._letters))
indices = range(start, stop, stride)
- letters = [self.__letters[i] for i in indices]
+ letters = [self._letters[i] for i in indices]
dim = 2
elif isinstance(key, int):
- letter = self.__letters[key]
+ letter = self._letters[key]
dim = 1
elif isinstance(key, tuple):
- letters = [self.__letters[i] for i in key]
+ letters = [self._letters[i] for i in key]
dim = 2
elif isinstance(key, str):
if len(key)==1:
@@ -416,6 +416,51 @@ def std(self):
returns None if the standard deviation is undefined"""
return self._std
+ def dist_pearson(self, other):
+ """
+ return the similarity score based on pearson correlation for the given motif against self.
+
+ We use the Pearson's correlation of the respective probabilities.
+ """
+ if self.alphabet != other.alphabet:
+ raise ValueError("Cannot compare motifs with different alphabets")
+
+ max_p=-2
+ for offset in range(-self.length+1, other.length):
+ if offset<0:
+ p = self.dist_pearson_at(other, -offset)
+ else: # offset>=0
+ p = other.dist_pearson_at(self, offset)
+ if max_p<p:
+ max_p=p
+ max_o=-offset
+ return 1-max_p,max_o
+
+ def dist_pearson_at(self, other, offset):
+ letters = self._letters
+ sx = 0.0 # \sum x
+ sy = 0.0 # \sum y
+ sxx = 0.0 # \sum x^2
+ sxy = 0.0 # \sum x \cdot y
+ syy = 0.0 # \sum y^2
+ norm=max(self.length,offset+other.length)*len(letters)
+ for pos in range(min(self.length-offset, other.length)):
+ xi = [self[letter,pos+offset] for letter in letters]
+ yi = [other[letter,pos] for letter in letters]
+ sx += sum(xi)
+ sy += sum(yi)
+ sxx += sum([x*x for x in xi])
+ sxy += sum([x*y for x,y in zip(xi,yi)])
+ syy += sum([y*y for y in yi])
+ sx /= norm
+ sy /= norm
+ sxx /= norm
+ sxy /= norm
+ syy /= norm
+ numerator = sxy - sx*sy
+ denominator = math.sqrt((sxx-sx*sx)*(syy-sy*sy))
+ return numerator/denominator
+
class Motif(object):
"""
@@ -895,7 +940,7 @@ def dist_dpq(self,other):
m2[i].freq(alphabet[k])*log_2(m2[i].freq(alphabet[k])/m1[i].freq(alphabet[k]))
}
- over possible non-spaced alignemts of two motifs. See this reference:
+ over possible non-spaced alignments of two motifs. See this reference:
D. M Endres and J. E Schindelin, "A new metric for probability
distributions", IEEE transactions on Information Theory 49, no. 7
View
@@ -11255,7 +11255,7 @@ \section{Searching for instances}
21 8.44065060871
\end{verbatim}
-\subsection{Comparing motifs}
+\section{Comparing motifs}
\label{sec:comp}
Once we have more than one motif, we might want to compare them. For
that, we have currently three different methods of \verb|Bio.Motif|
@@ -11272,48 +11272,65 @@ \subsection{Comparing motifs}
In \verb|Bio.Motif| we have 3 different functions for motif
comparison, which are based on the same idea behind motif alignment,
but use different functions to compare aligned motifs. Briefly
-speaking, we are using ungapped alignment of PWMs and substitute the
-missing columns at the beginning and end of the matrices with
-background distribution. All three comparison functions are written in
-such a way, that they can be interpreted as distance measures, however
+speaking, we are using ungapped alignment of PSSMs and substitute zeros
+for any missing columns at the beginning and end of the matrices. This means
+that effectively we are using the background distribution for columns missing
+from the PSSM. All three comparison functions are written in
+such a way that they can be interpreted as distance measures. However
only one (\verb|dist_dpq|) satisfies the triangle inequality. All of
them return the minimal distance and the corresponding offset between
motifs.
To show how these functions work, let us first load another motif,
which is similar to our test motif \verb|m|:
+%cont-doctest
\begin{verbatim}
->>> ubx=Motif.read(open("Ubx.pfm"),"jaspar-pfm")
-<Bio.Motif.Motif.Motif object at 0xc29b90>
->>> ubx.consensus
-Seq('TAAT', IUPACUnambiguousDNA())
+>>> m_reb1 = Motif.read(open("REB1.pfm"), "pfm")
+>>> m_reb1.consensus
+Seq('GTTACCCGG', IUPACUnambiguousDNA())
+>>> print m_reb1.counts
+ 0 1 2 3 4 5 6 7 8
+A: 30.00 0.00 0.00 100.00 0.00 0.00 0.00 0.00 15.00
+C: 10.00 0.00 0.00 0.00 100.00 100.00 100.00 0.00 15.00
+G: 50.00 0.00 0.00 0.00 0.00 0.00 0.00 60.00 55.00
+T: 10.00 100.00 100.00 0.00 0.00 0.00 0.00 40.00 15.00
+<BLANKLINE>
\end{verbatim}
+We construct the position-weight matrix and the position-specific scoring matrix
+using the same values for the pseudocounts and the background distribution as our
+motif \verb|m|:
+%cont-doctest
+\begin{verbatim}
+>>> pwm_reb1 = m_reb1.counts.normalize(pseudocounts={'A':0.6, 'C': 0.4, 'G': 0.4, 'T': 0.6})
+>>> pssm_reb1 = pwm_reb1.log_odds(background={'A':0.3,'C':0.2,'G':0.2,'T':0.3})
+>>> print pssm_reb1
+ 0 1 2 3 4 5 6 7 8
+A: 0.00 -5.67 -5.67 1.72 -5.67 -5.67 -5.67 -5.67 -0.97
+C: -0.97 -5.67 -5.67 -5.67 2.30 2.30 2.30 -5.67 -0.41
+G: 1.30 -5.67 -5.67 -5.67 -5.67 -5.67 -5.67 1.57 1.44
+T: -1.53 1.72 1.72 -5.67 -5.67 -5.67 -5.67 0.41 -0.97
+<BLANKLINE>
+\end{verbatim}
The first function we'll use to compare these motifs is based on the
Pearson correlation. Since we want it to resemble a distance
measure, we actually take $1-r$, where $r$ is the Pearson correlation
coefficient (PCC):
+%cont-doctest
\begin{verbatim}
->>> m.dist_pearson(ubx)
-(0.41740393308237722, 2)
+>>> distance, offset = pssm.dist_pearson(pssm_reb1)
+>>> print "distance = %5.3g" % distance
+distance = 0.239
+>>> print offset
+-2
\end{verbatim}
-This means, that the best PCC between motif \verb|m| and \verb|Ubx| is obtained with the following alignment:
+This means that the best PCC between motif \verb|m| and \verb|m_reb1| is obtained with the following alignment:
\begin{verbatim}
-bbTAAT
-TATAAb
+m: bbTACGCbb
+m_reb1: GTTACCCGG
\end{verbatim}
where \verb|b| stands for background distribution. The PCC itself is
-roughly $1-0.42=0.58$. If we try the reverse complement of the Ubx motif:
-
-\begin{verbatim}
->>> m.dist_pearson(ubx.reverse_complement())
-(0.25784180151584823, 1)
-\end{verbatim}
-We can see that the PCC is better (almost $0.75$), and the alignment is also different:
-\begin{verbatim}
-bATTA
-TATAA
-\end{verbatim}
+roughly $1-0.239=0.761$.
There are two other functions: \verb|dist_dpq|, which is a true metric (satisfying the triangle inequality) based on the Kullback-Leibler divergence
\begin{verbatim}
View
@@ -0,0 +1,5 @@
+30 0 0 100 0 0 0 0 15
+10 0 0 0 100 100 100 0 15
+50 0 0 0 0 0 0 60 55
+10 100 100 0 0 0 0 40 15
+

0 comments on commit f202f7c

Please sign in to comment.