Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Separating the old and new Motif code, and adding automatic pwm, pssm…

… calculation to the new Motif class
  • Loading branch information...
commit d1a3efb2e937ed15118cd257890648b2418164d1 1 parent 57ae89c
mdehoon authored
View
2  Bio/Motif/AlignAce.py
@@ -6,7 +6,7 @@
"""Parsing AlignACE output files
"""
-from Bio.Motif import Motif
+from Bio.Motif import NewMotif as Motif
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
View
2  Bio/Motif/Jaspar.py
@@ -1,4 +1,4 @@
-from Bio.Motif import Motif
+from Bio.Motif import NewMotif as Motif
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
View
2  Bio/Motif/MEME.py
@@ -6,7 +6,7 @@
from Bio.Alphabet import IUPAC
from Bio import Seq
-from Bio.Motif import Motif as BaseMotif
+from Bio.Motif import NewMotif as BaseMotif
def read(handle):
View
483 Bio/Motif/Matrix.py
@@ -0,0 +1,483 @@
+# Copyright 2013 by Michiel de Hoon. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+"""Implementation of frequency (count) matrices, position-weight matrices,
+and position-specific scoring matrices.
+"""
+
+import math
+from Bio.Seq import Seq
+from Bio.Alphabet import IUPAC
+
+
+class GenericPositionMatrix(dict):
+
+ def __init__(self, alphabet, values):
+ self.length = None
+ for letter in alphabet.letters:
+ if self.length is None:
+ self.length = len(values[letter])
+ elif self.length!=len(values[letter]):
+ raise Exception("Inconsistent lengths found in dictionary")
+ self[letter] = list(values[letter])
+ self.alphabet = alphabet
+ self._letters = sorted(self.alphabet.letters)
+
+ def __str__(self):
+ words = ["%6d" % i for i in range(self.length)]
+ line = " " + " ".join(words)
+ lines = [line]
+ for letter in self._letters:
+ words = ["%6.2f" % value for value in self[letter]]
+ line = "%c: " % letter + " ".join(words)
+ lines.append(line)
+ text = "\n".join(lines) + "\n"
+ return text
+
+ def __getitem__(self, key):
+ if isinstance(key, tuple):
+ if len(key)==2:
+ key1, key2 = key
+ if isinstance(key1, slice):
+ start1, stop1, stride1 = key1.indices(len(self._letters))
+ indices1 = range(start1, stop1, stride1)
+ letters1 = [self._letters[i] for i in indices1]
+ dim1 = 2
+ elif isinstance(key1, int):
+ letter1 = self._letters[key1]
+ dim1 = 1
+ elif isinstance(key1, tuple):
+ letters1 = [self._letters[i] for i in key1]
+ dim1 = 2
+ elif isinstance(key1, str):
+ if len(key1)==1:
+ letter1 = key1
+ dim1 = 1
+ else:
+ raise KeyError(key1)
+ else:
+ raise KeyError("Cannot understand key %s", str(key1))
+ if isinstance(key2, slice):
+ start2, stop2, stride2 = key2.indices(self.length)
+ indices2 = range(start2, stop2, stride2)
+ dim2 = 2
+ elif isinstance(key2, int):
+ index2 = key2
+ dim2 = 1
+ else:
+ raise KeyError("Cannot understand key %s", str(key2))
+ if dim1==1 and dim2==1:
+ return dict.__getitem__(self, letter1)[index2]
+ elif dim1==1 and dim2==2:
+ values = dict.__getitem__(self, letter1)
+ return tuple(values[index2] for index2 in indices2)
+ elif dim1==2 and dim2==1:
+ d = {}
+ for letter1 in letters1:
+ d[letter1] = dict.__getitem__(self, letter1)[index2]
+ return d
+ else:
+ d = {}
+ for letter1 in letters1:
+ values = dict.__getitem__(self, letter1)
+ d[letter1] = [values[index2] for index2 in indices2]
+ if sorted(letters1)==self._letters:
+ return self.__class__(self.alphabet, d)
+ else:
+ return d
+ elif len(key)==1:
+ key = key[0]
+ else:
+ raise KeyError("keys should be 1- or 2-dimensional")
+ if isinstance(key, slice):
+ start, stop, stride = key.indices(len(self._letters))
+ indices = range(start, stop, stride)
+ letters = [self._letters[i] for i in indices]
+ dim = 2
+ elif isinstance(key, int):
+ letter = self._letters[key]
+ dim = 1
+ elif isinstance(key, tuple):
+ letters = [self._letters[i] for i in key]
+ dim = 2
+ elif isinstance(key, str):
+ if len(key)==1:
+ letter = key
+ dim = 1
+ else:
+ raise KeyError(key)
+ else:
+ raise KeyError("Cannot understand key %s", str(key))
+ if dim==1:
+ return dict.__getitem__(self, letter)
+ elif dim==2:
+ d = {}
+ for letter in letters:
+ d[letter] = dict.__getitem__(self, letter)
+ return d
+ else:
+ raise RuntimeError("Should not get here")
+
+ @property
+ def consensus(self):
+ """Returns the consensus sequence.
+ """
+ sequence = ""
+ for i in range(self.length):
+ try:
+ maximum = float("-inf")
+ except ValueError:
+ # On Python 2.5 or older that was handled in C code,
+ # and failed on Windows XP 32bit
+ maximum = - 1E400
+ for letter in self.alphabet.letters:
+ count = self[letter][i]
+ if count > maximum:
+ maximum = count
+ sequence_letter = letter
+ sequence += sequence_letter
+ return Seq(sequence, self.alphabet)
+
+ @property
+ def anticonsensus(self):
+ sequence = ""
+ for i in range(self.length):
+ try:
+ minimum = float("inf")
+ except ValueError:
+ # On Python 2.5 or older that was handled in C code,
+ # and failed on Windows XP 32bit
+ minimum = 1E400
+ for letter in self.alphabet.letters:
+ count = self[letter][i]
+ if count < minimum:
+ minimum = count
+ sequence_letter = letter
+ sequence += sequence_letter
+ return Seq(sequence, self.alphabet)
+
+ @property
+ def degenerate_consensus(self):
+ # Following the rules adapted from
+ # D. R. Cavener: "Comparison of the consensus sequence flanking
+ # translational start sites in Drosophila and vertebrates."
+ # Nucleic Acids Research 15(4): 1353-1361. (1987).
+ # The same rules are used by TRANSFAC.
+ degenerate_nucleotide = {
+ 'A': 'A',
+ 'C': 'C',
+ 'G': 'G',
+ 'T': 'T',
+ 'AC': 'M',
+ 'AG': 'R',
+ 'AT': 'W',
+ 'CG': 'S',
+ 'CT': 'Y',
+ 'GT': 'K',
+ 'ACG': 'V',
+ 'ACT': 'H',
+ 'AGT': 'D',
+ 'CGT': 'B',
+ 'ACGT': 'N',
+ }
+ sequence = ""
+ for i in range(self.length):
+ def get(nucleotide):
+ return self[nucleotide][i]
+ nucleotides = sorted(self, key=get, reverse=True)
+ counts = [self[c][i] for c in nucleotides]
+ # Follow the Cavener rules:
+ if counts[0] >= sum(counts[1:]) and counts[0] >= 2*counts[1]:
+ key = nucleotides[0]
+ elif 4*sum(counts[:2]) > 3*sum(counts):
+ key = "".join(sorted(nucleotides[:2]))
+ elif counts[3]==0:
+ key = "".join(sorted(nucleotides[:3]))
+ else:
+ key = "ACGT"
+ nucleotide = degenerate_nucleotide[key]
+ sequence += nucleotide
+ return Seq(sequence, alphabet = IUPAC.ambiguous_dna)
+
+ def reverse_complement(self):
+ values = {}
+ values["A"] = self["T"][::-1]
+ values["T"] = self["A"][::-1]
+ values["G"] = self["C"][::-1]
+ values["C"] = self["G"][::-1]
+ alphabet = self.alphabet
+ return self.__class__(alphabet, values)
+
+
+class FrequencyPositionMatrix(GenericPositionMatrix):
+
+ def normalize(self, pseudocounts=None):
+ """
+ create and return a position-weight matrix by normalizing the counts matrix.
+
+ If pseudocounts is None (default), no pseudocounts are added
+ to the counts.
+ If pseudocounts is a number, it is added to the counts before
+ calculating the position-weight matrix.
+ Alternatively, the pseudocounts can be a dictionary with a key
+ for each letter in the alphabet associated with the motif.
+ """
+
+ counts = {}
+ if pseudocounts is None:
+ for letter in self.alphabet.letters:
+ counts[letter] = [0.0] * self.length
+ elif isinstance(pseudocounts, dict):
+ for letter in self.alphabet.letters:
+ counts[letter] = [float(pseudocounts[letter])] * self.length
+ else:
+ for letter in self.alphabet.letters:
+ counts[letter] = [float(pseudocounts)] * self.length
+ for i in xrange(self.length):
+ for letter in self.alphabet.letters:
+ counts[letter][i] += self[letter][i]
+ # Actual normalization is done in the PositionWeightMatrix initializer
+ return PositionWeightMatrix(self.alphabet, counts)
+
+
+class PositionWeightMatrix(GenericPositionMatrix):
+
+ def __init__(self, alphabet, counts):
+ GenericPositionMatrix.__init__(self, alphabet, counts)
+ for i in xrange(self.length):
+ total = sum([float(self[letter][i]) for letter in alphabet.letters])
+ for letter in alphabet.letters:
+ self[letter][i] /= total
+ for letter in alphabet.letters:
+ self[letter] = tuple(self[letter])
+
+ def log_odds(self, background=None):
+ """
+ returns the Position-Specific Scoring Matrix.
+
+ The Position-Specific Scoring Matrix (PSSM) contains the log-odds
+ scores computed from the probability matrix and the background
+ probabilities. If the background is None, a uniform background
+ distribution is assumed.
+ """
+ values = {}
+ alphabet = self.alphabet
+ if background is None:
+ background = {}
+ for letter in alphabet.letters:
+ background[letter] = 1.0
+ else:
+ background = dict(background)
+ total = sum(background.values())
+ for letter in alphabet.letters:
+ background[letter] /= total
+ values[letter] = []
+ for i in range(self.length):
+ for letter in alphabet.letters:
+ b = background[letter]
+ if b > 0:
+ p = self[letter][i]
+ if p > 0:
+ logodds = math.log(p/b, 2)
+ else:
+ #TODO - Ensure this has unittest coverage!
+ try:
+ logodds = float("-inf")
+ except ValueError:
+ # On Python 2.5 or older that was handled in C code,
+ # and failed on Windows XP 32bit
+ logodds = - 1E400
+ else:
+ p = self[letter][i]
+ if p > 0:
+ logodds = float("inf")
+ else:
+ logodds = float("nan")
+ values[letter].append(logodds)
+ pssm = PositionSpecificScoringMatrix(alphabet, values)
+ pssm._background = background
+ return pssm
+
+
+class PositionSpecificScoringMatrix(GenericPositionMatrix):
+
+ def calculate(self, sequence):
+ """
+ returns the PWM score for a given sequence for all positions.
+
+ - the sequence can only be a DNA sequence
+ - the search is performed only on one strand
+ - if the sequence and the motif have the same length, a single
+ number is returned
+ - otherwise, the result is a one-dimensional list or numpy array
+ """
+ if self.alphabet!=IUPAC.unambiguous_dna:
+ raise ValueError("Wrong alphabet! Use only with DNA motifs")
+ if sequence.alphabet!=IUPAC.unambiguous_dna:
+ raise ValueError("Wrong alphabet! Use only with DNA sequences")
+
+ sequence = str(sequence)
+ m = self.length
+ n = len(sequence)
+
+ scores = []
+ # check if the fast C code can be used
+ try:
+ import _pwm
+ except ImportError:
+ # use the slower Python code otherwise
+ for i in xrange(n-m+1):
+ score = 0.0
+ for position in xrange(m):
+ letter = sequence[i+position]
+ score += self[letter][position]
+ scores.append(score)
+ else:
+ # get the log-odds matrix into a proper shape
+ # (each row contains sorted (ACGT) log-odds values)
+ logodds = [[self[letter][i] for letter in "ACGT"] for i in range(m)]
+ scores = _pwm.calculate(sequence, logodds)
+ if len(scores)==1:
+ return scores[0]
+ else:
+ return scores
+
+ def search(self, sequence, threshold=0.0, both=True):
+ """
+ a generator function, returning found hits in a given sequence with the pwm score higher than the threshold
+ """
+ sequence = sequence.upper()
+ n = len(sequence)
+ m = self.length
+ if both:
+ rc = self.reverse_complement()
+ for position in xrange(0,n-m+1):
+ s = sequence[position:position+m]
+ score = self.calculate(s)
+ if score > threshold:
+ yield (position, score)
+ if both:
+ score = rc.calculate(s)
+ if score > threshold:
+ yield (position-n, score)
+
+ @property
+ def max(self):
+ """Maximal possible score for this motif.
+
+ returns the score computed for the consensus sequence.
+ """
+ score = 0.0
+ letters = self._letters
+ for position in xrange(0,self.length):
+ score += max([self[letter][position] for letter in letters])
+ return score
+
+ @property
+ def min(self):
+ """Minimal possible score for this motif.
+
+ returns the score computed for the anticonsensus sequence.
+ """
+ score = 0.0
+ letters = self._letters
+ for position in xrange(0,self.length):
+ score += min([self[letter][position] for letter in letters])
+ return score
+
+ @property
+ def mean(self):
+ """Expected value of the score of a motif.
+
+ returns None if the expected value is undefined"""
+ if self._background is None:
+ return None
+ background = self._background
+ sx = 0.0
+ for i in range(self.length):
+ for letter in self._letters:
+ logodds = self[letter,i]
+ b = background[letter]
+ p = b * math.pow(2,logodds)
+ sx += p * logodds
+ return sx
+
+
+ @property
+ def std(self):
+ """Standard deviation of the score of a motif.
+
+ returns None if the standard deviation is undefined"""
+ if self._background is None:
+ return None
+ background = self._background
+ variance = 0.0
+ for i in range(self.length):
+ sx = 0.0
+ sxx = 0.0
+ for letter in self._letters:
+ logodds = self[letter,i]
+ b = background[letter]
+ p = b * math.pow(2,logodds)
+ sx += p*logodds
+ sxx += p*logodds*logodds
+ sxx -= sx*sx
+ variance += sxx
+ variance = max(variance, 0) # to avoid roundoff problems
+ return math.sqrt(variance)
+
+ def dist_pearson(self, other):
+ """
+ return the similarity score based on pearson correlation for the given motif against self.
+
+ We use the Pearson's correlation of the respective probabilities.
+ """
+ if self.alphabet != other.alphabet:
+ raise ValueError("Cannot compare motifs with different alphabets")
+
+ max_p=-2
+ for offset in range(-self.length+1, other.length):
+ if offset<0:
+ p = self.dist_pearson_at(other, -offset)
+ else: # offset>=0
+ p = other.dist_pearson_at(self, offset)
+ if max_p<p:
+ max_p=p
+ max_o=-offset
+ return 1-max_p,max_o
+
+ def dist_pearson_at(self, other, offset):
+ letters = self._letters
+ sx = 0.0 # \sum x
+ sy = 0.0 # \sum y
+ sxx = 0.0 # \sum x^2
+ sxy = 0.0 # \sum x \cdot y
+ syy = 0.0 # \sum y^2
+ norm=max(self.length,offset+other.length)*len(letters)
+ for pos in range(min(self.length-offset, other.length)):
+ xi = [self[letter,pos+offset] for letter in letters]
+ yi = [other[letter,pos] for letter in letters]
+ sx += sum(xi)
+ sy += sum(yi)
+ sxx += sum([x*x for x in xi])
+ sxy += sum([x*y for x,y in zip(xi,yi)])
+ syy += sum([y*y for y in yi])
+ sx /= norm
+ sy /= norm
+ sxx /= norm
+ sxy /= norm
+ syy /= norm
+ numerator = sxy - sx*sy
+ denominator = math.sqrt((sxx-sx*sx)*(syy-sy*sy))
+ return numerator/denominator
+
+ def distribution(self, precision=10**3):
+ """calculate the distribution of the scores at the given precision
+
+ returns None if the distribution is undefined"""
+ background = self._background
+ if background==None:
+ return None
+ from Thresholds import ScoreDistribution
+ return ScoreDistribution(precision=precision, pssm=self, background=background)
View
1,441 Bio/Motif/_Motif.py
@@ -7,669 +7,85 @@
from Bio.Seq import Seq
from Bio.SubsMat import FreqTable
from Bio.Alphabet import IUPAC
-import math
-
-import warnings
-from Bio import BiopythonExperimentalWarning
-
-
-class GenericPositionMatrix(dict):
-
- def __init__(self, alphabet, values):
- self.length = None
- for letter in alphabet.letters:
- if self.length is None:
- self.length = len(values[letter])
- elif self.length!=len(values[letter]):
- raise Exception("Inconsistent lengths found in dictionary")
- self[letter] = list(values[letter])
- self.alphabet = alphabet
- self._letters = sorted(self.alphabet.letters)
-
- def __str__(self):
- words = ["%6d" % i for i in range(self.length)]
- line = " " + " ".join(words)
- lines = [line]
- for letter in self._letters:
- words = ["%6.2f" % value for value in self[letter]]
- line = "%c: " % letter + " ".join(words)
- lines.append(line)
- text = "\n".join(lines) + "\n"
- return text
-
- def __getitem__(self, key):
- if isinstance(key, tuple):
- if len(key)==2:
- key1, key2 = key
- if isinstance(key1, slice):
- start1, stop1, stride1 = key1.indices(len(self._letters))
- indices1 = range(start1, stop1, stride1)
- letters1 = [self._letters[i] for i in indices1]
- dim1 = 2
- elif isinstance(key1, int):
- letter1 = self._letters[key1]
- dim1 = 1
- elif isinstance(key1, tuple):
- letters1 = [self._letters[i] for i in key1]
- dim1 = 2
- elif isinstance(key1, str):
- if len(key1)==1:
- letter1 = key1
- dim1 = 1
- else:
- raise KeyError(key1)
- else:
- raise KeyError("Cannot understand key %s", str(key1))
- if isinstance(key2, slice):
- start2, stop2, stride2 = key2.indices(self.length)
- indices2 = range(start2, stop2, stride2)
- dim2 = 2
- elif isinstance(key2, int):
- index2 = key2
- dim2 = 1
- else:
- raise KeyError("Cannot understand key %s", str(key2))
- if dim1==1 and dim2==1:
- return dict.__getitem__(self, letter1)[index2]
- elif dim1==1 and dim2==2:
- values = dict.__getitem__(self, letter1)
- return tuple(values[index2] for index2 in indices2)
- elif dim1==2 and dim2==1:
- d = {}
- for letter1 in letters1:
- d[letter1] = dict.__getitem__(self, letter1)[index2]
- return d
- else:
- d = {}
- for letter1 in letters1:
- values = dict.__getitem__(self, letter1)
- d[letter1] = [values[index2] for index2 in indices2]
- if sorted(letters1)==self._letters:
- return self.__class__(self.alphabet, d)
- else:
- return d
- elif len(key)==1:
- key = key[0]
- else:
- raise KeyError("keys should be 1- or 2-dimensional")
- if isinstance(key, slice):
- start, stop, stride = key.indices(len(self._letters))
- indices = range(start, stop, stride)
- letters = [self._letters[i] for i in indices]
- dim = 2
- elif isinstance(key, int):
- letter = self._letters[key]
- dim = 1
- elif isinstance(key, tuple):
- letters = [self._letters[i] for i in key]
- dim = 2
- elif isinstance(key, str):
- if len(key)==1:
- letter = key
- dim = 1
- else:
- raise KeyError(key)
- else:
- raise KeyError("Cannot understand key %s", str(key))
- if dim==1:
- return dict.__getitem__(self, letter)
- elif dim==2:
- d = {}
- for letter in letters:
- d[letter] = dict.__getitem__(self, letter)
- return d
- else:
- raise RuntimeError("Should not get here")
-
- @property
- def consensus(self):
- """Returns the consensus sequence.
- """
- sequence = ""
- for i in range(self.length):
- try:
- maximum = float("-inf")
- except ValueError:
- # On Python 2.5 or older that was handled in C code,
- # and failed on Windows XP 32bit
- maximum = - 1E400
- for letter in self.alphabet.letters:
- count = self[letter][i]
- if count > maximum:
- maximum = count
- sequence_letter = letter
- sequence += sequence_letter
- return Seq(sequence, self.alphabet)
-
- @property
- def anticonsensus(self):
- sequence = ""
- for i in range(self.length):
- try:
- minimum = float("inf")
- except ValueError:
- # On Python 2.5 or older that was handled in C code,
- # and failed on Windows XP 32bit
- minimum = 1E400
- for letter in self.alphabet.letters:
- count = self[letter][i]
- if count < minimum:
- minimum = count
- sequence_letter = letter
- sequence += sequence_letter
- return Seq(sequence, self.alphabet)
-
- @property
- def degenerate_consensus(self):
- # Following the rules adapted from
- # D. R. Cavener: "Comparison of the consensus sequence flanking
- # translational start sites in Drosophila and vertebrates."
- # Nucleic Acids Research 15(4): 1353-1361. (1987).
- # The same rules are used by TRANSFAC.
- degenerate_nucleotide = {
- 'A': 'A',
- 'C': 'C',
- 'G': 'G',
- 'T': 'T',
- 'AC': 'M',
- 'AG': 'R',
- 'AT': 'W',
- 'CG': 'S',
- 'CT': 'Y',
- 'GT': 'K',
- 'ACG': 'V',
- 'ACT': 'H',
- 'AGT': 'D',
- 'CGT': 'B',
- 'ACGT': 'N',
- }
- sequence = ""
- for i in range(self.length):
- def get(nucleotide):
- return self[nucleotide][i]
- nucleotides = sorted(self, key=get, reverse=True)
- counts = [self[c][i] for c in nucleotides]
- # Follow the Cavener rules:
- if counts[0] >= sum(counts[1:]) and counts[0] >= 2*counts[1]:
- key = nucleotides[0]
- elif 4*sum(counts[:2]) > 3*sum(counts):
- key = "".join(sorted(nucleotides[:2]))
- elif counts[3]==0:
- key = "".join(sorted(nucleotides[:3]))
- else:
- key = "ACGT"
- nucleotide = degenerate_nucleotide[key]
- sequence += nucleotide
- return Seq(sequence, alphabet = IUPAC.ambiguous_dna)
-
- def reverse_complement(self):
- values = {}
- values["A"] = self["T"][::-1]
- values["T"] = self["A"][::-1]
- values["G"] = self["C"][::-1]
- values["C"] = self["G"][::-1]
- alphabet = self.alphabet
- return self.__class__(alphabet, values)
-
-
-class FrequencyPositionMatrix(GenericPositionMatrix):
-
- def normalize(self, pseudocounts=None):
- """
- create and return a position-weight matrix by normalizing the counts matrix.
-
- If pseudocounts is None (default), no pseudocounts are added
- to the counts.
- If pseudocounts is a number, it is added to the counts before
- calculating the position-weight matrix.
- Alternatively, the pseudocounts can be a dictionary with a key
- for each letter in the alphabet associated with the motif.
- """
-
- counts = {}
- if pseudocounts is None:
- for letter in self.alphabet.letters:
- counts[letter] = [0.0] * self.length
- elif isinstance(pseudocounts, dict):
- for letter in self.alphabet.letters:
- counts[letter] = [float(pseudocounts[letter])] * self.length
- else:
- for letter in self.alphabet.letters:
- counts[letter] = [float(pseudocounts)] * self.length
- for i in xrange(self.length):
- for letter in self.alphabet.letters:
- counts[letter][i] += self[letter][i]
- # Actual normalization is done in the PositionWeightMatrix initializer
- return PositionWeightMatrix(self.alphabet, counts)
-
-
-class PositionWeightMatrix(GenericPositionMatrix):
-
- def __init__(self, alphabet, counts):
- GenericPositionMatrix.__init__(self, alphabet, counts)
- for i in xrange(self.length):
- total = sum([float(self[letter][i]) for letter in alphabet.letters])
- for letter in alphabet.letters:
- self[letter][i] /= total
- for letter in alphabet.letters:
- self[letter] = tuple(self[letter])
-
- def log_odds(self, background=None):
- """
- returns the Position-Specific Scoring Matrix.
-
- The Position-Specific Scoring Matrix (PSSM) contains the log-odds
- scores computed from the probability matrix and the background
- probabilities. If the background is None, a uniform background
- distribution is assumed.
- """
- values = {}
- alphabet = self.alphabet
- if background is None:
- background = {}
- for letter in alphabet.letters:
- background[letter] = 1.0
- else:
- background = dict(background)
- total = sum(background.values())
- for letter in alphabet.letters:
- background[letter] /= total
- values[letter] = []
- for i in range(self.length):
- for letter in alphabet.letters:
- b = background[letter]
- if b > 0:
- p = self[letter][i]
- if p > 0:
- logodds = math.log(p/b, 2)
- else:
- #TODO - Ensure this has unittest coverage!
- try:
- logodds = float("-inf")
- except ValueError:
- # On Python 2.5 or older that was handled in C code,
- # and failed on Windows XP 32bit
- logodds = - 1E400
- else:
- p = self[letter][i]
- if p > 0:
- logodds = float("inf")
- else:
- logodds = float("nan")
- values[letter].append(logodds)
- pssm = PositionSpecificScoringMatrix(alphabet, values)
- pssm._background = background
- return pssm
-
-
-class PositionSpecificScoringMatrix(GenericPositionMatrix):
-
- def calculate(self, sequence):
- """
- returns the PWM score for a given sequence for all positions.
-
- - the sequence can only be a DNA sequence
- - the search is performed only on one strand
- - if the sequence and the motif have the same length, a single
- number is returned
- - otherwise, the result is a one-dimensional list or numpy array
- """
- if self.alphabet!=IUPAC.unambiguous_dna:
- raise ValueError("Wrong alphabet! Use only with DNA motifs")
- if sequence.alphabet!=IUPAC.unambiguous_dna:
- raise ValueError("Wrong alphabet! Use only with DNA sequences")
-
- sequence = str(sequence)
- m = self.length
- n = len(sequence)
-
- scores = []
- # check if the fast C code can be used
- try:
- import _pwm
- except ImportError:
- # use the slower Python code otherwise
- for i in xrange(n-m+1):
- score = 0.0
- for position in xrange(m):
- letter = sequence[i+position]
- score += self[letter][position]
- scores.append(score)
- else:
- # get the log-odds matrix into a proper shape
- # (each row contains sorted (ACGT) log-odds values)
- logodds = [[self[letter][i] for letter in "ACGT"] for i in range(m)]
- scores = _pwm.calculate(sequence, logodds)
- if len(scores)==1:
- return scores[0]
- else:
- return scores
-
- def search(self, sequence, threshold=0.0, both=True):
- """
- a generator function, returning found hits in a given sequence with the pwm score higher than the threshold
- """
- sequence = sequence.upper()
- n = len(sequence)
- m = self.length
- if both:
- rc = self.reverse_complement()
- for position in xrange(0,n-m+1):
- s = sequence[position:position+m]
- score = self.calculate(s)
- if score > threshold:
- yield (position, score)
- if both:
- score = rc.calculate(s)
- if score > threshold:
- yield (position-n, score)
-
- @property
- def max(self):
- """Maximal possible score for this motif.
-
- returns the score computed for the consensus sequence.
- """
- score = 0.0
- letters = self._letters
- for position in xrange(0,self.length):
- score += max([self[letter][position] for letter in letters])
- return score
-
- @property
- def min(self):
- """Minimal possible score for this motif.
-
- returns the score computed for the anticonsensus sequence.
- """
- score = 0.0
- letters = self._letters
- for position in xrange(0,self.length):
- score += min([self[letter][position] for letter in letters])
- return score
-
- @property
- def mean(self):
- """Expected value of the score of a motif.
-
- returns None if the expected value is undefined"""
- if self._background is None:
- return None
- background = self._background
- sx = 0.0
- for i in range(self.length):
- for letter in self._letters:
- logodds = self[letter,i]
- b = background[letter]
- p = b * math.pow(2,logodds)
- sx += p * logodds
- return sx
-
-
- @property
- def std(self):
- """Standard deviation of the score of a motif.
-
- returns None if the standard deviation is undefined"""
- if self._background is None:
- return None
- background = self._background
- variance = 0.0
- for i in range(self.length):
- sx = 0.0
- sxx = 0.0
- for letter in self._letters:
- logodds = self[letter,i]
- b = background[letter]
- p = b * math.pow(2,logodds)
- sx += p*logodds
- sxx += p*logodds*logodds
- sxx -= sx*sx
- variance += sxx
- variance = max(variance, 0) # to avoid roundoff problems
- return math.sqrt(variance)
-
- def dist_pearson(self, other):
- """
- return the similarity score based on pearson correlation for the given motif against self.
-
- We use the Pearson's correlation of the respective probabilities.
- """
- if self.alphabet != other.alphabet:
- raise ValueError("Cannot compare motifs with different alphabets")
-
- max_p=-2
- for offset in range(-self.length+1, other.length):
- if offset<0:
- p = self.dist_pearson_at(other, -offset)
- else: # offset>=0
- p = other.dist_pearson_at(self, offset)
- if max_p<p:
- max_p=p
- max_o=-offset
- return 1-max_p,max_o
-
- def dist_pearson_at(self, other, offset):
- letters = self._letters
- sx = 0.0 # \sum x
- sy = 0.0 # \sum y
- sxx = 0.0 # \sum x^2
- sxy = 0.0 # \sum x \cdot y
- syy = 0.0 # \sum y^2
- norm=max(self.length,offset+other.length)*len(letters)
- for pos in range(min(self.length-offset, other.length)):
- xi = [self[letter,pos+offset] for letter in letters]
- yi = [other[letter,pos] for letter in letters]
- sx += sum(xi)
- sy += sum(yi)
- sxx += sum([x*x for x in xi])
- sxy += sum([x*y for x,y in zip(xi,yi)])
- syy += sum([y*y for y in yi])
- sx /= norm
- sy /= norm
- sxx /= norm
- sxy /= norm
- syy /= norm
- numerator = sxy - sx*sy
- denominator = math.sqrt((sxx-sx*sx)*(syy-sy*sy))
- return numerator/denominator
-
- def distribution(self, precision=10**3):
- """calculate the distribution of the scores at the given precision
-
- returns None if the distribution is undefined"""
- background = self._background
- if background==None:
- return None
- from Thresholds import ScoreDistribution
- return ScoreDistribution(precision=precision, pssm=self, background=background)
+import math,random
class Motif(object):
"""
A class representing sequence motifs.
"""
- def __init__(self, alphabet=None, instances=None, counts=None):
+ def __init__(self,alphabet=IUPAC.unambiguous_dna):
+ self.instances = []
+ self.has_instances=False
+ self.counts = {}
+ self.has_counts=False
+ self.mask = []
self._pwm_is_current = False
self._pwm = []
self._log_odds_is_current = False
self._log_odds = []
+ self.alphabet=alphabet
self.length=None
- self.info=None
- self.name=""
- if counts is not None and instances is not None:
- raise Exception(ValueError,
- "Specify either instances or counts, don't specify both")
- elif counts is not None:
- warnings.warn("This is experimental code, and may change in future versions", BiopythonExperimentalWarning)
- if alphabet is None:
- alphabet = IUPAC.unambiguous_dna
- for letter in counts:
- length = len(counts[letter])
- if self.length is None:
- self.length = length
- elif self.length!=length:
- raise Exception("counts matrix has inconsistent lengths")
- self.instances = None
- self.counts = FrequencyPositionMatrix(alphabet, counts)
- elif instances is not None:
- warnings.warn("This is experimental code, and may change in future versions", BiopythonExperimentalWarning)
- self.instances = []
- for instance in instances:
- if alphabet != instance.alphabet:
- raise ValueError("Alphabets are inconsistent")
- if self.length is None:
- self.length = len(instance)
- elif self.length != len(instance):
- message = "All instances should have the same length (%d found, %d expected)" % (len(instance), self.length)
- raise ValueError(message)
- self.instances.append(instance)
- counts = {}
- for letter in alphabet.letters:
- counts[letter] = [0] * self.length
- for instance in self.instances:
- for position, letter in enumerate(instance):
- counts[letter][position] += 1
- self.counts = FrequencyPositionMatrix(alphabet, counts)
- else:
- self.counts = None
- self.instances = None
- if alphabet is None:
- alphabet = IUPAC.unambiguous_dna
- self.alphabet = alphabet
- if self.length is None:
- self.__mask = ()
- else:
- self.__mask = (1,) * self.length
- self.background=dict((n, 1.0/len(self.alphabet.letters))
+ self.background=dict((n, 1.0/len(self.alphabet.letters)) \
for n in self.alphabet.letters)
self.beta=1.0
-
- @property
- def has_instances(self):
- """Legacy property, check if m.instances is None instead (DEPRECATED)."""
- from Bio import BiopythonDeprecationWarning
- warnings.warn("Instead of 'm.has_instances' use 'm.instances is not None'",
- BiopythonDeprecationWarning)
- return self.instances is not None
-
- @property
- def has_counts(self):
- """Legacy property, check if m.counts is None instead (DEPRECATED)."""
- from Bio import BiopythonDeprecationWarning
- warnings.warn("Instead of 'm.has_counts' use 'm.counts is not None'",
- BiopythonDeprecationWarning)
- return self.counts is not None
+ self.info=None
+ self.name=""
def _check_length(self, len):
- warnings.warn("""\
-This function is now obsolete, and will be deprecated and removed
-in a future release of Biopython.
-""", PendingDeprecationWarning)
- if self.length is None:
+ if self.length==None:
self.length = len
elif self.length != len:
print "len",self.length,self.instances, len
raise ValueError("You can't change the length of the motif")
def _check_alphabet(self,alphabet):
- warnings.warn("""\
-This function is now obsolete, and will be deprecated and removed
-in a future release of Biopython.
-""", PendingDeprecationWarning)
- if self.alphabet is None:
+ if self.alphabet==None:
self.alphabet=alphabet
elif self.alphabet != alphabet:
raise ValueError("Wrong Alphabet")
-
+
def add_instance(self,instance):
"""
adds new instance to the motif
"""
- warnings.warn("""\
-This function is now obsolete, and will be deprecated and removed
-in a future release of Biopython. Instead of adding instances to an existing
-Motif object, please create a new Motif object.
-""", PendingDeprecationWarning)
self._check_alphabet(instance.alphabet)
self._check_length(len(instance))
- if self.counts is not None:
+ if self.has_counts:
for i in range(self.length):
let=instance[i]
self.counts[let][i]+=1
- if self.instances is not None or self.counts is None:
- if self.instances is None:
- self.instances = []
+ if self.has_instances or not self.has_counts:
self.instances.append(instance)
-
+ self.has_instances=True
+
self._pwm_is_current = False
self._log_odds_is_current = False
+
def set_mask(self,mask):
"""
sets the mask for the motif
The mask should be a string containing asterisks in the position of significant columns and spaces in other columns
"""
- warnings.warn("""\
-This function is now obsolete, and will be deprecated and removed
-in a future release of Biopython. As a replacement, instead of
->>> motif.set_mask(mask)
-please use
->>> motif.mask = mask
-""", PendingDeprecationWarning)
self._check_length(len(mask))
- self.__mask=[]
+ self.mask=[]
for char in mask:
if char=="*":
- self.__mask.append(1)
+ self.mask.append(1)
elif char==" ":
- self.__mask.append(0)
+ self.mask.append(0)
else:
raise ValueError("Mask should contain only '*' or ' ' and not a '%s'"%char)
- self.__mask = tuple(self.__mask)
-
- def __get_mask(self):
- return self.__mask
-
- def __set_mask(self, mask):
- if mask is None:
- self.__mask = (1,) * self.length
- elif len(mask)!=self.length:
- raise ValueError("The length (%d) of the mask is inconsistent with the length (%d) of the motif", (len(mask), self.length))
- elif isinstance(mask, str):
- self.__mask=[]
- for char in mask:
- if char=="*":
- self.__mask.append(1)
- elif char==" ":
- self.__mask.append(0)
- else:
- raise ValueError("Mask should contain only '*' or ' ' and not a '%s'"%char)
- self.__mask = tuple(self.__mask)
- else:
- self.__mask = tuple(int(bool(c)) for c in mask)
-
- mask = property(__get_mask, __set_mask)
- del __get_mask
- del __set_mask
-
+
def pwm(self,laplace=True):
"""
returns the PWM computed for the set of instances
if laplace=True (default), pseudocounts equal to self.background multiplied by self.beta are added to all positions.
"""
-
- warnings.warn("""\
-This function is now obsolete, and will be deprecated and removed
-in a future release of Biopython. As a replacement, instead of
->>> motif.pwm()
-use
->>> pwm = motif.counts.normalize()
-See the documentation of motif.counts.normalize and pwm.log_odds for
-details on treatment of pseudocounts and background probabilities.
-""", PendingDeprecationWarning)
+
if self._pwm_is_current:
return self._pwm
#we need to compute new pwm
@@ -682,36 +98,27 @@ def pwm(self,laplace=True):
dict[letter]=self.beta*self.background[letter]
else:
dict[letter]=0.0
- if self.counts is not None:
+ if self.has_counts:
#taking the raw counts
for letter in self.alphabet.letters:
dict[letter]+=self.counts[letter][i]
- elif self.instances is not None:
+ elif self.has_instances:
#counting the occurences of letters in instances
for seq in self.instances:
#dict[seq[i]]=dict[seq[i]]+1
try:
dict[seq[i]]+=1
- except KeyError: # we need to ignore non-alphabet letters
+ except KeyError: #we need to ignore non-alphabet letters
pass
- self._pwm.append(FreqTable.FreqTable(dict,FreqTable.COUNT,self.alphabet))
+ self._pwm.append(FreqTable.FreqTable(dict,FreqTable.COUNT,self.alphabet))
self._pwm_is_current=1
return self._pwm
def log_odds(self,laplace=True):
"""
- returns the log odds matrix computed for the set of instances
+ returns the logg odds matrix computed for the set of instances
"""
- warnings.warn("""\
-This function is now obsolete, and will be deprecated and removed
-in a future release of Biopython. As a replacement, instead of
->>> motif.log_odds()
-use
->>> pwm = motif.counts.normalize()
->>> pssm = pwm.log_odds()
-See the documentation of motif.counts.normalize and pwm.log_odds for
-details on treatment of pseudocounts and background probabilities.
-""", PendingDeprecationWarning)
+
if self._log_odds_is_current:
return self._log_odds
#we need to compute new pwm
@@ -728,17 +135,6 @@ def log_odds(self,laplace=True):
def ic(self):
"""Method returning the information content of a motif.
"""
- warnings.warn("""\
-This function is now obsolete, and will be deprecated and removed
-in a future release of Biopython. As a replacement, instead of
->>> motif.ic()
-please use
->>> pwm = motif.counts.normalize()
->>> pwm.ic()
-Please be aware though that by default, motif.counts.normalize()
-does not use pseudocounts, while motif.ic() does. See the documentation
-of motif.counts.normalize for more details.
-""", PendingDeprecationWarning)
res=0
pwm=self.pwm()
for i in range(self.length):
@@ -752,16 +148,6 @@ def exp_score(self,st_dev=False):
"""
Computes expected score of motif's instance and its standard deviation
"""
- warnings.warn("""\
-This function is now obsolete, and will be deprecated and removed
-in a future release of Biopython. As a replacement, instead of
->>> motif.exp_score()
-please use
->>> pwm = motif.counts.normalize()
->>> pwm.exp_score()
-See the documentation of motif.counts.normalize for details on treatment of
-pseudocounts.
-""", PendingDeprecationWarning)
exs=0.0
var=0.0
pwm=self.pwm()
@@ -783,11 +169,11 @@ def search_instances(self,sequence):
"""
a generator function, returning found positions of instances of the motif in a given sequence
"""
- if self.instances is None:
- raise ValueError("This motif has no instances")
+ if not self.has_instances:
+ raise ValueError ("This motif has no instances")
for pos in xrange(0,len(sequence)-self.length+1):
for instance in self.instances:
- if str(instance) == str(sequence[pos:pos+self.length]):
+ if instance.tostring()==sequence[pos:pos+self.length].tostring():
yield(pos,instance)
break # no other instance will fit (we don't want to return multiple hits)
@@ -795,23 +181,11 @@ def score_hit(self,sequence,position,normalized=0,masked=0):
"""
give the pwm score for a given position
"""
- warnings.warn("""\
-This function is now obsolete, and will be deprecated and removed
-in a future release of Biopython. As a replacement, instead of
->>> motif.score_hit(sequence, position)
-please use
->>> pwm = motif.counts.normalize()
->>> pssm = pwm.log_odds()
->>> s = sequence[position:positon+len(pssm)]
->>> pssm.calculate(s)
-See the documentation of motif.counts.normalize() and pwm.log_odds
-for details on the treatment of pseudocounts and background probabilities.
-""", PendingDeprecationWarning)
lo=self.log_odds()
score = 0.0
for pos in xrange(self.length):
a = sequence[position+pos]
- if not masked or self.__mask[pos]:
+ if not masked or self.mask[pos]:
try:
score += lo[pos][a]
except:
@@ -820,37 +194,25 @@ def score_hit(self,sequence,position,normalized=0,masked=0):
if not masked:
score/=self.length
else:
- score/=len([x for x in self.__mask if x])
+ score/=len([x for x in self.mask if x])
return score
-
+
def search_pwm(self,sequence,normalized=0,masked=0,threshold=0.0,both=True):
"""
a generator function, returning found hits in a given sequence with the pwm score higher than the threshold
"""
- warnings.warn("""\
-This function is now obsolete, and will be deprecated and removed
-in a future release of Biopython. As a replacement, instead of
->>> motif.score_hit(sequence, position)
-please use
->>> pwm = motif.counts.normalize()
->>> pssm = pwm.log_odds()
->>> pssm.search(sequence)
-See the documentation of motif.counts.normalize() and pwm.log_odds
-for details on treatment of pseudocounts and background probabilities.
-""", PendingDeprecationWarning)
if both:
rc = self.reverse_complement()
-
- sequence=sequence.upper()
- n = len(sequence)
- for pos in xrange(0,n-self.length+1):
+
+ sequence=sequence.tostring().upper()
+ for pos in xrange(0,len(sequence)-self.length+1):
score = self.score_hit(sequence,pos,normalized,masked)
if score > threshold:
yield (pos,score)
if both:
rev_score = rc.score_hit(sequence,pos,normalized,masked)
if rev_score > threshold:
- yield (pos-n,rev_score)
+ yield (-pos,rev_score)
def dist_pearson(self, motif, masked = 0):
"""
@@ -858,17 +220,6 @@ def dist_pearson(self, motif, masked = 0):
We use the Pearson's correlation of the respective probabilities.
"""
- warnings.warn("""\
-This function is now obsolete, and will be deprecated and removed
-in a future release of Biopython. As a replacement, instead of
->>> motif1.dist_pearson(motif2)
-please use
->>> pwm1 = motif1.counts.normalize()
->>> pwm2 = motif2.counts.normalize()
->>> pwm1.dist_pearson(pwm2)
-Please see the documentation of motif.counts.normalize and
-pwm.dist_pearson for more details.
-""", PendingDeprecationWarning)
if self.alphabet != motif.alphabet:
raise ValueError("Cannot compare motifs with different alphabets")
@@ -877,25 +228,22 @@ def dist_pearson(self, motif, masked = 0):
for offset in range(-self.length+1,motif.length):
if offset<0:
p = self.dist_pearson_at(motif,-offset)
- else: # offset>=0
+ else: #offset>=0
p = motif.dist_pearson_at(self,offset)
-
+
if max_p<p:
max_p=p
max_o=-offset
return 1-max_p,max_o
def dist_pearson_at(self,motif,offset):
- warnings.warn("""\
-This function is now obsolete, and will be deprecated and removed
-in a future release of Biopython.""", PendingDeprecationWarning)
- sxx = 0 # \sum x^2
- sxy = 0 # \sum x \cdot y
- sx = 0 # \sum x
- sy = 0 # \sum y
- syy = 0 # \sum y^2
+ sxx = 0 # \sum x^2
+ sxy = 0 # \sum x \cdot y
+ sx = 0 # \sum x
+ sy = 0 # \sum y
+ syy = 0 # \sum x^2
norm=max(self.length,offset+motif.length)
-
+
for pos in range(max(self.length,offset+motif.length)):
for l in self.alphabet.letters:
xi = self[pos][l]
@@ -913,26 +261,20 @@ def dist_pearson_at(self,motif,offset):
def dist_product(self,other):
"""
- A similarity measure taking into account a product probability of generating overlapping instances of two motifs
+ A similarity measure taking into account a product probability of generating overlaping instances of two motifs
"""
- warnings.warn("""\
-This function is now obsolete, and will be deprecated and removed
-in a future release of Biopython.""", PendingDeprecationWarning)
max_p=0.0
for offset in range(-self.length+1,other.length):
if offset<0:
p = self.dist_product_at(other,-offset)
- else: # offset>=0
+ else: #offset>=0
p = other.dist_product_at(self,offset)
if max_p<p:
max_p=p
max_o=-offset
return 1-max_p/self.dist_product_at(self,0),max_o
-
+
def dist_product_at(self,other,offset):
- warnings.warn("""\
-This function is now obsolete, and will be deprecated and removed
-in a future release of Biopython.""", PendingDeprecationWarning)
s=0
for i in range(max(self.length,offset+other.length)):
f1=self[i]
@@ -942,26 +284,23 @@ def dist_product_at(self,other,offset):
return s/i
def dist_dpq(self,other):
- warnings.warn("""\
-This function is now obsolete, and will be deprecated and removed
-in a future release of Biopython.""", PendingDeprecationWarning)
r"""Calculates the DPQ distance measure between motifs.
It is calculated as a maximal value of DPQ formula (shown using LaTeX
markup, familiar to mathematicians):
-
+
\sqrt{\sum_{i=1}^{alignment.len()} \sum_{k=1}^alphabet.len() \
\{ m1[i].freq(alphabet[k])*log_2(m1[i].freq(alphabet[k])/m2[i].freq(alphabet[k])) +
m2[i].freq(alphabet[k])*log_2(m2[i].freq(alphabet[k])/m1[i].freq(alphabet[k]))
}
-
- over possible non-spaced alignments of two motifs. See this reference:
+
+ over possible non-spaced alignemts of two motifs. See this reference:
D. M Endres and J. E Schindelin, "A new metric for probability
distributions", IEEE transactions on Information Theory 49, no. 7
(July 2003): 1858-1860.
"""
-
+
min_d=float("inf")
min_o=-1
d_s=[]
@@ -970,7 +309,7 @@ def dist_dpq(self,other):
if offset<0:
d = self.dist_dpq_at(other,-offset)
overlap = self.length+offset
- else: # offset>=0
+ else: #offset>=0
d = other.dist_dpq_at(self,offset)
overlap = other.length-offset
overlap = min(self.length,other.length,overlap)
@@ -983,154 +322,127 @@ def dist_dpq(self,other):
if min_d> d:
min_d=d
min_o=-offset
- return min_d,min_o # ,d_s
-
+ return min_d,min_o#,d_s
+
def dist_dpq_at(self,other,offset):
- warnings.warn("""\
-This function is now obsolete, and will be deprecated and removed
-in a future release of Biopython.""", PendingDeprecationWarning)
"""
calculates the dist_dpq measure with a given offset.
offset should satisfy 0<=offset<=len(self)
"""
- def dpq(f1,f2,alpha):
+ def dpq (f1,f2,alpha):
s=0
for n in alpha.letters:
avg=(f1[n]+f2[n])/2
s+=f1[n]*math.log(f1[n]/avg,2)+f2[n]*math.log(f2[n]/avg,2)
return math.sqrt(s)
-
- s=0
+
+ s=0
for i in range(max(self.length,offset+other.length)):
f1=self[i]
f2=other[i-offset]
s+=dpq(f1,f2,self.alphabet)
return s
-
+
def _read(self,stream):
"""Reads the motif from the stream (in AlignAce format).
the self.alphabet variable must be set beforehand.
If the last line contains asterisks it is used for setting mask
"""
- warnings.warn("This function is now obsolete, and will be deprecated and removed in a future release of Biopython. As a replacement, please use Bio.Motif.parse instead.", PendingDeprecationWarning)
-
+
while 1:
ln = stream.readline()
if "*" in ln:
self.set_mask(ln.strip("\n\c"))
break
self.add_instance(Seq(ln.strip(),self.alphabet))
-
+
def __str__(self,masked=False):
""" string representation of a motif.
"""
- string = ""
- if self.instances is not None:
- for inst in self.instances:
- string += str(inst) + "\n"
+ str = ""
+ for inst in self.instances:
+ str = str + inst.tostring() + "\n"
if masked:
for i in xrange(self.length):
- if self.__mask[i]:
- string += "*"
+ if self.mask[i]:
+ str = str + "*"
else:
- string += " "
- string += "\n"
- return string
+ str = str + " "
+ str = str + "\n"
+ return str
def __len__(self):
"""return the length of a motif
- Please use this method (i.e. invoke len(m)) instead of referring to m.length directly.
+ Please use this method (i.e. invoke len(m)) instead of refering to the m.length directly.
"""
- if self.length is None:
+ if self.length==None:
return 0
else:
return self.length
-
+
def _write(self,stream):
"""
writes the motif to the stream
"""
- warnings.warn("""\
-This function is now obsolete, and will be deprecated and removed
-in a future release of Biopython.""")
stream.write(self.__str__())
+
+
def _to_fasta(self):
"""
FASTA representation of motif
"""
- if self.instances is None:
- warnings.warn("""\
-Creating simulated instances of a motif is now obsolete. This functionality
-will be deprecated and removed in a future release of Biopython.""", PendingDeprecationWarning)
- alpha="".join(self.alphabet.letters)
- #col[i] is a column taken from aligned motif instances
- col=[]
- instances=[]
- s = sum(self.counts[nuc][0] for nuc in self.alphabet.letters)
- for i in range(self.length):
- col.append("")
- for n in self.alphabet.letters:
- col[i] += n * self.counts[n][i]
- if len(col[i])<s:
- warnings.warn(UserWarning, "WARNING, column too short (%d; expected %d)" % (len(col[i]), s))
- col[i]+=(alpha*s)[:(s-len(col[i]))]
- #iterate over instances
- for i in range(s):
- instance = "" # start with empty seq
- for j in range(self.length): # iterate over positions
- instance+=col[j][i]
- instance = Seq(instance, self.alphabet)
- instances.append(instance)
- else:
- instances = self.instances
- text = ""
- for i, instance in enumerate(instances):
- text += ">instance%d\n%s\n" % (i, instance)
- return text
+ if not self.has_instances:
+ self.make_instances_from_counts()
+ str = ""
+ for i,inst in enumerate(self.instances):
+ str = str + ">instance%d\n"%i + inst.tostring() + "\n"
+
+ return str
def reverse_complement(self):
"""
Gives the reverse complement of the motif
"""
- alphabet = self.alphabet
- if self.instances is not None:
- instances = []
- for instance in self.instances:
- instance = instance.reverse_complement()
- instances.append(instance)
- res = Motif(instances=instances, alphabet=alphabet)
- else: # has counts
- res = Motif(alphabet)
- res.counts={}
- res.counts["A"]=self.counts["T"][::-1]
- res.counts["T"]=self.counts["A"][::-1]
- res.counts["G"]=self.counts["C"][::-1]
- res.counts["C"]=self.counts["G"][::-1]
+ res = Motif()
+ if self.has_instances:
+ for i in self.instances:
+ res.add_instance(i.reverse_complement())
+ else: # has counts
+ res.has_counts=True
+ res.counts["A"]=self.counts["T"][:]
+ res.counts["T"]=self.counts["A"][:]
+ res.counts["G"]=self.counts["C"][:]
+ res.counts["C"]=self.counts["G"][:]
+ res.counts["A"].reverse()
+ res.counts["C"].reverse()
+ res.counts["G"].reverse()
+ res.counts["T"].reverse()
res.length=self.length
- res.__mask = self.__mask[::-1]
+ res.mask = self.mask
return res
+
def _from_jaspar_pfm(self,stream,make_instances=False):
"""
reads the motif from Jaspar .pfm file
The instances are fake, but the pwm is accurate.
"""
- warnings.warn("This function is now obsolete, and will be deprecated and removed in a future release of Biopython. Please use the 'pfm' format instead of the 'jaspar-pfm' format", PendingDeprecationWarning)
return self._from_horiz_matrix(stream,letters="ACGT",make_instances=make_instances)
def _from_vert_matrix(self,stream,letters=None,make_instances=False):
"""reads a vertical count matrix from stream and fill in the counts.
"""
- warnings.warn("This function is now obsolete, and will be deprecated and removed in a future release of Biopython.", PendingDeprecationWarning)
+
self.counts = {}
- if letters is None:
+ self.has_counts=True
+ if letters==None:
letters=self.alphabet.letters
self.length=0
for i in letters:
@@ -1140,18 +452,18 @@ def _from_vert_matrix(self,stream,letters=None,make_instances=False):
for k,v in zip(letters,rec):
self.counts[k].append(v)
self.length+=1
- self.__mask = (1,) * self.length
- if make_instances is True:
+ self.set_mask("*"*self.length)
+ if make_instances==True:
self.make_instances_from_counts()
return self
-
+
def _from_horiz_matrix(self,stream,letters=None,make_instances=False):
"""reads a horizontal count matrix from stream and fill in the counts.
"""
- warnings.warn("This function is now obsolete, and will be deprecated and removed in a future release of Biopython.", PendingDeprecationWarning)
- if letters is None:
+ if letters==None:
letters=self.alphabet.letters
self.counts = {}
+ self.has_counts=True
for i in letters:
ln = stream.readline().strip().split()
@@ -1161,28 +473,29 @@ def _from_horiz_matrix(self,stream,letters=None,make_instances=False):
#print ln
try:
self.counts[i]=map(int,ln)
- except ValueError: # not integers
- self.counts[i]=map(float,ln) # map(lambda s: int(100*float(s)),ln)
+ except ValueError: #not integers
+ self.counts[i]=map(float,ln) #map(lambda s: int(100*float(s)),ln)
#print counts[i]
-
+
s = sum(self.counts[nuc][0] for nuc in letters)
l = len(self.counts[letters[0]])
self.length=l
- self.__mask = (1,) * l
- if make_instances is True:
+ self.set_mask("*"*l)
+ if make_instances==True:
self.make_instances_from_counts()
return self
+
def make_instances_from_counts(self):
"""Creates "fake" instances for a motif created from a count matrix.
- In case the sums of counts are different for different columns, the
+ In case the sums of counts are different for different columnes, the
shorter columns are padded with background.
"""
- warnings.warn("This function is now obsolete, and will be deprecated and removed in a future release of Biopython.", PendingDeprecationWarning)
alpha="".join(self.alphabet.letters)
#col[i] is a column taken from aligned motif instances
col=[]
+ self.has_instances=True
self.instances=[]
s = sum(map(lambda nuc: self.counts[nuc][0],self.alphabet.letters))
for i in range(self.length):
@@ -1194,12 +507,12 @@ def make_instances_from_counts(self):
col[i]+=(alpha*s)[:(s-len(col[i]))]
#print i,col[i]
#iterate over instances
- for i in range(s):
- inst="" # start with empty seq
- for j in range(self.length): # iterate over positions
+ for i in range(s):
+ inst="" #start with empty seq
+ for j in range(self.length): #iterate over positions
inst+=col[j][i]
#print i,inst
- inst=Seq(inst,self.alphabet)
+ inst=Seq(inst,self.alphabet)
self.add_instance(inst)
return self.instances
@@ -1207,13 +520,12 @@ def make_counts_from_instances(self):
"""Creates the count matrix for a motif with instances.
"""
- warnings.warn("This function is now obsolete, and will be deprecated and removed in a future release of Biopython.", PendingDeprecationWarning)
- raise Exception(remove)
#make strings for "columns" of motifs
#col[i] is a column taken from aligned motif instances
counts={}
for a in self.alphabet.letters:
counts[a]=[]
+ self.has_counts=True
s = len(self.instances)
for i in range(self.length):
ci = dict((a,0) for a in self.alphabet.letters)
@@ -1230,395 +542,165 @@ def _from_jaspar_sites(self,stream):
The instances and pwm are OK.
"""
- warnings.warn("This function is now obsolete, and will be deprecated and removed in a future release of Biopython. Please use the 'sites' format instead of the 'jaspar-sites' format", PendingDeprecationWarning)
- # Probably this should be in a separate submodule of Bio.Motif
- self.instances = []
- for line in stream:
- if not line.startswith(">"):
+
+ while True:
+ ln = stream.readline()# read the header "$>...."
+ if ln=="" or ln[0]!=">":
break
- # line contains the header ">...."
- # now read the actual sequence
- line = stream.next()
- instance = ""
- for c in line.strip():
- if c==c.upper():
- instance += c
- instance = Seq(instance, self.alphabet)
- length = len(instance)
- if self.length is None:
- self.length = length
- elif length!=self.length:
- raise ValueError("Inconsistent motif lengths found")
- self.instances.append(instance)
- self._pwm_is_current = False
- self._log_odds_is_current = False
- self.__mask = (1,) * length
+
+ ln=stream.readline().strip()#read the actual sequence
+ i=0
+ while ln[i]==ln[i].lower():
+ i+=1
+ inst=""
+ while i<len(ln) and ln[i]==ln[i].upper():
+ inst+=ln[i]
+ i+=1
+ inst=Seq(inst,self.alphabet)
+ self.add_instance(inst)
+
+ self.set_mask("*"*len(inst))
return self
+
def __getitem__(self,index):
"""Returns the probability distribution over symbols at a given position, padding with background.
If the requested index is out of bounds, the returned distribution comes from background.
"""
- warnings.warn("""\
-This function is now obsolete, and will be deprecated and removed
-in a future release of Biopython. Instead of
->>> motif[i]
-please use
->>> pwm = motif.counts.normalize()
->>> pwm[:,i]
-""", PendingDeprecationWarning)
if index in range(self.length):
return self.pwm()[index]
else:
return self.background
- class ConsensusSeq(Seq):
- # This is an ugly hack that allows us raise a warning if a user attempts
- # to call the method motif.consensus() instead of accessing the property
- # motif.consensus. It can be removed after consensus() as a method has
- # been removed from Biopython. Same thing for motif.anticonsensus.
- def __call__(self):
- warnings.warn("""\
-Motif.consensus and Motif.anticonsensus are now properties instead of methods.
-Please use yourmotif.consensus instead of yourmotif.consensus(), and
-yourmotif.anticonsensus instead of yourmotif.anticonsensus()""",
- PendingDeprecationWarning)
- return self
- ConsensusSeq.__name__ = Seq.__name__
-
- @property
def consensus(self):
- """Returns the consensus sequence.
+ """Returns the consensus sequence of a motif.
"""
- sequence = self.counts.consensus
- return Motif.ConsensusSeq(str(sequence), sequence.alphabet)
+ res=""
+ for i in range(self.length):
+ max_f=0
+ max_n="X"
+ for n in sorted(self[i]):
+ if self[i][n]>max_f:
+ max_f=self[i][n]
+ max_n=n
+ res+=max_n
+ return Seq(res,self.alphabet)
- @property
def anticonsensus(self):
"""returns the least probable pattern to be generated from this motif.
"""
- sequence = self.counts.anticonsensus
- return Motif.ConsensusSeq(str(sequence), sequence.alphabet)
-
- @property
- def degenerate_consensus(self):
- """Following the rules adapted from
-D. R. Cavener: "Comparison of the consensus sequence flanking
-translational start sites in Drosophila and vertebrates."
-Nucleic Acids Research 15(4): 1353-1361. (1987).
-The same rules are used by TRANSFAC."""
- return self.counts.degenerate_consensus
+ res=""
+ for i in range(self.length):
+ min_f=10.0
+ min_n="X"
+ for n in sorted(self[i]):
+ if self[i][n]<min_f:
+ min_f=self[i][n]
+ min_n=n
+ res+=min_n
+ return Seq(res,self.alphabet)
def max_score(self):
"""Maximal possible score for this motif.
returns the score computed for the consensus sequence.
"""
- warnings.warn("""\
-This function is now deprecated. Instead of
->>> motif.max_score()
-please use
->>> pwm = motif.counts.normalize()
->>> pssm = pwm.log_odds()
->>> pssm.max_score()
-""",
- PendingDeprecationWarning)
- return self.score_hit(self.consensus,0)
-
+ return self.score_hit(self.consensus(),0)
+
def min_score(self):
"""Minimal possible score for this motif.
returns the score computed for the anticonsensus sequence.
"""
- warnings.warn("""\
-This function is now deprecated. Instead of
->>> motif.min_score()
-please use
->>> pwm = motif.counts.normalize()
->>> pssm = pwm.log_odds()
->>> pssm.min_score()
-""",
- PendingDeprecationWarning)
- return self.score_hit(self.anticonsensus,0)
+ return self.score_hit(self.anticonsensus(),0)
- def weblogo(self,fname,format="PNG",version="2.8.2", **kwds):
+ def weblogo(self,fname,format="PNG",**kwds):
"""
- uses the Berkeley weblogo service to download and save a weblogo of
- itself
-
+ uses the Berkeley weblogo service to download and save a weblogo of itself
+
requires an internet connection.
The parameters from **kwds are passed directly to the weblogo server.
-
- Currently, this method uses WebLogo 2.8.2 by default. This is likely
- to change in a future release of Biopython in favor of WebLogo 3.3.
- To use WebLogo 3.3 now, use the 'version="3"' argument when calling
- this method.
-
- These are the arguments and their default values passed to
- WebLogo 2.8.2; see their website at http://weblogo.berkeley.edu
- for more information:
-
- 'logowidth' : '18',
- 'logoheight' : '5',
- 'logounits' : 'cm',
- 'kind' : 'AUTO',
- 'firstnum' : "1",
- 'command' : 'Create Logo',
- 'smallsamplecorrection' : "on",
- 'symbolsperline' : 32,
- 'res' : '96',
- 'res_units' : 'ppi',
- 'antialias' : 'on',
- 'title' : '',
- 'barbits' : '',
- 'xaxis': 'on',
- 'xaxis_label' : '',
- 'yaxis': 'on',
- 'yaxis_label' : '',
- 'showends' : 'on',
- 'shrink' : '0.5',
- 'fineprint' : 'on',
- 'ticbits' : '1',
- 'colorscheme' : 'DEFAULT',
- 'color1' : 'black',
- 'color2' : 'blue',
- 'color3' : 'red',
- 'color4' : 'black',
- 'color5' : 'purple',
- 'color6' : 'orange',
-
- These are the arguments and their default values passed to
- WebLogo 3.3; see their website at http://weblogo.threeplusone.com
- for more information:
-
- 'stack_width' : 'medium',
- 'stack_per_line' : '40',
- 'alphabet' : 'alphabet_dna',
- 'ignore_lower_case' : True,
- 'unit_name' : "bits",
- 'first_index' : '1',
- 'logo_start' : '1',
- 'logo_end': str(self.length),
- 'composition' : "comp_auto",
- 'percentCG' : '',
- 'scale_width' : True,
- 'show_errorbars' : True,
- 'logo_title' : '',
- 'logo_label' : '',
- 'show_xaxis': True,
- 'xaxis_label': '',
- 'show_yaxis': True,
- 'yaxis_label': '',
- 'yaxis_scale': 'auto',
- 'yaxis_tic_interval' : '1.0',
- 'show_ends' : True,
- 'show_fineprint' : True,
- 'symbols0': '',
- 'symbols1': '',
- 'symbols2': '',
- 'symbols3': '',
- 'symbols4': '',
- 'color0': '',
- 'color1': '',
- 'color2': '',
- 'color3': '',
- 'color4': '',
-
"""
import urllib
import urllib2
- version = str(version)
- if version.startswith("2"):
- warnings.warn("""\
-Please use WebLogo 3 instead of the older WebLogo 2.8.2.
-To use WebLogo 3, use the 'version="3"' argument when calling this method.
-By default, this method uses WebLogo 2.8.2; this default will be changed
-to WebLogo 3 in a future release of Biopython.
-Note that the arguments to WebLogo 3 can be different from those to
-WebLogo 2.8.2. See the WebLogo website (http://weblogo.threeplusone.com)
-for more information. You can also use help() on this method to see the
-arguments to WebLogo 2.8.2 and WebLogo 3.""",
- PendingDeprecationWarning)
- al= self._to_fasta()
- url = 'http://weblogo.berkeley.edu/logo.cgi'
- values = {'sequence' : al,
- 'format' : format,
- 'logowidth' : '18',
- 'logoheight' : '5',
- 'logounits' : 'cm',
- 'kind' : 'AUTO',
- 'firstnum' : "1",
- 'command' : 'Create Logo',
- 'smallsamplecorrection' : "on",
- 'symbolsperline' : 32,
- 'res' : '96',
- 'res_units' : 'ppi',
- 'antialias' : 'on',
- 'title' : '',
- 'barbits' : '',
- 'xaxis': 'on',
- 'xaxis_label' : '',
- 'yaxis': 'on',
- 'yaxis_label' : '',
- 'showends' : 'on',
- 'shrink' : '0.5',
- 'fineprint' : 'on',
- 'ticbits' : '1',
- 'colorscheme' : 'DEFAULT',
- 'color1' : 'black',
- 'color2' : 'blue',
- 'color3' : 'red',
- 'color4' : 'black',
- 'color5' : 'purple',
- 'color6' : 'orange',
- }
- for k,v in kwds.iteritems():
- values[k]=str(v)
- else:
- frequencies= self._to_transfac()
- url = 'http://weblogo.threeplusone.com/create.cgi'
- values = {'sequences' : frequencies,
- 'format' : format.lower(),
- 'stack_width' : 'medium',
- 'stack_per_line' : '40',
- 'alphabet' : 'alphabet_dna',
- 'ignore_lower_case' : True,
- 'unit_name' : "bits",
- 'first_index' : '1',
- 'logo_start' : '1',
- 'logo_end': str(self.length),
- 'composition' : "comp_auto",
- 'percentCG' : '',
- 'scale_width' : True,
- 'show_errorbars' : True,
- 'logo_title' : '',
- 'logo_label' : '',
- 'show_xaxis': True,
- 'xaxis_label': '',
- 'show_yaxis': True,
- 'yaxis_label': '',
- 'yaxis_scale': 'auto',
- 'yaxis_tic_interval' : '1.0',
- 'show_ends' : True,
- 'show_fineprint' : True,
- 'symbols0': '',
- 'symbols1': '',
- 'symbols2': '',
- 'symbols3': '',
- 'symbols4': '',
- 'color0': '',
- 'color1': '',
- 'color2': '',
- 'color3': '',
- 'color4': '',
- }
- for k,v in kwds.iteritems():
- if type(values[k])==bool:
- if not v:
- v = ""
- values[k]=str(v)
-
+ al= self._to_fasta()
+ url = 'http://weblogo.berkeley.edu/logo.cgi'
+ values = {'sequence' : al,
+ 'format' : format,
+ 'logowidth' : '18',
+ 'logoheight' : '5',
+ 'logounits' : 'cm',
+ 'kind' : 'AUTO',
+ 'firstnum' : "1",
+ 'command' : 'Create Logo',
+ 'smallsamplecorrection' : "on",
+ 'symbolsperline' : 32,
+ 'res' : '96',
+ 'res_units' : 'ppi',
+ 'antialias' : 'on',
+ 'title' : '',
+ 'barbits' : '',
+ 'xaxis': 'on',
+ 'xaxis_label' : '',
+ 'yaxis': 'on',
+ 'yaxis_label' : '',
+ 'showends' : 'on',
+ 'shrink' : '0.5',
+ 'fineprint' : 'on',
+ 'ticbits' : '1',
+ 'colorscheme' : 'DEFAULT',
+ 'color1' : 'green',
+ 'color2' : 'blue',
+ 'color3' : 'red',
+ 'color4' : 'black',
+ 'color5' : 'purple',
+ 'color6' : 'orange',
+ 'color1' : 'black',
+ }
+ for k,v in kwds.iteritems():
+ values[k]=str(v)
+
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
response = urllib2.urlopen(req)
f=open(fname,"w")
im=response.read()
-
+
f.write(im)
f.close()
+
def _to_transfac(self):
"""Write the representation of a motif in TRANSFAC format
"""
- from Bio.Motif import TRANSFAC
- multiple_value_keys = TRANSFAC.Motif.multiple_value_keys
- sections = (('AC', 'AS',), # Accession
- ('ID',), # ID
- ('DT', 'CO'), # Date, copyright
- ('NA',), # Name
- ('DE',), # Short factor description
- ('TY',), # Type
- ('OS', 'OC'), # Organism
- ('HP', 'HC'), # Superfamilies, subfamilies
- ('BF',), # Binding factors
- ('P0',), # Frequency matrix
- ('BA',), # Statistical basis
- ('BS',), # Factor binding sites
- ('CC',), # Comments
- ('DR',), # External databases
- ('OV', 'PV',), # Versions
- )
- lines = []
- for section in sections:
- blank = False
- for key in section:
- if key=='P0':
- # Frequency matrix
- length = self.length
- if length==0:
- continue
- sequence = self.degenerate_consensus
- line = "P0 A C G T"
- lines.append(