Permalink
Browse files

Updating the documentation on JASPAR motifs

  • Loading branch information...
1 parent 341fa42 commit 3aedebad8d45b2d691872855c78b6bc1ffe20a75 Michiel de Hoon committed Jan 2, 2013
Showing with 126 additions and 57 deletions.
  1. +3 −9 Bio/Motif/_Motif.py
  2. +24 −0 Bio/Motif/__init__.py
  3. +98 −47 Doc/Tutorial.tex
  4. +1 −1 Tests/test_Motif.py
View
@@ -515,20 +515,14 @@ def __init__(self, alphabet=None, instances=None, counts=None):
warnings.warn("This is experimental code, and may change in future versions", BiopythonExperimentalWarning)
self.instances = []
for instance in instances:
- if alphabet is None:
- alphabet=instance.alphabet
- elif alphabet != instance.alphabet:
+ if alphabet != instance.alphabet:
raise ValueError("Alphabets are inconsistent")
if self.length is None:
self.length = len(instance)
elif self.length != len(instance):
message = "All instances should have the same length (%d found, %d expected)" % (len(instance), self.length)
raise ValueError(message)
self.instances.append(instance)
- if alphabet.letters is None:
- # If we didn't get a meaningful alphabet from the instances,
- # assume it is DNA.
- alphabet = IUPAC.unambiguous_dna
counts = {}
for letter in alphabet.letters:
counts[letter] = [0] * self.length
@@ -1104,14 +1098,14 @@ def reverse_complement(self):
"""
Gives the reverse complement of the motif
"""
+ alphabet = self.alphabet
if self.instances is not None:
instances = []
for instance in self.instances:
instance = instance.reverse_complement()
instances.append(instance)
- res = Motif(instances=instances)
+ res = Motif(instances=instances, alphabet=alphabet)
else: # has counts
- alphabet = self.alphabet
res = Motif(alphabet)
res.counts={}
res.counts["A"]=self.counts["T"][::-1]
View
@@ -14,6 +14,8 @@
from Bio.Motif.MEME import read as _MEME_read
from Bio.Motif import Jaspar
from Bio.Motif.Thresholds import ScoreDistribution
+from Bio.Alphabet import IUPAC
+from Bio.Seq import Seq
def _from_pfm(handle):
@@ -24,6 +26,28 @@ def _from_sites(handle):
return Motif()._from_jaspar_sites(handle)
+def create(instances, alphabet=None):
+ for instance in instances:
+ try:
+ a = instance.alphabet
+ except AttributeError:
+ # The instance is a plain string
+ continue
+ if alphabet is None:
+ alphabet = a
+ elif alphabet != a:
+ raise ValueError("Alphabets are inconsistent")
+ if alphabet is None or alphabet.letters is None:
+ # If we didn't get a meaningful alphabet from the instances,
+ # assume it is DNA.
+ alphabet = IUPAC.unambiguous_dna
+ seqs = []
+ for instance in instances:
+ seq = Seq(str(instance), alphabet=alphabet)
+ seqs.append(seq)
+ return Motif(instances=seqs, alphabet=alphabet)
+
+
def parse(handle,format):
"""Parses an output file of motif finding programs.
View
@@ -10770,7 +10770,13 @@ \section{Motif objects}
\begin{verbatim}
>>> from Bio import Motif
\end{verbatim}
-and we can start creating our first motif objects.
+and we can start creating our first motif objects. We can either create
+a \verb+Motif+ object from a list of instances of the motif, or we can
+obtain a \verb+Motif+ object by parsing a file from a motif database
+or motif finding software.
+
+\subsection{Creating a motif from instances}
+
Suppose we have these instances of a DNA motif:
%cont-doctest
\begin{verbatim}
@@ -10787,7 +10793,7 @@ \section{Motif objects}
then we can create a Motif object as follows:
%cont-doctest
\begin{verbatim}
->>> m = Motif.Motif(instances=instances)
+>>> m = Motif.create(instances)
\end{verbatim}
Printing out the Motif object shows the instances from which it was constructed:
%cont-doctest
@@ -10906,7 +10912,7 @@ \section{Motif objects}
The reverse complement and the degenerate consensus sequence are
only defined for DNA motifs.
-\subsection{Reading and writing}
+\subsection{Reading motifs}
\label{sec:io}
Creating motifs from instances by hand is a bit boring, so it's
@@ -10915,67 +10921,112 @@ \subsection{Reading and writing}
motifs, but there's a couple of formats which are more used than
others. The most important distinction is whether the motif
representation is based on instances or on some version of PWM matrix.
+
+\subsubsection*{JASPAR}
One of the most popular motif databases \href{http://jaspar.genereg.net}{JASPAR}
- stores motifs in both formats, so
-let's look at how we can import JASPAR motifs from instances:
+ stores motifs either as a list of instances, or as a frequency matrix.
+As an example, these are the beginning and ending lines of the JASPAR \verb+Arnt.sites+ file showing known binding sites of the mouse helix-loop-helix transcription factor Arnt:
+\begin{verbatim}
+>MA0004 ARNT 1
+CACGTGatgtcctc
+>MA0004 ARNT 2
+CACGTGggaggtac
+>MA0004 ARNT 3
+CACGTGccgcgcgc
+...
+>MA0004 ARNT 18
+AACGTGacagccctcc
+>MA0004 ARNT 19
+AACGTGcacatcgtcc
+>MA0004 ARNT 20
+aggaatCGCGTGc
+\end{verbatim}
+The parts of the sequence in capital letters are the motif instances that were found to align to each other.
+
+We can create a \verb+Motif+ object from these instances as follows:
%cont-doctest
\begin{verbatim}
>>> from Bio import Motif
->>> arnt = Motif.read(open("Arnt.sites"),"jaspar-sites")
+>>> arnt = Motif.read(open("Arnt.sites"), "sites")
\end{verbatim}
-and from a count matrix:
+The instances from which this motif was created is stored in the \verb+.instances+ property:
%cont-doctest
\begin{verbatim}
->>> srf = Motif.read(open("SRF.pfm"),"jaspar-pfm")
+>>> print arnt.instances[:3]
+[Seq('CACGTG', IUPACUnambiguousDNA()), Seq('CACGTG', IUPACUnambiguousDNA()), Seq('CACGTG', IUPACUnambiguousDNA())]
+>>> for instance in arnt.instances:
+... print instance
+...
+CACGTG
+CACGTG
+CACGTG
+CACGTG
+CACGTG
+CACGTG
+CACGTG
+CACGTG
+CACGTG
+CACGTG
+CACGTG
+CACGTG
+CACGTG
+CACGTG
+CACGTG
+AACGTG
+AACGTG
+AACGTG
+AACGTG
+CGCGTG
+\end{verbatim}
+The counts matrix of this motif is automatically calculated from the instances:
+%cont-doctest
+\begin{verbatim}
+>>> print arnt.counts
+ 0 1 2 3 4 5
+A: 4.00 19.00 0.00 0.00 0.00 0.00
+C: 16.00 0.00 20.00 0.00 0.00 0.00
+G: 0.00 1.00 0.00 20.00 0.00 20.00
+T: 0.00 0.00 0.00 0.00 20.00 0.00
+<BLANKLINE>
\end{verbatim}
-The \verb|arnt| and \verb|srf| motifs can both do the same things for
-us, but they use different internal representations of the motif. We
-can tell that by inspecting the \verb|counts| and \verb|instances|
-properties:
+JASPAR also makes motifs available directly as a count matrix.
+For example, this is the JASPAR file \verb+SRF.pfm+ containing the count
+matrix for the human SRF transcription factor:
+\begin{verbatim}
+ 2 9 0 1 32 3 46 1 43 15 2 2
+ 1 33 45 45 1 1 0 0 0 1 0 1
+39 2 1 0 0 0 0 0 0 0 44 43
+ 4 2 0 0 13 42 0 45 3 30 0 0
+\end{verbatim}
+We can create a motif for this count matrix as follows:
%cont-doctest
\begin{verbatim}
->>> arnt.counts is None
-True
->>> arnt.instances is None
-False
->>> srf.counts is None
-False
->>> srf.instances is None
-True
+>>> srf = Motif.read(open("SRF.pfm"),"pfm")
+>>> print srf.counts
+ 0 1 2 3 4 5 6 7 8 9 10 11
+A: 2.00 9.00 0.00 1.00 32.00 3.00 46.00 1.00 43.00 15.00 2.00 2.00
+C: 1.00 33.00 45.00 45.00 1.00 1.00 0.00 0.00 0.00 1.00 0.00 1.00
+G: 39.00 2.00 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 44.00 43.00
+T: 4.00 2.00 0.00 0.00 13.00 42.00 0.00 45.00 3.00 30.00 0.00 0.00
+<BLANKLINE>
\end{verbatim}
-%TODO - sort order and formatting for doctest?
+As this motif was created from the counts matrix directly, it has no instances associated with it:
+%cont-doctest
\begin{verbatim}
->>> srf.counts
-{'A': [2, 9, 0, 1, 32, 3, 46, 1, 43, 15, 2, 2],
- 'C': [1, 33, 45, 45, 1, 1, 0, 0, 0, 1, 0, 1],
- 'G': [39, 2, 1, 0, 0, 0, 0, 0, 0, 0, 44, 43],
- 'T': [4, 2, 0, 0, 13, 42, 0, 45, 3, 30, 0, 0]}
+>>> print srf.instances
+None
\end{verbatim}
-
-There are conversion functions, which can help us convert between
-different representations:
+We can now ask for the consensus sequence of these two motifs:
+%cont-doctest
\begin{verbatim}
->>> arnt.make_counts_from_instances()
-{'A': [8, 38, 0, 0, 0, 0],
- 'C': [32, 0, 40, 0, 0, 0],
- 'G': [0, 2, 0, 40, 0, 40],
- 'T': [0, 0, 0, 0, 40, 0]}
-
->>> srf.make_instances_from_counts()
-[Seq('GGGAAAAAAAGG', IUPACUnambiguousDNA()),
- Seq('GGCCAAATAAGG', IUPACUnambiguousDNA()),
- Seq('GACCAAATAAGG', IUPACUnambiguousDNA()),
-....
+>>> print arnt.counts.consensus
+CACGTG
+>>> print srf.counts.consensus
+GCCCATATATGG
\end{verbatim}
-The important thing to remember here is that the method
-\verb|make_instances_from_counts()| creates fake instances, because
-usually there are very many possible sets of instances which give rise
-to the same PWM, and if we have only the count matrix, we cannot
-reconstruct the original one. This does not make any difference if we
-are using the PWM as the representation of the motif, but one should
-be careful with exporting instances from count-based motifs.
+\subsection{Writing motifs}
Speaking of exporting, let's look at export functions. We can export to fasta:
%cont-doctest
\begin{verbatim}
View
@@ -20,7 +20,7 @@ def setUp(self):
self.PFMout = "Motif/fa.out"
instance = Seq("ATATA")
instances = [instance]
- self.m=Motif.Motif(instances=instances)
+ self.m = Motif.create(instances)
def tearDown(self):
self.PFMin.close()

0 comments on commit 3aedeba

Please sign in to comment.