merged Andrew's Seq package with the tree

added MANIFEST.in and setup.py for distutils
biopython · Apr 30, 2000 · f726249 · f726249
1 parent 18dbba1
commit f726249
Show file tree

Hide file tree

Showing 26 changed files with 2,310 additions and 6 deletions.
diff --git a/AUTHORS b/AUTHORS
@@ -0,0 +1,4 @@
+Jeffrey Chang <jchang@smi.stanford.edu>
+Andrew Dalke <dalke@acm.org>
+Katharine Lindner <katel@worldpath.net>
+
diff --git a/Bio/Alphabet/IUPAC.py b/Bio/Alphabet/IUPAC.py
@@ -0,0 +1,103 @@
+# Define the IUPAC Alphabets you know and love
+
+from Bio import Alphabet
+from Bio.Data import IUPACData
+
+##################### Protein
+
+# From the IUPAC definition at:
+#   http://www.chem.qmw.ac.uk/iupac/AminoAcid/A2021.html#AA21
+class IUPACProtein(Alphabet.ProteinAlphabet):
+    letters = IUPACData.protein_letters
+
+protein = IUPACProtein()
+
+# This could be considered the base class for the standard IUPAC
+# protein, except that some encodings will use "X" to mean "unknown
+# character", which causes a collision.  If you use X for
+# selenocysteines, then you'll need a new alphabet.
+
+class ExtendedIUPACProtein(Alphabet.ProteinAlphabet):
+    letters = IUPACData.extended_protein_letters
+    # B = "Asx";  aspartic acid or asparagine
+    # X = "Sec";  selenocysteine
+    # Z = "Glx";  glutamic acid or glutamine (or substances such as
+    #         4-carboxyglutamic acid and 5-oxoproline that yield glutamic
+    #         acid on acid hydrolysis of peptides)
+
+extended_protein = ExtendedIUPACProtein()
+
+##################### DNA
+
+# The next two are the IUPAC definitions, from:
+#   http://www.chem.qmw.ac.uk/iubmb/misc/naseq.html
+class IUPACAmbiguousDNA(Alphabet.DNAAlphabet):
+    letters = IUPACData.ambiguous_dna_letters
+
+ambiguous_dna = IUPACAmbiguousDNA()
+
+class IUPACUnambiguousDNA(IUPACAmbiguousDNA):
+    letters = IUPACData.unambiguous_dna_letters
+
+unambiguous_dna = IUPACUnambiguousDNA()
+
+
+# Also from the URL, but not part of the standard
+class ExtendedIUPACDNA(Alphabet.DNAAlphabet):
+    letters = IUPACData.extended_dna_letters
+    #   B == 5-bromouridine
+    #   D == 5,6-dihydrouridine
+    #   S == thiouridine
+    #   W == wyosine
+
+extended_dna = ExtendedIUPACDNA()
+
+##################### RNA
+
+class IUPACAmbiguousRNA(Alphabet.RNAAlphabet):
+    letters = IUPACData.ambiguous_rna_letters
+
+ambiguous_rna = IUPACAmbiguousRNA()
+
+class IUPACUnambiguousRNA(IUPACAmbiguousRNA):
+    letters = IUPACData.unambiguous_rna_letters
+
+unambiguous_rna = IUPACUnambiguousRNA()
+
+# are there extended forms?
+#class ExtendedIUPACRNA(Alphabet.RNAAlphabet):
+#    letters = extended_rna_letters
+#    #   B == 5-bromouridine
+#    #   D == 5,6-dihydrouridine
+#    #   S == thiouridine
+#    #   W == wyosine
+
+
+# We need to load the property resolution information, but we need to
+# wait until after the systems have been loaded. (There's a nasty loop
+# where, eg, translation objects need an alphabet, which need to be
+# assocated with translators.)
+
+from Bio.PropertyManager import default_manager
+
+def _bootstrap(manager, klass, property):
+    assert manager is default_manager
+    del default_manager.class_resolver[IUPACProtein]
+    del default_manager.class_resolver[ExtendedIUPACProtein]
+    del default_manager.class_resolver[IUPACAmbiguousDNA]
+    del default_manager.class_resolver[IUPACUnambiguousDNA]
+    del default_manager.class_resolver[ExtendedIUPACDNA]
+    del default_manager.class_resolver[IUPACAmbiguousRNA]
+    del default_manager.class_resolver[IUPACUnambiguousRNA]
+
+    from Bio.Encodings import IUPACEncoding
+
+    return manager.resolve_class(klass, property)
+
+default_manager.class_resolver[IUPACProtein] = _bootstrap
+default_manager.class_resolver[ExtendedIUPACProtein] = _bootstrap
+default_manager.class_resolver[IUPACAmbiguousDNA] = _bootstrap
+default_manager.class_resolver[IUPACUnambiguousDNA] = _bootstrap
+default_manager.class_resolver[ExtendedIUPACDNA] = _bootstrap
+default_manager.class_resolver[IUPACAmbiguousRNA] = _bootstrap
+default_manager.class_resolver[IUPACUnambiguousRNA] = _bootstrap
diff --git a/Bio/Alphabet/__init__.py b/Bio/Alphabet/__init__.py
@@ -0,0 +1,101 @@
+import string, re
+
+# This is used by sequences which contain a finite number of similar
+# words.
+
+class Alphabet:
+    size = None     # no fixed size for words
+    letters = None  # no fixed alphabet; implement as a list-like
+                    # interface,
+    def __repr__(self):
+        return self.__class__.__name__ + "()"
+
+    def contains(self, other):
+        return isinstance(other, self.__class__)
+
+generic_alphabet = Alphabet()
+
+class SingleLetterAlphabet(Alphabet):
+    size = 1
+    letters = None   # string of all letters in the alphabet
+
+########### Protein
+
+class ProteinAlphabet(SingleLetterAlphabet):
+    pass
+
+generic_protein = ProteinAlphabet()
+
+########### DNA
+class NucleotideAlphabet(SingleLetterAlphabet):
+    pass
+
+generic_nucleotide = NucleotideAlphabet()
+
+class DNAAlphabet(NucleotideAlphabet):
+    pass
+
+generic_dna = DNAAlphabet()
+
+
+########### RNA
+
+class RNAAlphabet(NucleotideAlphabet):
+    pass
+
+generic_rna = RNAAlphabet()
+
+
+
+########### Other per-sequence encodings
+
+class SecondaryStructure(SingleLetterAlphabet):
+    letters = "HSTC"
+
+class ThreeLetterProtein(Alphabet):
+    size = 3
+    letters = [
+        "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile",
+        "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr",
+        "Sec", "Val", "Trp", "Xaa", "Tyr", "Glx",
+        ]
+
+###### Non per-sequence modifications
+
+# (These are Decorator classes)
+
+class AlphabetEncoder:
+    def __init__(self, alphabet, new_letters):
+        self.alphabet = alphabet
+        if alphabet.letters is not None:
+            self.letters = alphabet.letters + new_letters
+        else:
+            self.letters = None
+    def __getattr__(self, key):
+        return getattr(self.alphabet, key)
+
+    def contains(self, other):
+        return 0
+
+class Gapped(AlphabetEncoder):
+    gap_char = '-'
+    def __init__(self, alphabet, gap_char = gap_char):
+        AlphabetEncoder.__init__(self, alphabet, gap_char)
+
+    def contains(self, other):
+        return other.gap_char == self.gap_char and \
+               self.alphabet.contains(other.alphabet)
+
+class HasStopCodon(AlphabetEncoder):
+    stop_symbol = "*"
+    def __init__(self, alphabet, stop_symbol = stop_symbol):
+        AlphabetEncoder.__init__(self, alphabet, stop_symbol)
+    def __cmp__(self, other):
+        x = cmp(self.alphabet, other.alphabet)
+        if x == 0:
+            return cmp(self.stop_symbol, other.stop_symbol)
+        return x
+
+    def contains(self, other):
+        return other.stop_symbol == self.stop_symbol and \
+               self.alphabet.contains(other.alphabet)