Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Merge branch 'master' of github.com:biopython/biopython

  • Loading branch information...
commit 86689112a64b858a39d52c68bd7b86dfacfc5a54 2 parents dfa8eb5 + cf063bf
Michiel de Hoon authored
Showing with 1,050 additions and 1,534 deletions.
  1. +0 −1  Bio/AlignIO/Interfaces.py
  2. +1 −1  Bio/AlignIO/StockholmIO.py
  3. +0 −1  Bio/Alphabet/__init__.py
  4. +2 −1  Bio/Application/__init__.py
  5. +2 −1  Bio/Blast/NCBIStandalone.py
  6. +2 −3 Bio/Blast/NCBIWWW.py
  7. +0 −3  Bio/Blast/ParseBlastTable.py
  8. +10 −10 Bio/Crystal/__init__.py
  9. +4 −1 Bio/Entrez/__init__.py
  10. +1 −1  Bio/ExPASy/ScanProsite.py
  11. +0 −1  Bio/FSSP/FSSPTools.py
  12. +0 −1  Bio/FSSP/__init__.py
  13. +0 −1  Bio/FSSP/fssp_rec.py
  14. +510 −3 Bio/File.py
  15. +0 −3  Bio/GA/Mutation/Simple.py
  16. +3 −3 Bio/GA/Organism.py
  17. +0 −1  Bio/GA/Repair/Stabilizing.py
  18. +0 −2  Bio/GA/Selection/Diversity.py
  19. +0 −1  Bio/GA/Selection/Tournament.py
  20. +2 −2 Bio/Geo/Record.py
  21. +0 −3  Bio/Graphics/DisplayRepresentation.py
  22. +2 −2 Bio/Graphics/GenomeDiagram/_AbstractDrawer.py
  23. +1 −1  Bio/Graphics/GenomeDiagram/_CircularDrawer.py
  24. +3 −7 Bio/Graphics/GenomeDiagram/_Colors.py
  25. +0 −1  Bio/Graphics/GenomeDiagram/_Diagram.py
  26. +0 −1  Bio/Graphics/GenomeDiagram/_FeatureSet.py
  27. +0 −2  Bio/Graphics/GenomeDiagram/_Graph.py
  28. +0 −3  Bio/HMM/DynamicProgramming.py
  29. +0 −1  Bio/HMM/MarkovModel.py
  30. +0 −9 Bio/HMM/Trainer.py
  31. +0 −5 Bio/HMM/Utilities.py
  32. +0 −3  Bio/HotRand.py
  33. +1 −1  Bio/Index.py
  34. +0 −1  Bio/KDTree/KDTree.py
  35. +0 −1  Bio/Motif/MEME.py
  36. +0 −2  Bio/Motif/Parsers/AlignAce.py
  37. +0 −1  Bio/Motif/Parsers/MEME.py
  38. +2 −2 Bio/Motif/_Motif.py
  39. +20 −10 Bio/NMR/xpktools.py
  40. +0 −1  Bio/NeuralNetwork/Gene/Motif.py
  41. +2 −2 Bio/NeuralNetwork/Gene/Pattern.py
  42. +0 −1  Bio/NeuralNetwork/Gene/Signature.py
  43. +0 −4 Bio/NeuralNetwork/Training.py
  44. +96 −93 Bio/Nexus/Nexus.py
  45. +25 −25 Bio/Nexus/Nodes.py
  46. +60 −63 Bio/Nexus/Trees.py
  47. +5 −5 Bio/Nexus/cnexus.c
  48. +0 −1  Bio/PDB/AbstractPropertyMap.py
  49. +0 −1  Bio/PDB/Chain.py
  50. +1 −2  Bio/PDB/DSSP.py
  51. +0 −1  Bio/PDB/Dice.py
  52. +0 −1  Bio/PDB/FragmentMapper.py
  53. +0 −2  Bio/PDB/HSExposure.py
  54. +1 −2  Bio/PDB/MMCIF2Dict.py
  55. +0 −1  Bio/PDB/MMCIFParser.py
  56. +0 −1  Bio/PDB/Model.py
  57. +2 −2 Bio/PDB/NACCESS.py
  58. +0 −1  Bio/PDB/NeighborSearch.py
  59. +0 −4 Bio/PDB/PDBIO.py
  60. +0 −1  Bio/PDB/Polypeptide.py
  61. +0 −1  Bio/PDB/Residue.py
  62. +0 −1  Bio/PDB/ResidueDepth.py
  63. +0 −2  Bio/PDB/Structure.py
  64. +0 −1  Bio/PDB/StructureAlignment.py
  65. +0 −2  Bio/PDB/StructureBuilder.py
  66. +0 −3  Bio/Pathway/Rep/HashSet.py
  67. +0 −1  Bio/Phylo/Applications/_Phyml.py
  68. +0 −1  Bio/Phylo/Applications/_Raxml.py
  69. +0 −1  Bio/Phylo/BaseTree.py
  70. +0 −1  Bio/Phylo/Newick.py
  71. +0 −1  Bio/Phylo/NewickIO.py
  72. +2 −2 Bio/Phylo/PAML/_paml.py
  73. +0 −2  Bio/Phylo/PAML/yn00.py
  74. +0 −1  Bio/Phylo/PhyloXMLIO.py
  75. +0 −1  Bio/Phylo/_utils.py
  76. +0 −1  Bio/PopGen/Async/Local.py
  77. +0 −1  Bio/PopGen/FDist/Controller.py
  78. +3 −4 Bio/PopGen/FDist/Utils.py
  79. +0 −1  Bio/PopGen/FDist/__init__.py
  80. +1 −3 Bio/PopGen/GenePop/Controller.py
  81. +5 −6 Bio/PopGen/GenePop/FileParser.py
  82. +0 −1  Bio/PopGen/GenePop/LargeFileParser.py
  83. +0 −2  Bio/PopGen/GenePop/__init__.py
  84. +0 −2  Bio/PopGen/SimCoal/Controller.py
  85. +0 −2  Bio/PopGen/SimCoal/Template.py
  86. +0 −2  Bio/Restriction/RanaConfig.py
  87. +0 −1  Bio/Restriction/Restriction_Dictionary.py
  88. +0 −2  Bio/Restriction/_Update/RestrictionCompiler.py
  89. +1 −1  Bio/SCOP/Raf.py
  90. +0 −8 Bio/SCOP/Residues.py
  91. +2 −1  Bio/SCOP/__init__.py
  92. +1 −1  Bio/SCOP/three_to_one_dict.py
  93. +37 −67 Bio/SearchIO/BlastIO/blast_xml.py
  94. +4 −6 Bio/SearchIO/FastaIO.py
  95. +45 −11 Bio/SearchIO/__init__.py
  96. +16 −439 Bio/SearchIO/_index.py
  97. +0 −1  Bio/SeqIO/PdbIO.py
  98. +5 −6 Bio/SeqIO/UniprotIO.py
  99. +23 −5 Bio/SeqIO/__init__.py
  100. +2 −492 Bio/SeqIO/_index.py
  101. +0 −1  Bio/SeqUtils/CodonUsage.py
  102. +0 −1  Bio/SeqUtils/ProtParam.py
  103. +1 −1  Bio/SeqUtils/__init__.py
  104. +0 −1  Bio/SubsMat/FreqTable.py
  105. +9 −6 Bio/SubsMat/__init__.py
  106. +1 −1  Bio/SwissProt/__init__.py
  107. +3 −5 Bio/UniGene/UniGene.py
  108. +3 −3 Bio/UniGene/__init__.py
  109. +2 −1  Bio/Wise/dnal.py
  110. +1 −1  Bio/_py3k/_ordereddict.py
  111. +2 −2 Bio/triefind.py
  112. +24 −8 BioSQL/BioSeq.py
  113. +0 −1  BioSQL/Loader.py
  114. +0 −1  BioSQL/__init__.py
  115. +0 −1  Doc/examples/clustal_run.py
  116. +0 −1  Doc/examples/fasta_iterator.py
  117. +4 −7 Doc/examples/getgene.py
  118. +0 −2  Doc/examples/local_blast.py
  119. +0 −1  Doc/examples/make_subsmat.py
  120. +0 −1  Doc/examples/swissprot.py
  121. +0 −2  Doc/examples/www_blast.py
  122. +0 −3  Scripts/GenBank/check_output_simple.py
  123. +0 −3  Scripts/GenBank/find_parser_problems.py
  124. +0 −1  Scripts/Performance/biosql_performance_load.py
  125. +1 −1  Scripts/SeqGui/SeqGui.py
  126. +2 −2 Scripts/debug/debug_blast_parser.py
  127. +0 −18 Scripts/scop_pdb.py
  128. +3 −1 Scripts/xbbtools/nextorf.py
  129. +3 −1 Scripts/xbbtools/xbb_blast.py
  130. +9 −4 Scripts/xbbtools/xbb_blastbg.py
  131. +0 −2  Scripts/xbbtools/xbb_help.py
  132. +3 −4 Scripts/xbbtools/xbb_search.py
  133. +0 −2  Scripts/xbbtools/xbb_sequence.py
  134. +0 −1  Scripts/xbbtools/xbb_translations.py
  135. +0 −1  Scripts/xbbtools/xbbtools.py
  136. BIN  Tests/Blast/wnts.xml.bgz
  137. +6 −6 Tests/test_Cluster.py
  138. +0 −1  Tests/test_ColorSpiral.py
  139. +0 −1  Tests/test_Crystal.py
  140. +0 −1  Tests/test_FSSP.py
  141. +0 −1  Tests/test_HMMCasino.py
  142. +1 −1  Tests/test_KEGG.py
  143. +0 −1  Tests/test_Location.py
  144. +0 −1  Tests/test_Medline.py
  145. +2 −1  Tests/test_NCBI_BLAST_tools.py
  146. +0 −1  Tests/test_PAML_baseml.py
  147. +0 −1  Tests/test_ParserSupport.py
  148. +0 −1  Tests/test_Phd.py
  149. +2 −2 Tests/test_PopGen_GenePop_nodepend.py
  150. +32 −4 Tests/test_SearchIO_index.py
  151. +0 −1  Tests/test_SeqIO_PdbIO.py
  152. +1 −1  Tests/test_SeqIO_SeqXML.py
  153. +0 −2  Tests/test_SeqIO_online.py
  154. +0 −1  Tests/test_SeqUtils.py
  155. +0 −1  Tests/test_SubsMat.py
  156. +0 −2  Tests/test_TogoWS.py
  157. +2 −1  Tests/test_Wise.py
  158. +20 −0 Tests/test_bgzf.py
  159. +2 −6 Tests/test_geo.py
  160. +2 −1  Tests/test_psw.py
  161. +1 −1  Tests/test_seq.py
  162. +0 −1  Tests/test_translate.py
  163. +3 −3 Tests/test_trie.py
View
1  Bio/AlignIO/Interfaces.py
@@ -144,4 +144,3 @@ def write_alignment(self, alignment):
# You SHOULD subclass this, to write the alignment #
# objecta to the file handle #
#####################################################
-
View
2  Bio/AlignIO/StockholmIO.py
@@ -194,7 +194,7 @@ def _write_record(self, record):
seq_name = seq_name.replace(" ","_")
if "start" in record.annotations \
- and "end" in record.annotations:
+ and "end" in record.annotations:
suffix = "/%s-%s" % (str(record.annotations["start"]),
str(record.annotations["end"]))
if seq_name[-len(suffix):] != suffix:
View
1  Bio/Alphabet/__init__.py
@@ -400,4 +400,3 @@ def _verify_alphabet(sequence):
if letter not in letters:
return False
return True
-
View
3  Bio/Application/__init__.py
@@ -19,7 +19,8 @@
The finished command line strings are then normally invoked via the built-in
Python module subprocess.
"""
-import os, sys
+import os
+import sys
import StringIO
import subprocess
import re
View
3  Bio/Blast/NCBIStandalone.py
@@ -2083,7 +2083,8 @@ def _invoke_blast(cline):
Expects a command line wrapper object from Bio.Blast.Applications
"""
- import subprocess, sys
+ import subprocess
+ import sys
blast_cmd = cline.program_name
if not os.path.exists(blast_cmd):
raise ValueError("BLAST executable does not exist at %s" % blast_cmd)
View
5 Bio/Blast/NCBIWWW.py
@@ -61,7 +61,8 @@ def qblast(program, database, sequence,
http://www.ncbi.nlm.nih.gov/BLAST/blast_overview.html
"""
- import urllib, urllib2
+ import urllib
+ import urllib2
import time
assert program in ['blastn', 'blastp', 'blastx', 'tblastn', 'tblastx']
@@ -243,5 +244,3 @@ def _parse_qblast_ref_page(handle):
except ValueError:
raise ValueError("A non-integer RTOE found in " \
+"the 'please wait' page, %s" % repr(rtoe))
-
-
View
3  Bio/Blast/ParseBlastTable.py
@@ -98,6 +98,3 @@ def _parse_fields(self, inline):
return 0
def _Parse(self, method_name, inline):
return getattr(self,method_name)(inline)
-
-
-
View
20 Bio/Crystal/__init__.py
@@ -23,7 +23,7 @@ def wrap_line(line):
return output
def validate_key(key):
- if type(key) != type(''):
+ if not isinstance(key, str):
raise CrystalError('chain requires a string label')
if len(key) != 1:
raise CrystalError('chain label should contain one letter')
@@ -37,7 +37,7 @@ class Hetero(object):
"""
def __init__(self, data):
# Enforce string storage
- if type(data) != type(""):
+ if not isinstance(data, str):
raise CrystalError('Hetero data must be an alphameric string')
if data.isalnum() == 0:
raise CrystalError('Hetero data must be an alphameric string')
@@ -66,12 +66,12 @@ def __len__(self): return len(self.data)
class Chain(object):
def __init__(self, residues = ''):
self.data = []
- if type(residues) == type(''):
+ if isinstance(residues, str):
residues = residues.replace('*', ' ')
residues = residues.strip()
elements = residues.split()
self.data = map(Hetero, elements)
- elif type(residues) == type([]):
+ elif isinstance(residues, list):
for element in residues:
if not isinstance(element, Hetero):
raise CrystalError('Text must be a string')
@@ -185,7 +185,7 @@ def index(self, item):
def __add__(self, other):
if isinstance(other, Chain):
return self.__class__(self.data + other.data)
- elif type(other) == type(''):
+ elif isinstance(other, str):
return self.__class__(self.data + Chain(other).data)
else:
raise TypeError
@@ -193,7 +193,7 @@ def __add__(self, other):
def __radd__(self, other):
if isinstance(other, Chain):
return self.__class__(other.data + self.data)
- elif type(other) == type(''):
+ elif isinstance(other, str):
return self.__class__(Chain(other).data + self.data)
else:
raise TypeError
@@ -201,7 +201,7 @@ def __radd__(self, other):
def __iadd__(self, other):
if isinstance(other, Chain):
self.data += other.data
- elif type(other) == type(''):
+ elif isinstance(other, str):
self.data += Chain(other).data
else:
raise TypeError
@@ -210,7 +210,7 @@ def __iadd__(self, other):
class Crystal(object):
def __init__(self, data = {}):
# Enforcestorage
- if type(data) != type({}):
+ if not isinstance(data, dict):
raise CrystalError('Crystal must be a dictionary')
self.data = data
self.fix()
@@ -221,7 +221,7 @@ def fix(self):
element = data[key]
if isinstance(element, Chain):
pass
- elif type(element) == type(''):
+ elif isinstance(element, str):
data[key] = Chain(element)
else:
raise TypeError
@@ -250,7 +250,7 @@ def __getitem__(self, key): return self.data[key]
def __setitem__(self, key, item):
if isinstance(item, Chain):
self.data[key] = item
- elif type(item) == type(''):
+ elif isinstance(item, str):
self.data[ key ] = Chain(item)
else:
raise TypeError
View
5 Bio/Entrez/__init__.py
@@ -68,7 +68,10 @@
_open Internally used function.
"""
-import urllib, urllib2, time, warnings
+import urllib
+import urllib2
+import time
+import warnings
import os.path
from Bio._py3k import _binary_to_string_handle
View
2  Bio/ExPASy/ScanProsite.py
@@ -69,7 +69,7 @@ def feed(self, data, isFinal = 0):
# fed to the parser.
if self.firsttime:
if data[:5]!="<?xml":
- raise ValueError, data
+ raise ValueError(data)
self.firsttime = False
return ExpatParser.feed(self, data, isFinal)
View
1  Bio/FSSP/FSSPTools.py
@@ -89,4 +89,3 @@ def name_filter(sum_dict, align_dict, name_list):
new_align_dict.abs(pos_num).pos_align_dict[prot_num] = \
align_dict.abs(pos_num).pos_align_dict[prot_num]
return new_sum_dict, new_align_dict
-
View
1  Bio/FSSP/__init__.py
@@ -264,4 +264,3 @@ def read_fssp(fssp_handle):
del i.PosAlignList
align_dict.build_resnum_list()
return (header, sum_dict, align_dict)
-
View
1  Bio/FSSP/fssp_rec.py
@@ -31,4 +31,3 @@ class align(object):
turn5 = (20,22)
acc = (34,37)
start_aa_list = 42
-
View
513 Bio/File.py
@@ -1,4 +1,5 @@
# Copyright 1999 by Jeffrey Chang. All rights reserved.
+# Copyright 2009-2012 by Peter Cock. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
@@ -16,13 +17,33 @@
SGMLStripper Object that strips SGML. This is now DEPRECATED, and is likely
to be removed in a future release of Biopython.
+Additional private classes used in Bio.SeqIO and Bio.SearchIO for indexing
+files are also defined under Bio.File but these are not intended for direct
+use.
"""
# For with statement in Python 2.5
from __future__ import with_statement
-import sys
+import codecs
+import os
import contextlib
import StringIO
+import itertools
+try:
+ from collections import UserDict as _dict_base
+except ImportError:
+ from UserDict import DictMixin as _dict_base
+
+try:
+ from sqlite3 import dbapi2 as _sqlite
+ from sqlite3 import IntegrityError as _IntegrityError
+ from sqlite3 import OperationalError as _OperationalError
+except ImportError:
+ #Not present on Jython, but should be included in Python 2.5
+ #or later (unless compiled from source without its dependencies)
+ #Still want to offer in-memory indexing.
+ _sqlite = None
+ pass
@contextlib.contextmanager
def as_handle(handleish, mode='r', **kwargs):
@@ -54,8 +75,7 @@ def as_handle(handleish, mode='r', **kwargs):
>>> fp.close()
"""
if isinstance(handleish, basestring):
- if 'encoding' in kwargs and sys.version_info[0] < 3:
- import codecs
+ if 'encoding' in kwargs:
with codecs.open(handleish, mode, **kwargs) as fp:
yield fp
else:
@@ -205,3 +225,490 @@ def strip(self, str):
return str
+#The rest of this file defines code used in Bio.SeqIO and Bio.SearchIO
+#for indexing
+
+class _IndexedSeqFileProxy(object):
+ """Base class for file format specific random access (PRIVATE).
+
+ This is subclasses in both Bio.SeqIO for indexing as SeqRecord
+ objects, and in Bio.SearchIO for indexing QueryResult objects.
+
+ Subclasses for each file format should define '__iter__', 'get'
+ and optionally 'get_raw' methods.
+ """
+
+ def __iter__(self):
+ """Returns (identifier, offset, length in bytes) tuples.
+
+ The length can be zero where it is not implemented or not
+ possible for a particular file format.
+ """
+ raise NotImplementedError("Subclass should implement this")
+
+ def get(self, offset):
+ """Returns parsed object for this entry."""
+ #Most file formats with self contained records can be handled by
+ #parsing StringIO(_bytes_to_string(self.get_raw(offset)))
+ raise NotImplementedError("Subclass should implement this")
+
+ def get_raw(self, offset):
+ """Returns bytes string (if implemented for this file format)."""
+ #Should be done by each sub-class (if possible)
+ raise NotImplementedError("Not available for this file format.")
+
+
+class _IndexedSeqFileDict(_dict_base):
+ """Read only dictionary interface to a sequential record file.
+
+ This code is used in both Bio.SeqIO for indexing as SeqRecord
+ objects, and in Bio.SearchIO for indexing QueryResult objects.
+
+ Keeps the keys and associated file offsets in memory, reads the file
+ to access entries as objects parsing them on demand. This approach
+ is memory limited, but will work even with millions of records.
+
+ Note duplicate keys are not allowed. If this happens, a ValueError
+ exception is raised.
+
+ As used in Bio.SeqIO, by default the SeqRecord's id string is used
+ as the dictionary key. In Bio.SearchIO, the query's id string is
+ used. This can be changed by suppling an optional key_function,
+ a callback function which will be given the record id and must
+ return the desired key. For example, this allows you to parse
+ NCBI style FASTA identifiers, and extract the GI number to use
+ as the dictionary key.
+
+ Note that this dictionary is essentially read only. You cannot
+ add or change values, pop values, nor clear the dictionary.
+ """
+ def __init__(self, random_access_proxy, key_function,
+ repr, obj_repr):
+ #Use key_function=None for default value
+ self._proxy = random_access_proxy
+ self._key_function = key_function
+ self._repr = repr
+ self._obj_repr = obj_repr
+ if key_function:
+ offset_iter = (
+ (key_function(k), o, l) for (k, o, l) in random_access_proxy)
+ else:
+ offset_iter = random_access_proxy
+ offsets = {}
+ for key, offset, length in offset_iter:
+ #Note - we don't store the length because I want to minimise the
+ #memory requirements. With the SQLite backend the length is kept
+ #and is used to speed up the get_raw method (by about 3 times).
+ #The length should be provided by all the current backends except
+ #SFF where there is an existing Roche index we can reuse (very fast
+ #but lacks the record lengths)
+ #assert length or format in ["sff", "sff-trim"], \
+ # "%s at offset %i given length %r (%s format %s)" \
+ # % (key, offset, length, filename, format)
+ if key in offsets:
+ self._proxy._handle.close()
+ raise ValueError("Duplicate key '%s'" % key)
+ else:
+ offsets[key] = offset
+ self._offsets = offsets
+
+ def __repr__(self):
+ return self._repr
+
+ def __str__(self):
+ #TODO - How best to handle the __str__ for SeqIO and SearchIO?
+ if self:
+ return "{%r : %s(...), ...}" % (self.keys()[0], self._obj_repr)
+ else:
+ return "{}"
+
+ def __contains__(self, key):
+ return key in self._offsets
+
+ def __len__(self):
+ """How many records are there?"""
+ return len(self._offsets)
+
+ if hasattr(dict, "iteritems"):
+ #Python 2, use iteritems but not items etc
+ def values(self):
+ """Would be a list of the SeqRecord objects, but not implemented.
+
+ In general you can be indexing very very large files, with millions
+ of sequences. Loading all these into memory at once as SeqRecord
+ objects would (probably) use up all the RAM. Therefore we simply
+ don't support this dictionary method.
+ """
+ raise NotImplementedError("Due to memory concerns, when indexing a "
+ "sequence file you cannot access all the "
+ "records at once.")
+
+ def items(self):
+ """Would be a list of the (key, SeqRecord) tuples, but not implemented.
+
+ In general you can be indexing very very large files, with millions
+ of sequences. Loading all these into memory at once as SeqRecord
+ objects would (probably) use up all the RAM. Therefore we simply
+ don't support this dictionary method.
+ """
+ raise NotImplementedError("Due to memory concerns, when indexing a "
+ "sequence file you cannot access all the "
+ "records at once.")
+
+ def keys(self):
+ """Return a list of all the keys (SeqRecord identifiers)."""
+ #TODO - Stick a warning in here for large lists? Or just refuse?
+ return self._offsets.keys()
+
+ def itervalues(self):
+ """Iterate over the SeqRecord) items."""
+ for key in self.__iter__():
+ yield self.__getitem__(key)
+
+ def iteritems(self):
+ """Iterate over the (key, SeqRecord) items."""
+ for key in self.__iter__():
+ yield key, self.__getitem__(key)
+
+ def iterkeys(self):
+ """Iterate over the keys."""
+ return self.__iter__()
+
+ else:
+ #Python 3 - define items and values as iterators
+ def items(self):
+ """Iterate over the (key, SeqRecord) items."""
+ for key in self.__iter__():
+ yield key, self.__getitem__(key)
+
+ def values(self):
+ """Iterate over the SeqRecord items."""
+ for key in self.__iter__():
+ yield self.__getitem__(key)
+
+ def keys(self):
+ """Iterate over the keys."""
+ return self.__iter__()
+
+ def __iter__(self):
+ """Iterate over the keys."""
+ return iter(self._offsets)
+
+ def __getitem__(self, key):
+ """x.__getitem__(y) <==> x[y]"""
+ #Pass the offset to the proxy
+ record = self._proxy.get(self._offsets[key])
+ if self._key_function:
+ key2 = self._key_function(record.id)
+ else:
+ key2 = record.id
+ if key != key2:
+ raise ValueError("Key did not match (%s vs %s)" % (key, key2))
+ return record
+
+ def get(self, k, d=None):
+ """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
+ try:
+ return self.__getitem__(k)
+ except KeyError:
+ return d
+
+ def get_raw(self, key):
+ """Similar to the get method, but returns the record as a raw string.
+
+ If the key is not found, a KeyError exception is raised.
+
+ Note that on Python 3 a bytes string is returned, not a typical
+ unicode string.
+
+ NOTE - This functionality is not supported for every file format.
+ """
+ #Pass the offset to the proxy
+ return self._proxy.get_raw(self._offsets[key])
+
+ def __setitem__(self, key, value):
+ """Would allow setting or replacing records, but not implemented."""
+ raise NotImplementedError("An indexed a sequence file is read only.")
+
+ def update(self, *args, **kwargs):
+ """Would allow adding more values, but not implemented."""
+ raise NotImplementedError("An indexed a sequence file is read only.")
+
+ def pop(self, key, default=None):
+ """Would remove specified record, but not implemented."""
+ raise NotImplementedError("An indexed a sequence file is read only.")
+
+ def popitem(self):
+ """Would remove and return a SeqRecord, but not implemented."""
+ raise NotImplementedError("An indexed a sequence file is read only.")
+
+ def clear(self):
+ """Would clear dictionary, but not implemented."""
+ raise NotImplementedError("An indexed a sequence file is read only.")
+
+ def fromkeys(self, keys, value=None):
+ """A dictionary method which we don't implement."""
+ raise NotImplementedError("An indexed a sequence file doesn't "
+ "support this.")
+
+ def copy(self):
+ """A dictionary method which we don't implement."""
+ raise NotImplementedError("An indexed a sequence file doesn't "
+ "support this.")
+
+class _SQLiteManySeqFilesDict(_IndexedSeqFileDict):
+ """Read only dictionary interface to many sequential record files.
+
+ This code is used in both Bio.SeqIO for indexing as SeqRecord
+ objects, and in Bio.SearchIO for indexing QueryResult objects.
+
+ Keeps the keys, file-numbers and offsets in an SQLite database. To access
+ a record by key, reads from the offset in the appropriate file and then
+ parses the record into an object.
+
+ There are OS limits on the number of files that can be open at once,
+ so a pool are kept. If a record is required from a closed file, then
+ one of the open handles is closed first.
+ """
+ def __init__(self, index_filename, filenames,
+ proxy_factory, format,
+ key_function, repr, max_open=10):
+ self._proxy_factory = proxy_factory
+ self._repr = repr
+ random_access_proxies = {}
+ #TODO? - Don't keep filename list in memory (just in DB)?
+ #Should save a chunk of memory if dealing with 1000s of files.
+ #Furthermore could compare a generator to the DB on reloading
+ #(no need to turn it into a list)
+ if not _sqlite:
+ # Hack for Jython (of if Python is compiled without it)
+ from Bio import MissingPythonDependencyError
+ raise MissingPythonDependencyError("Requires sqlite3, which is "
+ "included Python 2.5+")
+ if filenames is not None:
+ filenames = list(filenames) # In case it was a generator
+ if os.path.isfile(index_filename):
+ #Reuse the index.
+ con = _sqlite.connect(index_filename)
+ self._con = con
+ #Check the count...
+ try:
+ count, = con.execute(
+ "SELECT value FROM meta_data WHERE key=?;",
+ ("count",)).fetchone()
+ self._length = int(count)
+ if self._length == -1:
+ con.close()
+ raise ValueError("Unfinished/partial database")
+ count, = con.execute(
+ "SELECT COUNT(key) FROM offset_data;").fetchone()
+ if self._length != int(count):
+ con.close()
+ raise ValueError("Corrupt database? %i entries not %i"
+ % (int(count), self._length))
+ self._format, = con.execute(
+ "SELECT value FROM meta_data WHERE key=?;",
+ ("format",)).fetchone()
+ if format and format != self._format:
+ con.close()
+ raise ValueError("Index file says format %s, not %s"
+ % (self._format, format))
+ self._filenames = [row[0] for row in
+ con.execute("SELECT name FROM file_data "
+ "ORDER BY file_number;").fetchall()]
+ if filenames and len(filenames) != len(self._filenames):
+ con.close()
+ raise ValueError("Index file says %i files, not %i"
+ % (len(self._filenames), len(filenames)))
+ if filenames and filenames != self._filenames:
+ con.close()
+ raise ValueError("Index file has different filenames")
+ except _OperationalError, err:
+ con.close()
+ raise ValueError("Not a Biopython index database? %s" % err)
+ #Now we have the format (from the DB if not given to us),
+ if not proxy_factory(self._format):
+ con.close()
+ raise ValueError("Unsupported format '%s'" % self._format)
+ else:
+ self._filenames = filenames
+ self._format = format
+ if not format or not filenames:
+ raise ValueError("Filenames to index and format required")
+ if not proxy_factory(format):
+ raise ValueError("Unsupported format '%s'" % format)
+ #Create the index
+ con = _sqlite.connect(index_filename)
+ self._con = con
+ #print "Creating index"
+ # Sqlite PRAGMA settings for speed
+ con.execute("PRAGMA synchronous=OFF")
+ con.execute("PRAGMA locking_mode=EXCLUSIVE")
+ #Don't index the key column until the end (faster)
+ #con.execute("CREATE TABLE offset_data (key TEXT PRIMARY KEY, "
+ # "offset INTEGER);")
+ con.execute("CREATE TABLE meta_data (key TEXT, value TEXT);")
+ con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);",
+ ("count", -1))
+ con.execute("INSERT INTO meta_data (key, value) VALUES (?,?);",
+ ("format", format))
+ #TODO - Record the alphabet?
+ #TODO - Record the file size and modified date?
+ con.execute(
+ "CREATE TABLE file_data (file_number INTEGER, name TEXT);")
+ con.execute("CREATE TABLE offset_data (key TEXT, file_number INTEGER, offset INTEGER, length INTEGER);")
+ count = 0
+ for i, filename in enumerate(filenames):
+ con.execute(
+ "INSERT INTO file_data (file_number, name) VALUES (?,?);",
+ (i, filename))
+ random_access_proxy = proxy_factory(format, filename)
+ if key_function:
+ offset_iter = ((key_function(
+ k), i, o, l) for (k, o, l) in random_access_proxy)
+ else:
+ offset_iter = (
+ (k, i, o, l) for (k, o, l) in random_access_proxy)
+ while True:
+ batch = list(itertools.islice(offset_iter, 100))
+ if not batch:
+ break
+ #print "Inserting batch of %i offsets, %s ... %s" \
+ # % (len(batch), batch[0][0], batch[-1][0])
+ con.executemany(
+ "INSERT INTO offset_data (key,file_number,offset,length) VALUES (?,?,?,?);",
+ batch)
+ con.commit()
+ count += len(batch)
+ if len(random_access_proxies) < max_open:
+ random_access_proxies[i] = random_access_proxy
+ else:
+ random_access_proxy._handle.close()
+ self._length = count
+ #print "About to index %i entries" % count
+ try:
+ con.execute("CREATE UNIQUE INDEX IF NOT EXISTS "
+ "key_index ON offset_data(key);")
+ except _IntegrityError, err:
+ self._proxies = random_access_proxies
+ self.close()
+ con.close()
+ raise ValueError("Duplicate key? %s" % err)
+ con.execute("PRAGMA locking_mode=NORMAL")
+ con.execute("UPDATE meta_data SET value = ? WHERE key = ?;",
+ (count, "count"))
+ con.commit()
+ #print "Index created"
+ self._proxies = random_access_proxies
+ self._max_open = max_open
+ self._index_filename = index_filename
+ self._key_function = key_function
+
+ def __repr__(self):
+ return self._repr
+
+ def __contains__(self, key):
+ return bool(
+ self._con.execute("SELECT key FROM offset_data WHERE key=?;",
+ (key,)).fetchone())
+
+ def __len__(self):
+ """How many records are there?"""
+ return self._length
+ #return self._con.execute("SELECT COUNT(key) FROM offset_data;").fetchone()[0]
+
+ def __iter__(self):
+ """Iterate over the keys."""
+ for row in self._con.execute("SELECT key FROM offset_data;"):
+ yield str(row[0])
+
+ if hasattr(dict, "iteritems"):
+ #Python 2, use iteritems but not items etc
+ #Just need to override this...
+ def keys(self):
+ """Return a list of all the keys (SeqRecord identifiers)."""
+ return [str(row[0]) for row in
+ self._con.execute("SELECT key FROM offset_data;").fetchall()]
+
+ def __getitem__(self, key):
+ """x.__getitem__(y) <==> x[y]"""
+ #Pass the offset to the proxy
+ row = self._con.execute(
+ "SELECT file_number, offset FROM offset_data WHERE key=?;",
+ (key,)).fetchone()
+ if not row:
+ raise KeyError
+ file_number, offset = row
+ proxies = self._proxies
+ if file_number in proxies:
+ record = proxies[file_number].get(offset)
+ else:
+ if len(proxies) >= self._max_open:
+ #Close an old handle...
+ proxies.popitem()[1]._handle.close()
+ #Open a new handle...
+ proxy = self._proxy_factory(self._format, self._filenames[file_number])
+ record = proxy.get(offset)
+ proxies[file_number] = proxy
+ if self._key_function:
+ key2 = self._key_function(record.id)
+ else:
+ key2 = record.id
+ if key != key2:
+ raise ValueError("Key did not match (%s vs %s)" % (key, key2))
+ return record
+
+ def get(self, k, d=None):
+ """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
+ try:
+ return self.__getitem__(k)
+ except KeyError:
+ return d
+
+ def get_raw(self, key):
+ """Similar to the get method, but returns the record as a raw string.
+
+ If the key is not found, a KeyError exception is raised.
+
+ Note that on Python 3 a bytes string is returned, not a typical
+ unicode string.
+
+ NOTE - This functionality is not supported for every file format.
+ """
+ #Pass the offset to the proxy
+ row = self._con.execute(
+ "SELECT file_number, offset, length FROM offset_data WHERE key=?;",
+ (key,)).fetchone()
+ if not row:
+ raise KeyError
+ file_number, offset, length = row
+ proxies = self._proxies
+ if file_number in proxies:
+ if length:
+ #Shortcut if we have the length
+ h = proxies[file_number]._handle
+ h.seek(offset)
+ return h.read(length)
+ else:
+ return proxies[file_number].get_raw(offset)
+ else:
+ #This code is duplicated from __getitem__ to avoid a function call
+ if len(proxies) >= self._max_open:
+ #Close an old handle...
+ proxies.popitem()[1]._handle.close()
+ #Open a new handle...
+ proxy = self._proxy_factory(self._format, self._filenames[file_number])
+ proxies[file_number] = proxy
+ if length:
+ #Shortcut if we have the length
+ h = proxy._handle
+ h.seek(offset)
+ return h.read(length)
+ else:
+ return proxy.get_raw(offset)
+
+ def close(self):
+ """Close any open file handles."""
+ proxies = self._proxies
+ while proxies:
+ proxies.popitem()[1]._handle.close()
View
3  Bio/GA/Mutation/Simple.py
@@ -83,6 +83,3 @@ def mutate(self, organism):
mutated_org.genome[gene_index] = new_letter
return mutated_org
-
-
-
View
6 Bio/GA/Organism.py
@@ -53,14 +53,14 @@ def random_population(genome_alphabet, genome_size, num_organisms,
letter_rand = random.Random()
# figure out what type of characters are in the alphabet
- if type(genome_alphabet.letters[0]) == type("A"):
+ if isinstance(genome_alphabet.letters[0], str):
if sys.version_info[0] == 3:
alphabet_type = "u" #Use unicode string on Python 3
else:
alphabet_type = "c" #Use byte string on Python 2
- elif type(genome_alphabet.letters[0]) == type(1):
+ elif isinstance(genome_alphabet.letters[0], int):
alphabet_type = "i"
- elif type(genome_alphabet.letters[0]) == type(1.0):
+ elif isinstance(genome_alphabet.letters[0], float):
alphabet_type = "d"
else:
raise ValueError(\
View
1  Bio/GA/Repair/Stabilizing.py
@@ -56,4 +56,3 @@ def repair(self, organism):
new_org.genome[to_change] = new_gene
return new_org
-
View
2  Bio/GA/Selection/Diversity.py
@@ -94,5 +94,3 @@ def select(self, population):
# return the new population, which should have the same number
# of individuals we started with.
return new_population[:len(population)]
-
-
View
1  Bio/GA/Selection/Tournament.py
@@ -72,4 +72,3 @@ def select(self, population):
new_population.extend([new_org_1, new_org_2])
return new_population
-
View
4 Bio/Geo/Record.py
@@ -42,14 +42,14 @@ def __str__( self ):
att_keys.sort()
for key in att_keys:
contents = self.entity_attributes[ key ]
- if( type( contents ) == type( [] ) ):
+ if isinstance(contents, list):
for item in contents:
try:
output = output + '%s: %s\n' % ( key, item[ :40 ] )
output = output + out_block( item[ 40: ] )
except:
pass
- elif( type( contents ) == type( '' ) ):
+ elif isinstance(contents, str):
output = output + '%s: %s\n' % ( key, contents[ :40 ] )
output = output + out_block( contents[ 40: ] )
else:
View
3  Bio/Graphics/DisplayRepresentation.py
@@ -179,6 +179,3 @@ def _color_from_count(self, count):
# if we got here we didn't find a color for the count
raise ValueError("Count value %s was not found in the color scheme."
% count)
-
-
-
View
4 Bio/Graphics/GenomeDiagram/_AbstractDrawer.py
@@ -457,9 +457,9 @@ def set_page_size(self, pagesize, orientation):
Set the size of the drawing
"""
- if type(pagesize) == type('a'): # A string, so translate
+ if isinstance(pagesize, str): # A string, so translate
pagesize = page_sizes(pagesize)
- elif type(pagesize) == type((1,2)): # A tuple, so don't translate
+ elif isinstance(pagesize, tuple): # A tuple, so don't translate
pagesize = pagesize
else:
raise ValueError("Page size %s not recognised" % pagesize)
View
2  Bio/Graphics/GenomeDiagram/_CircularDrawer.py
@@ -231,7 +231,7 @@ def __init__(self, parent=None, pagesize='A3', orientation='landscape',
# Useful measurements on the page
self.track_size = track_size
self.circle_core = circle_core
- if circular == False: # Determine the proportion of the circumference
+ if not circular: # Determine the proportion of the circumference
self.sweep = 0.9 # around which information will be drawn
else:
self.sweep = 1
View
10 Bio/Graphics/GenomeDiagram/_Colors.py
@@ -79,7 +79,7 @@ def translate(self, color=None, colour=None):
color = colour
if color is None:
- raise ValueError, "Passed color (or colour) must be a valid color type"
+ raise ValueError("Passed color (or colour) must be a valid color type")
elif isinstance(color, int):
color = self.scheme_color(color)
elif isinstance(color, colors.Color):
@@ -87,9 +87,9 @@ def translate(self, color=None, colour=None):
elif isinstance(color, basestring):
#Assume its a named reportlab color like "red".
color = colors.toColor(color)
- elif type(color) == type((1., 2., 3.)) and type(color[0]) == type(1.):
+ elif isinstance(color, tuple) and isinstance(color[0], float):
color = self.float1_color(color)
- elif type(color) == type((1, 2, 3)) and type(color[0]) == type(1):
+ elif isinstance(color, tuple) and isinstance(color[0], int):
color = self.int255_color(color)
return color
@@ -224,7 +224,3 @@ def float1_color(self, values):
print gdct.translate((1, 75, 240))
print gdct.translate(7)
print gdct.translate(2)
-
-
-
-
View
1  Bio/Graphics/GenomeDiagram/_Diagram.py
@@ -485,4 +485,3 @@ def __str__(self):
outstr.append("Track %d: %s\n" % (level, self.tracks[level]))
outstr = '\n'.join(outstr)
return outstr
-
View
1  Bio/Graphics/GenomeDiagram/_FeatureSet.py
@@ -298,4 +298,3 @@ def __str__(self):
#for feature in gdfs.get_features():
# print feature.id, feature.start, feature.end
#print gdfs[500]
-
View
2  Bio/Graphics/GenomeDiagram/_Graph.py
@@ -280,5 +280,3 @@ def __str__(self):
outstr.append("Minimum: %s\n1Q: %s\n2Q: %s\n3Q: %s\nMaximum: %s" % self.quartiles())
outstr.append("Sequence Range: %s..%s" % self.range())
return "\n".join(outstr)
-
-
View
3  Bio/HMM/DynamicProgramming.py
@@ -323,6 +323,3 @@ class LogDPAlgorithms(AbstractDPAlgorithms):
"""
def __init__(self, markov_model, sequence):
raise NotImplementedError("Haven't coded this yet...")
-
-
-
View
1  Bio/HMM/MarkovModel.py
@@ -655,4 +655,3 @@ def _log_transform(self, probability):
log_prob[key] = neg_inf
return log_prob
-
View
9 Bio/HMM/Trainer.py
@@ -418,12 +418,3 @@ def _count_transitions(self, state_seq, transition_counts):
(cur_state, next_state))
return transition_counts
-
-
-
-
-
-
-
-
-
View
5 Bio/HMM/Utilities.py
@@ -50,8 +50,3 @@ def pretty_print_prediction(emissions, real_state, predicted_state,
break
cur_position += seq_length
-
-
-
-
-
View
3  Bio/HotRand.py
@@ -71,6 +71,3 @@ def hot_rand( self, high, low = 0 ):
nums = [ '0000', 'abcd', '1234', '5555', '4321', 'aaaa', 'ffff' ]
for num in nums:
print hex_convert( num )
-
-
-
View
2  Bio/Index.py
@@ -60,7 +60,7 @@ def __init__(self, indexname, truncate=None):
% (version, self.__version))
def __del__(self):
- if self.__dict__.has_key('data'):
+ if 'data' in self.__dict__:
self.data.close()
class _InMemoryIndex(dict):
View
1  Bio/KDTree/KDTree.py
@@ -259,4 +259,3 @@ def all_get_radii(self):
x, y, z=center
print "Found %i points in radius %f around center (%.2f, %.2f, %.2f)." % (len(indices), query_radius, x, y, z)
-
View
1  Bio/Motif/MEME.py
@@ -396,4 +396,3 @@ def __skip_unused_lines(handle):
raise ValueError("Unexpected end of stream: Expected to find line starting with '***'")
if not line.startswith('***'):
raise ValueError("Line does not start with '***':\n%s" % line)
-
View
2  Bio/Motif/Parsers/AlignAce.py
@@ -61,5 +61,3 @@ def read(handle):
else:
raise ValueError(line)
return record
-
-
View
1  Bio/Motif/Parsers/MEME.py
@@ -363,4 +363,3 @@ def __skip_unused_lines(handle):
raise ValueError("Unexpected end of stream: Expected to find line starting with '***'")
if not line.startswith('***'):
raise ValueError("Line does not start with '***':\n%s" % line)
-
View
4 Bio/Motif/_Motif.py
@@ -554,7 +554,7 @@ def _from_vert_matrix(self,stream,letters=None,make_instances=False):
self.counts[k].append(v)
self.length+=1
self.set_mask("*"*self.length)
- if make_instances==True:
+ if make_instances is True:
self.make_instances_from_counts()
return self
@@ -582,7 +582,7 @@ def _from_horiz_matrix(self,stream,letters=None,make_instances=False):
l = len(self.counts[letters[0]])
self.length=l
self.set_mask("*"*l)
- if make_instances==True:
+ if make_instances is True:
self.make_instances_from_counts()
return self
View
30 Bio/NMR/xpktools.py
@@ -68,7 +68,8 @@ def residue_dict(self,index):
# The nucleus should be given as the input argument in the
# same form as it appears in the xpk label line (H1, 15N for example)
- maxres=-1; minres=-1
+ maxres=-1
+ minres=-1
# Cast the data lines into the xpentry class
self.dict={}
@@ -104,12 +105,18 @@ def residue_dict(self,index):
def write_header(self,outfn):
outfile=_try_open_write(outfn)
- outfile.write(self.firstline);outfile.write("\012")
- outfile.write(self.axislabels);outfile.write("\012")
- outfile.write(self.dataset);outfile.write("\012")
- outfile.write(self.sw);outfile.write("\012")
- outfile.write(self.sf);outfile.write("\012")
- outfile.write(self.datalabels);outfile.write("\012")
+ outfile.write(self.firstline)
+ outfile.write("\012")
+ outfile.write(self.axislabels)
+ outfile.write("\012")
+ outfile.write(self.dataset)
+ outfile.write("\012")
+ outfile.write(self.sw)
+ outfile.write("\012")
+ outfile.write(self.sf)
+ outfile.write("\012")
+ outfile.write(self.datalabels)
+ outfile.write("\012")
outfile.close()
def _try_open_read(fn):
@@ -193,7 +200,8 @@ def data_table(fn_list, datalabel, keyatom):
[dict_list,label_line_list]=_read_dicts(fn_list,keyatom)
# Find global max and min residue numbers
- minr=dict_list[0]["minres"]; maxr=dict_list[0]["maxres"]
+ minr=dict_list[0]["minres"]
+ maxr=dict_list[0]["maxres"]
for dictionary in dict_list:
if (maxr < dictionary["maxres"]):
@@ -225,9 +233,11 @@ def _sort_keys(dictionary):
def _read_dicts(fn_list, keyatom):
# Read multiple files into a list of residue dictionaries
- dict_list=[]; datalabel_list=[]
+ dict_list=[]
+ datalabel_list=[]
for fn in fn_list:
- peaklist=Peaklist(fn); dict=peaklist.residue_dict(keyatom)
+ peaklist=Peaklist(fn)
+ dict=peaklist.residue_dict(keyatom)
dict_list.append(dict)
datalabel_list.append(peaklist.datalabels)
View
1  Bio/NeuralNetwork/Gene/Motif.py
@@ -209,4 +209,3 @@ def representation(self, sequence):
motif_amounts.append(seq_motifs[motif])
return motif_amounts
-
View
4 Bio/NeuralNetwork/Gene/Pattern.py
@@ -42,8 +42,8 @@ def write(self, pattern_list, output_handle):
"""
for pattern in pattern_list:
# deal with signatures, concatentate them with the separator
- if (type(pattern) == type([]) or
- type(pattern) == type(tuple([]))):
+ if (isinstance(pattern, list) or
+ isinstance(pattern, tuple)):
string_pattern = self.separator.join(pattern)
# deal with the normal cases
else:
View
1  Bio/NeuralNetwork/Gene/Signature.py
@@ -215,4 +215,3 @@ def representation(self, sequence):
sig_amounts.append(sequence_sigs[sig])
return sig_amounts
-
View
4 Bio/NeuralNetwork/Training.py
@@ -83,7 +83,3 @@ def add_examples(self, training_examples):
self.validation_examples.append(example)
else:
self.test_examples.append(example)
-
-
-
-
View
189 Bio/Nexus/Nexus.py
@@ -12,7 +12,10 @@
# For with in Python/Jython 2.5
from __future__ import with_statement
-import sys, math, random, copy
+import copy
+import math
+import random
+import sys
from Bio import File
from Bio.Alphabet import IUPAC
@@ -43,7 +46,7 @@ def __init__(self,string):
self.buffer=list(string)
else:
self.buffer=[]
-
+
def peek(self):
if self.buffer:
return self.buffer[0]
@@ -56,7 +59,7 @@ def peek_nonwhitespace(self):
return b[0]
else:
return None
-
+
def next(self):
if self.buffer:
return self.buffer.pop(0)
@@ -68,14 +71,14 @@ def next_nonwhitespace(self):
p=self.next()
if p is None:
break
- if p not in WHITESPACE:
+ if p not in WHITESPACE:
return p
return None
def skip_whitespace(self):
while self.buffer[0] in WHITESPACE:
self.buffer=self.buffer[1:]
-
+
def next_until(self,target):
for t in target:
try:
@@ -91,7 +94,7 @@ def next_until(self,target):
def peek_word(self,word):
return ''.join(self.buffer[:len(word)])==word
-
+
def next_word(self):
"""Return the next NEXUS word from a string.
@@ -110,22 +113,22 @@ def next_word(self):
quoted='"'
elif first in PUNCTUATION: # if it's punctuation, return immediately
return first
- while True:
+ while True:
c=self.peek()
if c==quoted: # a quote?
- word.append(self.next()) # store quote
+ word.append(self.next()) # store quote
if self.peek()==quoted: # double quote
- skip=self.next() # skip second quote
+ skip=self.next() # skip second quote
elif quoted: # second single quote ends word
break
elif quoted:
word.append(self.next()) # if quoted, then add anything
- elif not c or c in PUNCTUATION or c in WHITESPACE: # if not quoted and special character, stop
+ elif not c or c in PUNCTUATION or c in WHITESPACE: # if not quoted and special character, stop
break
else:
word.append(self.next()) # standard character
return ''.join(word)
-
+
def rest(self):
"""Return the rest of the string without parsing."""
return ''.join(self.buffer)
@@ -135,7 +138,7 @@ class StepMatrix(object):
See Wheeler (1990), Cladistics 6:269-275.
"""
-
+
def __init__(self,symbols,gap):
self.data={}
self.symbols=[s for s in symbols]
@@ -157,7 +160,7 @@ def add(self,x,y,value):
self.data[x+y]+=value
def sum(self):
- return reduce(lambda x,y:x+y,self.data.values())
+ return reduce(lambda x,y:x+y,self.data.values())
def transformation(self):
total=self.sum()
@@ -165,7 +168,7 @@ def transformation(self):
for k in self.data:
self.data[k]=self.data[k]/float(total)
return self
-
+
def weighting(self):
for k in self.data:
if self.data[k]!=0:
@@ -192,7 +195,7 @@ def smprint(self,name='your_name_here'):
matrix+='\n'
matrix+=';\n'
return matrix
-
+
def safename(name,mrbayes=False):
"""Return a taxon identifier according to NEXUS standard.
@@ -236,15 +239,15 @@ def get_start_end(sequence, skiplist=['-','?']):
if start==length and end==-1: # empty sequence
return -1,-1
else:
- return start,end
-
+ return start,end
+
def _sort_keys_by_values(p):
- """Returns a sorted list of keys of p sorted by values of p."""
+ """Returns a sorted list of keys of p sorted by values of p."""
startpos=[(p[pn],pn) for pn in p if p[pn]]
startpos.sort()
# parenthisis added because of py3k
return (zip(*startpos))[1]
-
+
def _make_unique(l):
"""Check that all values in list are unique and return a pruned and sorted list."""
l=list(set(l))
@@ -268,14 +271,14 @@ def _compact4nexus(orig_list):
"""Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class)
into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.).
"""
-
+
if not orig_list:
return ''
orig_list=list(set(orig_list))
orig_list.sort()
shortlist=[]
clist=orig_list[:]
- clist.append(clist[-1]+.5) # dummy value makes it easier
+ clist.append(clist[-1]+.5) # dummy value makes it easier
while len(clist)>1:
step=1
for i,x in enumerate(clist):
@@ -283,7 +286,7 @@ def _compact4nexus(orig_list):
continue
elif i==1 and len(clist)>3 and clist[i+1]-x==x-clist[0]:
# second element, and possibly at least 3 elements to link,
- # and the next one is in the right step
+ # and the next one is in the right step
step=x-clist[0]
else: # pattern broke, add all values before current position to new list
sub=clist[:i]
@@ -303,9 +306,9 @@ def combine(matrices):
combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...]
Character sets, character partitions and taxon sets are prefixed, readjusted
- and present in the combined matrix.
+ and present in the combined matrix.
"""
-
+
if not matrices:
return None
name=matrices[0][0]
@@ -361,7 +364,7 @@ def combine(matrices):
for (i,label) in m.charlabels.iteritems()))
combined.nchar+=m.nchar # update nchar and ntax
combined.ntax+=len(m_only)
-
+
# some prefer partitions, some charsets:
# make separate charset for ecah initial dataset
for c in combined.charpartitions['combined']:
@@ -371,7 +374,7 @@ def combine(matrices):
def _kill_comments_and_break_lines(text):
"""Delete []-delimited comments out of a file and break into lines separated by ';'.
-
+
stripped_text=_kill_comments_and_break_lines(text):
Nested and multiline comments are allowed. [ and ] symbols within single
or double quotes are ignored, newline ends a quote, all symbols with quotes are
@@ -380,11 +383,11 @@ def _kill_comments_and_break_lines(text):
Quotes inside special [& and [\ are treated as normal characters,
but no nesting inside these special comments allowed (like [& [\ ]]).
';' ist deleted from end of line.
-
+
NOTE: this function is very slow for large files, and obsolete when using C extension cnexus
"""
contents=iter(text)
- newtext=[]
+ newtext=[]
newline=[]
quotelevel=''
speciallevel=False
@@ -408,14 +411,14 @@ def _kill_comments_and_break_lines(text):
speciallevel=True
else:
commlevel+=1
- elif not quotelevel and t==']': # closing bracket ioutside a quote
+ elif not quotelevel and t==']': # closing bracket ioutside a quote
if speciallevel:
speciallevel=False
else:
commlevel-=1
if commlevel<0:
raise NexusError('Nexus formatting error: unmatched ]')
- continue
+ continue
if commlevel==0: # copy if we're not in comment
if t==';' and not quotelevel:
newtext.append(''.join(newline))
@@ -434,10 +437,10 @@ def _adjust_lines(lines):
"""Adjust linebreaks to match ';', strip leading/trailing whitespace.
list_of_commandlines=_adjust_lines(input_text)
- Lines are adjusted so that no linebreaks occur within a commandline
+ Lines are adjusted so that no linebreaks occur within a commandline
(except matrix command line)
"""
- formatted_lines=[]
+ formatted_lines=[]
for l in lines:
#Convert line endings
l=l.replace('\r\n','\n').replace('\r','\n').strip()
@@ -448,7 +451,7 @@ def _adjust_lines(lines):
if l:
formatted_lines.append(l)
return formatted_lines
-
+
def _replace_parenthesized_ambigs(seq,rev_ambig_values):
"""Replaces ambigs in xxx(ACG)xxx format by IUPAC ambiguity code."""
@@ -465,13 +468,13 @@ def _replace_parenthesized_ambigs(seq,rev_ambig_values):
ambig_code=rev_ambig_values[ambig.upper()]
if ambig!=ambig.upper():
ambig_code=ambig_code.lower()
- seq=seq[:opening]+ambig_code+seq[closing+1:]
+ seq=seq[:opening]+ambig_code+seq[closing+1:]
opening=seq.find('(')
return seq
class Commandline(object):
"""Represent a commandline as command and options."""
-
+
def __init__(self, line, title):
self.options={}
options=[]
@@ -486,9 +489,9 @@ def __init__(self, line, title):
self.command = self.command.strip().lower()
if self.command in SPECIAL_COMMANDS: # special command that need newlines and order of options preserved
self.options=options.strip()
- else:
+ else:
if len(options) > 0:
- try:
+ try:
options = options.replace('=', ' = ').split()
valued_indices=[(n-1,n,n+1) for n in range(len(options)) if options[n]=='=' and n!=0 and n!=len((options))]
indices = []
@@ -502,7 +505,7 @@ def __init__(self, line, title):
self.options[options[token].lower()] = None
except ValueError:
raise NexusError('Incorrect formatting in line: %s' % line)
-
+
class Block(object):
"""Represent a NEXUS block with block name and list of commandlines."""
def __init__(self,title=None):
@@ -528,8 +531,8 @@ def __init__(self, input=None):
self.labels=None # left, right, no
self.transpose=False # whether matrix is transposed
self.interleave=False # whether matrix is interleaved
- self.tokens=False # unsupported
- self.eliminate=None # unsupported
+ self.tokens=False # unsupported
+ self.eliminate=None # unsupported
self.matrix=None # ...
self.unknown_blocks=[] # blocks we don't care about
self.taxsets={}
@@ -539,13 +542,13 @@ def __init__(self, input=None):
self.trees=[] # list of Trees (instances of Tree class)
self.translate=None # Dict to translate taxon <-> taxon numbers
self.structured=[] # structured input representation
- self.set={} # dict of the set command to set various options
+ self.set={} # dict of the set command to set various options
self.options={} # dict of the options command in the data block
self.codonposset=None # name of the charpartition that defines codon positions
# some defaults
self.options['gapmode']='missing'
-
+
if input:
self.read(input)
else:
@@ -560,7 +563,7 @@ def set_original_taxon_order(self,value):
original_taxon_order=property(get_original_taxon_order,set_original_taxon_order)
def read(self,input):
- """Read and parse NEXUS imput (a filename, file-handle, or string)."""
+ """Read and parse NEXUS input (a filename, file-handle, or string)."""
# 1. Assume we have the name of a file in the execution dir or a
# file-like object.
@@ -626,7 +629,7 @@ def _unknown_nexus_block(self,title, contents):
block = Block()
block.commandlines.append(contents)
block.title = title
- self.unknown_blocks.append(block)
+ self.unknown_blocks.append(block)
def _parse_nexus_block(self,title, contents):
"""Parse a known Nexus Block (PRIVATE)."""
@@ -634,14 +637,14 @@ def _parse_nexus_block(self,title, contents):
self._apply_block_structure(title, contents)
#now check for taxa,characters,data blocks. If this stuff is defined more than once
#the later occurences will override the previous ones.
- block=self.structured[-1]
+ block=self.structured[-1]
for line in block.commandlines:
try:
getattr(self,'_'+line.command)(line.options)
except AttributeError:
raise
raise NexusError('Unknown command: %s ' % line.command)
-
+
def _title(self,options):
pass
@@ -733,10 +736,10 @@ def _format(self,options):
def _set(self,options):
- self.set=options;
+ self.set=options
def _options(self,options):
- self.options=options;
+ self.options=options
def _eliminate(self,options):
self.eliminate=options
@@ -758,7 +761,7 @@ def _taxlabels(self,options):
# break
# self.taxlabels.append(taxon)
- def _check_taxlabels(self,taxon):
+ def _check_taxlabels(self,taxon):
"""Check for presence of taxon in self.taxlabels."""
# According to NEXUS standard, underscores shall be treated as spaces...,
# so checking for identity is more difficult
@@ -775,7 +778,7 @@ def _charlabels(self,options):
w=opts.next_word()
if w is None: # McClade saves and reads charlabel-lists with terminal comma?!
break
- identifier=self._resolve(w,set_type=CHARSET)
+ identifier=self._resolve(w,set_type=CHARSET)
state=quotestrip(opts.next_word())
self.charlabels[identifier]=state
# check for comma or end of command
@@ -804,8 +807,8 @@ def _matrix(self,options):
self.matrix={}
taxcount=0
first_matrix_block=True
-
- #eliminate empty lines and leading/trailing whitespace
+
+ #eliminate empty lines and leading/trailing whitespace
lines=[l.strip() for l in options.split('\n') if l.strip()!='']
lineiter=iter(lines)
while 1:
@@ -879,7 +882,7 @@ def _matrix(self,options):
#check all sequences for length according to nchar
for taxon in self.matrix:
if len(self.matrix[taxon])!=self.nchar:
- raise NexusError('Matrx Nchar %d does not match data length (%d) for taxon %s' \
+ raise NexusError('Matrix Nchar %d does not match data length (%d) for taxon %s' \
% (self.nchar, len(self.matrix[taxon]),taxon))
#check that taxlabels is identical with matrix.keys. If not, it's a problem
matrixkeys=sorted(self.matrix)
@@ -893,7 +896,7 @@ def _translate(self,options):
while True:
try:
# get id and state
- identifier=int(opts.next_word())
+ identifier=int(opts.next_word())
label=quotestrip(opts.next_word())
self.translate[identifier]=label
# check for comma or end of command
@@ -910,7 +913,7 @@ def _translate(self,options):
def _utree(self,options):
"""Some software (clustalx) uses 'utree' to denote an unrooted tree."""
self._tree(options)
-
+
def _tree(self,options):
opts=CharBuffer(options)
if opts.peek_nonwhitespace()=='*': # a star can be used to make it the default tree in some software packages
@@ -943,25 +946,25 @@ def _tree(self,options):
try:
tree.node(n).data.taxon=safename(self.translate[int(tree.node(n).data.taxon)])
except (ValueError,KeyError):
- raise NexusError('Unable to substitue %s using \'translate\' data.' \
+ raise NexusError('Unable to substitute %s using \'translate\' data.' \
% tree.node(n).data.taxon)
self.trees.append(tree)
def _apply_block_structure(self,title,lines):
block=Block('')
- block.title = title
+ block.title = title
for line in lines:
block.commandlines.append(Commandline(line, title))
self.structured.append(block)
-
+
def _taxset(self, options):
name,taxa=self._get_indices(options,set_type=TAXSET)
self.taxsets[name]=_make_unique(taxa)
-
+
def _charset(self, options):
name,sites=self._get_indices(options,set_type=CHARSET)
self.charsets[name]=_make_unique(sites)
-
+
def _taxpartition(self, options):
taxpartition={}
quotelevel=False
@@ -1002,7 +1005,7 @@ def _codonposset(self,options):
raise NexusError('Formatting Error in codonposset: %s ' % options)
else:
self.codonposset=codonname[0]
-
+
def _codeset(self,options):
pass
@@ -1066,7 +1069,7 @@ def _name_n_vector(self,opts,separator='='):
if opts.next_nonwhitespace()!=separator:
raise NexusError('Formatting error in line: %s ' % rest)
return name
-
+
def _parse_list(self,options_buffer,set_type):
"""Parse a NEXUS list (PRIVATE).
@@ -1091,7 +1094,7 @@ def _parse_list(self,options_buffer,set_type):
if options_buffer.peek_nonwhitespace()=='\\': # followd by \
backslash=options_buffer.next_nonwhitespace()
step=int(options_buffer.next_word()) # get backslash and step
- plain_list.extend(range(start,end+1,step))
+ plain_list.extend(range(start,end+1,step))
else:
if type(start)==list or type(end)==list:
raise NexusError('Name if character sets not allowed in range definition: %s'\
@@ -1110,7 +1113,7 @@ def _parse_list(self,options_buffer,set_type):
except:
return None
return plain_list
-
+
def _resolve(self,identifier,set_type=None):
"""Translate identifier in list into character/taxon index.
@@ -1220,7 +1223,7 @@ def write_nexus_data_partitions(self, matrix=None, filename=None, blocksize=None
exclude=exclude,delete=delete,comment=comment,append_sets=False,
mrbayes=mrbayes)
return fn
-
+
def write_nexus_data(self, filename=None, matrix=None, exclude=[], delete=[],\
blocksize=None, interleave=False, interleave_by_partition=False,\
comment=None,omit_NEXUS=False,append_sets=True,mrbayes=False,\
@@ -1368,12 +1371,12 @@ def append_sets(self,exclude=[],delete=[],mrbayes=False,include_codons=True,codo
if not codons_only:
for n,ns in self.charsets.iteritems():
cset=[offlist[c] for c in ns if c not in exclude]
- if cset:
- setsb.append('charset %s = %s' % (safename(n),_compact4nexus(cset)))
+ if cset:
+ setsb.append('charset %s = %s' % (safename(n),_compact4nexus(cset)))
for n,s in self.taxsets.iteritems():
tset=[safename(t,mrbayes=mrbayes) for t in s if t not in delete]
if tset:
- setsb.append('taxset %s = %s' % (safename(n),' '.join(tset)))
+ setsb.append('taxset %s = %s' % (safename(n),' '.join(tset)))
for n,p in self.charpartitions.iteritems():
if not include_codons and n==CODONPOSITIONS:
continue
@@ -1412,9 +1415,9 @@ def append_sets(self,exclude=[],delete=[],mrbayes=False,include_codons=True,codo
return ''
else:
return ';\n'.join(setsb)
-
+
def export_fasta(self, filename=None, width=70):
- """Writes matrix into a fasta file: (self, filename=None, width=70)."""
+ """Writes matrix into a fasta file."""
if not filename:
if '.' in self.filename and self.filename.split('.')[-1].lower() in ['paup','nexus','nex','dat']:
filename='.'.join(self.filename.split('.')[:-1])+'.fas'
@@ -1424,12 +1427,12 @@ def export_fasta(self, filename=None, width=70):
for taxon in self.taxlabels:
fh.write('>'+safename(taxon)+'\n')
for i in range(0, len(str(self.matrix[taxon])), width):
- fh.write(str(self.matrix[taxon])[i:i+width] + '\n')
+ fh.write(str(self.matrix[taxon])[i:i+width] + '\n')
fh.close()
return filename
def export_phylip(self, filename=None):
- """Writes matrix into a PHYLIP file: (self, filename=None).
+ """Writes matrix into a PHYLIP file.
Note that this writes a relaxed PHYLIP format file, where the names
are not truncated, nor checked for invalid characters."""
@@ -1444,7 +1447,7 @@ def export_phylip(self, filename=None):
fh.write('%s %s\n' % (safename(taxon), str(self.matrix[taxon])))
fh.close()
return filename
-
+
def constant(self,matrix=None,delete=[],exclude=[]):
"""Return a list with all constant characters."""
if not matrix:
@@ -1455,7 +1458,7 @@ def constant(self,matrix=None,delete=[],exclude=[]):
elif len(undelete)==1:
return [x for x in range(len(matrix[undelete[0]])) if x not in exclude]
# get the first sequence and expand all ambiguous values
- constant=[(x,self.ambiguous_values.get(n.upper(),n.upper())) for
+ constant=[(x,self.ambiguous_values.get(n.upper(),n.upper())) for
x,n in enumerate(str(matrix[undelete[0]])) if x not in exclude]
for taxon in undelete[1:]:
@@ -1464,7 +1467,7 @@ def constant(self,matrix=None,delete=[],exclude=[]):
#print '%d (paup=%d)' % (site[0],site[0]+1),
seqsite=matrix[taxon][site[0]].upper()
#print seqsite,'checked against',site[1],'\t',
- if seqsite==self.missing or (seqsite==self.gap and self.options['gapmode'].lower()=='missing') or seqsite==site[1]:
+ if seqsite==self.missing or (seqsite==self.gap and self.options['gapmode'].lower()=='missing') or seqsite==site[1]:
# missing or same as before -> ok
newconstant.append(site)
elif seqsite in site[1] or site[1]==self.missing or (self.options['gapmode'].lower()=='missing' and site[1]==self.gap):
@@ -1512,7 +1515,7 @@ def weighted_stepmatrix(self,name='your_name_here',exclude=[],delete=[]):
See Wheeler (1990), Cladistics 6:269-275 and
Felsenstein (1981), Biol. J. Linn. Soc. 16:183-196
- """
+ """
m=StepMatrix(self.unambiguous_letters,self.gap)
for site in [s for s in range(self.nchar) if s not in exclude]:
cstatus=self.cstatus(site,delete)
@@ -1543,7 +1546,7 @@ def crop_matrix(self,matrix=None, delete=[], exclude=[]):
return dict(zip(undelete,m))
else:
return dict([(t,matrix[t]) for t in self.taxlabels if t in matrix and t not in delete])
-
+
def bootstrap(self,matrix=None,delete=[],exclude=[]):
"""Return a bootstrapped matrix."""
if not matrix:
@@ -1554,7 +1557,7 @@ def bootstrap(self,matrix=None,delete=[],exclude=[]):
return {}
elif len(cm[cm.keys()[0]])==0: # everything excluded?
return cm
- undelete=[t for t in self.taxlabels if t in cm]
+ undelete=[t for t in self.taxlabels if t in cm]
if seqobjects:
sitesm=zip(*[str(cm[t]) for t in undelete])
alphabet=matrix[matrix.keys()[0]].alphabet
@@ -1564,11 +1567,11 @@ def bootstrap(self,matrix=None,delete=[],exclude=[]):
bootstrapseqs=map(''.join,zip(*bootstrapsitesm))
if seqobjects:
bootstrapseqs=[Seq(s,alphabet) for s in bootstrapseqs]
- return dict(zip(undelete,bootstrapseqs))
+ return dict(zip(undelete,bootstrapseqs))
def add_sequence(self,name,sequence):
"""Adds a sequence (string) to the matrix."""
-
+
if not name:
raise NexusError('New sequence must have a name')
@@ -1593,16 +1596,16 @@ def add_sequence(self,name,sequence):
def insert_gap(self,pos,n=1,leftgreedy=False):
"""Add a gap into the matrix and adjust charsets and partitions.
-
+
pos=0: first position
pos=nchar: last position
"""
def _adjust(set,x,d,leftgreedy=False):
- """Adjusts chartacter sets if gaps are inserted, taking care of
- new gaps within a coherent character set."""
+ """Adjusts character sets if gaps are inserted, taking care of
+ new gaps within a coherent character set."""
# if 3 gaps are inserted at pos. 9 in a set that looks like 1 2 3 8 9 10 11 13 14 15
- # then the adjusted set will be 1 2 3 8 9 10 11 12 13 14 15 16 17 18
+ # then the adjusted set will be 1 2 3 8 9 10 11 12 13 14 15 16 17 18
# but inserting into position 8 it will stay like 1 2 3 11 12 13 14 15 16 17 18
set.sort()
addpos=0