Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Removing deprecated Bio.Enzyme and Bio.Prosite modules

  • Loading branch information...
commit dcbdd45539b2eb5c4cb13f313faafc571b4924af 1 parent 2da96bf
@peterjc peterjc authored
View
256 Bio/Enzyme/__init__.py
@@ -1,256 +0,0 @@
-# Copyright 1999 by Jeffrey Chang. All rights reserved.
-# This code is part of the Biopython distribution and governed by its
-# license. Please see the LICENSE file that should have been included
-# as part of this package.
-
-"""Module to work with enzyme.dat file (DEPRECATED).
-
-This module provides code to work with the enzyme.dat file from
-Enzyme (OBSOLETE as of Biopython version 1.50).
-http://www.expasy.ch/enzyme/
-
-The functionality of Bio.Enzyme has moved to Bio.ExPASy.ExPASy;
-please use that module instead of Bio.Enzyme. Bio.Enzyme is now
-deprecated and will be removed in a future release of Biopython.
-"""
-
-import warnings
-import Bio
-warnings.warn("Bio.Enzyme is deprecated, and will be removed in a"\
- " future release of Biopython. Most of the functionality "
- " is now provided by Bio.ExPASy.Enzyme. If you want to "
- " continue to use Bio.Enzyme, please get in contact "
- " via the mailing lists to avoid its permanent removal from"\
- " Biopython.", Bio.BiopythonDeprecationWarning)
-
-from Bio import File
-from Bio.ParserSupport import *
-
-class _Scanner:
- """Scans Enzyme data (PRIVATE).
-
- Tested with:
- Release 33
- """
-
- def feed(self, handle, consumer):
- """feed(self, handle, consumer)
-
- Feed in Enzyme data for scanning. handle is a file-like object
- that contains keyword information. consumer is a Consumer
- object that will receive events as the report is scanned.
-
- """
- if isinstance(handle, File.UndoHandle):
- uhandle = handle
- else:
- uhandle = File.UndoHandle(handle)
-
- while not is_blank_line(uhandle.peekline()): # Am I done yet?
- self._scan_record(uhandle, consumer)
-
- def _scan_record(self, uhandle, consumer):
- # The first record is just copyright information embedded in
- # comments. Check to see if I'm at the first record. If so,
- # then just scan the comments and the terminator.
- consumer.start_record()
- line = uhandle.peekline()
- if line[:2] == 'CC':
- self._scan_cc(uhandle, consumer)
- self._scan_terminator(uhandle, consumer)
- else:
- for fn in self._scan_fns:
- fn(self, uhandle, consumer)
- consumer.end_record()
-
- def _scan_line(self, line_type, uhandle, event_fn,
- exactly_one=None, one_or_more=None, any_number=None,
- up_to_one=None):
- # Callers must set exactly one of exactly_one, one_or_more, or
- # any_number to a true value. I do not explicitly check to
- # make sure this function is called correctly.
-
- # This does not guarantee any parameter safety, but I
- # like the readability. The other strategy I tried was have
- # parameters min_lines, max_lines.
-
- if exactly_one or one_or_more:
- read_and_call(uhandle, event_fn, start=line_type)
- if one_or_more or any_number:
- while 1:
- if not attempt_read_and_call(uhandle, event_fn,
- start=line_type):
- break
- if up_to_one:
- attempt_read_and_call(uhandle, event_fn, start=line_type)
-
- def _scan_id(self, uhandle, consumer):
- self._scan_line('ID', uhandle, consumer.identification, exactly_one=1)
-
- def _scan_de(self, uhandle, consumer):
- self._scan_line('DE', uhandle, consumer.description, one_or_more=1)
-
- def _scan_an(self, uhandle, consumer):
- self._scan_line('AN', uhandle, consumer.alternate_name, any_number=1)
-
- def _scan_ca(self, uhandle, consumer):
- self._scan_line('CA', uhandle, consumer.catalytic_activity,
- any_number=1)
-
- def _scan_cf(self, uhandle, consumer):
- self._scan_line('CF', uhandle, consumer.cofactor, any_number=1)
-
- def _scan_cc(self, uhandle, consumer):
- self._scan_line('CC', uhandle, consumer.comment, any_number=1)
-
- def _scan_di(self, uhandle, consumer):
- self._scan_line('DI', uhandle, consumer.disease, any_number=1)
-
- def _scan_pr(self, uhandle, consumer):
- self._scan_line('PR', uhandle, consumer.prosite_reference,
- any_number=1)
-
- def _scan_dr(self, uhandle, consumer):
- self._scan_line('DR', uhandle, consumer.databank_reference,
- any_number=1)
-
- def _scan_terminator(self, uhandle, consumer):
- self._scan_line('//', uhandle, consumer.terminator, exactly_one=1)
-
- _scan_fns = [
- _scan_id,
- _scan_de,
- _scan_an,
- _scan_ca,
- _scan_cf,
- _scan_cc,
- _scan_di,
- _scan_pr,
- _scan_dr,
- _scan_terminator
- ]
-class DataRecord:
- def __init__(self,tr_code='',sw_code=''):
- self.tr_code = tr_code
- self.sw_code = sw_code
-
- def __str__(self):
- return self.tr_code + ", " + self.sw_code
-
-class EnzymeRecord:
- def __init__(self):
- self.ID = ''
- self.DE = []
- self.AN = []
- self.CA = ''
- self.CF = []
- self.CC = [] # one comment per line
- self.DI = []
- self.PR = []
- self.DR = []
-
- def __repr__(self):
- if self.ID:
- if self.DE:
- return "%s (%s, %s)" % (self.__class__.__name__,
- self.ID, self.DE[0])
- else:
- return "%s (%s)" % (self.__class__.__name__,
- self.ID)
- else:
- return "%s ( )" % (self.__class__.__name__)
-
- def __str__(self):
- output = "ID: " + self.ID
- output += " DE: " + repr(self.DE)
- output += " AN: " + repr(self.AN)
- output += " CA: '" + self.CA + "'"
- output += " CF: " + repr(self.CF)
- output += " CC: " + repr(self.CC)
- output += " DI: " + repr(self.DI)
- output += " PR: " + repr(self.PR)
- output += " DR: %d Records" % len(self.DR)
-
- return output
-
-class RecordParser(AbstractParser):
- def __init__(self):
- self._scanner = _Scanner()
- self._consumer = _RecordConsumer()
-
- def parse(self, handle):
- if isinstance(handle, File.UndoHandle):
- uhandle = handle
- else:
- uhandle = File.UndoHandle(handle)
- self._scanner.feed(uhandle, self._consumer)
- return self._consumer.enzyme_record
-
-class Iterator:
- def __init__(self, handle, parser=None):
- self._uhandle = File.UndoHandle(handle)
-
- def next(self):
- self._parser = RecordParser()
- lines = []
- while True:
- line = self._uhandle.readline()
- if not line: break
- if line[:2] == '//':
- break
- lines.append(line)
- if not lines:
- return None
- lines.append('//')
- data = ''.join(lines)
- if self._parser is not None:
- return self._parser.parse(File.StringHandle(data))
- return data
-
- def __iter__(self):
- return iter(self.next, None)
-
-class _RecordConsumer(AbstractConsumer):
- def __init__(self):
- self.enzyme_record = EnzymeRecord()
- def identification(self, id_info):
- self.enzyme_record.ID = id_info.split()[1]
- def description(self,de_info):
- self.enzyme_record.DE.append(de_info[2:].strip())
- def alternate_name(self,an_info):
- self.enzyme_record.AN.append(an_info[2:].strip())
- def catalytic_activity(self, ca_info):
- self.enzyme_record.CA = ''.join([self.enzyme_record.CA, ca_info[2:].strip()])
- def cofactor(self, cf_info):
- self.enzyme_record.CF.append(cf_info[2:].strip())
- def comment(self, cc_info):
- cc = cc_info[2:].strip()
- if cc.startswith("-!-"):
- self.enzyme_record.CC.append(cc[len("-!-"):].strip())
- else:
- # The header is all CC, but doesn't start with -!-
- if self.enzyme_record.CC:
- pre_cc = self.enzyme_record.CC.pop()
- else:
- pre_cc = ""
- new_cc = pre_cc + " " + cc
- self.enzyme_record.CC.append(new_cc)
- def disease(self, di_info):
- self.enzyme_record.DI.append(di_info[2:].strip())
-
- def prosite_reference(self,pr_info):
- self.enzyme_record.PR.append(pr_info.split(';')[1].strip())
-
- def databank_reference(self,dr_info):
- good_data = dr_info[2:].strip()
- pair_data = good_data.split(';')
- for pair in pair_data:
- if not pair: continue
- data_record = DataRecord()
- t1, t2 = pair.split(',')
- data_record.tr_code, data_record.sw_code = \
- t1.strip(), t2.strip()
- self.enzyme_record.DR.append(data_record)
-
- def terminator(self,schwarzenegger):
- pass # Hasta la Vista, baby!
View
494 Bio/Prosite/Pattern.py
@@ -1,494 +0,0 @@
-# Copyright 2000 by Andrew Dalke. All rights reserved.
-# This code is part of the Biopython distribution and governed by its
-# license. Please see the LICENSE file that should have been included
-# as part of this package.
-
-# The Prosite patterns are defined at http://www.expasy.ch/txt/prosuser.txt
-#
-# The PA (PAttern) lines contains the definition of a PROSITE pattern. The
-# patterns are described using the following conventions:
-#
-# - The standard IUPAC one-letter codes for the amino acids are used.
-# - The symbol `x' is used for a position where any amino acid is accepted.
-# - Ambiguities are indicated by listing the acceptable amino acids for a
-# given position, between square parentheses `[ ]'. For example: [ALT]
-# stands for Ala or Leu or Thr.
-# - Ambiguities are also indicated by listing between a pair of curly
-# brackets `{ }' the amino acids that are not accepted at a given
-# position. For example: {AM} stands for any amino acid except Ala and
-# Met.
-# - Each element in a pattern is separated from its neighbor by a `-'.
-# - Repetition of an element of the pattern can be indicated by following
-# that element with a numerical value or a numerical range between
-# parenthesis. Examples: x(3) corresponds to x-x-x, x(2,4) corresponds to
-# x-x or x-x-x or x-x-x-x.
-# - When a pattern is restricted to either the N- or C-terminal of a
-# sequence, that pattern either starts with a `<' symbol or respectively
-# ends with a `>' symbol.
-# - A period ends the pattern.
-#
-# That boils down to doing these conversions
-#
-# [] -> []
-# {} -> [^ ]
-# - ->
-# () -> {}
-# < -> ^
-# > -> $
-# x->X
-# . ->
-
-# Note:
-# [G>] is a valid Prosite pattern, equivalent to "([G]|$)"
-
-# I assume then that
-# [>G] is equivalent to "(^|[G])"
-# It is conceivable that [G>]-G-G is valid, meaning a "G" at the end
-# of the sequence or followed by two more Gs. I did not implement
-# this. I haven't gotten an answer to my query on either of these two
-# non-documented possibilities.
-
-import string, re
-from Bio import Seq, Alphabet
-
-
-# Syntactic conversion to two types of regular expressions
-
-_prosite_trans = string.maketrans("abcdefghijklmnopqrstuvwxyzX}()<>",
- "ABCDEFGHIJKLMNOPQRSTUVW.YZ.]{}^$")
-
-# This does not verify that the pattern is correct - invalid patterns
-# can be converted!
-def prosite_to_re(pattern):
- """convert a valid Prosite pattern into an re string"""
- flg = (pattern[:2] == "[<")
- s = pattern.replace("{", "[^")
- s = s.translate(_prosite_trans, "-.")
- # special case "[<" and ">]", if they exist
- if flg:
- i = s.index("]")
- s = "(?:^|[" + s[2:i] + "])" + s[i+1:]
- if s[-2:] == "$]":
- i = s.rindex("[")
- s = s[:i] + "(?:" + s[i:-2] + "]|$)"
- elif s[-3:] == "$]$":
- i = s.rindex("[")
- s = s[:i] + "(?:" + s[i:-3] + "]|$)$"
- return s
-
-
-# This does not verify the pattern is correct - invalid patterns can
-# be converted!
-def prosite_to_grouped_re(pattern):
- """convert a valid Prosite pattern into an re with groups for each term"""
- flg = (pattern[:2] == "[<")
- s = pattern.replace("{", "[^")
- # Don't delete the "-" characters: use them to place the ()s
- s = s.translate(_prosite_trans, ".")
-
- # Get the [< and >] terms correct
- if flg:
- i = s.index("]")
- s = "(?:^|[" + s[2:i] + "])" + s[i+1:]
- if s[-2:] == "$]":
- i = s.rindex("[")
- s = s[:i] + "(?:" + s[i:-2] + "]|$)"
- if s[-3:] == "$]$":
- i = s.rindex("[")
- s = s[:i] + "(?:" + s[i:-3] + "]|$)$"
-
- # Watch out for unescaped < and > terms
- if s[:1] == "^":
- s = "^(" + s[1:]
- else:
- s = "(" + s
- if s[-1:] == "$":
- s = s[:-1] + ")$"
- else:
- s = s + ")"
-
- return s.replace("-", ")(")
-
-
-
-# Both the Prosite pattern and match result act like sequences.
-class PrositeAlphabet(Alphabet.Alphabet):
- pass
-prosite_alphabet = PrositeAlphabet()
-
-def compile(pattern):
- if not verify_pattern(pattern):
- raise TypeError("not a legal prosite pattern")
- return Prosite(pattern = pattern)
-
-class Prosite:
- alphabet = prosite_alphabet
-
- # Don't like having two different types of input - not very pythonic
- # However, it is faster since I can assume the input has already been
- # verified (if it's a pattern).
- def __init__(self, pattern = None, data = None):
- assert (pattern is None and data is not None) ^ \
- (pattern is not None and data is None), \
- "one and only one of pattern and data can have a value"
- if pattern is not None:
- self.pattern = pattern
- if data is not None:
- self.data = data
-
- def __repr__(self):
- return "Prosite(%s)" % repr(str(self))
- def __str__(self):
- return '-'.join(map(str, self.data)) + "."
- def __len__(self): return len(self.data)
- def __getitem__(self, i): return self.data[i]
- def __getslice__(self, i, j):
- i = max(i, 0); j = max(j, 0)
- return Prosite(data = self.data[i:j])
- def __getattr__(self, name):
- # Lazy creation of these elements / cache results
- if name == "re":
- self.re = re.compile(prosite_to_re(self.pattern))
- return self.re
- elif name == "grouped_re":
- self.grouped_re = re.compile(prosite_to_grouped_re(self.pattern))
- return self.grouped_re
- elif name == "data":
- self.data = find_terms(self.pattern)
- return self.data
- elif name == "pattern":
- self.pattern = str(self)
- return self.pattern
- raise AttributeError(name)
-
- def tostring(self):
- return str(self)
-
- def search(self, seq, pos=0, endpos=None):
- if endpos is not None:
- m = self.grouped_re.search(buffer(seq.tostring()), pos, endpos)
- else:
- m = self.grouped_re.search(buffer(seq.tostring()), pos)
- if m is None:
- return None
- return PrositeMatch(self, seq, m)
- def match(self, seq, pos=0, endpos=None):
- if endpos is not None:
- m = self.grouped_re.match(buffer(seq.tostring()), pos, endpos)
- else:
- m = self.grouped_re.match(buffer(seq.tostring()), pos)
- if m is None:
- return None
- return PrositeMatch(self, seq, m)
-
- # I was thinking about adding sub, subn, findall, etc., but either
- # you just want the string (in which case, use the ".re") or
- # you could be changing to a different alphabet (eg, T->U).
-
-
-# Elements of a Prosite pattern
-class PrositeTerm:
- def __init__(self, letters, ignore, is_begin, is_end, \
- min_count, max_count, can_begin, can_end):
- self.letters = letters
- self.ignore = ignore
- self.is_begin = is_begin
- self.is_end = is_end
- self.min_count = min_count
- self.max_count = max_count
- self.can_begin = can_begin
- self.can_end = can_end
- def copy(self):
- return PrositeTerm(self.letters, self.ignore, self.is_begin,
- self.is_end, self.min_count, self.max_count,
- self.can_begin, self.can_end)
- def __str__(self):
- # Convert the term back into Prosite form
- s = self.base_str()
-
- if self.min_count == self.max_count:
- if self.min_count == 1:
- pass
- else:
- s = s + "(%d)" % self.min_count
- else:
- s = s + "(%d,%d)" % (self.min_count, self.max_count)
- if self.is_end:
- s = s + ">"
- return s
-
- def base_str(self):
- # Convert the term back into Prosite form, without the repeat
- # count fields.
-
- if self.is_begin:
- s = "<"
- else:
- s = ""
- if self.ignore:
- s = s + "{" + self.letters + "}"
- elif len(self.letters) == 1 and \
- (not self.can_begin and not self.can_end):
- s = s + self.letters
- else:
- s = s + "["
- if self.can_begin:
- s = s + "<"
- s = s + self.letters
- if self.can_end:
- s = s + ">"
- s = s + "]"
- return s
-
-# Results of a Prosite match. Wrapper to the re.MatchObj, but returns
-# Seq objects instead of strings. And lookee - it implements the Seq
-# interface too!
-class PrositeMatch:
- def __init__(self, prosite, seq, match):
- self.prosite = prosite
- self.seq = seq
- self.match = match
- self.pos = match.pos
- self.endpos = match.pos
-
- # for Seq.Seq initialization
- self.data = match.group(0)
- self.alphabet = seq.alphabet
-
- def __repr__(self):
- # XXX this isn't the right way
- return "<PrositeMatch instance at %x>" % id(self)
- def __str__(self):
- return str(self.data)
- def __len__(self): return len(self.data)
- def __getitem__(self, i): return self.data[i]
- def __getslice__(self, i, j):
- i = max(i, 0); j = max(j, 0)
- return Seq.Seq(self.data[i:j], self.alphabet)
-
- def mapping(self):
- """return a list of numbers mapping to items of the original pattern
-
- For example, if the Prosite pattern is "[AP](2)-D." matched against
- "PAD", then the mapping is [1, 1, 2], meaning the first character
- of the match ("P") is from the first Prosite group ("[AP]"), as
- is the second letter ("A"). The 3rd letter ("D") is mapped to
- group 2 of the pattern.
- """
-
- vals = []
- i = 0
- start = self.start(0)
- try:
- while 1:
- end = self.match.end(i+1)
- while start < end:
- vals.append(i)
- start = start + 1
- i = i + 1
- except IndexError:
- pass
- return vals
-
- def mapped_pattern(self):
- """returns the specific Prosite pattern used to find this sequence
-
- >>> p = Prosite.compile("[AP](2,3)-D.")
- >>> m = p.search(Seq.Seq("PAD"))
- >>> mapping = m.mapping()
- >>> mapped = m.mapped_pattern()
- >>> print str(m[1]), str(p[mapping[1]]), str(mapped[1])
- P [AP](2,3) [AP]
- >>> print str(mapped)
- [AP]-[AP]-D.
- >>>
-
- Note that the original term includes the count, while the
- mapped pattern does the expansion.
-
- """
- return pattern_mapping(self.prosite, self.mapping())
-
- def start(self, g=0):
- return self.match.start(g)
- def end(self, g=0):
- return self.match.end(g)
- def span(self, g):
- return self.match.span(g)
- def groups(self, default=None):
- result = []
- alphabet = self.alphabet
- for g in self.match.groups(default):
- result.append( Seq.Seq(g, alphabet) )
- return tuple(result)
- def group(self, *groups):
- result = self.match.group(*groups)
- if result == ():
- return result
- if len(result) == 1:
- return Seq.Seq(result, self.alphabet)
- retval = []
- for x in result:
- retval.append(Seq.Seq(x, self.alphabet))
- return tuple(retval)
-
-def pattern_mapping(prosite, mapping):
- data = []
- for i in mapping:
- x = prosite[i].copy()
- x.min_count = x.max_count = 1
- data.append(x)
- return Prosite(data=data)
-
-prosite_term_re = re.compile(r"""
-(?:
- ([ABCDEFGHIKLMNPQRSTVWXYZx])| # a character OR
- \[(<?)([ABCDEFGHIKLMNPQRSTVWXYZ]+)(>?)\]| # something in []s OR
- \{([ABCDEFGHIKLMNPQRSTVWXYZ]+)\} # something in {}s
-)(?:\((\d+)(,\d+)?\))? # optional count of the form "(i,j)", ",j" optional
-$
-""", re.VERBOSE)
-
-# This does not verify the pattern is correct - invalid patterns can
-# be converted!
-def find_terms(pattern):
- if pattern[-1:] != ".":
- raise TypeError("not a prosite pattern - needs a final '.'")
- pattern = pattern[:-1]
- terms = pattern.split("-")
- result = []
- i = 0
- for term in terms:
- can_begin = can_end = 0
- # Starts with a "<"?
- if term[:1] == "<":
- term = term[1:]
- is_begin = 1
- else:
- is_begin = 0
-
- # Ends with a ">"?
- if term[-1:] == ">":
- term = term[:-1]
- is_end = 1
- else:
- is_end = 0
-
- match = prosite_term_re.match(term)
- if match is None:
- raise TypeError("not a Prosite term (%s)" % repr(term))
- if match.group(1) is not None:
- # Single letter
- ignore = 0
- letters = match.group(1)
- elif match.group(3) is not None:
- # Letters inside of "[]"s
- ignore = 0
- letters = match.group(3)
- if match.group(2):
- can_begin = 1
- if i != 0:
- raise TypeError("[<] only allowed for first term (%s)" \
- % repr(term))
-
- if match.group(4):
- can_end = 1
- if i != len(terms) - 1:
- raise TypeError("[>] only allowed for last term (%s)" \
- % repr(term))
-
- elif match.group(5) is not None:
- # Letters inside of "{}"s
- ignore = 1
- letters = match.group(5)
- else:
- raise TypeError("not a prosite term (%s)" % repr(term))
-
- if match.group(6) is not None:
- # there is a minimum number
- min_count = int(match.group(6))
- else:
- # no min, so it's 1
- min_count = 1
- if match.group(7) is not None:
- # there is a maximum number
- max_count = int(match.group(7)[1:])
- else:
- # no max specified, so use the same as the min
- max_count = min_count
-
- result.append(PrositeTerm(letters, ignore, is_begin,
- is_end, min_count, max_count,
- can_begin, can_end))
-
- i = i + 1
- return result
-
-
-
-
-prosite_re = re.compile(r"""
-^<? # starts with an optional "<"
-(
- [ABCDEFGHIKLMNPQRSTVWXYZx]| # a character OR
- (\[<?[ABCDEFGHIKLMNPQRSTVWXYZ]+>?\])| # something in []s OR
- \{[ABCDEFGHIKLMNPQRSTVWXYZ]+\} # something in {}s
-)(\(\d+(,\d+)?\))? # optional count of the form "(i,j)" (",j" is optional)
-(- # new terms seperated by a '-'
- (
- [ABCDEFGHIKLMNPQRSTVWXYZx]| # a character OR
- \[[ABCDEFGHIKLMNPQRSTVWXYZ]+>?\]| # something in []s OR
- \{[ABCDEFGHIKLMNPQRSTVWXYZ]+\} # something in {}s
- )(\(\d+(,\d+)?\))? # optional count
-)* # repeat until done
->? # pattern ends with an optional ">"
-\.$ # description ends with a required "."
-""", re.VERBOSE)
-
-# This verifies the pattern is correct.
-def verify_pattern(pattern):
- """returns 1 if the Prosite pattern is syntactically correct, else 0"""
- x = prosite_re.match(pattern)
- if x is None:
- return 0
- # check there's only one [< at the beginning, or >] at the end
- if pattern.find("[<", 1) != -1:
- return 0
- if pattern.find(">]", 0, len(pattern)-2) != -1:
- return 0
- return 1
-
-def _verify_test(infile):
- """verify the patterns from a Prosite file handle"""
- pattern = ""
- while 1:
- line = infile.readline()
- if not line:
- break
- if line[:2] != "PA":
- continue
-
- pattern = pattern + line[5:-1]
- if line[-2] == ".":
- try:
- print "*" * 60
- print pattern
- p = compile(pattern)
- print prosite_to_re(pattern)
- print repr(p.re)
- print prosite_to_grouped_re(pattern)
- print repr(p.grouped_re)
- terms = str(p)
- if terms != pattern:
- print "DIFFER", terms, pattern
- except TypeError, msg:
- print "PROBLEM", pattern, msg
- pattern = ""
-
-# Commented out by jchang 4/13/00.
-# Specific to Andrew's test environment.
-#if __name__ == "__main__":
-# import os
-# infile = os.popen("bzcat /home/dalke/ftps/prosite/prosite.dat.bz2 | grep ^PA")
-# _verify_test(infile)
-
View
334 Bio/Prosite/Prodoc.py
@@ -1,334 +0,0 @@
-# Copyright 2000 by Jeffrey Chang. All rights reserved.
-# This code is part of the Biopython distribution and governed by its
-# license. Please see the LICENSE file that should have been included
-# as part of this package.
-
-"""
-This module is OBSOLETE.
-Most of the functionality in this module has moved to Bio.ExPASy.Prodoc;
-please see
-
-Bio.ExPASy.Prodoc.read To read a Prodoc file containing one entry.
-Bio.ExPASy.Prodoc.parse Iterates over entries in a Prodoc file.
-Bio.ExPASy.Prodoc.Record Holds Prodoc data.
-Bio.ExPASy.Prodoc.Reference Holds data from a Prodoc reference.
-
-The other functions and classes in Bio.Prosite.Prodoc (including
-Bio.Prosite.Prodoc.index_file and Bio.Prosite.Prodoc.Dictionary) are
-considered deprecated, and were not moved to Bio.ExPASy.Prodoc. If you use
-this functionality, please contact the Biopython developers at
-biopython-dev@biopython.org to avoid permanent removal of this module from
-Biopython.
-
-
-
-
-This module provides code to work with the prosite.doc file from
-Prosite, available at http://www.expasy.ch/prosite/.
-
-Tested with:
-Release 15.0, July 1998
-Release 16.0, July 1999
-Release 20.22, 13 November 2007
-
-
-Functions:
-parse Iterates over entries in a Prodoc file.
-index_file Index a Prodoc file for a Dictionary.
-_extract_record Extract Prodoc data from a web page.
-
-
-Classes:
-Record Holds Prodoc data.
-Reference Holds data from a Prodoc reference.
-Dictionary Accesses a Prodoc file using a dictionary interface.
-RecordParser Parses a Prodoc record into a Record object.
-
-_Scanner Scans Prodoc-formatted data.
-_RecordConsumer Consumes Prodoc data to a Record object.
-"""
-
-import warnings
-warnings.warn("This module is OBSOLETE. Most of the functionality in this module has moved to Bio.ExPASy.Prodoc.", PendingDeprecationWarning)
-
-from types import *
-import os
-import sgmllib
-from Bio import File
-from Bio import Index
-from Bio.ParserSupport import *
-
-def parse(handle):
- import cStringIO
- parser = RecordParser()
- text = ""
- for line in handle:
- text += line
- if line[:5] == '{END}':
- handle = cStringIO.StringIO(text)
- record = parser.parse(handle)
- text = ""
- yield record
-
-def read(handle):
- parser = RecordParser()
- record = parser.parse(handle)
- # We should have reached the end of the record by now
- remainder = handle.read()
- if remainder:
- raise ValueError("More than one Prodoc record found")
- return record
-
-
-# It may be a good idea to rewrite read(), parse() at some point to avoid
-# using the old-style "parser = RecordParser(); parser.parse(handle)" approach.
-
-class Record:
- """Holds information from a Prodoc record.
-
- Members:
- accession Accession number of the record.
- prosite_refs List of tuples (prosite accession, prosite name).
- text Free format text.
- references List of reference objects.
-
- """
- def __init__(self):
- self.accession = ''
- self.prosite_refs = []
- self.text = ''
- self.references = []
-
-class Reference:
- """Holds information from a Prodoc citation.
-
- Members:
- number Number of the reference. (string)
- authors Names of the authors.
- citation Describes the citation.
-
- """
- def __init__(self):
- self.number = ''
- self.authors = ''
- self.citation = ''
-
-class Dictionary:
- """Accesses a Prodoc file using a dictionary interface.
-
- """
- __filename_key = '__filename'
-
- def __init__(self, indexname, parser=None):
- """__init__(self, indexname, parser=None)
-
- Open a Prodoc Dictionary. indexname is the name of the
- index for the dictionary. The index should have been created
- using the index_file function. parser is an optional Parser
- object to change the results into another form. If set to None,
- then the raw contents of the file will be returned.
-
- """
- self._index = Index.Index(indexname)
- self._handle = open(self._index[Dictionary.__filename_key])
- self._parser = parser
-
- def __len__(self):
- return len(self._index)
-
- def __getitem__(self, key):
- start, len = self._index[key]
- self._handle.seek(start)
- data = self._handle.read(len)
- if self._parser is not None:
- return self._parser.parse(File.StringHandle(data))
- return data
-
- def __getattr__(self, name):
- return getattr(self._index, name)
-
-class RecordParser(AbstractParser):
- """Parses Prodoc data into a Record object.
-
- """
- def __init__(self):
- self._scanner = _Scanner()
- self._consumer = _RecordConsumer()
-
- def parse(self, handle):
- self._scanner.feed(handle, self._consumer)
- return self._consumer.data
-
-class _Scanner:
- """Scans Prodoc-formatted data.
-
- Tested with:
- Release 15.0, July 1998
-
- """
- def feed(self, handle, consumer):
- """feed(self, handle, consumer)
-
- Feed in Prodoc data for scanning. handle is a file-like
- object that contains prosite data. consumer is a
- Consumer object that will receive events as the report is scanned.
-
- """
- if isinstance(handle, File.UndoHandle):
- uhandle = handle
- else:
- uhandle = File.UndoHandle(handle)
-
- while 1:
- line = uhandle.peekline()
- if not line:
- break
- elif is_blank_line(line):
- # Skip blank lines between records
- uhandle.readline()
- continue
- else:
- self._scan_record(uhandle, consumer)
-
- def _scan_record(self, uhandle, consumer):
- consumer.start_record()
-
- self._scan_accession(uhandle, consumer)
- self._scan_prosite_refs(uhandle, consumer)
- read_and_call(uhandle, consumer.noevent, start='{BEGIN}')
- self._scan_text(uhandle, consumer)
- self._scan_refs(uhandle, consumer)
- self._scan_copyright(uhandle, consumer)
- read_and_call(uhandle, consumer.noevent, start='{END}')
-
- consumer.end_record()
-
- def _scan_accession(self, uhandle, consumer):
- read_and_call(uhandle, consumer.accession, start='{PDOC')
-
- def _scan_prosite_refs(self, uhandle, consumer):
- while attempt_read_and_call(uhandle, consumer.prosite_reference,
- start='{PS'):
- pass
-
- def _scan_text(self, uhandle, consumer):
- while 1:
- line = safe_readline(uhandle)
- if (line[0] == '[' and line[3] == ']' and line[4] == ' ') or \
- line[:5] == '{END}':
- uhandle.saveline(line)
- break
- consumer.text(line)
-
- def _scan_refs(self, uhandle, consumer):
- while 1:
- line = safe_readline(uhandle)
- if line[:5] == '{END}' or is_blank_line(line):
- uhandle.saveline(line)
- break
- consumer.reference(line)
-
- def _scan_copyright(self, uhandle, consumer):
- # Cayte Lindner found some PRODOC records with the copyrights
- # appended at the end. We'll try and recognize these.
- read_and_call_while(uhandle, consumer.noevent, blank=1)
- if attempt_read_and_call(uhandle, consumer.noevent, start='+----'):
- read_and_call_until(uhandle, consumer.noevent, start='+----')
- read_and_call(uhandle, consumer.noevent, start='+----')
- read_and_call_while(uhandle, consumer.noevent, blank=1)
-
-class _RecordConsumer(AbstractConsumer):
- """Consumer that converts a Prodoc record to a Record object.
-
- Members:
- data Record with Prodoc data.
-
- """
- def __init__(self):
- self.data = None
-
- def start_record(self):
- self.data = Record()
-
- def end_record(self):
- self._clean_data()
-
- def accession(self, line):
- line = line.rstrip()
- if line[0] != '{' or line[-1] != '}':
- raise ValueError("I don't understand accession line\n%s" % line)
- acc = line[1:-1]
- if acc[:4] != 'PDOC':
- raise ValueError("Invalid accession in line\n%s" % line)
- self.data.accession = acc
-
- def prosite_reference(self, line):
- line = line.rstrip()
- if line[0] != '{' or line[-1] != '}':
- raise ValueError("I don't understand accession line\n%s" % line)
- acc, name = line[1:-1].split('; ')
- self.data.prosite_refs.append((acc, name))
-
- def text(self, line):
- self.data.text = self.data.text + line
-
- def reference(self, line):
- if line[0] == '[' and line[3] == ']': # new reference
- self._ref = Reference()
- self._ref.number = line[1:3].strip()
- if line[1] == 'E':
- # If it's an electronic reference, then the URL is on the
- # line, instead of the author.
- self._ref.citation = line[4:].strip()
- else:
- self._ref.authors = line[4:].strip()
- self.data.references.append(self._ref)
- elif line[:4] == ' ':
- if not self._ref:
- raise ValueError("Unnumbered reference lines\n%s" % line)
- self._ref.citation = self._ref.citation + line[5:]
- else:
- raise Exception("I don't understand the reference line\n%s" % line)
-
- def _clean_data(self):
- # get rid of trailing newlines
- for ref in self.data.references:
- ref.citation = ref.citation.rstrip()
- ref.authors = ref.authors.rstrip()
-
-def index_file(filename, indexname, rec2key=None):
- """index_file(filename, indexname, rec2key=None)
-
- Index a Prodoc file. filename is the name of the file.
- indexname is the name of the dictionary. rec2key is an
- optional callback that takes a Record and generates a unique key
- (e.g. the accession number) for the record. If not specified,
- the id name will be used.
-
- """
- import os
- if not os.path.exists(filename):
- raise ValueError("%s does not exist" % filename)
-
- index = Index.Index(indexname, truncate=1)
- index[Dictionary._Dictionary__filename_key] = filename
-
- handle = open(filename)
- records = parse(handle)
- end = 0L
- for record in records:
- start = end
- end = handle.tell()
- length = end - start
-
- if rec2key is not None:
- key = rec2key(record)
- else:
- key = record.accession
-
- if not key:
- raise KeyError("empty key was produced")
- elif key in index:
- raise KeyError("duplicate key %s found" % key)
-
- index[key] = start, length
View
747 Bio/Prosite/__init__.py
@@ -1,747 +0,0 @@
-# Copyright 1999 by Jeffrey Chang. All rights reserved.
-# Copyright 2000 by Jeffrey Chang. All rights reserved.
-# Revisions Copyright 2007 by Peter Cock. All rights reserved.
-# This code is part of the Biopython distribution and governed by its
-# license. Please see the LICENSE file that should have been included
-# as part of this package.
-"""Module for working with Prosite files from ExPASy (DEPRECATED).
-
-Most of the functionality in this module has moved to Bio.ExPASy.Prosite;
-please see
-
-Bio.ExPASy.Prosite.read To read a Prosite file containing one entry.
-Bio.ExPASy.Prosite.parse Iterates over entries in a Prosite file.
-Bio.ExPASy.Prosite.Record Holds Prosite data.
-
-For
-scan_sequence_expasy Scan a sequence for occurrences of Prosite patterns.
-_extract_pattern_hits Extract Prosite patterns from a web page.
-PatternHit Holds data from a hit against a Prosite pattern.
-please see the new module Bio.ExPASy.ScanProsite.
-
-The other functions and classes in Bio.Prosite (including
-Bio.Prosite.index_file and Bio.Prosite.Dictionary) are considered deprecated,
-and were not moved to Bio.ExPASy.Prosite. If you use this functionality,
-please contact the Biopython developers at biopython-dev@biopython.org to
-avoid permanent removal of this module from Biopython.
-
-
-This module provides code to work with the prosite dat file from
-Prosite.
-http://www.expasy.ch/prosite/
-
-Tested with:
-Release 15.0, July 1998
-Release 16.0, July 1999
-Release 17.0, Dec 2001
-Release 19.0, Mar 2006
-
-
-Functions:
-parse Iterates over entries in a Prosite file.
-scan_sequence_expasy Scan a sequence for occurrences of Prosite patterns.
-index_file Index a Prosite file for a Dictionary.
-_extract_record Extract Prosite data from a web page.
-_extract_pattern_hits Extract Prosite patterns from a web page.
-
-
-Classes:
-Record Holds Prosite data.
-PatternHit Holds data from a hit against a Prosite pattern.
-Dictionary Accesses a Prosite file using a dictionary interface.
-RecordParser Parses a Prosite record into a Record object.
-
-_Scanner Scans Prosite-formatted data.
-_RecordConsumer Consumes Prosite data to a Record object.
-
-"""
-
-import warnings
-import Bio
-warnings.warn("Bio.Prosite is deprecated, and will be removed in a"\
- " future release of Biopython. Most of the functionality "
- " is now provided by Bio.ExPASy.Prosite. If you want to "
- " continue to use Bio.Prosite, please get in contact "
- " via the mailing lists to avoid its permanent removal from"\
- " Biopython.", Bio.BiopythonDeprecationWarning)
-
-from types import *
-import re
-import sgmllib
-from Bio import File
-from Bio import Index
-from Bio.ParserSupport import *
-
-# There is probably a cleaner way to write the read/parse functions
-# if we don't use the "parser = RecordParser(); parser.parse(handle)"
-# approach. Leaving that for the next revision of Bio.Prosite.
-def parse(handle):
- import cStringIO
- parser = RecordParser()
- text = ""
- for line in handle:
- text += line
- if line[:2]=='//':
- handle = cStringIO.StringIO(text)
- record = parser.parse(handle)
- text = ""
- if not record: # Then this was the copyright notice
- continue
- yield record
-
-def read(handle):
- parser = RecordParser()
- try:
- record = parser.parse(handle)
- except ValueError, error:
- if error.message=="There doesn't appear to be a record":
- raise ValueError("No Prosite record found")
- else:
- raise error
- # We should have reached the end of the record by now
- remainder = handle.read()
- if remainder:
- raise ValueError("More than one Prosite record found")
- return record
-
-class Record:
- """Holds information from a Prosite record.
-
- Members:
- name ID of the record. e.g. ADH_ZINC
- type Type of entry. e.g. PATTERN, MATRIX, or RULE
- accession e.g. PS00387
- created Date the entry was created. (MMM-YYYY)
- data_update Date the 'primary' data was last updated.
- info_update Date data other than 'primary' data was last updated.
- pdoc ID of the PROSITE DOCumentation.
-
- description Free-format description.
- pattern The PROSITE pattern. See docs.
- matrix List of strings that describes a matrix entry.
- rules List of rule definitions (from RU lines). (strings)
- prorules List of prorules (from PR lines). (strings)
-
- NUMERICAL RESULTS
- nr_sp_release SwissProt release.
- nr_sp_seqs Number of seqs in that release of Swiss-Prot. (int)
- nr_total Number of hits in Swiss-Prot. tuple of (hits, seqs)
- nr_positive True positives. tuple of (hits, seqs)
- nr_unknown Could be positives. tuple of (hits, seqs)
- nr_false_pos False positives. tuple of (hits, seqs)
- nr_false_neg False negatives. (int)
- nr_partial False negatives, because they are fragments. (int)
-
- COMMENTS
- cc_taxo_range Taxonomic range. See docs for format
- cc_max_repeat Maximum number of repetitions in a protein
- cc_site Interesting site. list of tuples (pattern pos, desc.)
- cc_skip_flag Can this entry be ignored?
- cc_matrix_type
- cc_scaling_db
- cc_author
- cc_ft_key
- cc_ft_desc
- cc_version version number (introduced in release 19.0)
-
- DATA BANK REFERENCES - The following are all
- lists of tuples (swiss-prot accession,
- swiss-prot name)
- dr_positive
- dr_false_neg
- dr_false_pos
- dr_potential Potential hits, but fingerprint region not yet available.
- dr_unknown Could possibly belong
-
- pdb_structs List of PDB entries.
-
- """
- def __init__(self):
- self.name = ''
- self.type = ''
- self.accession = ''
- self.created = ''
- self.data_update = ''
- self.info_update = ''
- self.pdoc = ''
-
- self.description = ''
- self.pattern = ''
- self.matrix = []
- self.rules = []
- self.prorules = []
- self.postprocessing = []
-
- self.nr_sp_release = ''
- self.nr_sp_seqs = ''
- self.nr_total = (None, None)
- self.nr_positive = (None, None)
- self.nr_unknown = (None, None)
- self.nr_false_pos = (None, None)
- self.nr_false_neg = None
- self.nr_partial = None
-
- self.cc_taxo_range = ''
- self.cc_max_repeat = ''
- self.cc_site = []
- self.cc_skip_flag = ''
-
- self.dr_positive = []
- self.dr_false_neg = []
- self.dr_false_pos = []
- self.dr_potential = []
- self.dr_unknown = []
-
- self.pdb_structs = []
-
-class PatternHit:
- """Holds information from a hit against a Prosite pattern.
-
- Members:
- name ID of the record. e.g. ADH_ZINC
- accession e.g. PS00387
- pdoc ID of the PROSITE DOCumentation.
- description Free-format description.
- matches List of tuples (start, end, sequence) where
- start and end are indexes of the match, and sequence is
- the sequence matched.
-
- """
- def __init__(self):
- self.name = None
- self.accession = None
- self.pdoc = None
- self.description = None
- self.matches = []
- def __str__(self):
- lines = []
- lines.append("%s %s %s" % (self.accession, self.pdoc, self.name))
- lines.append(self.description)
- lines.append('')
- if len(self.matches) > 1:
- lines.append("Number of matches: %s" % len(self.matches))
- for i in range(len(self.matches)):
- start, end, seq = self.matches[i]
- range_str = "%d-%d" % (start, end)
- if len(self.matches) > 1:
- lines.append("%7d %10s %s" % (i+1, range_str, seq))
- else:
- lines.append("%7s %10s %s" % (' ', range_str, seq))
- return "\n".join(lines)
-
-
-class Dictionary:
- """Accesses a Prosite file using a dictionary interface.
-
- """
- __filename_key = '__filename'
-
- def __init__(self, indexname, parser=None):
- """__init__(self, indexname, parser=None)
-
- Open a Prosite Dictionary. indexname is the name of the
- index for the dictionary. The index should have been created
- using the index_file function. parser is an optional Parser
- object to change the results into another form. If set to None,
- then the raw contents of the file will be returned.
-
- """
- self._index = Index.Index(indexname)
- self._handle = open(self._index[Dictionary.__filename_key])
- self._parser = parser
-
- def __len__(self):
- return len(self._index)
-
- def __getitem__(self, key):
- start, len = self._index[key]
- self._handle.seek(start)
- data = self._handle.read(len)
- if self._parser is not None:
- return self._parser.parse(File.StringHandle(data))
- return data
-
- def __getattr__(self, name):
- return getattr(self._index, name)
-
-class RecordParser(AbstractParser):
- """Parses Prosite data into a Record object.
-
- """
- def __init__(self):
- self._scanner = _Scanner()
- self._consumer = _RecordConsumer()
-
- def parse(self, handle):
- self._scanner.feed(handle, self._consumer)
- return self._consumer.data
-
-class _Scanner:
- """Scans Prosite-formatted data.
-
- Tested with:
- Release 15.0, July 1998
-
- """
- def feed(self, handle, consumer):
- """feed(self, handle, consumer)
-
- Feed in Prosite data for scanning. handle is a file-like
- object that contains prosite data. consumer is a
- Consumer object that will receive events as the report is scanned.
-
- """
- if isinstance(handle, File.UndoHandle):
- uhandle = handle
- else:
- uhandle = File.UndoHandle(handle)
-
- consumer.finished = False
- while not consumer.finished:
- line = uhandle.peekline()
- if not line:
- break
- elif is_blank_line(line):
- # Skip blank lines between records
- uhandle.readline()
- continue
- elif line[:2] == 'ID':
- self._scan_record(uhandle, consumer)
- elif line[:2] == 'CC':
- self._scan_copyrights(uhandle, consumer)
- else:
- raise ValueError("There doesn't appear to be a record")
-
- def _scan_copyrights(self, uhandle, consumer):
- consumer.start_copyrights()
- self._scan_line('CC', uhandle, consumer.copyright, any_number=1)
- self._scan_terminator(uhandle, consumer)
- consumer.end_copyrights()
-
- def _scan_record(self, uhandle, consumer):
- consumer.start_record()
- for fn in self._scan_fns:
- fn(self, uhandle, consumer)
-
- # In Release 15.0, C_TYPE_LECTIN_1 has the DO line before
- # the 3D lines, instead of the other way around.
- # Thus, I'll give the 3D lines another chance after the DO lines
- # are finished.
- if fn is self._scan_do.im_func:
- self._scan_3d(uhandle, consumer)
- consumer.end_record()
-
- def _scan_line(self, line_type, uhandle, event_fn,
- exactly_one=None, one_or_more=None, any_number=None,
- up_to_one=None):
- # Callers must set exactly one of exactly_one, one_or_more, or
- # any_number to a true value. I do not explicitly check to
- # make sure this function is called correctly.
-
- # This does not guarantee any parameter safety, but I
- # like the readability. The other strategy I tried was have
- # parameters min_lines, max_lines.
-
- if exactly_one or one_or_more:
- read_and_call(uhandle, event_fn, start=line_type)
- if one_or_more or any_number:
- while 1:
- if not attempt_read_and_call(uhandle, event_fn,
- start=line_type):
- break
- if up_to_one:
- attempt_read_and_call(uhandle, event_fn, start=line_type)
-
- def _scan_id(self, uhandle, consumer):
- self._scan_line('ID', uhandle, consumer.identification, exactly_one=1)
-
- def _scan_ac(self, uhandle, consumer):
- self._scan_line('AC', uhandle, consumer.accession, exactly_one=1)
-
- def _scan_dt(self, uhandle, consumer):
- self._scan_line('DT', uhandle, consumer.date, exactly_one=1)
-
- def _scan_de(self, uhandle, consumer):
- self._scan_line('DE', uhandle, consumer.description, exactly_one=1)
-
- def _scan_pa(self, uhandle, consumer):
- self._scan_line('PA', uhandle, consumer.pattern, any_number=1)
-
- def _scan_ma(self, uhandle, consumer):
- self._scan_line('MA', uhandle, consumer.matrix, any_number=1)
-## # ZN2_CY6_FUNGAL_2, DNAJ_2 in Release 15
-## # contain a CC line buried within an 'MA' line. Need to check
-## # for that.
-## while 1:
-## if not attempt_read_and_call(uhandle, consumer.matrix, start='MA'):
-## line1 = uhandle.readline()
-## line2 = uhandle.readline()
-## uhandle.saveline(line2)
-## uhandle.saveline(line1)
-## if line1[:2] == 'CC' and line2[:2] == 'MA':
-## read_and_call(uhandle, consumer.comment, start='CC')
-## else:
-## break
-
- def _scan_pp(self, uhandle, consumer):
- #New PP line, PostProcessing, just after the MA line
- self._scan_line('PP', uhandle, consumer.postprocessing, any_number=1)
-
- def _scan_ru(self, uhandle, consumer):
- self._scan_line('RU', uhandle, consumer.rule, any_number=1)
-
- def _scan_nr(self, uhandle, consumer):
- self._scan_line('NR', uhandle, consumer.numerical_results,
- any_number=1)
-
- def _scan_cc(self, uhandle, consumer):
- self._scan_line('CC', uhandle, consumer.comment, any_number=1)
-
- def _scan_dr(self, uhandle, consumer):
- self._scan_line('DR', uhandle, consumer.database_reference,
- any_number=1)
-
- def _scan_3d(self, uhandle, consumer):
- self._scan_line('3D', uhandle, consumer.pdb_reference,
- any_number=1)
-
- def _scan_pr(self, uhandle, consumer):
- #New PR line, ProRule, between 3D and DO lines
- self._scan_line('PR', uhandle, consumer.prorule, any_number=1)
-
- def _scan_do(self, uhandle, consumer):
- self._scan_line('DO', uhandle, consumer.documentation, exactly_one=1)
-
- def _scan_terminator(self, uhandle, consumer):
- self._scan_line('//', uhandle, consumer.terminator, exactly_one=1)
-
- #This is a list of scan functions in the order expected in the file file.
- #The function definitions define how many times each line type is exected
- #(or if optional):
- _scan_fns = [
- _scan_id,
- _scan_ac,
- _scan_dt,
- _scan_de,
- _scan_pa,
- _scan_ma,
- _scan_pp,
- _scan_ru,
- _scan_nr,
- _scan_cc,
-
- # This is a really dirty hack, and should be fixed properly at
- # some point. ZN2_CY6_FUNGAL_2, DNAJ_2 in Rel 15 and PS50309
- # in Rel 17 have lines out of order. Thus, I have to rescan
- # these, which decreases performance.
- _scan_ma,
- _scan_nr,
- _scan_cc,
-
- _scan_dr,
- _scan_3d,
- _scan_pr,
- _scan_do,
- _scan_terminator
- ]
-
-class _RecordConsumer(AbstractConsumer):
- """Consumer that converts a Prosite record to a Record object.
-
- Members:
- data Record with Prosite data.
-
- """
- def __init__(self):
- self.data = None
-
- def start_record(self):
- self.data = Record()
-
- def end_record(self):
- self._clean_record(self.data)
-
- def identification(self, line):
- cols = line.split()
- if len(cols) != 3:
- raise ValueError("I don't understand identification line\n%s" \
- % line)
- self.data.name = self._chomp(cols[1]) # don't want ';'
- self.data.type = self._chomp(cols[2]) # don't want '.'
-
- def accession(self, line):
- cols = line.split()
- if len(cols) != 2:
- raise ValueError("I don't understand accession line\n%s" % line)
- self.data.accession = self._chomp(cols[1])
-
- def date(self, line):
- uprline = line.upper()
- cols = uprline.split()
-
- # Release 15.0 contains both 'INFO UPDATE' and 'INF UPDATE'
- if cols[2] != '(CREATED);' or \
- cols[4] != '(DATA' or cols[5] != 'UPDATE);' or \
- cols[7][:4] != '(INF' or cols[8] != 'UPDATE).':
- raise ValueError("I don't understand date line\n%s" % line)
-
- self.data.created = cols[1]
- self.data.data_update = cols[3]
- self.data.info_update = cols[6]
-
- def description(self, line):
- self.data.description = self._clean(line)
-
- def pattern(self, line):
- self.data.pattern = self.data.pattern + self._clean(line)
-
- def matrix(self, line):
- self.data.matrix.append(self._clean(line))
-
- def postprocessing(self, line):
- postprocessing = self._clean(line).split(";")
- self.data.postprocessing.extend(postprocessing)
-
- def rule(self, line):
- self.data.rules.append(self._clean(line))
-
- def numerical_results(self, line):
- cols = self._clean(line).split(";")
- for col in cols:
- if not col:
- continue
- qual, data = [word.lstrip() for word in col.split("=")]
- if qual == '/RELEASE':
- release, seqs = data.split(",")
- self.data.nr_sp_release = release
- self.data.nr_sp_seqs = int(seqs)
- elif qual == '/FALSE_NEG':
- self.data.nr_false_neg = int(data)
- elif qual == '/PARTIAL':
- self.data.nr_partial = int(data)
- elif qual in ['/TOTAL', '/POSITIVE', '/UNKNOWN', '/FALSE_POS']:
- m = re.match(r'(\d+)\((\d+)\)', data)
- if not m:
- raise Exception("Broken data %s in comment line\n%s" \
- % (repr(data), line))
- hits = tuple(map(int, m.groups()))
- if(qual == "/TOTAL"):
- self.data.nr_total = hits
- elif(qual == "/POSITIVE"):
- self.data.nr_positive = hits
- elif(qual == "/UNKNOWN"):
- self.data.nr_unknown = hits
- elif(qual == "/FALSE_POS"):
- self.data.nr_false_pos = hits
- else:
- raise ValueError("Unknown qual %s in comment line\n%s" \
- % (repr(qual), line))
-
- def comment(self, line):
- #Expect CC lines like this:
- #CC /TAXO-RANGE=??EPV; /MAX-REPEAT=2;
- #Can (normally) split on ";" and then on "="
- cols = self._clean(line).split(";")
- for col in cols:
- if not col or col[:17] == 'Automatic scaling':
- # DNAJ_2 in Release 15 has a non-standard comment line:
- # CC Automatic scaling using reversed database
- # Throw it away. (Should I keep it?)
- continue
- if col.count("=") == 0:
- #Missing qualifier! Can we recover gracefully?
- #For example, from Bug 2403, in PS50293 have:
- #CC /AUTHOR=K_Hofmann; N_Hulo
- continue
- qual, data = [word.lstrip() for word in col.split("=")]
- if qual == '/TAXO-RANGE':
- self.data.cc_taxo_range = data
- elif qual == '/MAX-REPEAT':
- self.data.cc_max_repeat = data
- elif qual == '/SITE':
- pos, desc = data.split(",")
- self.data.cc_site.append((int(pos), desc))
- elif qual == '/SKIP-FLAG':
- self.data.cc_skip_flag = data
- elif qual == '/MATRIX_TYPE':
- self.data.cc_matrix_type = data
- elif qual == '/SCALING_DB':
- self.data.cc_scaling_db = data
- elif qual == '/AUTHOR':
- self.data.cc_author = data
- elif qual == '/FT_KEY':
- self.data.cc_ft_key = data
- elif qual == '/FT_DESC':
- self.data.cc_ft_desc = data
- elif qual == '/VERSION':
- self.data.cc_version = data
- else:
- raise ValueError("Unknown qual %s in comment line\n%s" \
- % (repr(qual), line))
-
- def database_reference(self, line):
- refs = self._clean(line).split(";")
- for ref in refs:
- if not ref:
- continue
- acc, name, type = [word.strip() for word in ref.split(",")]
- if type == 'T':
- self.data.dr_positive.append((acc, name))
- elif type == 'F':
- self.data.dr_false_pos.append((acc, name))
- elif type == 'N':
- self.data.dr_false_neg.append((acc, name))
- elif type == 'P':
- self.data.dr_potential.append((acc, name))
- elif type == '?':
- self.data.dr_unknown.append((acc, name))
- else:
- raise ValueError("I don't understand type flag %s" % type)
-
- def pdb_reference(self, line):
- cols = line.split()
- for id in cols[1:]: # get all but the '3D' col
- self.data.pdb_structs.append(self._chomp(id))
-
- def prorule(self, line):
- #Assume that each PR line can contain multiple ";" separated rules
- rules = self._clean(line).split(";")
- self.data.prorules.extend(rules)
-
- def documentation(self, line):
- self.data.pdoc = self._chomp(self._clean(line))
-
- def terminator(self, line):
- self.finished = True
-
- def _chomp(self, word, to_chomp='.,;'):
- # Remove the punctuation at the end of a word.
- if word[-1] in to_chomp:
- return word[:-1]
- return word
-
- def _clean(self, line, rstrip=1):
- # Clean up a line.
- if rstrip:
- return line[5:].rstrip()
- return line[5:]
-
-def scan_sequence_expasy(seq=None, id=None, exclude_frequent=None):
- """scan_sequence_expasy(seq=None, id=None, exclude_frequent=None) ->
- list of PatternHit's
-
- Search a sequence for occurrences of Prosite patterns. You can
- specify either a sequence in seq or a SwissProt/trEMBL ID or accession
- in id. Only one of those should be given. If exclude_frequent
- is true, then the patterns with the high probability of occurring
- will be excluded.
-
- """
- from Bio import ExPASy
- if (seq and id) or not (seq or id):
- raise ValueError("Please specify either a sequence or an id")
- handle = ExPASy.scanprosite1(seq, id, exclude_frequent)
- return _extract_pattern_hits(handle)
-
-def _extract_pattern_hits(handle):
- """_extract_pattern_hits(handle) -> list of PatternHit's
-
- Extract hits from a web page. Raises a ValueError if there
- was an error in the query.
-
- """
- class parser(sgmllib.SGMLParser):
- def __init__(self):
- sgmllib.SGMLParser.__init__(self)
- self.hits = []
- self.broken_message = 'Some error occurred'
- self._in_pre = 0
- self._current_hit = None
- self._last_found = None # Save state of parsing
- def handle_data(self, data):
- if data.find('try again') >= 0:
- self.broken_message = data
- return
- elif data == 'illegal':
- self.broken_message = 'Sequence contains illegal characters'
- return
- if not self._in_pre:
- return
- elif not data.strip():
- return
- if self._last_found is None and data[:4] == 'PDOC':
- self._current_hit.pdoc = data
- self._last_found = 'pdoc'
- elif self._last_found == 'pdoc':
- if data[:2] != 'PS':
- raise ValueError("Expected accession but got:\n%s" % data)
- self._current_hit.accession = data
- self._last_found = 'accession'
- elif self._last_found == 'accession':
- self._current_hit.name = data
- self._last_found = 'name'
- elif self._last_found == 'name':
- self._current_hit.description = data
- self._last_found = 'description'
- elif self._last_found == 'description':
- m = re.findall(r'(\d+)-(\d+) (\w+)', data)
- for start, end, seq in m:
- self._current_hit.matches.append(
- (int(start), int(end), seq))
-
- def do_hr(self, attrs):
- # <HR> inside a <PRE> section means a new hit.
- if self._in_pre:
- self._current_hit = PatternHit()
- self.hits.append(self._current_hit)
- self._last_found = None
- def start_pre(self, attrs):
- self._in_pre = 1
- self.broken_message = None # Probably not broken
- def end_pre(self):
- self._in_pre = 0
- p = parser()
- p.feed(handle.read())
- if p.broken_message:
- raise ValueError(p.broken_message)
- return p.hits
-
-
-
-
-def index_file(filename, indexname, rec2key=None):
- """index_file(filename, indexname, rec2key=None)
-
- Index a Prosite file. filename is the name of the file.
- indexname is the name of the dictionary. rec2key is an
- optional callback that takes a Record and generates a unique key
- (e.g. the accession number) for the record. If not specified,
- the id name will be used.
-
- """
- import os
- if not os.path.exists(filename):
- raise ValueError("%s does not exist" % filename)
-
- index = Index.Index(indexname, truncate=1)
- index[Dictionary._Dictionary__filename_key] = filename
-
- handle = open(filename)
- records = parse(handle)
- end = 0L
- for record in records:
- start = end
- end = handle.tell()
- length = end - start
-
- if rec2key is not None:
- key = rec2key(record)
- else:
- key = record.name
-
- if not key:
- raise KeyError("empty key was produced")
- elif key in index:
- raise KeyError("duplicate key %s found" % key)
-
- index[key] = start, length
View
4 DEPRECATED
@@ -112,8 +112,8 @@ from Bio.SwissProt.
Bio.Prosite and Bio.Enzyme
==========================
-Declared obsolete in Release 1.50, and deprecated in Release 1.53.
-Most of the functionality has moved to Bio.ExPASy.Prosite and
+Declared obsolete in Release 1.50, deprecated in Release 1.53, and removed in
+Release 1.57. Most of the functionality has moved to Bio.ExPASy.Prosite and
Bio.ExPASy.Enzyme, respectively.
Bio.EZRetrieve, Bio.NetCatch, Bio.File.SGMLHandle, Bio.FilteredReader
View
3,764 Tests/test_prosite_patterns.py
0 additions, 3,764 deletions not shown
View
2  setup.py
@@ -221,7 +221,6 @@ def is_Numpy_installed():
'Bio.Data',
'Bio.Emboss',
'Bio.Entrez',
- 'Bio.Enzyme',
'Bio.ExPASy',
'Bio.FSSP',
'Bio.GA',
@@ -259,7 +258,6 @@ def is_Numpy_installed():
'Bio.PopGen.FDist',
'Bio.PopGen.GenePop',
'Bio.PopGen.SimCoal',
- 'Bio.Prosite',
'Bio.Restriction',
'Bio.Restriction._Update',
'Bio.SCOP',
Please sign in to comment.
Something went wrong with that request. Please try again.