Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Added Compass parsing code contributed by James Casbon.

  • Loading branch information...
commit 76ce5ca7b3e84b87c7629393877cb75d6990de11 1 parent 836d8e5
chapmanb authored
View
260 Bio/Compass/__init__.py
@@ -0,0 +1,260 @@
+# Copyright 2004 by James Casbon. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""
+Code to deal with COMPASS output, a program for profile/profile comparison.
+
+Compass is described in:
+
+Sadreyev R, Grishin N. COMPASS: a tool for comparison of multiple protein
+alignments with assessment of statistical significance. J Mol Biol. 2003 Feb
+7;326(1):317-36.
+
+Tested with COMPASS 1.24.
+
+Classes:
+Record One result of a compass file
+_Scanner Scan compass results
+_Consumer Consume scanner events
+RecordParser Parse one compass record
+Iterator Iterate through a number of compass records
+"""
+from Bio import File
+from Bio.ParserSupport import *
+import re,string
+
+class Record:
+ """
+ Hold information from one compass hit.
+ Ali1 one is the query, Ali2 the hit.
+ """
+
+ def __init__(self):
+ self.query=''
+ self.hit=''
+ self.gap_threshold=0
+ self.query_length=0
+ self.query_filtered_length=0
+ self.query_nseqs=0
+ self.query_neffseqs=0
+ self.hit_length=0
+ self.hit_filtered_length=0
+ self.hit_nseqs=0
+ self.hit_neffseqs=0
+ self.sw_score=0
+ self.evalue=-1
+ self.query_start=-1
+ self.hit_start=-1
+ self.query_aln=''
+ self.hit_aln=''
+ self.positives=''
+
+
+ def query_coverage(self):
+ """Return the length of the query covered in alignment"""
+ s = string.replace(self.query_aln, "=", "")
+ return len(s)
+
+ def hit_coverage(self):
+ """Return the length of the hit covered in the alignment"""
+ s = string.replace(self.hit_aln, "=", "")
+ return len(s)
+
+class _Scanner:
+ """Reads compass output and generate events"""
+
+ def feed(self, handle, consumer):
+ """Feed in COMPASS ouput"""
+
+ if isinstance(handle, File.UndoHandle):
+ pass
+ else:
+ handle = File.UndoHandle(handle)
+
+
+ assert isinstance(handle, File.UndoHandle), \
+ "handle must be an UndoHandle"
+ if handle.peekline():
+ self._scan_record(handle, consumer)
+
+
+ def _scan_record(self,handle,consumer):
+ self._scan_names(handle, consumer)
+ self._scan_threshold(handle, consumer)
+ self._scan_lengths(handle,consumer)
+ self._scan_profilewidth(handle, consumer)
+ self._scan_scores(handle,consumer)
+ self._scan_alignment(handle,consumer)
+
+ def _scan_names(self,handle,consumer):
+ """
+ Ali1: 60456.blo.gz.aln Ali2: allscop//14984.blo.gz.aln
+ """
+ read_and_call(handle, consumer.names, contains="Ali1:")
+
+ def _scan_threshold(self,handle, consumer):
+ """
+ Threshold of effective gap content in columns: 0.5
+ """
+ read_and_call(handle, consumer.threshold, start="Threshold")
+
+ def _scan_lengths(self,handle, consumer):
+ """
+ length1=388 filtered_length1=386 length2=145 filtered_length2=137
+ """
+ read_and_call(handle, consumer.lengths, start="length1=")
+
+ def _scan_profilewidth(self,handle,consumer):
+ """
+ Nseqs1=399 Neff1=12.972 Nseqs2=1 Neff2=6.099
+ """
+ read_and_call(handle, consumer.profilewidth, contains="Nseqs1")
+
+ def _scan_scores(self,handle, consumer):
+ """
+ Smith-Waterman score = 37 Evalue = 5.75e+02
+ """
+ read_and_call(handle, consumer.scores, start="Smith-Waterman")
+
+ def _scan_alignment(self,handle, consumer):
+ """
+ QUERY 2 LSDRLELVSASEIRKLFDIAAGMKDVISLGIGEPDFDTPQHIKEYAKEALDKGLTHYGPN
+ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ QUERY 2 LSDRLELVSASEIRKLFDIAAGMKDVISLGIGEPDFDTPQHIKEYAKEALDKGLTHYGPN
+
+
+ QUERY IGLLELREAIAEKLKKQNGIEADPKTEIMVLLGANQAFLMGLSAFLKDGEEVLIPTPAFV
+ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ QUERY IGLLELREAIAEKLKKQNGIEADPKTEIMVLLGANQAFLMGLSAFLKDGEEVLIPTPAFV
+
+ """
+ while 1:
+ line = handle.readline()
+ if not line:
+ break
+ if is_blank_line(line):
+ continue
+ else:
+ consumer.query_alignment(line)
+ read_and_call(handle, consumer.positive_alignment)
+ read_and_call(handle, consumer.hit_alignment)
+
+class _Consumer:
+ # all regular expressions used -- compile only once
+ _re_names = re.compile("Ali1:\s+(\S+)\s+Ali2:\s+(\S+)\s+")
+ _re_threshold = \
+ re.compile("Threshold of effective gap content in columns: (\S+)")
+ _re_lengths = \
+ re.compile("length1=(\S+)\s+filtered_length1=(\S+)\s+length2=(\S+)"
+ + "\s+filtered_length2=(\S+)")
+ _re_profilewidth = \
+ re.compile("Nseqs1=(\S+)\s+Neff1=(\S+)\s+Nseqs2=(\S+)\s+Neff2=(\S+)")
+ _re_scores = re.compile("Smith-Waterman score = (\S+)\s+Evalue = (\S+)")
+ _re_start = re.compile("(\d+)")
+ _re_align = re.compile("^.{15}(\S+)")
+ _re_positive_alignment = re.compile("^.{15}(.+)")
+
+ def __init__(self):
+ self.data = None
+
+ def names(self, line):
+ """
+ Ali1: 60456.blo.gz.aln Ali2: allscop//14984.blo.gz.aln
+ ------query----- -------hit-------------
+ """
+ self.data = Record()
+ m = self.__class__._re_names.search(line)
+ self.data.query = m.group(1)
+ self.data.hit = m.group(2)
+
+ def threshold(self,line):
+ m = self.__class__._re_threshold.search(line)
+ self.data.gap_threshold = float(m.group(1))
+
+ def lengths(self,line):
+ m = self.__class__._re_lengths.search(line)
+ self.data.query_length = int(m.group(1))
+ self.data.query_filtered_length = float(m.group(2))
+ self.data.hit_length = int(m.group(3))
+ self.data.hit_filtered_length = float(m.group(4))
+
+ def profilewidth(self,line):
+ m = self.__class__._re_profilewidth.search(line)
+ self.data.query_nseqs = int(m.group(1))
+ self.data.query_neffseqs = float(m.group(2))
+ self.data.hit_nseqs = int(m.group(3))
+ self.data.hit_neffseqs = float(m.group(4))
+
+ def scores(self, line):
+ m = self.__class__._re_scores.search(line)
+ if m:
+ self.data.sw_score = int(m.group(1))
+ self.data.evalue = float(m.group(2))
+ else:
+ self.data.sw_score = 0
+ self.data.evalue = -1.0
+
+ def query_alignment(self, line):
+ m = self.__class__._re_start.search(line)
+ if m:
+ self.data.query_start = int(m.group(1))
+ m = self.__class__._re_align.match(line)
+ assert m!=None, "invalid match"
+ self.data.query_aln = self.data.query_aln + m.group(1)
+
+ def positive_alignment(self,line):
+ m = self.__class__._re_positive_alignment.match(line)
+ assert m!=None, "invalid match"
+ self.data.positives = self.data.positives + m.group(1)
+
+ def hit_alignment(self,line):
+ m = self.__class__._re_start.search(line)
+ if m:
+ self.data.hit_start = int(m.group(1))
+ m = self.__class__._re_align.match(line)
+ assert m!=None, "invalid match"
+ self.data.hit_aln = self.data.hit_aln + m.group(1)
+
+class RecordParser(AbstractParser):
+ """Parses compass results into a Record object.
+ """
+ def __init__(self):
+ self._scanner = _Scanner()
+ self._consumer = _Consumer()
+
+
+ def parse(self, handle):
+ if isinstance(handle, File.UndoHandle):
+ uhandle = handle
+ else:
+ uhandle = File.UndoHandle(handle)
+ self._scanner.feed(uhandle, self._consumer)
+ return self._consumer.data
+
+class Iterator:
+ """Iterate through a file of compass results"""
+ def __init__(self, handle):
+ self._uhandle = File.UndoHandle(handle)
+ self._parser = RecordParser()
+
+ def next(self):
+ lines = []
+ while 1:
+ line = self._uhandle.readline()
+ if not line:
+ break
+ if line[0:4] == "Ali1" and lines:
+ self._uhandle.saveline(line)
+ break
+
+ lines.append(line)
+
+
+ if not lines:
+ return None
+
+ data = string.join(lines, '')
+ return self._parser.parse(File.StringHandle(data))
+
View
1  CONTRIB
@@ -5,7 +5,6 @@ This is a list of people who have made contributions to Biopython.
This is certainly not comprehensive, and if you've been overlooked
(sorry!), please write Jeff at jchang@smi.stanford.edu.
-
Cecilia Alsmark <Cecilia.Alsmark@ebc.uu.se>
Sebastian Bassi <sbassi@asalup.org>
Yves Bastide <ybastide@irisa.fr>
View
40 Tests/Compass/comtest1
@@ -0,0 +1,40 @@
+....Ali1: 60456.blo.gz.aln Ali2: 60456.blo.gz.aln
+Threshold of effective gap content in columns: 0.5
+length1=388 filtered_length1=386 length2=388 filtered_length2=386
+Nseqs1=399 Neff1=12.972 Nseqs2=399 Neff2=12.972
+Smith-Waterman score = 2759 Evalue = 0.00e+00
+
+QUERY 2 LSDRLELVSASEIRKLFDIAAGMKDVISLGIGEPDFDTPQHIKEYAKEALDKGLTHYGPN
+ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+QUERY 2 LSDRLELVSASEIRKLFDIAAGMKDVISLGIGEPDFDTPQHIKEYAKEALDKGLTHYGPN
+
+
+QUERY IGLLELREAIAEKLKKQNGIEADPKTEIMVLLGANQAFLMGLSAFLKDGEEVLIPTPAFV
+ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+QUERY IGLLELREAIAEKLKKQNGIEADPKTEIMVLLGANQAFLMGLSAFLKDGEEVLIPTPAFV
+
+
+QUERY SYAPAVILAGGKPVEVPTYEEDEFRLNVDELKKYVTDKTRALIINSPCNPTGAVLTKKDL
+ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+QUERY SYAPAVILAGGKPVEVPTYEEDEFRLNVDELKKYVTDKTRALIINSPCNPTGAVLTKKDL
+
+
+QUERY EEIADFVVEHDLIVISDEVYEHFIYDDARHYSIASLDGMFERTITVNGFSKTFAMTGWRL
+ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+QUERY EEIADFVVEHDLIVISDEVYEHFIYDDARHYSIASLDGMFERTITVNGFSKTFAMTGWRL
+
+
+QUERY GFVAAPSWIIERMVKFQMYNATCPVTFIQYAAAKALKDERSWKAVEEMRKEYDRRRKLVW
+ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+QUERY GFVAAPSWIIERMVKFQMYNATCPVTFIQYAAAKALKDERSWKAVEEMRKEYDRRRKLVW
+
+
+QUERY KRLNEMGLPTVKPKGAFYIFPRIRDTGLTSKKFSELMLKEARVAVVPGSAFGKAGEGYVR
+ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+QUERY KRLNEMGLPTVKPKGAFYIFPRIRDTGLTSKKFSELMLKEARVAVVPGSAFGKAGEGYVR
+
+
+QUERY ISYATAYEKLEEAMDRMERVLKERKL
+ ++++++++++++++++++++++++++
+QUERY ISYATAYEKLEEAMDRMERVLKERKL
+
View
41 Tests/Compass/comtest2
@@ -0,0 +1,41 @@
+Ali1: 60456.blo.gz.aln Ali2: allscop//14982.blo.gz.aln
+Threshold of effective gap content in columns: 0.5
+length1=388 filtered_length1=386 length2=116 filtered_length2=115
+Nseqs1=399 Neff1=12.972 Nseqs2=1 Neff2=11.313
+Smith-Waterman score = 35 Evalue = 1.01e+03
+
+QUERY 178 KKDLEEIAD
+ ++ ++++++
+QUERY 9 QAAVQAVTA
+
+Ali1: 60456.blo.gz.aln Ali2: allscop//14983.blo.gz.aln
+Threshold of effective gap content in columns: 0.5
+length1=388 filtered_length1=386 length2=121 filtered_length2=119
+Nseqs1=399 Neff1=12.972 Nseqs2=1 Neff2=11.168
+Smith-Waterman score = 35 Evalue = 1.01e+03
+
+QUERY 178 KKDLEEIAD
+ ++ ++++++
+QUERY 9 REAVEAAVD
+
+Ali1: 60456.blo.gz.aln Ali2: allscop//14984.blo.gz.aln
+Threshold of effective gap content in columns: 0.5
+length1=388 filtered_length1=386 length2=145 filtered_length2=137
+Nseqs1=399 Neff1=12.972 Nseqs2=1 Neff2=5.869
+Smith-Waterman score = 37 Evalue = 5.75e+02
+
+QUERY 371 LEEAMDRMER~~~V
+ + ++++ + + +
+QUERY 76 LQNFIDQLDNpddL
+
+Ali1: 60456.blo.gz.aln Ali2: allscop//15010.blo.gz.aln
+Threshold of effective gap content in columns: 0.5
+length1=388 filtered_length1=386 length2=141 filtered_length2=141
+Nseqs1=399 Neff1=12.972 Nseqs2=1 Neff2=6.099
+Smith-Waterman score = 37 Evalue = 5.75e+02
+
+QUERY 163 LIINSP
+ ++++++
+QUERY 32 LFDAHD
+
+
View
12 Tests/output/test_Compass
@@ -0,0 +1,12 @@
+test_Compass
+testAlignmentParsingOne (test_Compass.CompassTest) ... ok
+testAlignmentParsingTwo (test_Compass.CompassTest) ... ok
+testCompassIteratorEasy (test_Compass.CompassTest) ... ok
+testCompassIteratorHard (test_Compass.CompassTest) ... ok
+testCompassParser (test_Compass.CompassTest) ... ok
+testCompassScanAndConsume (test_Compass.CompassTest) ... ok
+
+----------------------------------------------------------------------
+Ran 6 tests in 0.007s
+
+OK
View
118 Tests/test_Compass.py
@@ -0,0 +1,118 @@
+"""Tests for parsing Compass output.
+"""
+import os
+import sys
+import unittest
+
+from Bio import Compass
+
+def run_tests(argv):
+ test_suite = testing_suite()
+ runner = unittest.TextTestRunner(sys.stdout, verbosity = 2)
+ runner.run(test_suite)
+
+def testing_suite():
+ """Generate the suite of tests.
+ """
+ test_suite = unittest.TestSuite()
+
+ test_loader = unittest.TestLoader()
+ test_loader.testMethodPrefix = 'test'
+ tests = [CompassTest]
+
+ for test in tests:
+ cur_suite = test_loader.loadTestsFromTestCase(test)
+ test_suite.addTest(cur_suite)
+
+ return test_suite
+
+class CompassTest(unittest.TestCase):
+ def setUp(self):
+ file_dir = os.path.join("Compass")
+ self.test_files = [
+ os.path.join(file_dir, "comtest1"),
+ os.path.join(file_dir, "comtest2")]
+
+ def testCompassScanAndConsume(self):
+ cons = Compass._Consumer()
+ scan = Compass._Scanner()
+ scan.feed(open(self.test_files[0]), cons)
+
+ com_record = cons.data
+
+ self.assertEquals("60456.blo.gz.aln", com_record.query)
+ self.assertEquals("60456.blo.gz.aln", com_record.hit)
+ self.assertEquals(0.5, com_record.gap_threshold)
+
+ self.assertEquals(388, com_record.query_length)
+ self.assertEquals(386, com_record.query_filtered_length)
+ self.assertEquals(388, com_record.hit_length)
+ self.assertEquals(386, com_record.hit_filtered_length)
+
+ self.assertEquals(399, com_record.query_nseqs)
+ self.assertEquals(12.972, com_record.query_neffseqs)
+ self.assertEquals(399, com_record.hit_nseqs)
+ self.assertEquals(12.972, com_record.hit_neffseqs)
+
+ self.assertEquals(2759, com_record.sw_score)
+ self.assertEquals(float("0.00e+00"), com_record.evalue)
+
+ def testCompassParser(self):
+ parser = Compass.RecordParser()
+ com_record = parser.parse(open(self.test_files[0]))
+
+ self.assertEquals("60456.blo.gz.aln", com_record.query)
+
+ def testCompassIteratorEasy(self):
+ it = Compass.Iterator(open(self.test_files[0]))
+
+ com_record = it.next()
+ self.assertEquals("60456.blo.gz.aln", com_record.query)
+
+ com_record = it.next()
+ self.assertEquals(None, com_record)
+ pass
+
+ def testCompassIteratorHard(self):
+ it = Compass.Iterator(open(self.test_files[1]))
+
+ com_record = it.next()
+ self.assertEquals("allscop//14982.blo.gz.aln", com_record.hit)
+ self.assertEquals(float('1.01e+03'), com_record.evalue)
+
+ com_record = it.next()
+ self.assertEquals("allscop//14983.blo.gz.aln", com_record.hit)
+ self.assertEquals(float('1.01e+03'), com_record.evalue)
+
+ com_record = it.next()
+ self.assertEquals("allscop//14984.blo.gz.aln", com_record.hit)
+ self.assertEquals(float('5.75e+02'), com_record.evalue)
+
+ def testAlignmentParsingOne(self):
+ it = Compass.Iterator(open(self.test_files[1]))
+
+ com_record = it.next()
+ self.assertEquals(178, com_record.query_start)
+ self.assertEquals("KKDLEEIAD", com_record.query_aln)
+ self.assertEquals(9, com_record.hit_start)
+ self.assertEquals("QAAVQAVTA", com_record.hit_aln)
+ self.assertEquals("++ ++++++", com_record.positives)
+
+ com_record = it.next()
+ com_record = it.next()
+ self.assertEquals(371, com_record.query_start)
+ self.assertEquals("LEEAMDRMER~~~V", com_record.query_aln)
+ self.assertEquals(76, com_record.hit_start)
+ self.assertEquals("LQNFIDQLDNpddL", com_record.hit_aln)
+ self.assertEquals("+ ++++ + + +", com_record.positives)
+
+ def testAlignmentParsingTwo(self):
+ it = Compass.Iterator(open(self.test_files[0]))
+
+ com_record = it.next()
+ self.assertEquals(2, com_record.query_start)
+ self.assertEquals(2, com_record.hit_start)
+ self.assertEquals("LKERKL", com_record.hit_aln[-6:])
+
+if __name__ == "__main__":
+ sys.exit(run_tests(sys.argv))
View
1  setup.py
@@ -293,6 +293,7 @@ def is_reportlab_installed():
'Bio.builders.Search',
'Bio.builders.SeqRecord',
'Bio.CDD',
+ 'Bio.Compass',
'Bio.Clustalw',
'Bio.config',
'Bio.Crystal',
Please sign in to comment.
Something went wrong with that request. Please try again.