From 6afcce88a5a19cd4e9ff67f7073c2ae69035833b Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 6 Oct 2014 13:56:52 -0700
Subject: [PATCH] Fixing minor defects.

Fixed MafIndex not inheriting from object.

Fixed use of "+1" and "-1" to represent strands; changed to 1 and -1 (ints) like in Bio.SeqFeature.

Removed trailing whitespace in MafIO.py.
---
 Bio/AlignIO/MafIO.py | 200 +++++++++++++++++++++----------------------
 1 file changed, 100 insertions(+), 100 deletions(-)

diff --git a/Bio/AlignIO/MafIO.py b/Bio/AlignIO/MafIO.py
index a2314358184..fedff7d561b 100644
--- a/Bio/AlignIO/MafIO.py
+++ b/Bio/AlignIO/MafIO.py
@@ -22,7 +22,7 @@
 
 class MafWriter(SequentialAlignmentWriter):
     """Accepts a MultipleSeqAlignment object, writes a MAF file"""
-    
+
     def write_header(self):
         """Writes the MAF header"""
 
@@ -32,10 +32,10 @@ def write_header(self):
     def _write_record (self, record):
         """Writes a single SeqRecord object to an 's' line in a MAF block"""
 
-        # convert biopython-style +1/-1 strand to MAF-style +/- strand
-        if record.annotations.get("strand") == "+1":
+        # convert biopython-style 1/-1 strand to MAF-style +/- strand
+        if record.annotations.get("strand") == 1:
             strand = "+"
-        elif record.annotations.get("strand") == "-1":
+        elif record.annotations.get("strand") == -1:
             strand = "-"
         else:
             strand = "+"
@@ -55,10 +55,10 @@ def write_alignment(self, alignment):
         Writes every SeqRecord in a MultipleSeqAlignment object to its own
         MAF block (beginning with an 'a' line, containing 's' lines)
         """
-        
+
         if not isinstance(alignment, Alignment):
             raise TypeError("Expected an alignment object")
-        
+
         if len(set([len(x) for x in alignment])) > 1:
             raise ValueError("Sequences must all be the same length")
 
@@ -86,26 +86,26 @@ def write_alignment(self, alignment):
         self.handle.write("\n")
 
         return recs_out
-        
+
 def MafIterator(handle, seq_count = None, alphabet = single_letter_alphabet):
     """
     Iterates over lines in a MAF file-like object (handle), yielding
-    MultipleSeqAlignment objects. SeqRecord IDs generally correspond to 
+    MultipleSeqAlignment objects. SeqRecord IDs generally correspond to
     species names
     """
 
     in_a_bundle = False
-    
+
     annotations = []
     records = []
-    
+
     while True:
         # allows parsing of the last bundle without duplicating code
         try:
             line = next(handle)
         except StopIteration:
             line = ""
-        
+
         if in_a_bundle:
             if line.startswith("s"):
                 # add a SeqRecord to the bundle
@@ -114,35 +114,35 @@ def MafIterator(handle, seq_count = None, alphabet = single_letter_alphabet):
                 if len(line_split) != 7:
                     raise ValueError("Error parsing alignment - 's' line must have 7 fields")
 
-                # convert MAF-style +/- strand to biopython-stype +1/-1
+                # convert MAF-style +/- strand to biopython-type 1/-1
                 if line_split[4] == "+":
-                    strand = "+1"
+                    strand = 1
                 elif line_split[4] == "-":
-                    strand = "-1"
+                    strand = -1
                 else:
-                    strand = "+1"
+                    strand = 1
 
                 # s (literal), src (ID), start, size, strand, srcSize, text (sequence)
                 anno = {"start": int(line_split[2]),
                         "size": int(line_split[3]),
                         "strand": strand,
                         "srcSize": int(line_split[5])}
-                        
+
                 sequence = line_split[6]
-                
+
                 # interpret a dot/period to mean same the first sequence
                 if "." in sequence:
                     if not records:
                         raise ValueError("Found dot/period in first sequence of alignment")
-                        
+
                     ref = str(records[0].seq)
                     new = []
-                    
+
                     for (s, r) in zip(sequence, ref):
                         new.append(r if s == "." else s)
-                             
+
                     sequence = "".join(new)
-                    
+
                 records.append(SeqRecord(Seq(sequence, alphabet),
                                id = line_split[1],
                                name = line_split[1],
@@ -157,17 +157,17 @@ def MafIterator(handle, seq_count = None, alphabet = single_letter_alphabet):
                 # end a bundle of records
                 if seq_count is not None:
                     assert len(records) == seq_count
-                    
+
                 alignment = MultipleSeqAlignment(records, alphabet)
                 #TODO - Introduce an annotated alignment class?
-                #See also Bio/AlignIO/FastaIO.py for same requirement.        
+                #See also Bio/AlignIO/FastaIO.py for same requirement.
                 #For now, store the annotation a new private property:
                 alignment._annotations = annotations
-                
+
                 yield alignment
-                
+
                 in_a_bundle = False
-                
+
                 annotations = []
                 records = []
             else:
@@ -175,7 +175,7 @@ def MafIterator(handle, seq_count = None, alphabet = single_letter_alphabet):
         elif line.startswith("a"):
             # start a bundle of records
             in_a_bundle = True
-            
+
             if len(line.strip().split()[1:]) != line.count("="):
                 raise ValueError("Error parsing alignment - invalid key in 'a' line")
 
@@ -186,28 +186,28 @@ def MafIterator(handle, seq_count = None, alphabet = single_letter_alphabet):
         elif not line:
             break
 
-class MafIndex():
+class MafIndex(object):
     def __init__(self, sqlite_file, maf_file, target_seqname):
         """Indexes or loads the index of a MAF file"""
 
         import os
-        
+
         try:
             from sqlite3 import dbapi2 as _sqlite
         except ImportError:
             from Bio import MissingPythonDependencyError
             raise MissingPythonDependencyError("Requires sqlite3, which is "
                                                "included Python 2.5+")
-        
+
         self._target_seqname = target_seqname
         self._maf_file = maf_file
-        
+
         # make sure maf_file exists, then open it up
         if os.path.isfile(self._maf_file):
             self._maf_fp = open(self._maf_file, "r")
         else:
             raise ValueError("Error opening %s -- file not found" % (self._maf_file,))
-        
+
         # if sqlite_file exists, use the existing db, otherwise index the file
         if os.path.isfile(sqlite_file):
             self._con = _sqlite.connect(sqlite_file)
@@ -215,7 +215,7 @@ def __init__(self, sqlite_file, maf_file, target_seqname):
         else:
             self._con = _sqlite.connect(sqlite_file)
             self._record_count = self.__make_new_index()
-            
+
         # lastly, setup a MafIterator pointing at the open maf_file
         self._mafiter = MafIterator(self._maf_fp)
 
@@ -227,7 +227,7 @@ def __check_existing_db(self):
         from sqlite3 import DatabaseError as _DatabaseError
 
         try:
-            idx_version = int(self._con.execute("SELECT value FROM meta_data WHERE key = 'version'").fetchone()[0])   
+            idx_version = int(self._con.execute("SELECT value FROM meta_data WHERE key = 'version'").fetchone()[0])
             if idx_version != MAFINDEX_VERSION:
                 raise ValueError("Index version (%s) incompatible with this version of MafIndex" % (idx_version,))
 
@@ -239,10 +239,10 @@ def __check_existing_db(self):
             if db_target != self._target_seqname:
                 raise ValueError("Provided database indexed for %s, expected %s" % (db_target, self._target_seqname))
 
-            record_count = int(self._con.execute("SELECT value FROM meta_data WHERE key = 'record_count'").fetchone()[0])          
+            record_count = int(self._con.execute("SELECT value FROM meta_data WHERE key = 'record_count'").fetchone()[0])
             if record_count == -1:
                 raise ValueError("Unfinished/partial database provided")
-                
+
             records_found = int(self._con.execute("SELECT COUNT(*) FROM offset_data").fetchone()[0])
             if records_found != record_count:
                 raise ValueError("Expected %s records, found %s.  Corrupt index?" % (record_count, records_found))
@@ -253,7 +253,7 @@ def __check_existing_db(self):
 
     def __make_new_index(self):
         """Read MAF file and generate SQLite index"""
-        
+
         import itertools
 
         # make the tables
@@ -263,32 +263,32 @@ def __make_new_index(self):
         self._con.execute("INSERT INTO meta_data (key, value) VALUES ('target_seqname', '%s');" % (self._target_seqname,))
         self._con.execute("INSERT INTO meta_data (key, value) VALUES ('filename', '%s');" % (self._maf_file,))
         self._con.execute("CREATE TABLE offset_data (bin INTEGER, start INTEGER, end INTEGER, offset INTEGER);")
-        
+
         insert_count = 0
-        
+
         # iterate over the entire file and insert in batches
         mafindex_func = self.__maf_indexer()
-                    
+
         while True:
             batch = list(itertools.islice(mafindex_func, 100))
             if not batch: break
-        
+
             self._con.executemany("INSERT INTO offset_data (bin, start, end, offset) VALUES (?,?,?,?);", batch)
             self._con.commit()
-            
+
             insert_count += len(batch)
-        
+
         # then make indexes on the relevant fields
         self._con.execute("CREATE INDEX IF NOT EXISTS bin_index ON offset_data(bin);")
         self._con.execute("CREATE INDEX IF NOT EXISTS start_index ON offset_data(start);")
         self._con.execute("CREATE INDEX IF NOT EXISTS end_index ON offset_data(end);")
-        
+
         self._con.execute("UPDATE meta_data SET value = '%s' WHERE key = 'record_count'" % (insert_count,))
-        
+
         self._con.commit()
-        
+
         return insert_count
-        
+
     def __maf_indexer(self):
         """Generator function, returns index information for each bundle"""
 
@@ -298,17 +298,17 @@ def __maf_indexer(self):
             if line.startswith("a"):
                 # note the offset
                 offset = self._maf_fp.tell() - len(line)
-                
+
                 # search the following lines for a match to target_seqname
                 while True:
                     line = self._maf_fp.readline()
-                    
+
                     if not line.strip() or line.startswith("a"):
                         raise ValueError("Target for indexing (%s) not found in this bundle" % (self._target_seqname,))
                     elif line.startswith("s"):
                         # s (literal), src (ID), start, size, strand, srcSize, text (sequence)
                         line_split = line.strip().split()
-                        
+
                         if line_split[1] == self._target_seqname:
                             start = int(line_split[2])
                             end = int(line_split[2]) + int(line_split[3])
@@ -318,15 +318,15 @@ def __maf_indexer(self):
                                                 (end - start, len(line_split[6].replace("-", ""))))
 
                             yield (self._ucscbin(start, end), start, end, offset)
-                            
-                            break                    
+
+                            break
 
             line = self._maf_fp.readline()
 
     @staticmethod
     def _region2bin(start, end):
         """Finds bins that a region may belong to.
-        
+
         Converts a region to a list of bins that it may belong to, including largest
         and smallest bins.
         """
@@ -343,7 +343,7 @@ def _region2bin(start, end):
     @staticmethod
     def _ucscbin(start, end):
         """Returns the smallest bin a given region will fit into.
-        
+
         Taken from http://genomewiki.ucsc.edu/index.php/Bin_indexing_system
         """
 
@@ -366,30 +366,30 @@ def _ucscbin(start, end):
             endBin >>= _binNextShift
 
         return 0
-            
+
     def _get_record(self, offset):
         """Retrieves a single MAF record located at the offset provided."""
-        
+
         self._maf_fp.seek(offset)
         return next(self._mafiter)
         
     def search(self, starts, ends):
         """Searches index database for MAF records overlapping ranges provided.
+
         Returns results in order by start, then end, then internal offset 
         field."""
-        
         # verify the provided exon coordinates
         if len(starts) != len(ends):
             raise ValueError("Every position in starts must have a match in ends")
-        
+
         for exonstart, exonend in zip(starts, ends):
             if exonstart >= exonend:
                 raise ValueError("Exon coordinates invalid (%s >= %s)" % (exonstart, exonend))
-        
+
         con = self._con
-        
+
         # search for every exon
-        
+
         for exonstart, exonend in zip(starts, ends):
             try:
                 possible_bins = ", ".join(map(str, self._region2bin(exonstart, exonend)))
@@ -402,7 +402,7 @@ def search(self, starts, ends):
                      "OR %s BETWEEN start AND end) ORDER BY start, end, "
                      "offset ASC;" \
                      % (possible_bins, exonstart, exonend, exonend))
-    
+
             rows = result.fetchall()
             
             for rec_start, rec_end, offset in rows:
@@ -424,41 +424,41 @@ def search(self, starts, ends):
                             
                 yield fetched
             
-    def get_spliced(self, starts, ends, strand = "+1"):
+    def get_spliced(self, starts, ends, strand="+1"):
         """Returns a multiple alignment of the exact sequence range provided.
-        
+
         Accepts two lists of start and end positions on target_seqname, representing
         exons to be spliced in silico.  Returns a MultipleSeqAlignment of the
         desired sequences spliced together.
         """
 
         # validate strand
-        if strand not in ("+1", "-1"):
-            raise ValueError("Strand must be +1 or -1")
-                
+        if strand not in (1, -1):
+            raise ValueError("Strand must be 1 or -1")
+
         # pull all alignments that span the desired intervals
         fetched = [x for x in self.search(starts, ends)]
-        
+
         # keep track of the expected letter count
         expected_letters = sum([y - x for x, y in zip(starts,ends)])
-        
+
         # if there's no alignment, return filler for the assembly of the length given
         if len(fetched) == 0:
             return MultipleSeqAlignment([SeqRecord(Seq("N" * expected_letters),
                                                    id=self._target_seqname)])
-        
+
         # find the union of all IDs in these alignments
         all_seqnames = set([x.id for y in fetched for x in y])
-                
+
         # split every record by base position
         split_by_position = dict([(x, {}) for x in all_seqnames])
-        
+
         # keep track of what the total number of (unspliced) letters should be
         total_rec_length = 0
-        
+
         # track first strand encountered on the target seqname
         ref_first_strand = None
-        
+
         for multiseq in fetched:
             # find the target_seqname in this MultipleSeqAlignment and use it to
             # set the parameters for the rest of this iteration
@@ -467,66 +467,66 @@ def get_spliced(self, starts, ends, strand = "+1"):
                     try:
                         if ref_first_strand == None:
                             ref_first_strand = seqrec.annotations["strand"]
-                            
-                            if ref_first_strand not in ("+1", "-1"):
-                                raise ValueError("Strand must be +1 or -1")
+
+                            if ref_first_strand not in (1, -1):
+                                raise ValueError("Strand must be 1 or -1")
                         elif ref_first_strand != seqrec.annotations["strand"]:
                             raise ValueError("Encountered strand='%s' on target seqname, expected '%s'" % \
                             (seqrec.annotations["strand"], ref_first_strand))
                     except KeyError:
                         raise ValueError("No strand information for target seqname (%s)" % \
                         (self._target_seqname,))
-                        
+
                     rec_length = len(seqrec)
                     rec_start = seqrec.annotations["start"]
                     rec_end = seqrec.annotations["start"] + seqrec.annotations["size"]
 
                     total_rec_length += rec_end - rec_start
-                    
+
                     # blank out these positions for every seqname
                     for seqrec in multiseq:
                         for pos in range(rec_start, rec_end):
                             split_by_position[seqrec.id][pos] = ""
-                    
+
                     break
             else:
                 raise ValueError("Did not find %s in alignment bundle" % (self._target_seqname,))
-                
+
             # the true, chromosome/contig/etc position in the target seqname
             real_pos = rec_start
-            
+
             for gapped_pos in range(0, rec_length):
                 for seqrec in multiseq:
                     # keep track of this position's value for the target seqname
                     if seqrec.id == self._target_seqname: track_val = seqrec.seq[gapped_pos]
-                    
+
                     split_by_position[seqrec.id][real_pos] += seqrec.seq[gapped_pos]
-                    
+
                 # increment the real_pos counter only when non-gaps are found in
                 # the target_seqname, and we haven't reached the end of the record
                 if track_val != "-" and real_pos < rec_end - 1: real_pos += 1
-                
+
         # make sure the number of bp entries equals the sum of the record lengths
         if len(split_by_position[self._target_seqname]) != total_rec_length:
             raise ValueError("Target seqname (%s) has %s records, expected %s" % \
             (self._target_seqname, len(split_by_position[self._target_seqname]), total_rec_length))
 
-        # translates a position in the target_seqname sequence to its gapped length        
+        # translates a position in the target_seqname sequence to its gapped length
         realpos_to_len = dict([(x, len(y)) for x, y in split_by_position[self._target_seqname].items() if len(y) > 1])
 
-        # splice together the exons            
+        # splice together the exons
         subseq = {}
-        
+
         for seqid in all_seqnames:
             seq_split = split_by_position[seqid]
             seq_splice = []
-            
+
             filler_char = "N" if seqid == self._target_seqname else "-"
 
             # iterate from start to end, taking bases from split_by_position when
-            # they exist, using N or - for gaps when there is no alignment.            
+            # they exist, using N or - for gaps when there is no alignment.
             append = seq_splice.append
-            
+
             for exonstart, exonend in zip(starts, ends):
                 for real_pos in range(exonstart, exonend):
                     # if this seqname has this position, add it
@@ -538,37 +538,37 @@ def get_spliced(self, starts, ends, strand = "+1"):
                     # it's not in either, so add a single filler character
                     else:
                         append(filler_char)
-                        
+
             subseq[seqid] = "".join(seq_splice)
 
         # make sure we're returning the right number of letters
         if len(subseq[self._target_seqname].replace("-", "")) != expected_letters:
             raise ValueError("Returning %s letters for target seqname (%s), expected %s" % \
             (len(subseq[self._target_seqname].replace("-", "")), self._target_seqname, expected_letters))
-                
+
         # check to make sure all sequences are the same length as the target seqname
         ref_subseq_len = len(subseq[self._target_seqname])
-        
+
         for seqid, seq in subseq.items():
             if len(seq) != ref_subseq_len:
                 raise ValueError("Returning length %s for %s, expected %s" % \
                 (len(seq), seqid, ref_subseq_len))
-                
+
         # finally, build a MultipleSeqAlignment object for our final sequences
         result_multiseq = []
-        
+
         for seqid, seq in subseq.items():
             seq = Seq(seq)
-            
+
             seq = seq if strand == ref_first_strand else seq.reverse_complement()
-            
+
             result_multiseq.append(SeqRecord(seq,
                                              id = seqid,
                                              name = seqid,
                                              description = ""))
-                                             
+
         return MultipleSeqAlignment(result_multiseq)
-        
+
 
     def __repr__(self):
         return "MafIO.MafIndex(%r, target_seqname=%r)" % (self._maf_fp.name,