replacing the C-based tokenizer by a Python-based tokenizer using the…

… shlex module of the Python standard library
biopython · Mar 15, 2013 · b2bafdf · b2bafdf
1 parent 203345d
commit b2bafdf
Showing 1 changed file with 48 additions and 106 deletions.
diff --git a/Bio/PDB/MMCIF2Dict.py b/Bio/PDB/MMCIF2Dict.py
@@ -5,118 +5,60 @@
 
 """Turn an mmCIF file into a dictionary."""
 
-import os.path
-import warnings
-import Bio.PDB.mmCIF.MMCIFlex as MMCIFlex
-
+import shlex
 
 class MMCIF2Dict(dict):
-    # The token identifiers
-    NAME=1
-    LOOP=2
-    DATA=3
-    SEMICOLONS=4
-    DOUBLEQUOTED=5
-    QUOTED=6
-    SIMPLE=7
 
     def __init__(self, filename):
-        if not os.path.isfile(filename):
-            raise IOError("File not found.")
-        MMCIFlex.open_file(filename)
-        dict.__init__(self, **self._make_mmcif_dict())
-        MMCIFlex.close_file()
-
-    def _make_mmcif_dict(self):
-        """
-        Loop through PLY token (type, value) pairs, return a dict.
-
-        """
-        # this dict will contain the name/data pairs
-        mmcif_dict = {}
-        # entry for garbage
-        mmcif_dict[None] = []
-        # local copies
-        NAME=self.NAME
-        LOOP=self.LOOP
-        DATA=self.DATA
-        SEMICOLONS=self.SEMICOLONS
-        DOUBLEQUOTED=self.DOUBLEQUOTED
-        QUOTED=self.QUOTED
-        SIMPLE=self.SIMPLE
-        get_token=MMCIFlex.get_token
-        # are we looping?
-        loop_flag=0
-        # list of names in loop
-        temp_list=[]
-        # last encountered name
-        current_name=None
-        # get first token/value pair
-        token, value=get_token()
-        # print token, value
-        # loop until EOF (token==0)
-        while token:
-            if token==NAME:
-                if loop_flag:
-                    # Make lists for all the names in the loop
-                    while token==NAME:
-                        # create  a list for each name encountered in loop
-                        new_list=mmcif_dict[value]=[]
-                        temp_list.append(new_list)
-                        token, value=get_token()
-                        # print token, value
-                    loop_flag=0
-                    # nr of data items parsed
-                    data_counter=0
-                    # corresponding data name
-                    pos=0
-                    nr_fields=len(temp_list)
-                    # Now fill all lists with the data
-                    while token>3:
-                        pos=data_counter%nr_fields
-                        data_counter=data_counter+1
-                        temp_list[pos].append(value)
-                        token, value=get_token()
-                        # print token, value
-                    if pos!=nr_fields-1:
-                        warnings.warn("ERROR: broken name-data pair "
-                                      "(data missing)!", RuntimeWarning)
-                    # The last token was not used, so
-                    # don't set token to None! (this means the
-                    # last parsed token goes through the loop again)
-                else:
-                    # simple name-data pair (no loop)
-                    # so next token should be the data
-                    next_token, data=get_token()
-                    # print token, value
-                    mmcif_dict[value]=data
-                    if next_token<4:
-                        warnings.warn("ERROR: broken name-data pair "
-                                      "(name-non data pair)!", RuntimeWarning)
-                        # print token, value
+        handle = open(filename)
+        loop_flag = False
+        key = None
+        tokens = self._tokenize(handle)
+        token = tokens.next()
+        self[token[0:5]]=token[5:]
+        for token in tokens:
+            if token=="loop_":
+                loop_flag = True
+                keys = []
+                i = 0
+                n = 0
+                continue
+            elif loop_flag:
+                if token.startswith("_"):
+                    if i > 0:
+                        loop_flag = False
                     else:
-                        # get next token
-                        token=None
-            elif token==LOOP:
-                loop_flag=1
-                temp_list=[]
-                # get next token
-                token=None
-            elif token==DATA:
-                mmcif_dict[value[0:5]]=value[5:]
-                token=None
+                        self[token] = []
+                        keys.append(token)
+                        n += 1
+                        continue
+                else:
+                    self[keys[i%n]].append(token)
+                    i+=1
+                    continue
+            if key is None:
+                key = token
+            else:
+                self[key] = token
+                key = None
+        handle.close()
+
+    def _tokenize(self, handle):
+        for line in handle:
+            if line.startswith("#"):
+                continue
+            elif line.startswith(";"):
+                token = line[1:].strip()
+                for line in handle:
+                    line = line.strip()
+                    if line==';':
+                        break
+                    token += line
+                yield token
             else:
-                # we found some complete garbage
-                warnings.warn("ERROR: broken name-data pair "
-                              "(missing name)!\n%s %s" % (token, value),
-                              RuntimeWarning)
-                mmcif_dict[None].append(value)
-                # get next token
-                token=None
-            if token is None:
-                token, value=get_token()
-                # print token, value
-        return mmcif_dict
+                tokens = shlex.split(line)
+                for token in tokens:
+                    yield token
 
 
 if __name__=="__main__":