Skip to content

Commit

Permalink
replacing the C-based tokenizer by a Python-based tokenizer using the…
Browse files Browse the repository at this point in the history
… shlex module of the Python standard library
  • Loading branch information
Michiel de Hoon authored and Michiel de Hoon committed Mar 15, 2013
1 parent 203345d commit b2bafdf
Showing 1 changed file with 48 additions and 106 deletions.
154 changes: 48 additions & 106 deletions Bio/PDB/MMCIF2Dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,118 +5,60 @@

"""Turn an mmCIF file into a dictionary."""

import os.path
import warnings
import Bio.PDB.mmCIF.MMCIFlex as MMCIFlex

import shlex

class MMCIF2Dict(dict):
# The token identifiers
NAME=1
LOOP=2
DATA=3
SEMICOLONS=4
DOUBLEQUOTED=5
QUOTED=6
SIMPLE=7

def __init__(self, filename):
if not os.path.isfile(filename):
raise IOError("File not found.")
MMCIFlex.open_file(filename)
dict.__init__(self, **self._make_mmcif_dict())
MMCIFlex.close_file()

def _make_mmcif_dict(self):
"""
Loop through PLY token (type, value) pairs, return a dict.
"""
# this dict will contain the name/data pairs
mmcif_dict = {}
# entry for garbage
mmcif_dict[None] = []
# local copies
NAME=self.NAME
LOOP=self.LOOP
DATA=self.DATA
SEMICOLONS=self.SEMICOLONS
DOUBLEQUOTED=self.DOUBLEQUOTED
QUOTED=self.QUOTED
SIMPLE=self.SIMPLE
get_token=MMCIFlex.get_token
# are we looping?
loop_flag=0
# list of names in loop
temp_list=[]
# last encountered name
current_name=None
# get first token/value pair
token, value=get_token()
# print token, value
# loop until EOF (token==0)
while token:
if token==NAME:
if loop_flag:
# Make lists for all the names in the loop
while token==NAME:
# create a list for each name encountered in loop
new_list=mmcif_dict[value]=[]
temp_list.append(new_list)
token, value=get_token()
# print token, value
loop_flag=0
# nr of data items parsed
data_counter=0
# corresponding data name
pos=0
nr_fields=len(temp_list)
# Now fill all lists with the data
while token>3:
pos=data_counter%nr_fields
data_counter=data_counter+1
temp_list[pos].append(value)
token, value=get_token()
# print token, value
if pos!=nr_fields-1:
warnings.warn("ERROR: broken name-data pair "
"(data missing)!", RuntimeWarning)
# The last token was not used, so
# don't set token to None! (this means the
# last parsed token goes through the loop again)
else:
# simple name-data pair (no loop)
# so next token should be the data
next_token, data=get_token()
# print token, value
mmcif_dict[value]=data
if next_token<4:
warnings.warn("ERROR: broken name-data pair "
"(name-non data pair)!", RuntimeWarning)
# print token, value
handle = open(filename)
loop_flag = False
key = None
tokens = self._tokenize(handle)
token = tokens.next()
self[token[0:5]]=token[5:]
for token in tokens:
if token=="loop_":
loop_flag = True
keys = []
i = 0
n = 0
continue
elif loop_flag:
if token.startswith("_"):
if i > 0:
loop_flag = False
else:
# get next token
token=None
elif token==LOOP:
loop_flag=1
temp_list=[]
# get next token
token=None
elif token==DATA:
mmcif_dict[value[0:5]]=value[5:]
token=None
self[token] = []
keys.append(token)
n += 1
continue
else:
self[keys[i%n]].append(token)
i+=1
continue
if key is None:
key = token
else:
self[key] = token
key = None
handle.close()

def _tokenize(self, handle):
for line in handle:
if line.startswith("#"):
continue
elif line.startswith(";"):
token = line[1:].strip()
for line in handle:
line = line.strip()
if line==';':
break
token += line
yield token
else:
# we found some complete garbage
warnings.warn("ERROR: broken name-data pair "
"(missing name)!\n%s %s" % (token, value),
RuntimeWarning)
mmcif_dict[None].append(value)
# get next token
token=None
if token is None:
token, value=get_token()
# print token, value
return mmcif_dict
tokens = shlex.split(line)
for token in tokens:
yield token


if __name__=="__main__":
Expand Down

0 comments on commit b2bafdf

Please sign in to comment.