Skip to content

Commit

Permalink
Added Functionality for Parsing XML Schema Type Files
Browse files Browse the repository at this point in the history
  • Loading branch information
David authored and peterjc committed May 12, 2015
1 parent cf54c6c commit d8c9245
Showing 1 changed file with 72 additions and 2 deletions.
74 changes: 72 additions & 2 deletions Bio/Entrez/Parser.py
Expand Up @@ -34,11 +34,13 @@
contents may change over time. About half the code in this parser deals
wih parsing the DTD, and the other half with the XML itself.
"""

import pdb
import re
import os
import warnings
from xml.parsers import expat
from io import BytesIO
import xml.etree.ElementTree as ET

# Importing these functions with leading underscore as not intended for reuse
from Bio._py3k import urlopen as _urlopen
Expand Down Expand Up @@ -161,6 +163,7 @@ class DataHandler(object):
directory = os.path.join(home, '.config', 'biopython')
del home
local_dtd_dir = os.path.join(directory, 'Bio', 'Entrez', 'DTDs')
local_xsd_dir = os.path.join(directory,'Bio', 'Entrez', 'XSDs')
del directory
del platform
try:
Expand All @@ -174,6 +177,7 @@ class DataHandler(object):

from Bio import Entrez
global_dtd_dir = os.path.join(str(Entrez.__path__[0]), "DTDs")
global_xsd_dir = os.path.join(str(Entrez.__path__[0]), "XSDs")
del Entrez

def __init__(self, validate):
Expand All @@ -190,6 +194,7 @@ def __init__(self, validate):
self.parser = expat.ParserCreate(namespace_separator=" ")
self.parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
self.parser.XmlDeclHandler = self.xmlDeclHandler
self.is_schema = False

def read(self, handle):
"""Set up the parser and let it parse the XML results"""
Expand Down Expand Up @@ -233,6 +238,7 @@ def parse(self, handle):
text = handle.read(BLOCK)
if not text:
# We have reached the end of the XML file
# print self.stack
if self.stack:
# No more XML data, but there is still some unfinished
# business
Expand Down Expand Up @@ -287,9 +293,27 @@ def xmlDeclHandler(self, version, encoding, standalone):
self.parser.StartNamespaceDeclHandler = self.startNamespaceDeclHandler

def startNamespaceDeclHandler(self, prefix, un):
raise NotImplementedError("The Bio.Entrez parser cannot handle XML data that make use of XML namespaces")
#This is an xml schema
if "Schema" in un:
self.is_schema = True
else:
raise NotImplementedError("The Bio.Entrez parser cannot handle XML data that make use of XML namespaces")

def startElementHandler(self, name, attrs):
#preprocessing the xml schema
if self.is_schema:
if len(attrs) > 0 and "http" in attrs.keys()[0]:
handle = self.open_xsd_file(os.path.basename(attrs.values()[0]))
#if there is no local xsd file grab the url and parse the file
if not handle:
handle = _urlopen(attrs.values()[0])
text = handle.read()
self.save_xsd_file(os.path.basename(attrs.values()[0]), text)
handle.close()
self.parse_xsd(ET.fromstring(text))
else:
self.parse_xsd(ET.fromstring(handle.read()))
handle.close()
self.content = ""
if name in self.lists:
object = ListElement()
Expand Down Expand Up @@ -364,6 +388,9 @@ def endElementHandler(self, name):
name = self.object.itemname
else:
self.object = self.stack.pop()
value = re.sub(r"[\s]+", "",value)
if self.is_schema and value:
self.object.update({'data':value})
return
value.tag = name
if self.attributes:
Expand All @@ -379,6 +406,22 @@ def endElementHandler(self, name):
def characterDataHandler(self, content):
self.content += content

def parse_xsd(self, root):
is_dictionary = False
for child in root:
for element in list(child.iter()):
if "element" in element.tag:
if "name" in element.attrib:
name = element.attrib['name']
if "attribute" in element.tag:
is_dictionary = True
if is_dictionary:
self.dictionaries.append(name)
is_dictionary = False
else:
self.lists.append(name)


def elementDecl(self, name, model):
"""This callback function is called for each element declaration:
<!ELEMENT name (...)>
Expand Down Expand Up @@ -475,6 +518,23 @@ def open_dtd_file(self, filename):
return handle
return None

def open_xsd_file(self, filename):
path = os.path.join(DataHandler.local_xsd_dir, filename)
try:
handle = open(path, "rb")
except IOError:
pass
else:
return handle
path = os.path.join(DataHandler.global_xsd_dir, filename)
try:
handle = open(path, "rb")
except IOError:
pass
else:
return handle
return None

def save_dtd_file(self, filename, text):
path = os.path.join(DataHandler.local_dtd_dir, filename)
try:
Expand All @@ -485,6 +545,16 @@ def save_dtd_file(self, filename, text):
handle.write(text)
handle.close()

def save_xsd_file(self, filename, text):
path = os.path.join(DataHandler.local_xsd_dir, filename)
try:
handle = open(path, "wb")
except IOError:
warnings.warn("Failed to save %s at %s" % (filename, path))
else:
handle.write(text)
handle.close()

def externalEntityRefHandler(self, context, base, systemId, publicId):
"""The purpose of this function is to load the DTD locally, instead
of downloading it from the URL specified in the XML. Using the local
Expand Down

0 comments on commit d8c9245

Please sign in to comment.