more mixed content and conversion options

atl · May 20, 2010 · 88c3d60 · 88c3d60
1 parent 60b4fb9
commit 88c3d60
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 12 deletions.
diff --git a/README b/README
@@ -18,17 +18,19 @@ Dulse is unapologetically lossy. Besides the ordering and attribute simplificati
 Customization
 =============
 
-By default, Dulse tries to be as clever and sensitive as possible about the underlying content, including checking for mixed content and converting to numeric types as aggressively as possible. If you know more about the underlying content model, you can turn off features as desired. For example, if you know there's no mixed content, you can instantiate a parser that doesn't constantly check for it:
+By default, Dulse tries to be as clever and sensitive as possible about the underlying content, including checking for mixed content and converting to numeric types as aggressively as possible. If you know more about the underlying content model, you can turn off features as desired, sometimes resulting in a significant performance gain. For example, if you know there's no mixed content, you can instantiate a parser that doesn't constantly check for it:
 
     import dulse
     p = dulse.Parser(mixed_content=False)
     d = p.parse("hamlet.xml")
 
-Similarly, if you don't need to convert to numeric types, you can use one of the provided conversion functions, NUMBER (default), STRING, or NO_CONVERSION:
+If you know ahead of time which elements are expected to contain mixed content, then you can sidestep the expensive blind checking and explicitly parse accounting for the mixed content elements:
 
-    import dulse
-    p = dulse.Parser(conversion=dulse.STRING)
-    d = p.parse("hamlet.xml")
+    d = dulse.parse("hamlet.xml", mixed_elements=['LINE'])
+
+Similarly, if you don't need to convert to numeric types, you can use one of the provided conversion functions, NUMBER_OR_COLLAPSE (default), NUMBER, COLLAPSE_WHITESPACE, STRING, or NO_CONVERSION:
+
+    d = dulse.parse("hamlet.xml", conversion=dulse.STRING)
 
 Alternatively, you can convert each value using a function of your devising, and passing it into the conversion option of dulse.Parser().
 
@@ -37,4 +39,4 @@ Alternatively, you can convert each value using a function of your devising, and
 History & approach
 ==================
 
-Dulse began as an exercise in the pulldom and in trying to unify access between different textual encodings. The unifying project fell by the wayside, but I revisited the code for another project that perhaps should have used JSON. The basic design worked well, but then I did some basic speed profiling, and discovered that although it was simple, our approach using the pulldom wasn't necessarily as fast as expected. After reworking the code to use ElementTree, the module was ready to be let out into the wild.
+Dulse began as an exercise in the pulldom and in trying to unify access between different textual encodings. The unifying project fell by the wayside, but I revisited the code for another project that -- had I chosen the data carrier format -- could have used JSON, but used XML. The basic design worked well, but then I did some basic speed profiling, and discovered that although it was simple, the approach using the pulldom wasn't necessarily as fast as expected. After reworking the code to use ElementTree, the module was ready to be let out into the wild.
diff --git a/dulse/__init__.py b/dulse/__init__.py
@@ -1,3 +1,3 @@
 from dulse.parser import SimpleXMLParser as Parser
-from dulse.parser import NUMBER, STRING, NO_CONVERSION
+from dulse.parser import NUMBER, STRING, NO_CONVERSION, NUMBER_OR_COLLAPSE, COLLAPSE_WHITESPACE
 from dulse.parser import parse, parse_string
diff --git a/dulse/parser.py b/dulse/parser.py
@@ -22,6 +22,18 @@ def addtodict(dictionary, key, value):
         else:
             dictionary[key].append(value)
 
+def NUMBER_OR_COLLAPSE(string):
+    try:
+        return int(string)
+    except (ValueError, TypeError):
+        try:
+            return float(string)
+        except (ValueError, TypeError):
+            if string:
+                return " ".join(string.split()) or None
+            else:
+                return None
+
 def NUMBER(string):
     try:
         return int(string)
@@ -34,6 +46,12 @@ def NUMBER(string):
             else:
                 return None
 
+def COLLAPSE_WHITESPACE(string):
+    try:
+        return " ".join(string.split())
+    except AttributeError:
+        return None
+
 def STRING(string):
     try:
         return string.strip()
@@ -80,13 +98,25 @@ class SimpleXMLParser(object):
          'endnote': ['This is the first.', 'This is the last.'],
          'title': 'My Book'}
     
+    This might not be the ideal form for your uses. If so, then you can change
+    the way mixed content is handled and the way elements are converted::
+    
+        parse("sample.xml", conversion=COLLAPSE_WHITESPACE, mixed_elements=['content'])
+    
+    resulting in::
+        {'author': 'Me',
+         'content': "<h1>Chapter 1</h1><p><em>These</em> are the times that try mens' soles....</p>",
+         'date': '23 April 2009',
+         'endnote': ['This is the first.', 'This is the last.'],
+         'title': 'My Book'}
+    
     """
     def __init__(self, conversion=None, mixed_content=True, mixed_elements=None):
         self.events = None
         if conversion:
             self.conversion = conversion
         else:
-            self.conversion = NUMBER
+            self.conversion = NUMBER_OR_COLLAPSE
         if not mixed_content:
             self.is_mixed = lambda x: False
         if mixed_elements:
@@ -96,14 +126,13 @@ def __init__(self, conversion=None, mixed_content=True, mixed_elements=None):
     def is_mixed(ele):
         return bool((none_strip(ele.text) and ele.getchildren()) or 
             any(map(lambda x: none_strip(x.tail), ele.getchildren())))
-
-    @staticmethod
-    def get_mixed(z):
+
+    def get_mixed(self, z):
         if z.text:
             y = z.text + ''.join(map(etree.tostring, z.getchildren()))
         else:
             y = ''.join(map(etree.tostring, z.getchildren()))
-        return y.strip()
+        return self.conversion(y)
 
     def get_simple_xml_content(self, parentnode):
         d = {}