Permalink
Browse files

beef up encoding support, and tests

  • Loading branch information...
1 parent 0ef5c8b commit 595a82baecac40ab2fc0801855831cd84078e603 @bukzor committed May 30, 2012
@@ -55,8 +55,8 @@ def add_comment(self, comment_text):
def make_comment(self, comment_text):
- comment = CheetahNode('Comment')
- comment_start = CheetahNode('CommentStart')
+ comment = self.makeelement('Comment')
+ comment_start = self.makeelement('CommentStart')
comment_start.text = '##'
comment_start.tail = ' ' + comment_text
@@ -91,25 +91,25 @@ def is_in_context(self, directive_string):
else:
return False
-def call(method, arguments):
+def call(method, arguments, makeelement):
"""
return an lxml node representing a call to a method, with arguments.
`method` is a string
`arguments` is an lxml node
"""
- call = CheetahNode('Placeholder')
+ call = makeelement('Placeholder')
- varstart = CheetahNode('CheetahVarStart')
+ varstart = makeelement('CheetahVarStart')
varstart.text = '$'
call.append(varstart)
- namechunks = CheetahNode('CheetahVarNameChunks')
+ namechunks = makeelement('CheetahVarNameChunks')
- name = CheetahNode('DottedName')
+ name = makeelement('DottedName')
name.text = method
namechunks.append(name)
- argstring = CheetahNode('CallArgsString')
+ argstring = makeelement('CallArgsString')
argstring.text = '('
argstring.append(arguments)
namechunks.append(argstring)
@@ -209,7 +209,7 @@ def remove_self(self):
class CheetahDirective(CheetahNodeBase):
def replace_directive(self, other):
if isinstance(other, basestring):
- var = CheetahNode('CheetahVar')
+ var = self.makeelement('CheetahVar')
try:
directive, var.text = other.split(None, 1)
except ValueError:
@@ -268,7 +268,7 @@ def get_end_directive(self):
# Look at sibling Directives after this node, take first one that is an EndDirective.
return self.xpath_one('./following-sibling::Directive[./EndDirective][1]')
-class CheetahNodeLookup(etree.PythonElementClassLookup):
+class NodeLookup(etree.PythonElementClassLookup):
"""
Specify how to assign Python classes to lxml objects.
see: http://lxml.de/element_classes.html#tree-based-element-class-lookup-in-python
@@ -285,9 +285,7 @@ def lookup(self, document, element):
else:
return CheetahNodeBase
-CHEETAH_PARSER = etree.XMLParser()
-CHEETAH_PARSER.set_element_class_lookup(CheetahNodeLookup())
-
-CheetahNode = CHEETAH_PARSER.makeelement
+node_lookup = NodeLookup()
+del NodeLookup # This is a singleton class.
__all__ = ('CheetahNode',)
@@ -175,7 +175,7 @@ def detect_encoding(source):
# We didn't find anything.
return None
-def parse(cheetah_content):
+def parse(cheetah_content, encoding=None):
from Cheetah.Compiler import Compiler
# This is very screwy, but so is cheetah. Apologies.
@@ -193,8 +193,8 @@ def parse(cheetah_content):
dictnode = parser_data_to_dictnode(data, cheetah_content)
from refactorlib.parse import dictnode_to_lxml
- from refactorlib.cheetah.node import CheetahNode
- root = dictnode_to_lxml(dictnode, CheetahNode)
+ from refactorlib.cheetah.node import node_lookup
+ root = dictnode_to_lxml(dictnode, node_lookup, encoding)
return root
def remove_empty(data):
View
@@ -98,10 +98,6 @@ def one(mylist):
return mylist[0]
-parser_lookup = etree.ElementDefaultClassLookup(element=RefactorLibNodeBase)
-parser = etree.XMLParser()
-parser.set_element_class_lookup(parser_lookup)
-
-RefactorLibNode = parser.makeelement
+node_lookup = etree.ElementDefaultClassLookup(element=RefactorLibNodeBase)
__all__ = ('RefactorLibNodeBase',)
View
@@ -15,9 +15,9 @@ def parse(filename, filetype=None, encoding=None):
# I don't see why encoding=None is different from not specifying the encoding.
source = unicode(source)
- return filetype.parser(source)
+ return filetype.parser(source, encoding)
-def dictnode_to_lxml(tree, element_factory=None):
+def dictnode_to_lxml(tree, node_lookup=None, encoding=None):
"""
Input: A dictionary-based representation of a node tree.
Output: An lxml representation of the same.
@@ -29,26 +29,36 @@ def dictnode_to_lxml(tree, element_factory=None):
attrs -- A dictionary of any extra attributes.
children -- An ordered list of more node-dictionaries.
"""
- if element_factory:
- Element = element_factory
- else:
- from node import RefactorLibNode as Element
+ if not node_lookup:
+ from node import node_lookup
+
+ from lxml.etree import XMLParser
+ lxml_parser_object = XMLParser(encoding=encoding)
+ lxml_parser_object.set_element_class_lookup(node_lookup)
+ Element = lxml_parser_object.makeelement
root = None
stack = [ (tree,root) ]
while stack:
node, parent = stack.pop()
- lxmlnode = Element(node['name'], attrib=node['attrs'])
- lxmlnode.text = node['text']
- lxmlnode.tail = node['tail']
if parent is None:
+ # We use this roundabout method becuase the encoding is always set
+ # to 'UTF8' if we use parser.makeelement()
+ lxml_parser_object.feed('<trash></trash>')
+ lxmlnode = lxml_parser_object.close()
+ lxmlnode.tag = node['name']
+ lxmlnode.attrib.update(node['attrs'])
root = lxmlnode
else:
+ lxmlnode = Element(node['name'], attrib=node['attrs'])
parent.append(lxmlnode)
+ lxmlnode.text = node['text']
+ lxmlnode.tail = node['tail']
+
for child in reversed(node['children']):
stack.append((child, lxmlnode))
@@ -14,15 +14,15 @@ def detect_encoding(source):
# We didn't find anything.
return None
-def parse(python_contents):
+def parse(python_contents, encoding):
"""
- Given some python contents, as a string, return the lxml representation.
+ Given some python contents, as a unicode string, return the lxml representation.
"""
lib2to3_python = lib2to3_parse(python_contents)
dictnode_python = lib2to3_to_dictnode(lib2to3_python)
from refactorlib.parse import dictnode_to_lxml
- return dictnode_to_lxml(dictnode_python)
+ return dictnode_to_lxml(dictnode_python, encoding=encoding)
def lib2to3_parse(python_contents):
from lib2to3 import pygram, pytree
@@ -1,2 +0,0 @@
-7[?47h)0[?25lPuDB 2011.3 - ?:help n:next s:step into b:breakpoint o:output t:run to cursor !:python shell    1 from Cheetah.Parser import Parser  Variables:    2   args: tuple    3 DEBUG = True  kwargs: dict    4   name: 'getExpressionParts'    5 class InstrumentedMethod(object):  pudb: <module 'pudb' from '/nail/home/buck/mypy/lib/p   6  def __init__(self, method, parent):   ython2.6/site-packages/pudb/__init__.pyc'>    7  self.method = method  self: InstrumentedMethod    8  self.parent = parent  start_pos: 6    9       10  def __call__(self, *args, **kwargs):      11  # I want the data to be arranged in *call* order      12  start_pos = self.parent.pos()      13  name = self.method.__name__      14       15  if name == 'getExpressionParts':      16  import pudb; pudb.set_trace()  Stack:    17   >> __call__ [InstrumentedMethod] parse.py:18 >  18 mydata = [start_pos, None, name]   getExpression [InstrumentedParser] Parser.py:1210    19  self.parent.data.append(mydata)   __call__ [InstrumentedMethod] parse.py:20    20  result = self.method(*args, **kwargs) # Call the wrapped method.   eatSet [InstrumentedParser] Parser.py:2157    21  mydata[1] = self.parent.pos()   __call__ [InstrumentedMethod] parse.py:20    22    eatDirective [InstrumentedParser] Parser.py:1613    23  return result   __call__ [InstrumentedMethod] parse.py:20    24    parse [InstrumentedParser] Parser.py:1489    25 class AnyString(str):   compile [ModuleCompiler] Compiler.py:1687    26  'Represents "any string".'   parse parse.py:149    27  def startswith(self, other):   parse parse.py:5    28  return True   <module> <string>:1    29  def __eq__(self, other):      30  return True      31   Breakpoints:    32 from collections import defaultdict      33 class AutoDict(defaultdict):      34  "Like defaultdict, but auto-populates for .get() as well."      35  no_default = []      36  def get(self, key, default=no_default):      37  if default is self.no_default:      38  return self[key]      39  else:      40  return super(AutoDict, self).get(key, default)      41       42 class InstrumentedParser(Parser):      43  dont_care_methods = (      44  'getc', 'getRowCol', 'getRowColLine', 'getLine',    [?1002l[?1000l[?25h[?47l8
-[?25h
@@ -0,0 +1,11 @@
+<?xml version='1.0' encoding='CP850'?>
+<file_input><simple_stmt><print_stmt># vim:encoding=CP850:
+
+# German: fuel oil recoil absorber
+# jqvwxy missing, but all non-ASCII letters in one word
+# See: http://www.cl.cam.ac.uk/~mgk25/ucs/examples/quickbrown.txt
+
+# These should be equivalent:
+<NAME>print</NAME> <STRING>'Heiz”lrckstoáabd„mpfung'</STRING></print_stmt><NEWLINE>
+</NEWLINE></simple_stmt><simple_stmt><print_stmt><NAME>print</NAME><power> <STRING>u'Heiz”lrckstoáabd„mpfung'</STRING><trailer><DOT>.</DOT><NAME>encode</NAME></trailer><trailer><LPAR>(</LPAR><STRING>'cp850'</STRING><RPAR>)</RPAR></trailer></power></print_stmt><NEWLINE>
+</NEWLINE></simple_stmt><ENDMARKER></ENDMARKER></file_input>
@@ -0,0 +1,9 @@
+# vim:encoding=CP850:
+
+# German: fuel oil recoil absorber
+# jqvwxy missing, but all non-ASCII letters in one word
+# See: http://www.cl.cam.ac.uk/~mgk25/ucs/examples/quickbrown.txt
+
+# These should be equivalent:
+print 'Heiz”lrckstoáabd„mpfung'
+print u'Heiz”lrckstoáabd„mpfung'.encode('cp850')
@@ -7,6 +7,18 @@ def test_can_make_round_trip(example):
example = parse(example)
assert text == example.totext()
+@parametrize(get_examples)
+def test_encoding_detection(example):
+ from refactorlib.python.parse import detect_encoding
+ text = open(example).read()
+ example = parse(example)
+ detected_encoding = detect_encoding(text)
+
+ assert (
+ example.encoding == detected_encoding or
+ (example.encoding, detected_encoding) == ('UTF-8', None)
+ )
+
@parametrize(get_output('xml'))
def test_matches_known_good_parsing(example, output):
example = parse(example).tostring()

0 comments on commit 595a82b

Please sign in to comment.