/
microtron.py
144 lines (112 loc) · 5.78 KB
/
microtron.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import isodate, re
import lxml.etree, lxml.html
class ParseError(Exception):
pass
class Parser(object):
def __init__(self, tree, formats, strict=False):
self.root = tree
self.formats = formats
self.strict = strict
def parse_format(self, mf, root=None):
root = root if root is not None else self.root
format = self.formats.xpath('/microformats/*[@name="%s"] | /microformats/%s' % (mf, mf))
if not format:
return None
else:
format = format[0]
results = []
if format.attrib['type'] == 'compound':
expr = 'descendant-or-self::*[contains(concat(" ", normalize-space(@class), " "), " %s ")]' % format.tag
for node in root.xpath(expr):
results.append(self._parse_node(node, format))
elif format.attrib['type'] == 'elemental':
for feature in format:
attribute = feature.attrib['attribute']
value = feature.tag
expr = 'descendant-or-self::*[contains(concat(" ", normalize-space(@%s), " "), " %s ")]' % (attribute, value)
for node in root.xpath(expr):
values.append((value, node.attrib['href'], node.text))
return results
def _parse_node(self, node, format):
result = {}
for prop in format:
prop_name = prop.tag
prop_type = prop.attrib['type'] if 'type' in prop.attrib else None
prop_mandatory = True if 'mandatory' in prop.attrib and prop.attrib['mandatory'] == 'yes' else False
prop_many = prop.attrib['many'] if 'many' in prop.attrib else False
prop_couldbe = prop.attrib['couldbe'].split('|') if 'couldbe' in prop.attrib else []
prop_values = set(prop.attrib['values'].split(',')) if 'values' in prop.attrib else None
# Select all properties, but exclude nested properties
prop_expr = 'descendant-or-self::*[contains(concat(" ", normalize-space(@class), " "), " %s ")]' % prop_name
parent_expr = 'ancestor::*[contains(concat(" ", normalize-space(@class), " "), " %s ")]' % format.tag
prop_nodes = [prop_node for prop_node in node.xpath(prop_expr) if prop_node.xpath(parent_expr)[0] == node]
if self.strict and not prop_nodes and prop_mandatory:
raise ParseError("Missing mandatory property: %s" % (prop_name))
if prop_many == 'many':
values = []
elif prop_many == "manyasone":
values = ""
for prop_node in prop_nodes:
try:
for mf in prop_couldbe:
try:
value = self.parse_format(mf, prop_node)[0]
if value:
break
except:
pass
else:
if len(prop):
value = self._parse_node(prop_node, prop)
if not value:
value = self._parse_value(prop_node)
elif not prop_type or prop_type in ('text', 'url', 'image'):
value = self._parse_value(prop_node)
elif prop_type == 'date':
value = isodate.parse_date(self._parse_value(prop_node))
else:
value = self.parse_format(prop_type, prop_node)[0]
if not value:
value = self._parse_value(prop_node)
except Exception, e:
if self.strict:
raise ParseError("Error parsing value for property %s: %s" % (prop_name, e))
else:
value = self._parse_value(prop_node)
if self.strict and prop_values and value.lower() not in prop_values:
raise ParseError("Invalid value for property %s: %s" % (prop_name, value))
if prop_many == 'many':
values.append(value)
elif prop_many == 'manyasone':
values += value
else:
result[prop_name] = value
break
if prop_many and values:
result[prop_name] = values
return result
def _parse_value(self, node):
value_expr = 'descendant::*[contains(concat(" ", normalize-space(@class), " "), " value ")]'
value_nodes = node.xpath(value_expr)
if value_nodes:
return self._normalize_space(" ".join(value_node.text_content() for value_node in value_nodes))
elif node.tag == 'abbr' and 'title' in node.attrib:
return node.attrib['title']
elif 'href' in node.attrib:
href = node.attrib['href']
for prefix in ('mailto:', 'tel:', 'fax:', 'modem:'):
if href.startswith(prefix):
href = href[len(prefix):]
break
return (href, self._normalize_space(node.text_content()))
elif 'src' in node.attrib:
return node.attrib['src']
else:
return self._normalize_space(node.text_content())
def _normalize_space(self, text):
return re.sub(r'\s+', ' ', text.strip())
if __name__ == "__main__":
import pprint, sys
tree = lxml.html.parse(sys.argv[1])
formats = lxml.etree.parse(sys.argv[2])
pprint.pprint(Parser(tree, formats).parse_format(sys.argv[3]))