-
Notifications
You must be signed in to change notification settings - Fork 20
/
filter_alt_tags.py
61 lines (52 loc) · 2.25 KB
/
filter_alt_tags.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from lxml import etree
###############################
## DEAL WITH ALT TAG ##########
#select first part of alt tag when there are no EM in it
#select part where there is a tag if there are no tags in the other part
#select part where there are the highest amount of EM tags, when there is EM tags in each side
# NOTE: there are ALT with more than 2 sides
#http://stackoverflow.com/questions/4624062/get-all-text-inside-a-tag-in-lxml
def stringify_children(node):
from lxml.etree import tostring
from itertools import chain
parts = ([node.text] +
list(chain(*([tostring(c, with_tail=False), c.tail] for c in node.getchildren()))) +
[node.tail])
# filter removes possible Nones in texts and tails
return ''.join(filter(None, parts))
#select first alternative when multiple categories occur
def filter_categories_alternatives(tree):
for alternatives in tree.xpath("//EM[contains(@CATEG,'|')]"):
alternatives.attrib['CATEG'] = alternatives.attrib['CATEG'].split('|')[0]
for alternatives in tree.xpath("//EM[contains(@TIPO,'|')]"):
alternatives.attrib['TIPO'] = alternatives.attrib['TIPO'].split('|')[0]
for alternatives in tree.xpath("//EM[contains(@SUBTIPO,'|')]"):
alternatives.attrib['SUBTIPO'] = alternatives.attrib['SUBTIPO'].split('|')[0]
def filter_alt_tags(tree):
for alt in tree.xpath("//ALT"):
if len(alt.findall('EM')) == 0:
alt.text = alt.text.split('|')[0]
else:
temp_tail = alt.tail
alt.tail = None
sides_txt = stringify_children(alt).split('|')
side_len = []
for side in sides_txt:
try:
side = "<ALT>"+side+"</ALT>"
s = etree.fromstring(side)
except: #happens with & (only once)
if "&" in side:
side = side.replace('&',"&")
s = etree.fromstring(side)
else:
print "Exception in: " + side
side_len.append(len(s.findall('EM')))
side_selected = sides_txt[side_len.index(max(side_len))]
try:
side_elem = etree.fromstring("<ALT>"+side_selected+"</ALT>")
side_elem.tail = temp_tail
alt.getparent().replace(alt,side_elem)
except: #happens with & (only once)
print etree.tostring(side_elem)
etree.strip_tags(tree, 'ALT')