Skip to content

Commit

Permalink
* Chase cfdict format change
Browse files Browse the repository at this point in the history
(cherry picked from commit 802c4b4)
  • Loading branch information
audreyt committed Jul 22, 2014
1 parent 642d0cd commit ceebc87
Showing 1 changed file with 7 additions and 7 deletions.
14 changes: 7 additions & 7 deletions translation-data/xml2txt.py
Expand Up @@ -7,7 +7,7 @@
from lxml import etree
from collections import defaultdict as dd

pinyinRE = re.compile(ur"(?P<pinyin>[^\]1-5A-Z]+\d)", re.UNICODE)
pinyinRE = re.compile(ur"(?P<py>[^\]1-5A-Z]+\d)", re.UNICODE)
alphaRE = re.compile(ur"(?P<alpha>[A-Z]+)", re.UNICODE)

cfdictXMLFile = "./translation-data/cfdict.xml"
Expand All @@ -26,8 +26,8 @@ def read_xml_dict(infile):
if ele.tag != None:
if ele.text != None:
text = ele.text.strip(' ')
if ele.tag == 'pinyin':
text = pinyinRE.sub(ur"\g<pinyin> ", text, re.UNICODE)
if ele.tag == 'py':
text = pinyinRE.sub(ur"\g<py> ", text, re.UNICODE)
text = alphaRE.sub(ur"\g<alpha> ", text, re.UNICODE)
text = text.rstrip(' ')
parsed_word[ele.tag].append(text)
Expand All @@ -40,11 +40,11 @@ def read_xml_dict(infile):

f = codecs.open(cfdictFile, 'w', 'utf-8')
for item in cfdict:
if len(item['traditional']) > 0:
line = item['traditional'][0] + " " + item['simplified'][0] + " [" + item['pinyin'][0] + "] "
for trans in item['translation']:
if len(item['trad']) > 0:
line = item['trad'][0] + " " + item['simp'][0] + " [" + item['py'][0] + "] "
for trans in item['fr']:
line = line + "/" + trans
if len(item['translation']) > 0:
if len(item['fr']) > 0:
line = line + "/"
f.write(line)
f.write("\n")
Expand Down

0 comments on commit ceebc87

Please sign in to comment.