Skip to content

Commit

Permalink
added extra replace section to tidy up output when retaining tables
Browse files Browse the repository at this point in the history
  • Loading branch information
Nathan Davies committed Jan 23, 2017
1 parent e835e8c commit 3701b77
Showing 1 changed file with 2 additions and 1 deletion.
3 changes: 2 additions & 1 deletion WikiExtractor.py
Expand Up @@ -688,7 +688,8 @@ def clean(self, text):
text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations
text = text.replace(',,', ',').replace(',.', '.')
if keep_tables:
text = re.sub(r'!(?:\s)?style=\"width:(?:\d+)%;\"', r'', text)
text = re.sub(r'!(?:\s)?style=\"[a-z]+:(?:\d+)%;\"', r'', text)
text = re.sub(r'!(?:\s)?style="[a-z]+:(?:\d+)%;[a-z]+:(?:#)?(?:[0-9a-z]+)?"', r'', text)
text = text.replace('|-', '')
text = text.replace('|', '')
if Extractor.toHTML:
Expand Down

0 comments on commit 3701b77

Please sign in to comment.