Skip to content

Commit

Permalink
#53 improved support for long text (#62)
Browse files Browse the repository at this point in the history
  • Loading branch information
spacemansteve authored and romanchyla committed Dec 20, 2017
1 parent 220b53f commit 7571be1
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 3 deletions.
6 changes: 4 additions & 2 deletions adsft/tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,11 @@ def test_trim(self):
b = 'a' + b'\xc2\xa0' + 'b' #utf-8 bytecode
c = u'a' + u'\xa0' + u'b' # unicode
d = u'a' + chr(160).decode('latin1') + u'b'
for x in (a, b, c, d):
# string with a large token representing table data
e = u'a ' + ('123%5.7890' * 100) + u' b'
for x in (a, b, c, d, e):
r = utils.TextCleaner(x).run(translate=False, decode=True, normalise=True, trim=True)
self.assertEqual(r, u'a b')

if __name__ == '__main__':
unittest.main()
unittest.main()
2 changes: 1 addition & 1 deletion adsft/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ def trimwords(self, maxlength=200):
:param maxlength: maximum length of words to keep
:return: no return
"""
self.text = re.sub(r'\b\w{'+str(maxlength)+r',}\b', '', self.text)
self.text = re.sub(r'\S{'+str(maxlength)+r',}\b', '', self.text)
# Substitute multiple spaces with just one space:
self.text = re.sub(' +', ' ', self.text)

Expand Down

0 comments on commit 7571be1

Please sign in to comment.