Skip to content

Commit

Permalink
Added trimwords() method to TextCleaner
Browse files Browse the repository at this point in the history
  • Loading branch information
nasaads committed Dec 1, 2016
1 parent f319f4a commit c18f29b
Showing 1 changed file with 26 additions and 3 deletions.
29 changes: 26 additions & 3 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def translate(self):
def decode(self):
"""
Decodes the text into unicode expected UTF-8 encoding
:return:
:return: no return
"""

if type(self.text) == str:
Expand All @@ -301,14 +301,35 @@ def normalise(self):
self.text = unicodedata.normalize('NFKC', unicode(self.text))
self.text = re.sub('\s+', ' ', self.text)

def run(self, translate=True, decode=True, normalise=True):
def trimwords(self, maxlength=200):
"""
Removes "words" longer than wordlength characters, which tend to be
artifacts generated by the text extraction pipeline (typically tables).
We do this because these huge words cause problems further down the line
when they are indexed in SOLR
:param maxlength: maximum length of words to keep
:return: no return
"""

# note: we want to keep the original text in the proper sequence of lines
# to avoid messing up text analysis downstream
buffer = []
for line in self.text.splitlines():
newline = ' '.join([word if len(word) <= maxlength else '' for word in line.split()])
buffer.append(newline)

self.text = '\n'.join(buffer)


def run(self, translate=True, decode=True, normalise=True, trim=True):
"""
Wrapper method that can run all of the methods wanted by the user
in one executable.
:param translate: should it translate, boolean
:param decode: should it decode, boolean
:param normalise: should it normalise, boolean
:param trimwords: remove long sequences of non-blank characters (usually garbage)
:return: cleaned text
"""

Expand All @@ -318,5 +339,7 @@ def run(self, translate=True, decode=True, normalise=True):
self.decode()
if normalise:
self.normalise()
if trim:
self.trimwords()

return self.text
return self.text

0 comments on commit c18f29b

Please sign in to comment.