Skip to content

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also compare across forks.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also compare across forks.
...
Checking mergeability… Don’t worry, you can still create the pull request.
  • 4 commits
  • 1 file changed
  • 0 commit comments
  • 2 contributors
Showing with 45 additions and 10 deletions.
  1. +45 −10 fulltext/__init__.py
View
55 fulltext/__init__.py 100644 → 100755
@@ -25,13 +25,17 @@ def is_exe(fpath):
def read_content(f, type):
"A handler that simply reads a file's output. Used on unrecognized types."
+ print "Unable to recognize file type of ", f, type
if isinstance(f, basestring):
f = file(f, 'r')
return f.read()
-def run_command(f, type):
+def run_command(f, type, use_uno=False, **kwargs):
"The default handler. It runs a command and reads it's output."
- cmds = PROG_MAP[type]
+ if use_uno and type in UNO_FORMATS:
+ cmds = UNO_COMMANDS
+ else:
+ cmds = PROG_MAP[type]
if isinstance(f, basestring):
cmd = cmds[0]
cmd = map(lambda x: x.format(f), cmd)
@@ -49,23 +53,37 @@ def run_command(f, type):
# We use regular subprocess module here. No timeout is allowed with communicate()
# If there are problems with timeouts, I will investigate other options, like:
# http://pypi.python.org/pypi/EasyProcess
+ print "Running ", cmd
p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=DEVNULL)
return p.communicate(i)[0]
-def strip_unrtf_header(f, type):
+def strip_unrtf_header(f, type, **kwargs):
"Can't find a way to turn off the stupid header in unrtf."
- text = run_command(f, type)
+ text = run_command(f, type, **kwargs)
parts = text.split('-----------------')
return '-----------------'.join(parts[1:])
-def csv_to_text(f, type):
+def csv_to_text(f, type, **kwargs):
"Can convert xls to csv, but this will go from csv to plain old text."
- text = run_command(f, type)
+ text = run_command(f, type, **kwargs)
buffer = []
for row in csv.reader(text.splitlines(), dialect="excel"):
buffer.append(' '.join(row))
return ' '.join(buffer)
+UNO_FORMATS = (
+ ('application/pdf', None),
+ ('application/vnd.openxmlformats-officedocument.wordprocessingml.document', None),
+ ('application/rtf', None),
+ ('text/rtf', None),
+ ('application/vnd.oasis.opendocument.text', None),
+)
+
+UNO_COMMANDS = (
+ ('unoconv', '-f', 'text', '{0}'),
+ None,
+)
+
PROG_MAP = {
('application/pdf', None): (
('pdftotext', '-q', '-nopgbrk', '{0}', '-'),
@@ -87,6 +105,10 @@ def csv_to_text(f, type):
('unrtf', '--text', '--nopict', '{0}'),
('unrtf', '--text', '--nopict'),
),
+ ('text/rtf', None): (
+ ('unrtf', '--text', '--nopict', '{0}'),
+ ('unrtf', '--text', '--nopict'),
+ ),
('application/vnd.oasis.opendocument.text', None): (
('odt2txt', '{0}'),
None,
@@ -142,7 +164,8 @@ def csv_to_text(f, type):
('application/pdf', None): run_command,
('application/msword', None): run_command,
('application/vnd.openxmlformats-officedocument.wordprocessingml.document', None): run_command,
- ('application/vnd.ms-excel', None): csv_to_text,
+ ('text/vnd.ms-excel', None): csv_to_text,
+ ('text/rtf', None): strip_unrtf_header,
('application/rtf', None): strip_unrtf_header,
('application/vnd.oasis.opendocument.text', None): run_command,
('application/vnd.oasis.opendocument.spreadsheet', None): run_command,
@@ -246,7 +269,7 @@ class NoDefault(object):
pass
-def get(f, default=NoDefault, filename=None, type=None):
+def get(f, default=NoDefault, filename=None, type=None, strip_whitespace=True, use_uno=False):
"""
Gets text from a given file. The first parameter can be a path or a file-like object that
has a read method. Default is a way to supress errors and just return the default text.
@@ -273,7 +296,16 @@ def get(f, default=NoDefault, filename=None, type=None):
return default
raise FullTextException('File not found')
if type is None:
- type = get_type(filename)
+ if not filename:
+ try:
+ print "Getting it from headers ", f.headers
+ type = (f.headers.type, None)
+ except AttributeError:
+ pass
+ else:
+ print "Getting it from filename ", filename
+ type = get_type(filename)
+ print "Type is ", type
handler = FUNC_MAP.get(type, read_content)
try:
text = handler(f, type)
@@ -281,7 +313,10 @@ def get(f, default=NoDefault, filename=None, type=None):
if default is not NoDefault:
return default
raise
- return STRIP_WHITE.sub(' ', text).strip()
+ if strip_whitespace:
+ return STRIP_WHITE.sub(' ', text).strip()
+ else:
+ return text
def check():
"""

No commit comments for this range

Something went wrong with that request. Please try again.