Skip to content

Commit

Permalink
Added UNO support
Browse files Browse the repository at this point in the history
  • Loading branch information
JordanReiter committed Nov 15, 2012
1 parent 2420318 commit 06effd8
Showing 1 changed file with 39 additions and 9 deletions.
48 changes: 39 additions & 9 deletions fulltext/__init__.py
Expand Up @@ -29,9 +29,12 @@ def read_content(f, type):
f = file(f, 'r')
return f.read()

def run_command(f, type):
def run_command(f, type, use_uno=False, **kwargs):
"The default handler. It runs a command and reads it's output."
cmds = PROG_MAP[type]
if use_uno and type in UNO_FORMATS:
cmds = UNO_COMMANDS
else:
cmds = PROG_MAP[type]
if isinstance(f, basestring):
cmd = cmds[0]
cmd = map(lambda x: x.format(f), cmd)
Expand All @@ -52,20 +55,33 @@ def run_command(f, type):
p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=DEVNULL)
return p.communicate(i)[0]

def strip_unrtf_header(f, type):
def strip_unrtf_header(f, type, **kwargs):
"Can't find a way to turn off the stupid header in unrtf."
text = run_command(f, type)
text = run_command(f, type, **kwarg)
parts = text.split('-----------------')
return '-----------------'.join(parts[1:])

def csv_to_text(f, type):
def csv_to_text(f, type, **kwargs):
"Can convert xls to csv, but this will go from csv to plain old text."
text = run_command(f, type)
text = run_command(f, type, **kwarg)
buffer = []
for row in csv.reader(text.splitlines(), dialect="excel"):
buffer.append(' '.join(row))
return ' '.join(buffer)

UNO_FORMATS = (
('application/pdf', None),
('application/vnd.openxmlformats-officedocument.wordprocessingml.document', None),
('application/rtf', None),
('text/rtf', None),
('application/vnd.oasis.opendocument.text', None),
)

UNO_COMMANDS = (
('unoconv', '-f', 'text', '{0}'),
None,
)

PROG_MAP = {
('application/pdf', None): (
('pdftotext', '-q', '-nopgbrk', '{0}', '-'),
Expand All @@ -83,6 +99,10 @@ def csv_to_text(f, type):
('xls2csv', '{0}'), # as provided by catdoc
None, # Supposedly this works, but I get segmentation fault.
),
('application/rtf', None): (
('unrtf', '--text', '--nopict', '{0}'),
('unrtf', '--text', '--nopict'),
),
('text/rtf', None): (
('unrtf', '--text', '--nopict', '{0}'),
('unrtf', '--text', '--nopict'),
Expand Down Expand Up @@ -144,6 +164,7 @@ def csv_to_text(f, type):
('application/vnd.openxmlformats-officedocument.wordprocessingml.document', None): run_command,
('text/vnd.ms-excel', None): csv_to_text,
('text/rtf', None): strip_unrtf_header,
('application/rtf', None): strip_unrtf_header,
('application/vnd.oasis.opendocument.text', None): run_command,
('application/vnd.oasis.opendocument.spreadsheet', None): run_command,
('application/zip', None): run_command,
Expand Down Expand Up @@ -246,7 +267,7 @@ class NoDefault(object):
pass


def get(f, default=NoDefault, filename=None, type=None):
def get(f, default=NoDefault, filename=None, type=None, strip_whitespace=True, use_uno=False):
"""
Gets text from a given file. The first parameter can be a path or a file-like object that
has a read method. Default is a way to supress errors and just return the default text.
Expand All @@ -273,15 +294,24 @@ def get(f, default=NoDefault, filename=None, type=None):
return default
raise FullTextException('File not found')
if type is None:
type = get_type(filename)
if not filename:
try:
type = (f.headers.type, None)
except AttributeError:
pass
else:
type = get_type(filename)
handler = FUNC_MAP.get(type, read_content)
try:
text = handler(f, type)
except:
if default is not NoDefault:
return default
raise
return STRIP_WHITE.sub(' ', text).strip()
if strip_whitespace:
return STRIP_WHITE.sub(' ', text).strip()
else:
return text

def check():
"""
Expand Down

0 comments on commit 06effd8

Please sign in to comment.