Permalink
Browse files

Fixed issue with capitalised PDF extension. Added returntext option t…

…o extract_text for returning the text extraced from the document as a unicode string.
  • Loading branch information...
1 parent 7a82a6f commit 9793650e2ec544cd0a4c9c0b5358e208dfe6aaeb @anderser committed Jan 24, 2010
Showing with 25 additions and 6 deletions.
  1. +2 −3 README.md
  2. +23 −3 pydocsplit/docsplit.py
View
@@ -28,13 +28,12 @@ See the Docsplit docs for howto: <http://documentcloud.github.com/docsplit/>
d = Docsplit()
d.extract_pdf('/path/to/my/document.doc', output='/path/to/outputdir/')
d.extract_pages('/path/to/my/pdffile.pdf', output='/path/to/outputdir/', pages='1-2')
- d.extract_text('/path/to/my/pdffile.pdf', output='/path/to/outputdir/')
+ d.extract_text('/path/to/my/pdffile.pdf', output='/path/to/outputdir/', returntext=True)
d.extract_images('/path/to/my/pdffile.pdf', output='/path/to/outputdir/', sizes=['500x', '250x'], formats=['png', 'jpg'], pages=[1,2,5,7])
documenttitle = d.extract_meta('/path/to/my/pdffile.pdf', 'title')
##TODO:
- Support multiple pdfs as input
- Enhance parsing of pages options/ranges
-- Fix page numbers on generated images of PDF pages
-- And probably a lot of bug fixes...
+- Fix page numbers on generated images of PDF pages
View
@@ -45,14 +45,34 @@ def extract_text(self, pdf, **kwargs):
"""
Extracts text from a PDF
The text is saved as a text file with same base name as your document in the
- output dir specified
+ output dir specified.
+
+ Using the returntext=True returns the text extracted in addition
+ to saving the text file. At the moment the returntext only works if all pages are
+ extracted i.e. there is no pages argument.
Usage:
>>>d = Docsplit()
>>>d.extract_text('/path/to/my/pdffile.pdf', output='/path/to/outputdir/')
+ >>>d.extract_text('/path/to/my/pdffile.pdf', output='/path/to/outputdir/', returntext=True)
"""
+
+ returntext = False
+ if 'returntext' in kwargs:
+ if kwargs['returntext'] == True:
+ returntext = True
+ kwargs.pop('returntext')
+
+ basename, ext = os.path.splitext(os.path.basename(pdf))
pdf = self.ensure_pdf(pdf)
- return self.run("org.documentcloud.ExtractText", pdf, **kwargs)
+ response = self.run("org.documentcloud.ExtractText", pdf, **kwargs)
+
+ if returntext == True and response is not None and 'pages' not in kwargs:
+ txtfile = open("%s.txt" % os.path.join(kwargs['output'], basename), 'r')
+ response = txtfile.read()
+ txtfile.close()
+
+ return response
def extract_pdf(self, doc, **kwargs):
"""
@@ -106,7 +126,7 @@ def ensure_pdf(self, doc):
basename, ext = os.path.splitext(os.path.basename(doc))
- if ext == '.pdf':
+ if ext.lower() == '.pdf':
return doc
else:
tempdir = os.path.join(tempfile.gettempdir(), 'docsplit')

0 comments on commit 9793650

Please sign in to comment.