Skip to content
Browse files

Added method to ensure pdfs when extracting images and other info

  • Loading branch information...
1 parent ef62e90 commit bb00cbf9263617361f37e7ffe1b8ca0b550fe3bb @anderser committed Jan 3, 2010
Showing with 27 additions and 6 deletions.
  1. +1 −0 .gitignore
  2. +18 −3 pydocsplit/docsplit.py
  3. +8 −3 pydocsplit/imageextract.py
View
1 .gitignore
@@ -1,3 +1,4 @@
*.pyc
.pydevproject
.project
+/test.py
View
21 pydocsplit/docsplit.py
@@ -5,6 +5,7 @@
#
import os
import subprocess
+import tempfile
from imageextract import ImageExtractor
#DOCSPLIT settings - change this to your value
@@ -36,7 +37,7 @@ def extract_pages(self, pdf, **kwargs):
>>>d = Docsplit()
>>>d.extract_pages('/path/to/my/document.doc', output='/path/to/outputdir/', pages='1-2')
"""
-
+ pdf = self.ensure_pdf(pdf)
return self.run("org.documentcloud.ExtractPages", pdf, **kwargs)
def extract_text(self, pdf, **kwargs):
@@ -49,7 +50,7 @@ def extract_text(self, pdf, **kwargs):
>>>d = Docsplit()
>>>d.extract_text('/path/to/my/pdffile.pdf', output='/path/to/outputdir/')
"""
-
+ pdf = self.ensure_pdf(pdf)
return self.run("org.documentcloud.ExtractText", pdf, **kwargs)
def extract_pdf(self, doc, **kwargs):
@@ -78,7 +79,7 @@ def extract_images(self, pdf, **kwargs):
>>>d = Docsplit()
>>>d.extract_images('/path/to/my/pdffile.pdf', output='/path/to/outputdir/', sizes=['500x', '250x'], formats=['png', 'jpg'], pages=[1,2,5,7])
"""
-
+ pdf = self.ensure_pdf(pdf)
i = ImageExtractor()
return i.extract(pdf, **kwargs)
@@ -92,15 +93,29 @@ def extract_meta(self, pdf, meta, **kwargs):
>>>d = Docsplit()
>>>d.extract_meta('/path/to/my/pdffile.pdf', 'title')
"""
+ pdf = self.ensure_pdf(pdf)
return self.run("org.documentcloud.ExtractInfo %s" % meta, pdf, **kwargs)
def kwargs_parse(self, kwargs):
return ' '.join(["--%s %s" % (key, kwargs[key]) for key in kwargs])
+ def ensure_pdf(self, doc):
+
+ basename, ext = os.path.splitext(os.path.basename(doc))
+
+ if ext == '.pdf':
+ return doc
+ else:
+ tempdir = os.path.join(tempfile.gettempdir(), 'docsplit')
+ self.extract_pdf(doc, output=tempdir)
+ return "%s.pdf" % os.path.join(tempdir, basename)
+
+
def run(self, command, pdf, **kwargs):
args = self.kwargs_parse(kwargs)
+
cmd = "java %s %s -cp %s %s %s %s 2>&1" % (DOCSPLIT_HEADLESS, DOCSPLIT_LOGGING, DOCSPLIT_CLASSPATH, command, args, pdf)
try:
View
11 pydocsplit/imageextract.py
@@ -37,9 +37,13 @@ def extract(self, pdf, **kwargs):
self.options.update(kwargs)
- for s in self.options['sizes']:
- for f in self.options['formats']:
- self.convert(pdf, s.lower(), f.lower())
+ try:
+ for s in self.options['sizes']:
+ for f in self.options['formats']:
+ self.convert(pdf, s.lower(), f.lower())
+ return True
+ except:
+ return False
def normalize_option(self, key):
@@ -60,6 +64,7 @@ def quality_arg(self, format):
def convert(self, pdf, size, format):
+
basename, ext = os.path.splitext(os.path.basename(pdf))
if size > 1:
subfolder = str(size)

0 comments on commit bb00cbf

Please sign in to comment.
Something went wrong with that request. Please try again.