-
Notifications
You must be signed in to change notification settings - Fork 2
/
imageextract.py
113 lines (81 loc) · 3.22 KB
/
imageextract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python
#Python implementation of DocumentCloud's Docsplit Image Exctractor
#Original Ruby implementation: http://github.com/documentcloud/docsplit/blob/master/lib/docsplit/image_extractor.rb
DENSITY_ARG = "-density 150"
DEFAULT_FORMATS = ["png",]
DEFAULT_SIZES = ["500x",]
import re
import subprocess
import os
class ImageExtractionError(Exception):
def __init__(self, cmd, msg):
self.cmd = cmd
self.msg = msg
class ImageExtractor:
def __init__(self):
self.options = {
'output' : '.',
'sizes' : DEFAULT_SIZES,
'formats' : DEFAULT_FORMATS,
'pages': None,
}
def extract(self, pdf, **kwargs):
""" Extracts images of each page in a PDF document
Usage:
>>>i = ImageExtractor()
>>>i.extract("/path/to/my/pdffile.pdf", output="/path/to/my/output/dir/", sizes=['500x', '250x'], formats=['png', 'jpg'])
"""
self.options.update(kwargs)
try:
for s in self.options['sizes']:
for f in self.options['formats']:
self.convert(pdf, s.lower(), f.lower())
return True
except:
return False
def normalize_option(self, key):
if type(self.options[key])==type(list()):
self.options[key] = ",".join([str(v) for v in self.options[key]])
def resize_arg(self, size):
if size is None:
return ''
return "-resize %s" % size
def quality_arg(self, format):
if format == "jpeg" or "jpg":
return "-quality 85"
else:
return "-quality 100"
def convert(self, pdf, size, format):
basename, ext = os.path.splitext(os.path.basename(pdf))
if size > 1:
subfolder = str(size)
else:
subfolder = ''
directory = os.path.join(self.options['output'], subfolder)
if not os.path.isdir(directory):
os.mkdir(directory)
out_file = os.path.join(directory, "%s_%%05d.%s" % (basename, format))
args = '%s %s %s "%s%s" "%s" 2>&1' % (DENSITY_ARG, self.resize_arg(size),
self.quality_arg(format), pdf, self.pages_arg(), out_file )
args = args.strip()
return self.run_gm(args)
def pages_arg(self):
self.normalize_option("pages")
if self.options['pages'] is None:
return ''
else:
p = re.compile(r'\d+')
return "[%s]" % p.sub(self.page_subtract, self.options['pages'])
def page_subtract(self, match):
value = int( match.group() ) - 1
return str(value)
def run_gm(self, args):
procs = subprocess.Popen('gm convert %s' % args, shell=True, stdout=subprocess.PIPE)
if procs.wait() != 0:
try:
raise ImageExtractionError(args, procs.communicate()[0])
except ImageExtractionError, err:
print err.cmd, err.msg
return False
else:
return True