Skip to content

Commit

Permalink
enable e-print graphics
Browse files Browse the repository at this point in the history
  • Loading branch information
ehenneken committed Nov 14, 2016
1 parent 0bb4a30 commit f70dab2
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 15 deletions.
4 changes: 4 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
### 1.0.11

* graphics for arXiv

### 1.0.10

* maintenance update
Expand Down
2 changes: 1 addition & 1 deletion service/config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
GRAPHICS_SECRET_KEY = 'this should be changed'
GRAPHICS_INCLUDE_ARXIV = False
GRAPHICS_INCLUDE_ARXIV = True
SQLALCHEMY_BINDS = {}
# Proper handling of database connections
SQLALCHEMY_COMMIT_ON_TEARDOWN = True
Expand Down
18 changes: 16 additions & 2 deletions service/file_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
'''
import sys
import os
import re
import tarfile
import magic
import commands
Expand All @@ -13,7 +14,18 @@
from invenio_tools import get_converted_image_name
from PIL import Image

def untar(tar_archive):
def atoi(text):
return int(text) if text.isdigit() else text

def natural_keys(text):
'''
alist.sort(key=natural_keys) sorts in human order
http://nedbatchelder.com/blog/200712/human_sorting.html
(See Toothy's implementation in the comments)
'''
return [ atoi(c) for c in re.split('(\d+)', text) ]

def untar(tar_archive, bibcode):
'''
Check validity of TAR archive and unpack in temporary directory
:param tar_archive:
Expand All @@ -30,7 +42,7 @@ def untar(tar_archive):
except:
contents = []
TMP_DIR = current_app.config.get('GRAPHICS_TMP_DIR')
extract_dir = "%s/%s" % (TMP_DIR, os.path.basename(tar_archive).split('.')[0])
extract_dir = "%s/%s" % (TMP_DIR, bibcode)
t = tarfile.open(tar_archive, 'r:*')
t.extractall(extract_dir)
for f in contents:
Expand Down Expand Up @@ -84,6 +96,8 @@ def convert_images(image_list):
if os.path.exists(png_image):
done_list.append(png_image)
remainder.append(image)

done_list.sort(key=natural_keys)
return remainder, done_list

@timeout_decorator.timeout(15)
Expand Down
4 changes: 2 additions & 2 deletions service/graphics.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def get_graphics(bibcode):
else:
results = {
'query': 'failed', 'error': 'PostgreSQL problem (%s)' % err}
print results
if results and 'figures' in results:
if len(results['figures']) == 0:
# There are cases where an entry exists, but the 'figures'
Expand Down Expand Up @@ -101,7 +102,7 @@ def get_graphics(bibcode):
ADS_image_url % (
bibcode.replace('&', '%26'), figure['page'] - 1))
elif source.upper() == 'ARXIV' \
and current_app.config.get('GRAPHICSINCLUDE_ARXIV'):
and current_app.config.get('GRAPHICS_INCLUDE_ARXIV'):
results['header'] = 'Images extracted from the arXiv e-print'
try:
display_image = random.choice(display_figure['images'])
Expand All @@ -115,7 +116,6 @@ def get_graphics(bibcode):
for image in images:
thumb_url = image['thumbnail']
highr_url = image['highres']
lowrs_url = image['lowres']
elif source.upper() == 'TEST':
results['pick'] = display_figure
return results
Expand Down
46 changes: 36 additions & 10 deletions service/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ def manage_arXiv_graphics(ft_file, bibcode, arx_id, category, update=False, dryr
else:
graphic = None
# First get lists of (La)TeX and image files
tex_files, img_files, xdir = file_ops.untar(ft_file)
tex_files, img_files, xdir = file_ops.untar(ft_file, bibcode)
# If we didn't find any image files, skip
if len(img_files) == 0:
return
Expand Down Expand Up @@ -423,8 +423,17 @@ def manage_arXiv_graphics(ft_file, bibcode, arx_id, category, update=False, dryr
extracted_image_data.extend((extract_context(tex_file,
cleaned_image_data)))
extracted_image_data = remove_dups(extracted_image_data)
# For those images whereno metadata was captured, keep them with
# empty strings
try:
skipped_images = [i for i in converted_images if i not in [e[0] for e in extracted_image_data]]
except:
skipped_images = converted_images
if len(skipped_images) > 0:
extracted_image_data += [(im,'','',[]) for im in skipped_images]
fid = 1
source2target = {}
source2AWS = {}
for item in extracted_image_data:
if not os.path.exists(item[0]) or not item[0].strip():
continue
Expand All @@ -443,6 +452,11 @@ def manage_arXiv_graphics(ft_file, bibcode, arx_id, category, update=False, dryr
subdir,
eprdir,
figure_id)
source2AWS[item[0]] = "seri/arXiv/%s/%s/%s/%s.png" % (
category,
subdir,
eprdir,
figure_id)
fig_data['figure_id'] = figure_id
try:
fig_data['figure_label'] = item[2].encode('ascii','ignore')
Expand All @@ -453,14 +467,10 @@ def manage_arXiv_graphics(ft_file, bibcode, arx_id, category, update=False, dryr
except:
fig_data['figure_caption'] = ''
image_url = "http://arxiv.org/abs/%s" % arx_id.replace('arXiv:','')
thumb_url = "%s/%s/%s/%s/%s.png/%s" % (
current_app.config.get('GRAPHICS_BASE_URL'),
category,
subdir,
eprdir,
figure_id,
current_app.config.get('GRAPHICS_THMB_PAR'),
)
thumb_url = "%s/%s/%s" % (
current_app.config.get('GRAPHICS_AWS_S3_URL'),
current_app.config.get('GRAPHICS_AWS_S3_BUCKET'),
source2AWS[item[0]])
fig_data['images'] = [
{
'image_id': fid,
Expand All @@ -473,15 +483,31 @@ def manage_arXiv_graphics(ft_file, bibcode, arx_id, category, update=False, dryr
fid += 1
# Now it is time to move the PNGs to their final location, renaming
# them in the process
# 1. Store them on a local server
# 2. Store them on AWS S3
# Create the S3 session and copy over the files
client = get_boto_session().client('s3')
# Currently we just process PNG files
mimetype = 'image/png'
bucket = current_app.config.get('GRAPHICS_AWS_S3_BUCKET')
for source, target in source2target.items():
# Copy image file from TMP location to final location on disk
target_dir, fname = os.path.split(target)
if not os.path.exists(target_dir):
cmmd = 'mkdir -p %s' % target_dir
commands.getoutput(cmmd)
shutil.copy(source, target)
# Upload image file to S3
key = source2AWS[source]
try:
data = open(source, 'rb')
except Exception, e:
sys.stderr.write('Error loading image data for %s: %s\n' % (source, str(e)))
continue
client.put_object(Key=key, Bucket=bucket ,Body=data, ACL='public-read', ContentType=mimetype)
# Now it's time to clean up stuff we've extracted
TMP_DIR = current_app.config.get('GRAPHICS_TMP_DIR')
extract_dir = "%s/%s" % (TMP_DIR, os.path.basename(ft_file).split('.')[0])
extract_dir = "%s/%s" % (TMP_DIR, bibcode)
try:
shutil.rmtree(extract_dir)
except:
Expand Down

0 comments on commit f70dab2

Please sign in to comment.