enable e-print graphics

adsabs · Nov 14, 2016 · f70dab2 · f70dab2
1 parent 0bb4a30
commit f70dab2
Show file tree

Hide file tree

Showing 5 changed files with 59 additions and 15 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,3 +1,7 @@
+### 1.0.11
+
+* graphics for arXiv
+
 ### 1.0.10
 
 * maintenance update

diff --git a/service/config.py b/service/config.py
@@ -1,5 +1,5 @@
 GRAPHICS_SECRET_KEY = 'this should be changed'
-GRAPHICS_INCLUDE_ARXIV = False
+GRAPHICS_INCLUDE_ARXIV = True
 SQLALCHEMY_BINDS = {}
 # Proper handling of database connections
 SQLALCHEMY_COMMIT_ON_TEARDOWN = True

diff --git a/service/file_ops.py b/service/file_ops.py
@@ -5,6 +5,7 @@
 '''
 import sys
 import os
+import re
 import tarfile
 import magic
 import commands
@@ -13,7 +14,18 @@
 from invenio_tools import get_converted_image_name
 from PIL import Image
 
-def untar(tar_archive):
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [ atoi(c) for c in re.split('(\d+)', text) ]
+
+def untar(tar_archive, bibcode):
     '''
     Check validity of TAR archive and unpack in temporary directory
     :param tar_archive:
@@ -30,7 +42,7 @@ def untar(tar_archive):
     except:
         contents = []
     TMP_DIR = current_app.config.get('GRAPHICS_TMP_DIR')
-    extract_dir = "%s/%s" % (TMP_DIR, os.path.basename(tar_archive).split('.')[0])
+    extract_dir = "%s/%s" % (TMP_DIR, bibcode)
     t = tarfile.open(tar_archive, 'r:*')
     t.extractall(extract_dir)
     for f in contents:
@@ -84,6 +96,8 @@ def convert_images(image_list):
         if os.path.exists(png_image):
             done_list.append(png_image)
             remainder.append(image)
+
+    done_list.sort(key=natural_keys)
     return remainder, done_list
 
 @timeout_decorator.timeout(15)

diff --git a/service/graphics.py b/service/graphics.py
@@ -46,6 +46,7 @@ def get_graphics(bibcode):
         else:
             results = {
                 'query': 'failed', 'error': 'PostgreSQL problem (%s)' % err}
+    print results
     if results and 'figures' in results:
         if len(results['figures']) == 0:
             # There are cases where an entry exists, but the 'figures'
@@ -101,7 +102,7 @@ def get_graphics(bibcode):
                     ADS_image_url % (
                         bibcode.replace('&', '%26'), figure['page'] - 1))
         elif source.upper() == 'ARXIV' \
-                and current_app.config.get('GRAPHICSINCLUDE_ARXIV'):
+                and current_app.config.get('GRAPHICS_INCLUDE_ARXIV'):
             results['header'] = 'Images extracted from the arXiv e-print'
             try:
                 display_image = random.choice(display_figure['images'])
@@ -115,7 +116,6 @@ def get_graphics(bibcode):
                 for image in images:
                     thumb_url = image['thumbnail']
                     highr_url = image['highres']
-                    lowrs_url = image['lowres']
         elif source.upper() == 'TEST':
             results['pick'] = display_figure
             return results

diff --git a/service/utils.py b/service/utils.py
@@ -390,7 +390,7 @@ def manage_arXiv_graphics(ft_file, bibcode, arx_id, category, update=False, dryr
     else:
         graphic = None
     # First get lists of (La)TeX and image files
-    tex_files, img_files, xdir = file_ops.untar(ft_file)
+    tex_files, img_files, xdir = file_ops.untar(ft_file, bibcode)
     # If we didn't find any image files, skip
     if len(img_files) == 0:
         return
@@ -423,8 +423,17 @@ def manage_arXiv_graphics(ft_file, bibcode, arx_id, category, update=False, dryr
             extracted_image_data.extend((extract_context(tex_file,
                                                          cleaned_image_data)))
     extracted_image_data = remove_dups(extracted_image_data)
+    # For those images whereno metadata was captured, keep them with
+    # empty strings
+    try:
+        skipped_images = [i for i in converted_images if i not in [e[0] for e in extracted_image_data]]
+    except:
+        skipped_images = converted_images
+    if len(skipped_images) > 0:
+        extracted_image_data += [(im,'','',[]) for im in skipped_images]
     fid = 1
     source2target = {}
+    source2AWS = {}
     for item in extracted_image_data:
         if not os.path.exists(item[0]) or not item[0].strip():
             continue
@@ -443,6 +452,11 @@ def manage_arXiv_graphics(ft_file, bibcode, arx_id, category, update=False, dryr
             subdir,
             eprdir,
             figure_id)
+        source2AWS[item[0]] = "seri/arXiv/%s/%s/%s/%s.png" % (
+            category,
+            subdir,
+            eprdir,
+            figure_id)
         fig_data['figure_id'] = figure_id
         try:
             fig_data['figure_label'] = item[2].encode('ascii','ignore')
@@ -453,14 +467,10 @@ def manage_arXiv_graphics(ft_file, bibcode, arx_id, category, update=False, dryr
         except:
             fig_data['figure_caption'] = ''
         image_url = "http://arxiv.org/abs/%s" % arx_id.replace('arXiv:','')
-        thumb_url = "%s/%s/%s/%s/%s.png/%s" % (
-            current_app.config.get('GRAPHICS_BASE_URL'),
-            category,
-            subdir,
-            eprdir,
-            figure_id,
-            current_app.config.get('GRAPHICS_THMB_PAR'),
-        )
+        thumb_url = "%s/%s/%s" % (
+            current_app.config.get('GRAPHICS_AWS_S3_URL'),
+            current_app.config.get('GRAPHICS_AWS_S3_BUCKET'),
+            source2AWS[item[0]])
         fig_data['images'] = [
             {
                 'image_id': fid,
@@ -473,15 +483,31 @@ def manage_arXiv_graphics(ft_file, bibcode, arx_id, category, update=False, dryr
         fid += 1
     # Now it is time to move the PNGs to their final location, renaming
     # them in the process
+    # 1. Store them on a local server
+    # 2. Store them on AWS S3
+    # Create the S3 session and copy over the files
+    client = get_boto_session().client('s3')
+    # Currently we just process PNG files
+    mimetype = 'image/png'
+    bucket = current_app.config.get('GRAPHICS_AWS_S3_BUCKET')
     for source, target in source2target.items():
+        # Copy image file from TMP location to final location on disk
         target_dir, fname = os.path.split(target)
         if not os.path.exists(target_dir):
             cmmd = 'mkdir -p %s' % target_dir
             commands.getoutput(cmmd)
         shutil.copy(source, target)
+        # Upload image file to S3
+        key = source2AWS[source]
+        try:
+            data = open(source, 'rb')
+        except Exception, e:
+            sys.stderr.write('Error loading image data for %s: %s\n' % (source, str(e)))
+            continue
+        client.put_object(Key=key, Bucket=bucket ,Body=data, ACL='public-read', ContentType=mimetype)
     # Now it's time to clean up stuff we've extracted
     TMP_DIR = current_app.config.get('GRAPHICS_TMP_DIR')
-    extract_dir = "%s/%s" % (TMP_DIR, os.path.basename(ft_file).split('.')[0])
+    extract_dir = "%s/%s" % (TMP_DIR, bibcode)
     try:
         shutil.rmtree(extract_dir)
     except: