diff --git a/CHANGES.md b/CHANGES.md index 097de3a..372ff80 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,7 @@ +### 1.0.11 + +* graphics for arXiv + ### 1.0.10 * maintenance update diff --git a/service/config.py b/service/config.py index 374cfa8..6e664fd 100644 --- a/service/config.py +++ b/service/config.py @@ -1,5 +1,5 @@ GRAPHICS_SECRET_KEY = 'this should be changed' -GRAPHICS_INCLUDE_ARXIV = False +GRAPHICS_INCLUDE_ARXIV = True SQLALCHEMY_BINDS = {} # Proper handling of database connections SQLALCHEMY_COMMIT_ON_TEARDOWN = True diff --git a/service/file_ops.py b/service/file_ops.py index ea8d003..cbdde62 100644 --- a/service/file_ops.py +++ b/service/file_ops.py @@ -5,6 +5,7 @@ ''' import sys import os +import re import tarfile import magic import commands @@ -13,7 +14,18 @@ from invenio_tools import get_converted_image_name from PIL import Image -def untar(tar_archive): +def atoi(text): + return int(text) if text.isdigit() else text + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [ atoi(c) for c in re.split('(\d+)', text) ] + +def untar(tar_archive, bibcode): ''' Check validity of TAR archive and unpack in temporary directory :param tar_archive: @@ -30,7 +42,7 @@ def untar(tar_archive): except: contents = [] TMP_DIR = current_app.config.get('GRAPHICS_TMP_DIR') - extract_dir = "%s/%s" % (TMP_DIR, os.path.basename(tar_archive).split('.')[0]) + extract_dir = "%s/%s" % (TMP_DIR, bibcode) t = tarfile.open(tar_archive, 'r:*') t.extractall(extract_dir) for f in contents: @@ -84,6 +96,8 @@ def convert_images(image_list): if os.path.exists(png_image): done_list.append(png_image) remainder.append(image) + + done_list.sort(key=natural_keys) return remainder, done_list @timeout_decorator.timeout(15) diff --git a/service/graphics.py b/service/graphics.py index 1573e2c..cc31b1d 100644 --- a/service/graphics.py +++ b/service/graphics.py @@ -46,6 +46,7 @@ def get_graphics(bibcode): else: results = { 'query': 'failed', 'error': 'PostgreSQL problem (%s)' % err} + print results if results and 'figures' in results: if len(results['figures']) == 0: # There are cases where an entry exists, but the 'figures' @@ -101,7 +102,7 @@ def get_graphics(bibcode): ADS_image_url % ( bibcode.replace('&', '%26'), figure['page'] - 1)) elif source.upper() == 'ARXIV' \ - and current_app.config.get('GRAPHICSINCLUDE_ARXIV'): + and current_app.config.get('GRAPHICS_INCLUDE_ARXIV'): results['header'] = 'Images extracted from the arXiv e-print' try: display_image = random.choice(display_figure['images']) @@ -115,7 +116,6 @@ def get_graphics(bibcode): for image in images: thumb_url = image['thumbnail'] highr_url = image['highres'] - lowrs_url = image['lowres'] elif source.upper() == 'TEST': results['pick'] = display_figure return results diff --git a/service/utils.py b/service/utils.py index 9292e8b..e08fe36 100644 --- a/service/utils.py +++ b/service/utils.py @@ -390,7 +390,7 @@ def manage_arXiv_graphics(ft_file, bibcode, arx_id, category, update=False, dryr else: graphic = None # First get lists of (La)TeX and image files - tex_files, img_files, xdir = file_ops.untar(ft_file) + tex_files, img_files, xdir = file_ops.untar(ft_file, bibcode) # If we didn't find any image files, skip if len(img_files) == 0: return @@ -423,8 +423,17 @@ def manage_arXiv_graphics(ft_file, bibcode, arx_id, category, update=False, dryr extracted_image_data.extend((extract_context(tex_file, cleaned_image_data))) extracted_image_data = remove_dups(extracted_image_data) + # For those images whereno metadata was captured, keep them with + # empty strings + try: + skipped_images = [i for i in converted_images if i not in [e[0] for e in extracted_image_data]] + except: + skipped_images = converted_images + if len(skipped_images) > 0: + extracted_image_data += [(im,'','',[]) for im in skipped_images] fid = 1 source2target = {} + source2AWS = {} for item in extracted_image_data: if not os.path.exists(item[0]) or not item[0].strip(): continue @@ -443,6 +452,11 @@ def manage_arXiv_graphics(ft_file, bibcode, arx_id, category, update=False, dryr subdir, eprdir, figure_id) + source2AWS[item[0]] = "seri/arXiv/%s/%s/%s/%s.png" % ( + category, + subdir, + eprdir, + figure_id) fig_data['figure_id'] = figure_id try: fig_data['figure_label'] = item[2].encode('ascii','ignore') @@ -453,14 +467,10 @@ def manage_arXiv_graphics(ft_file, bibcode, arx_id, category, update=False, dryr except: fig_data['figure_caption'] = '' image_url = "http://arxiv.org/abs/%s" % arx_id.replace('arXiv:','') - thumb_url = "%s/%s/%s/%s/%s.png/%s" % ( - current_app.config.get('GRAPHICS_BASE_URL'), - category, - subdir, - eprdir, - figure_id, - current_app.config.get('GRAPHICS_THMB_PAR'), - ) + thumb_url = "%s/%s/%s" % ( + current_app.config.get('GRAPHICS_AWS_S3_URL'), + current_app.config.get('GRAPHICS_AWS_S3_BUCKET'), + source2AWS[item[0]]) fig_data['images'] = [ { 'image_id': fid, @@ -473,15 +483,31 @@ def manage_arXiv_graphics(ft_file, bibcode, arx_id, category, update=False, dryr fid += 1 # Now it is time to move the PNGs to their final location, renaming # them in the process + # 1. Store them on a local server + # 2. Store them on AWS S3 + # Create the S3 session and copy over the files + client = get_boto_session().client('s3') + # Currently we just process PNG files + mimetype = 'image/png' + bucket = current_app.config.get('GRAPHICS_AWS_S3_BUCKET') for source, target in source2target.items(): + # Copy image file from TMP location to final location on disk target_dir, fname = os.path.split(target) if not os.path.exists(target_dir): cmmd = 'mkdir -p %s' % target_dir commands.getoutput(cmmd) shutil.copy(source, target) + # Upload image file to S3 + key = source2AWS[source] + try: + data = open(source, 'rb') + except Exception, e: + sys.stderr.write('Error loading image data for %s: %s\n' % (source, str(e))) + continue + client.put_object(Key=key, Bucket=bucket ,Body=data, ACL='public-read', ContentType=mimetype) # Now it's time to clean up stuff we've extracted TMP_DIR = current_app.config.get('GRAPHICS_TMP_DIR') - extract_dir = "%s/%s" % (TMP_DIR, os.path.basename(ft_file).split('.')[0]) + extract_dir = "%s/%s" % (TMP_DIR, bibcode) try: shutil.rmtree(extract_dir) except: