diff --git a/CHANGES.md b/CHANGES.md index 756dc75..5315019 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,7 @@ +### 1.0.9 + +* graphics for A&A + ### 1.0.8 * Maintenance update diff --git a/requirements.txt b/requirements.txt index 5abbd2f..15dca8b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,3 +17,4 @@ libmagic==1.0 python-magic==0.4.6 Pillow==2.9.0 timeout-decorator +boto3 diff --git a/service/aws_tools.py b/service/aws_tools.py new file mode 100644 index 0000000..6948a94 --- /dev/null +++ b/service/aws_tools.py @@ -0,0 +1,14 @@ +from boto3.session import Session +from flask import current_app + +def get_boto_session(): + """ + Gets a boto3 session using credentials stores in app.config; assumes an + app context is active + :return: boto3.session instance + """ + return Session( + aws_access_key_id=current_app.config.get('AWS_ACCESS_KEY'), + aws_secret_access_key=current_app.config.get('AWS_SECRET_KEY'), + region_name=current_app.config.get('AWS_REGION') + ) diff --git a/service/config.py b/service/config.py index 574a7cd..3484351 100644 --- a/service/config.py +++ b/service/config.py @@ -8,6 +8,7 @@ # The key defines the set and the values are journals (or categories, in the # case of arXiv) GRAPHICS_PUBSETS = { + 'EDP':['A&A'], 'IOP':['ApJ','ApJL','ApJS','AJ'], 'Elsevier':['NewA'], 'arXiv': ['arXiv', 'acc-phys', 'adap-org', 'alg-geom', @@ -23,19 +24,24 @@ GRAPHICS_EXTSOURCES = ['IOP', 'Elsevier'] # Some info for the external site GRAPHICS_HEADER = { + 'EDP':'Every image links to the article on Astronomy & Astrophysics', 'IOP':'Every image links to the AAS "Astronomy Image Explorer" for more detail.', 'Elsevier':'Every image links to the article on ScienceDirect' } # Define the mapping to help retrieve full text files for a given identifier GRAPHICS_FULLTEXT_MAPS = { + 'EDP':'/path/to/EDP.map', 'IOP':'/path/to/IOP.map', 'arXiv':'/path/to/arXiv.map' } +# Location of local graphics files +GRAPHICS_GRAPHICS_LOCATION = {} # Define a file with backdata, if available GRAPHICS_BACK_DATA_FILE = { } # These are the values to be stored as "source" in the graphics database GRAPHICS_SOURCE_NAMES = { + 'EDP': 'EDP', 'IOP': 'IOP', 'Elsevier':'Elsevier', 'arXiv': 'arXiv', @@ -53,6 +59,12 @@ # This section configures this application to act as a client, for example # to query solr via adsws GRAPHICS_API_TOKEN = 'we will provide an api key token for this application' +# To communicate with AWS +GRAPHICS_AWS_ACCESS_KEY = 'this will be provided through local_config.py' +GRAPHICS_AWS_SECRET_KEY = 'this will be provided through local_config.py' +GRAPHICS_AWS_REGION = 'this will be provided through local_config.py' +GRAPHICS_AWS_S3_URL = 'https://s3.amazonaws.com' +GRAPHICS_AWS_S3_BUCKET = '' # Config for logging GRAPHICS_LOGGING = { 'version': 1, diff --git a/service/utils.py b/service/utils.py index 7570ede..55af80c 100644 --- a/service/utils.py +++ b/service/utils.py @@ -1,8 +1,10 @@ import os import re import sys +import glob import shutil import commands +import urllib from operator import itemgetter import requests from flask import current_app, request @@ -12,6 +14,7 @@ from datetime import datetime from invenio_tools import extract_captions, prepare_image_data,\ extract_context, remove_dups +from aws_tools import get_boto_session requests.packages.urllib3.disable_warnings() @@ -620,3 +623,147 @@ def manage_Elsevier_graphics(record, update=False, dryrun=False): return len(figures) else: return figures + +def process_EDP_graphics(identifiers, force, dryrun=False): + """ + For the set of identifiers supplied, retrieve the graphics data. + If force is false, skip a bibcode if already in the database. The list of + identifiers is a list of dictionaries because for all records we need the + bibcode (to check if a record already exists) and the arXiv ID, to find + the full text TAR archive + :param bibcodes: + :param force: + :return: + """ + # Create the mapping from bibcode to full text location + bibcode2fulltext = {} + map_file = current_app.config.get('GRAPHICS_FULLTEXT_MAPS').get('EDP') + with open(map_file) as fh_map: + for line in fh_map: + try: + bibcode, ft_file, source = line.strip().split('\t') + if ft_file[-3:].lower() == 'xml': + bibcode2fulltext[bibcode] = ft_file + except: + continue + # Get source name + src = current_app.config.get('GRAPHICS_SOURCE_NAMES').get('EDP') + # Now process the records submitted + nfigs = None + updates = [] + new = [] + for entry in identifiers: + resp = db.session.query(GraphicsModel).filter( + GraphicsModel.bibcode == entry['bibcode']).first() + if force and resp: + updates.append(entry) + elif not resp: + new.append(entry) + else: + continue + # First process the updates + nfigs = None + for paper in updates: + # Get the full text for this article + fulltext = bibcode2fulltext.get(paper['bibcode'], None) + if not fulltext: + # No full text file, skip + sys.stderr.write('No full text found for %s (update)\n' % paper['bibcode']) + continue + try: + nfigs = manage_EDP_graphics(paper, fulltext, update=True, dryrun=dryrun) + except Exception, e: + sys.stderr.write('Error processing update %s (%s)\n'%(paper['bibcocde'], e)) + continue + # Next, process the new records + for paper in new: + # Get the full text for this article + fulltext = bibcode2fulltext.get(paper['bibcode'], None) + if not fulltext: + # No full text file, skip + sys.stderr.write('No full text found for %s (new record)\n' % paper['bibcode']) + continue + try: + nfigs = manage_EDP_graphics(paper, fulltext, dryrun=dryrun) + except Exception, e: + sys.stderr.write('Error processing new %s (%s)\n'%(paper['bibcode'], e)) + continue + return nfigs + +def manage_EDP_graphics(record, ft_file, update=False, dryrun=False): + # If we're updating, grab the existing database entry + if update: + graphic = db.session.query(GraphicsModel).filter( + GraphicsModel.bibcode == record['bibcode']).first() + else: + graphic = None + # Get the article identifier from the full text file name + identifier = os.path.basename(ft_file).replace('.xml','') + # and get the location of the full text files + srcdir = current_app.config.get('GRAPHICS_GRAPHICS_LOCATION').get('EDP') + # Get the JPEG files in the source directory + thumbs = glob.glob('%s/%s/*.jpg'%(srcdir, identifier)) + # Filter out any images with 'small' in the file name + # and that don't have 'fig' in the file name + thumbs = [t for t in thumbs if t.lower().find('fig') > -1 and t.lower().find('small') == -1] + # On S3, thumbnails go to + # /seri/A+A//
+ bucket = current_app.config.get('GRAPHICS_AWS_S3_BUCKET') + volno = record['bibcode'][9:13].replace('.','0') + thumb_bucket = "seri/A+A/%s/%s" % (volno, identifier) + # Create the S3 session and copy over the files + client = get_boto_session().client('s3') + # Currently we just process JPEG files + mimetype = 'image/jpeg' + # Copy files over to S3 + figures = [] + for thumb in thumbs: + fig_data = {} + images = [] + # Try to distill the figure number from file name + try: + fignr = re.sub('^.*fig(\d+).*',r'\1',os.path.basename(thumb)) + except: + fignr = 0 + fig_data['figure_id'] = re.sub('^(.*)\..*',r'\1',os.path.basename(thumb)) + fig_data['figure_label'] = "Figure %s" % fignr + fig_data['figure_caption'] = '' + fig_data['figure_number'] = fignr + highres = "http://dx.doi.org/%s" % record['doi'] + # S3 URL for thumbnail is: + # https://s3.amazonaws.com/adsabs-thumbnails/seri/A%2BA/0595/aa29175-16/aa29175-16-fig1.jpg + key = "%s/%s" % (thumb_bucket, os.path.basename(thumb)) + thumbURL = "%s/%s/%s" % (current_app.config.get('GRAPHICS_AWS_S3_URL'), bucket, urllib.quote(key)) + image = {'image_id': re.sub('^(.*)\..*',r'\1',os.path.basename(thumb)), + 'thumbnail': thumbURL, + 'format': mimetype.split('/')[1], + 'highres': highres} + fig_data['images'] = [image] + figures.append(fig_data) + # Upload the image to S3 + try: + data = open(thumb, 'rb') + except Exception, e: + sys.stderr.write('Error loading image data for %s: %s\n' % (thumb, str(e))) + continue + client.put_object(Key=key, Bucket=bucket ,Body=data, ACL='public-read', ContentType=mimetype) + figures = sorted(figures, key=itemgetter('figure_number')) + if len(figures) > 0 and not dryrun: + graph_src = current_app.config.get('GRAPHICS_SOURCE_NAMES').get('EDP') + if update: + sys.stderr.write('Updating %s\n'%record['bibcode']) + graphic.source = graph_src + graphic.figures = figures + graphic.modtime = datetime.now() + else: + sys.stderr.write('Creating new record for %s\n'%record['bibcode']) + graphic = GraphicsModel( + bibcode=record['bibcode'], + doi=record['doi'], + source=graph_src, + eprint=False, + figures=figures, + modtime=datetime.now() + ) + db.session.add(graphic) + db.session.commit()