Added A&A thumbnails

adsabs · Nov 3, 2016 · 20e317a · 20e317a
1 parent 0df4141
commit 20e317a
Show file tree

Hide file tree

Showing 5 changed files with 178 additions and 0 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,3 +1,7 @@
+### 1.0.9
+
+* graphics for A&A
+
 ### 1.0.8
 
 * Maintenance update

diff --git a/requirements.txt b/requirements.txt
@@ -17,3 +17,4 @@ libmagic==1.0
 python-magic==0.4.6
 Pillow==2.9.0
 timeout-decorator
+boto3
diff --git a/service/aws_tools.py b/service/aws_tools.py
@@ -0,0 +1,14 @@
+from boto3.session import Session
+from flask import current_app
+
+def get_boto_session():
+    """
+    Gets a boto3 session using credentials stores in app.config; assumes an
+    app context is active
+    :return: boto3.session instance
+    """
+    return Session(
+        aws_access_key_id=current_app.config.get('AWS_ACCESS_KEY'),
+        aws_secret_access_key=current_app.config.get('AWS_SECRET_KEY'),
+        region_name=current_app.config.get('AWS_REGION')
+    )
diff --git a/service/config.py b/service/config.py
@@ -8,6 +8,7 @@
 # The key defines the set and the values are journals (or categories, in the
 # case of arXiv)
 GRAPHICS_PUBSETS = {
+                   'EDP':['A&A'],
                    'IOP':['ApJ','ApJL','ApJS','AJ'],
                    'Elsevier':['NewA'],
                    'arXiv': ['arXiv', 'acc-phys', 'adap-org', 'alg-geom',
@@ -23,19 +24,24 @@
 GRAPHICS_EXTSOURCES = ['IOP', 'Elsevier']
 # Some info for the external site
 GRAPHICS_HEADER = {
+                  'EDP':'Every image links to the article on <a href="http://www.aanda.org/" target="_new">Astronomy &amp; Astrophysics</a>',
                   'IOP':'Every image links to the <a href="http://www.astroexplorer.org/" target="_new">AAS "Astronomy Image Explorer"</a> for more detail.',
                   'Elsevier':'Every image links to the article on <a href="http://www.sciencedirect.com" target="_new">ScienceDirect</a>'
                   }
 # Define the mapping to help retrieve full text files for a given identifier
 GRAPHICS_FULLTEXT_MAPS = {
+    'EDP':'/path/to/EDP.map',
     'IOP':'/path/to/IOP.map',
     'arXiv':'/path/to/arXiv.map'
 }
+# Location of local graphics files
+GRAPHICS_GRAPHICS_LOCATION = {}
 # Define a file with backdata, if available
 GRAPHICS_BACK_DATA_FILE = {
 }
 # These are the values to be stored as "source" in the graphics database
 GRAPHICS_SOURCE_NAMES = {
+    'EDP': 'EDP',
     'IOP': 'IOP',
     'Elsevier':'Elsevier',
     'arXiv': 'arXiv',
@@ -53,6 +59,12 @@
 # This section configures this application to act as a client, for example
 # to query solr via adsws
 GRAPHICS_API_TOKEN = 'we will provide an api key token for this application'
+# To communicate with AWS
+GRAPHICS_AWS_ACCESS_KEY = 'this will be provided through local_config.py'
+GRAPHICS_AWS_SECRET_KEY = 'this will be provided through local_config.py'
+GRAPHICS_AWS_REGION = 'this will be provided through local_config.py'
+GRAPHICS_AWS_S3_URL = 'https://s3.amazonaws.com'
+GRAPHICS_AWS_S3_BUCKET = ''
 # Config for logging
 GRAPHICS_LOGGING = {
     'version': 1,

diff --git a/service/utils.py b/service/utils.py
@@ -1,8 +1,10 @@
 import os
 import re
 import sys
+import glob
 import shutil
 import commands
+import urllib
 from operator import itemgetter
 import requests
 from flask import current_app, request
@@ -12,6 +14,7 @@
 from datetime import datetime
 from invenio_tools import extract_captions, prepare_image_data,\
     extract_context, remove_dups
+from aws_tools import get_boto_session
 
 requests.packages.urllib3.disable_warnings()
 
@@ -620,3 +623,147 @@ def manage_Elsevier_graphics(record, update=False, dryrun=False):
         return len(figures)
     else:
         return figures
+
+def process_EDP_graphics(identifiers, force, dryrun=False):
+    """
+    For the set of identifiers supplied, retrieve the graphics data.
+    If force is false, skip a bibcode if already in the database. The list of
+    identifiers is a list of dictionaries because for all records we need the
+    bibcode (to check if a record already exists) and the arXiv ID, to find
+    the full text TAR archive
+    :param bibcodes:
+    :param force:
+    :return:
+    """
+    # Create the mapping from bibcode to full text location
+    bibcode2fulltext = {}
+    map_file = current_app.config.get('GRAPHICS_FULLTEXT_MAPS').get('EDP')
+    with open(map_file) as fh_map:
+        for line in fh_map:
+            try:
+                bibcode, ft_file, source = line.strip().split('\t')
+                if ft_file[-3:].lower() == 'xml':
+                    bibcode2fulltext[bibcode] = ft_file
+            except:
+                continue
+    # Get source name
+    src = current_app.config.get('GRAPHICS_SOURCE_NAMES').get('EDP')
+    # Now process the records submitted
+    nfigs = None
+    updates = []
+    new = []
+    for entry in identifiers:
+        resp = db.session.query(GraphicsModel).filter(
+            GraphicsModel.bibcode == entry['bibcode']).first()
+        if force and resp:
+            updates.append(entry)
+        elif not resp:
+            new.append(entry)
+        else:
+            continue
+    # First process the updates
+    nfigs = None
+    for paper in updates:
+        # Get the full text for this article
+        fulltext = bibcode2fulltext.get(paper['bibcode'], None)
+        if not fulltext:
+            # No full text file, skip
+            sys.stderr.write('No full text found for %s (update)\n' % paper['bibcode'])
+            continue
+        try:
+             nfigs = manage_EDP_graphics(paper, fulltext, update=True, dryrun=dryrun)
+        except Exception, e:
+            sys.stderr.write('Error processing update %s (%s)\n'%(paper['bibcocde'], e))
+            continue
+    # Next, process the new records
+    for paper in new:
+        # Get the full text for this article
+        fulltext = bibcode2fulltext.get(paper['bibcode'], None)
+        if not fulltext:
+            # No full text file, skip
+            sys.stderr.write('No full text found for %s (new record)\n' % paper['bibcode'])
+            continue
+        try:
+            nfigs = manage_EDP_graphics(paper, fulltext, dryrun=dryrun)
+        except Exception, e:
+            sys.stderr.write('Error processing new %s (%s)\n'%(paper['bibcode'], e))
+            continue
+    return nfigs
+
+def manage_EDP_graphics(record, ft_file, update=False, dryrun=False):
+    # If we're updating, grab the existing database entry
+    if update:
+        graphic = db.session.query(GraphicsModel).filter(
+            GraphicsModel.bibcode == record['bibcode']).first()
+    else:
+        graphic = None
+    # Get the article identifier from the full text file name
+    identifier = os.path.basename(ft_file).replace('.xml','')
+    # and get the location of the full text files
+    srcdir = current_app.config.get('GRAPHICS_GRAPHICS_LOCATION').get('EDP')
+    # Get the JPEG files in the source directory
+    thumbs = glob.glob('%s/%s/*.jpg'%(srcdir, identifier))
+    # Filter out any images with 'small' in the file name
+    # and that don't have 'fig' in the file name  
+    thumbs = [t for t in thumbs if t.lower().find('fig') > -1 and t.lower().find('small') == -1]
+    # On S3, thumbnails go to
+    #  <bucket>/seri/A+A/<volume>/<article ID>
+    bucket = current_app.config.get('GRAPHICS_AWS_S3_BUCKET')
+    volno = record['bibcode'][9:13].replace('.','0')
+    thumb_bucket = "seri/A+A/%s/%s" % (volno, identifier)
+    # Create the S3 session and copy over the files
+    client = get_boto_session().client('s3')
+    # Currently we just process JPEG files
+    mimetype = 'image/jpeg'
+    # Copy files over to S3
+    figures = []
+    for thumb in thumbs:
+        fig_data = {}
+        images = []
+        # Try to distill the figure number from file name
+        try:
+            fignr = re.sub('^.*fig(\d+).*',r'\1',os.path.basename(thumb))
+        except:
+            fignr = 0
+        fig_data['figure_id'] = re.sub('^(.*)\..*',r'\1',os.path.basename(thumb))
+        fig_data['figure_label'] = "Figure %s" % fignr
+        fig_data['figure_caption'] = ''
+        fig_data['figure_number'] = fignr
+        highres = "http://dx.doi.org/%s" % record['doi']
+        # S3 URL for thumbnail is:
+        # https://s3.amazonaws.com/adsabs-thumbnails/seri/A%2BA/0595/aa29175-16/aa29175-16-fig1.jpg
+        key = "%s/%s" % (thumb_bucket, os.path.basename(thumb))
+        thumbURL = "%s/%s/%s" % (current_app.config.get('GRAPHICS_AWS_S3_URL'), bucket, urllib.quote(key))
+        image = {'image_id': re.sub('^(.*)\..*',r'\1',os.path.basename(thumb)),
+                 'thumbnail': thumbURL,
+                 'format': mimetype.split('/')[1],
+                 'highres': highres}
+        fig_data['images'] = [image]
+        figures.append(fig_data)
+        # Upload the image to S3
+        try:
+            data = open(thumb, 'rb')
+        except Exception, e:
+            sys.stderr.write('Error loading image data for %s: %s\n' % (thumb, str(e)))
+            continue
+        client.put_object(Key=key, Bucket=bucket ,Body=data, ACL='public-read', ContentType=mimetype)
+    figures = sorted(figures, key=itemgetter('figure_number'))
+    if len(figures) > 0 and not dryrun:
+        graph_src = current_app.config.get('GRAPHICS_SOURCE_NAMES').get('EDP')
+        if update:
+            sys.stderr.write('Updating %s\n'%record['bibcode'])
+            graphic.source = graph_src
+            graphic.figures = figures
+            graphic.modtime = datetime.now()
+        else:
+            sys.stderr.write('Creating new record for %s\n'%record['bibcode'])
+            graphic = GraphicsModel(
+                bibcode=record['bibcode'],
+                doi=record['doi'],
+                source=graph_src,
+                eprint=False,
+                figures=figures,
+                modtime=datetime.now()
+            )
+            db.session.add(graphic)
+        db.session.commit()