diff --git a/CHANGES.md b/CHANGES.md
index 756dc75..5315019 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,3 +1,7 @@
+### 1.0.9
+
+* graphics for A&A
+
### 1.0.8
* Maintenance update
diff --git a/requirements.txt b/requirements.txt
index 5abbd2f..15dca8b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,3 +17,4 @@ libmagic==1.0
python-magic==0.4.6
Pillow==2.9.0
timeout-decorator
+boto3
diff --git a/service/aws_tools.py b/service/aws_tools.py
new file mode 100644
index 0000000..6948a94
--- /dev/null
+++ b/service/aws_tools.py
@@ -0,0 +1,14 @@
+from boto3.session import Session
+from flask import current_app
+
+def get_boto_session():
+ """
+ Gets a boto3 session using credentials stores in app.config; assumes an
+ app context is active
+ :return: boto3.session instance
+ """
+ return Session(
+ aws_access_key_id=current_app.config.get('AWS_ACCESS_KEY'),
+ aws_secret_access_key=current_app.config.get('AWS_SECRET_KEY'),
+ region_name=current_app.config.get('AWS_REGION')
+ )
diff --git a/service/config.py b/service/config.py
index 574a7cd..3484351 100644
--- a/service/config.py
+++ b/service/config.py
@@ -8,6 +8,7 @@
# The key defines the set and the values are journals (or categories, in the
# case of arXiv)
GRAPHICS_PUBSETS = {
+ 'EDP':['A&A'],
'IOP':['ApJ','ApJL','ApJS','AJ'],
'Elsevier':['NewA'],
'arXiv': ['arXiv', 'acc-phys', 'adap-org', 'alg-geom',
@@ -23,19 +24,24 @@
GRAPHICS_EXTSOURCES = ['IOP', 'Elsevier']
# Some info for the external site
GRAPHICS_HEADER = {
+ 'EDP':'Every image links to the article on Astronomy & Astrophysics',
'IOP':'Every image links to the AAS "Astronomy Image Explorer" for more detail.',
'Elsevier':'Every image links to the article on ScienceDirect'
}
# Define the mapping to help retrieve full text files for a given identifier
GRAPHICS_FULLTEXT_MAPS = {
+ 'EDP':'/path/to/EDP.map',
'IOP':'/path/to/IOP.map',
'arXiv':'/path/to/arXiv.map'
}
+# Location of local graphics files
+GRAPHICS_GRAPHICS_LOCATION = {}
# Define a file with backdata, if available
GRAPHICS_BACK_DATA_FILE = {
}
# These are the values to be stored as "source" in the graphics database
GRAPHICS_SOURCE_NAMES = {
+ 'EDP': 'EDP',
'IOP': 'IOP',
'Elsevier':'Elsevier',
'arXiv': 'arXiv',
@@ -53,6 +59,12 @@
# This section configures this application to act as a client, for example
# to query solr via adsws
GRAPHICS_API_TOKEN = 'we will provide an api key token for this application'
+# To communicate with AWS
+GRAPHICS_AWS_ACCESS_KEY = 'this will be provided through local_config.py'
+GRAPHICS_AWS_SECRET_KEY = 'this will be provided through local_config.py'
+GRAPHICS_AWS_REGION = 'this will be provided through local_config.py'
+GRAPHICS_AWS_S3_URL = 'https://s3.amazonaws.com'
+GRAPHICS_AWS_S3_BUCKET = ''
# Config for logging
GRAPHICS_LOGGING = {
'version': 1,
diff --git a/service/utils.py b/service/utils.py
index 7570ede..55af80c 100644
--- a/service/utils.py
+++ b/service/utils.py
@@ -1,8 +1,10 @@
import os
import re
import sys
+import glob
import shutil
import commands
+import urllib
from operator import itemgetter
import requests
from flask import current_app, request
@@ -12,6 +14,7 @@
from datetime import datetime
from invenio_tools import extract_captions, prepare_image_data,\
extract_context, remove_dups
+from aws_tools import get_boto_session
requests.packages.urllib3.disable_warnings()
@@ -620,3 +623,147 @@ def manage_Elsevier_graphics(record, update=False, dryrun=False):
return len(figures)
else:
return figures
+
+def process_EDP_graphics(identifiers, force, dryrun=False):
+ """
+ For the set of identifiers supplied, retrieve the graphics data.
+ If force is false, skip a bibcode if already in the database. The list of
+ identifiers is a list of dictionaries because for all records we need the
+ bibcode (to check if a record already exists) and the arXiv ID, to find
+ the full text TAR archive
+ :param bibcodes:
+ :param force:
+ :return:
+ """
+ # Create the mapping from bibcode to full text location
+ bibcode2fulltext = {}
+ map_file = current_app.config.get('GRAPHICS_FULLTEXT_MAPS').get('EDP')
+ with open(map_file) as fh_map:
+ for line in fh_map:
+ try:
+ bibcode, ft_file, source = line.strip().split('\t')
+ if ft_file[-3:].lower() == 'xml':
+ bibcode2fulltext[bibcode] = ft_file
+ except:
+ continue
+ # Get source name
+ src = current_app.config.get('GRAPHICS_SOURCE_NAMES').get('EDP')
+ # Now process the records submitted
+ nfigs = None
+ updates = []
+ new = []
+ for entry in identifiers:
+ resp = db.session.query(GraphicsModel).filter(
+ GraphicsModel.bibcode == entry['bibcode']).first()
+ if force and resp:
+ updates.append(entry)
+ elif not resp:
+ new.append(entry)
+ else:
+ continue
+ # First process the updates
+ nfigs = None
+ for paper in updates:
+ # Get the full text for this article
+ fulltext = bibcode2fulltext.get(paper['bibcode'], None)
+ if not fulltext:
+ # No full text file, skip
+ sys.stderr.write('No full text found for %s (update)\n' % paper['bibcode'])
+ continue
+ try:
+ nfigs = manage_EDP_graphics(paper, fulltext, update=True, dryrun=dryrun)
+ except Exception, e:
+ sys.stderr.write('Error processing update %s (%s)\n'%(paper['bibcocde'], e))
+ continue
+ # Next, process the new records
+ for paper in new:
+ # Get the full text for this article
+ fulltext = bibcode2fulltext.get(paper['bibcode'], None)
+ if not fulltext:
+ # No full text file, skip
+ sys.stderr.write('No full text found for %s (new record)\n' % paper['bibcode'])
+ continue
+ try:
+ nfigs = manage_EDP_graphics(paper, fulltext, dryrun=dryrun)
+ except Exception, e:
+ sys.stderr.write('Error processing new %s (%s)\n'%(paper['bibcode'], e))
+ continue
+ return nfigs
+
+def manage_EDP_graphics(record, ft_file, update=False, dryrun=False):
+ # If we're updating, grab the existing database entry
+ if update:
+ graphic = db.session.query(GraphicsModel).filter(
+ GraphicsModel.bibcode == record['bibcode']).first()
+ else:
+ graphic = None
+ # Get the article identifier from the full text file name
+ identifier = os.path.basename(ft_file).replace('.xml','')
+ # and get the location of the full text files
+ srcdir = current_app.config.get('GRAPHICS_GRAPHICS_LOCATION').get('EDP')
+ # Get the JPEG files in the source directory
+ thumbs = glob.glob('%s/%s/*.jpg'%(srcdir, identifier))
+ # Filter out any images with 'small' in the file name
+ # and that don't have 'fig' in the file name
+ thumbs = [t for t in thumbs if t.lower().find('fig') > -1 and t.lower().find('small') == -1]
+ # On S3, thumbnails go to
+ # /seri/A+A//
+ bucket = current_app.config.get('GRAPHICS_AWS_S3_BUCKET')
+ volno = record['bibcode'][9:13].replace('.','0')
+ thumb_bucket = "seri/A+A/%s/%s" % (volno, identifier)
+ # Create the S3 session and copy over the files
+ client = get_boto_session().client('s3')
+ # Currently we just process JPEG files
+ mimetype = 'image/jpeg'
+ # Copy files over to S3
+ figures = []
+ for thumb in thumbs:
+ fig_data = {}
+ images = []
+ # Try to distill the figure number from file name
+ try:
+ fignr = re.sub('^.*fig(\d+).*',r'\1',os.path.basename(thumb))
+ except:
+ fignr = 0
+ fig_data['figure_id'] = re.sub('^(.*)\..*',r'\1',os.path.basename(thumb))
+ fig_data['figure_label'] = "Figure %s" % fignr
+ fig_data['figure_caption'] = ''
+ fig_data['figure_number'] = fignr
+ highres = "http://dx.doi.org/%s" % record['doi']
+ # S3 URL for thumbnail is:
+ # https://s3.amazonaws.com/adsabs-thumbnails/seri/A%2BA/0595/aa29175-16/aa29175-16-fig1.jpg
+ key = "%s/%s" % (thumb_bucket, os.path.basename(thumb))
+ thumbURL = "%s/%s/%s" % (current_app.config.get('GRAPHICS_AWS_S3_URL'), bucket, urllib.quote(key))
+ image = {'image_id': re.sub('^(.*)\..*',r'\1',os.path.basename(thumb)),
+ 'thumbnail': thumbURL,
+ 'format': mimetype.split('/')[1],
+ 'highres': highres}
+ fig_data['images'] = [image]
+ figures.append(fig_data)
+ # Upload the image to S3
+ try:
+ data = open(thumb, 'rb')
+ except Exception, e:
+ sys.stderr.write('Error loading image data for %s: %s\n' % (thumb, str(e)))
+ continue
+ client.put_object(Key=key, Bucket=bucket ,Body=data, ACL='public-read', ContentType=mimetype)
+ figures = sorted(figures, key=itemgetter('figure_number'))
+ if len(figures) > 0 and not dryrun:
+ graph_src = current_app.config.get('GRAPHICS_SOURCE_NAMES').get('EDP')
+ if update:
+ sys.stderr.write('Updating %s\n'%record['bibcode'])
+ graphic.source = graph_src
+ graphic.figures = figures
+ graphic.modtime = datetime.now()
+ else:
+ sys.stderr.write('Creating new record for %s\n'%record['bibcode'])
+ graphic = GraphicsModel(
+ bibcode=record['bibcode'],
+ doi=record['doi'],
+ source=graph_src,
+ eprint=False,
+ figures=figures,
+ modtime=datetime.now()
+ )
+ db.session.add(graphic)
+ db.session.commit()