Skip to content

Commit

Permalink
Added A&A thumbnails
Browse files Browse the repository at this point in the history
  • Loading branch information
ehenneken committed Nov 3, 2016
1 parent 0df4141 commit 20e317a
Show file tree
Hide file tree
Showing 5 changed files with 178 additions and 0 deletions.
4 changes: 4 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
### 1.0.9

* graphics for A&A

### 1.0.8

* Maintenance update
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ libmagic==1.0
python-magic==0.4.6
Pillow==2.9.0
timeout-decorator
boto3
14 changes: 14 additions & 0 deletions service/aws_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from boto3.session import Session
from flask import current_app

def get_boto_session():
"""
Gets a boto3 session using credentials stores in app.config; assumes an
app context is active
:return: boto3.session instance
"""
return Session(
aws_access_key_id=current_app.config.get('AWS_ACCESS_KEY'),
aws_secret_access_key=current_app.config.get('AWS_SECRET_KEY'),
region_name=current_app.config.get('AWS_REGION')
)
12 changes: 12 additions & 0 deletions service/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# The key defines the set and the values are journals (or categories, in the
# case of arXiv)
GRAPHICS_PUBSETS = {
'EDP':['A&A'],
'IOP':['ApJ','ApJL','ApJS','AJ'],
'Elsevier':['NewA'],
'arXiv': ['arXiv', 'acc-phys', 'adap-org', 'alg-geom',
Expand All @@ -23,19 +24,24 @@
GRAPHICS_EXTSOURCES = ['IOP', 'Elsevier']
# Some info for the external site
GRAPHICS_HEADER = {
'EDP':'Every image links to the article on <a href="http://www.aanda.org/" target="_new">Astronomy &amp; Astrophysics</a>',
'IOP':'Every image links to the <a href="http://www.astroexplorer.org/" target="_new">AAS "Astronomy Image Explorer"</a> for more detail.',
'Elsevier':'Every image links to the article on <a href="http://www.sciencedirect.com" target="_new">ScienceDirect</a>'
}
# Define the mapping to help retrieve full text files for a given identifier
GRAPHICS_FULLTEXT_MAPS = {
'EDP':'/path/to/EDP.map',
'IOP':'/path/to/IOP.map',
'arXiv':'/path/to/arXiv.map'
}
# Location of local graphics files
GRAPHICS_GRAPHICS_LOCATION = {}
# Define a file with backdata, if available
GRAPHICS_BACK_DATA_FILE = {
}
# These are the values to be stored as "source" in the graphics database
GRAPHICS_SOURCE_NAMES = {
'EDP': 'EDP',
'IOP': 'IOP',
'Elsevier':'Elsevier',
'arXiv': 'arXiv',
Expand All @@ -53,6 +59,12 @@
# This section configures this application to act as a client, for example
# to query solr via adsws
GRAPHICS_API_TOKEN = 'we will provide an api key token for this application'
# To communicate with AWS
GRAPHICS_AWS_ACCESS_KEY = 'this will be provided through local_config.py'
GRAPHICS_AWS_SECRET_KEY = 'this will be provided through local_config.py'
GRAPHICS_AWS_REGION = 'this will be provided through local_config.py'
GRAPHICS_AWS_S3_URL = 'https://s3.amazonaws.com'
GRAPHICS_AWS_S3_BUCKET = ''
# Config for logging
GRAPHICS_LOGGING = {
'version': 1,
Expand Down
147 changes: 147 additions & 0 deletions service/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import os
import re
import sys
import glob
import shutil
import commands
import urllib
from operator import itemgetter
import requests
from flask import current_app, request
Expand All @@ -12,6 +14,7 @@
from datetime import datetime
from invenio_tools import extract_captions, prepare_image_data,\
extract_context, remove_dups
from aws_tools import get_boto_session

requests.packages.urllib3.disable_warnings()

Expand Down Expand Up @@ -620,3 +623,147 @@ def manage_Elsevier_graphics(record, update=False, dryrun=False):
return len(figures)
else:
return figures

def process_EDP_graphics(identifiers, force, dryrun=False):
"""
For the set of identifiers supplied, retrieve the graphics data.
If force is false, skip a bibcode if already in the database. The list of
identifiers is a list of dictionaries because for all records we need the
bibcode (to check if a record already exists) and the arXiv ID, to find
the full text TAR archive
:param bibcodes:
:param force:
:return:
"""
# Create the mapping from bibcode to full text location
bibcode2fulltext = {}
map_file = current_app.config.get('GRAPHICS_FULLTEXT_MAPS').get('EDP')
with open(map_file) as fh_map:
for line in fh_map:
try:
bibcode, ft_file, source = line.strip().split('\t')
if ft_file[-3:].lower() == 'xml':
bibcode2fulltext[bibcode] = ft_file
except:
continue
# Get source name
src = current_app.config.get('GRAPHICS_SOURCE_NAMES').get('EDP')
# Now process the records submitted
nfigs = None
updates = []
new = []
for entry in identifiers:
resp = db.session.query(GraphicsModel).filter(
GraphicsModel.bibcode == entry['bibcode']).first()
if force and resp:
updates.append(entry)
elif not resp:
new.append(entry)
else:
continue
# First process the updates
nfigs = None
for paper in updates:
# Get the full text for this article
fulltext = bibcode2fulltext.get(paper['bibcode'], None)
if not fulltext:
# No full text file, skip
sys.stderr.write('No full text found for %s (update)\n' % paper['bibcode'])
continue
try:
nfigs = manage_EDP_graphics(paper, fulltext, update=True, dryrun=dryrun)
except Exception, e:
sys.stderr.write('Error processing update %s (%s)\n'%(paper['bibcocde'], e))
continue
# Next, process the new records
for paper in new:
# Get the full text for this article
fulltext = bibcode2fulltext.get(paper['bibcode'], None)
if not fulltext:
# No full text file, skip
sys.stderr.write('No full text found for %s (new record)\n' % paper['bibcode'])
continue
try:
nfigs = manage_EDP_graphics(paper, fulltext, dryrun=dryrun)
except Exception, e:
sys.stderr.write('Error processing new %s (%s)\n'%(paper['bibcode'], e))
continue
return nfigs

def manage_EDP_graphics(record, ft_file, update=False, dryrun=False):
# If we're updating, grab the existing database entry
if update:
graphic = db.session.query(GraphicsModel).filter(
GraphicsModel.bibcode == record['bibcode']).first()
else:
graphic = None
# Get the article identifier from the full text file name
identifier = os.path.basename(ft_file).replace('.xml','')
# and get the location of the full text files
srcdir = current_app.config.get('GRAPHICS_GRAPHICS_LOCATION').get('EDP')
# Get the JPEG files in the source directory
thumbs = glob.glob('%s/%s/*.jpg'%(srcdir, identifier))
# Filter out any images with 'small' in the file name
# and that don't have 'fig' in the file name
thumbs = [t for t in thumbs if t.lower().find('fig') > -1 and t.lower().find('small') == -1]
# On S3, thumbnails go to
# <bucket>/seri/A+A/<volume>/<article ID>
bucket = current_app.config.get('GRAPHICS_AWS_S3_BUCKET')
volno = record['bibcode'][9:13].replace('.','0')
thumb_bucket = "seri/A+A/%s/%s" % (volno, identifier)
# Create the S3 session and copy over the files
client = get_boto_session().client('s3')
# Currently we just process JPEG files
mimetype = 'image/jpeg'
# Copy files over to S3
figures = []
for thumb in thumbs:
fig_data = {}
images = []
# Try to distill the figure number from file name
try:
fignr = re.sub('^.*fig(\d+).*',r'\1',os.path.basename(thumb))
except:
fignr = 0
fig_data['figure_id'] = re.sub('^(.*)\..*',r'\1',os.path.basename(thumb))
fig_data['figure_label'] = "Figure %s" % fignr
fig_data['figure_caption'] = ''
fig_data['figure_number'] = fignr
highres = "http://dx.doi.org/%s" % record['doi']
# S3 URL for thumbnail is:
# https://s3.amazonaws.com/adsabs-thumbnails/seri/A%2BA/0595/aa29175-16/aa29175-16-fig1.jpg
key = "%s/%s" % (thumb_bucket, os.path.basename(thumb))
thumbURL = "%s/%s/%s" % (current_app.config.get('GRAPHICS_AWS_S3_URL'), bucket, urllib.quote(key))
image = {'image_id': re.sub('^(.*)\..*',r'\1',os.path.basename(thumb)),
'thumbnail': thumbURL,
'format': mimetype.split('/')[1],
'highres': highres}
fig_data['images'] = [image]
figures.append(fig_data)
# Upload the image to S3
try:
data = open(thumb, 'rb')
except Exception, e:
sys.stderr.write('Error loading image data for %s: %s\n' % (thumb, str(e)))
continue
client.put_object(Key=key, Bucket=bucket ,Body=data, ACL='public-read', ContentType=mimetype)
figures = sorted(figures, key=itemgetter('figure_number'))
if len(figures) > 0 and not dryrun:
graph_src = current_app.config.get('GRAPHICS_SOURCE_NAMES').get('EDP')
if update:
sys.stderr.write('Updating %s\n'%record['bibcode'])
graphic.source = graph_src
graphic.figures = figures
graphic.modtime = datetime.now()
else:
sys.stderr.write('Creating new record for %s\n'%record['bibcode'])
graphic = GraphicsModel(
bibcode=record['bibcode'],
doi=record['doi'],
source=graph_src,
eprint=False,
figures=figures,
modtime=datetime.now()
)
db.session.add(graphic)
db.session.commit()

0 comments on commit 20e317a

Please sign in to comment.