Skip to content
This repository has been archived by the owner on Apr 9, 2021. It is now read-only.

Commit

Permalink
[#162] Initial Europeana by-provider import
Browse files Browse the repository at this point in the history
  • Loading branch information
lizadaly committed Feb 22, 2017
1 parent 269c1ac commit 5d71df3
Show file tree
Hide file tree
Showing 10 changed files with 150 additions and 32 deletions.
6 changes: 6 additions & 0 deletions fabfile.py
Expand Up @@ -97,6 +97,11 @@
'action': 'load-from-provider',
'provider': 'met',
},
'europeana': {
'name': 'load-europeana',
'action': 'load-from-provider',
'provider': 'europeana',
},
'sync': {
'action': 'sync',
'name': 'sync',
Expand Down Expand Up @@ -203,6 +208,7 @@ def load_data_from_instance(instance):
NYPL_KEY=NYPL_KEY,
DJANGO_SECRET_KEY=os.environ.get('DJANGO_SECRET_KEY'),
NEW_RELIC_LICENSE_KEY=os.environ.get('NEW_RELIC_LICENSE_KEY'),
EUROPEANA_API_KEY=os.environ.get('EUROPEANA_API_KEY')
):

env.datasource['flags'] = env.flags
Expand Down
100 changes: 100 additions & 0 deletions imageledger/handlers/handler_europeana.py
@@ -0,0 +1,100 @@
import itertools
import logging
import json
import time
import urllib
import requests

from django.conf import settings
from imageledger import models, signals, search
from django.db.utils import IntegrityError
from django.utils import timezone

from imageledger import licenses
from imageledger.handlers.utils import *

BASE_URL = 'http://www.europeana.eu'
ENDPOINT_PHOTOS = BASE_URL + '/api/v2/search.json'

SOURCE_NAME = "europeana"

DELAY_SECONDS = 2 # Time to wait between API requests

log = logging.getLogger(__name__)

# List of providers we want to extract. This should match what's in settings.PROVIDERS
# and be added as cultural types in WORK_TYPES
providers = {
'bl': 'The British Library',
'nhl': 'The Trustees of the Natural History Museum, London',
'nbc': 'Naturalis Biodiversity Center',
'cg': 'Culture Grid',
}
def _provider_by_label(label):
"""Given a Europeana label, return the provider string we'll track"""
for k, v in providers.items():
if v == label:
return k

def photos(search=None, page='*', per_page=20, provider='bl', **kwargs):
log.debug("Searching for images in provider %s", provider)
e_label = providers[provider] # Search this provider using this string
params = {
'query': 'DATA_PROVIDER:"{}"'.format(e_label),
'media': True,
'qf': ['IMAGE_SIZE:large', 'IMAGE_SIZE:extra_large', 'TYPE:IMAGE',],
'reusability': 'open',
'profile': 'rich',
'thumbnail': True,
'rows': per_page,
'cursor': page,
'wskey': settings.EUROPEANA_API_KEY,
}
r = requests.get(ENDPOINT_PHOTOS, params=params)
results = r.json()
results['pages'] = int(int(results['totalResults']) / per_page)
return results

def serialize(result):
"""For a given Europeana result, map that to our database"""
url = result['edmIsShownBy'][0]

image = models.Image(url=url)
thumbnail = 'https://www.europeana.eu/api/v2/thumbnail-by-url.json?size=w200&type=IMAGE&'
image.thumbnail = thumbnail + urllib.parse.urlencode({'uri': url})
image.source = SOURCE_NAME
image.provider = _provider_by_label(result['dataProvider'][0])
image.creator = result['dcCreator'][0] if 'dcCreator' in result else None
license, version = licenses.url_to_license(result['rights'][0])
image.license = license
image.license_version = version
image.foreign_landing_url = result['guid']
image.foreign_identifier = result['id']
image.title = result['title'][0]
image.identifier = signals.create_identifier(image.url)
image.last_synced_with_source = timezone.now()

tag_names = []
# Tags, if available
if 'edmConceptPrefLabelLangAware' in result and 'en' in result['edmConceptPrefLabelLangAware']:
# Each one of these is a tag
for tag_label in result['edmConceptPrefLabelLangAware']['en']:
#log.debug("Adding tag %s", tag_label)
models.Tag.objects.get_or_create(name=tag_label.lower(), source=SOURCE_NAME)
tag_names.append(tag_label)
image.tags_list = tag_names
#log.debug("'%s' from %s", image.title, image.provider)
return image

def walk(page="*", per_page=200, provider='bl'):
"""Walk through a set of search results and collect items to serialize"""
has_more = True

while has_more:
results = photos(page=page, per_page=per_page, provider=provider)
page = results.get('nextCursor')
if not page:
has_more = False
for result in results['items']:
yield result
time.sleep(2)
4 changes: 2 additions & 2 deletions imageledger/handlers/utils.py
Expand Up @@ -21,15 +21,15 @@ def grouper_it(n, iterable):
return
yield itertools.chain((first_el,), chunk_it)

def insert_image(walk_func, serialize_func, chunk_size, max_results=5000):
def insert_image(walk_func, serialize_func, chunk_size, max_results=5000, **kwargs):
count = 0
success_count = 0
es = search.init()
search.Image.init()
mapping = search.Image._doc_type.mapping
mapping.save(settings.ELASTICSEARCH_INDEX)

for chunk in grouper_it(chunk_size, walk_func()):
for chunk in grouper_it(chunk_size, walk_func(**kwargs)):
if max_results is not None and count >= max_results:
break
else:
Expand Down
14 changes: 6 additions & 8 deletions imageledger/licenses.py
Expand Up @@ -60,27 +60,25 @@ def get_license_url(license, version):
return "{}/licenses/{}/{}".format(LICENSE_URL_BASE, license, version)

def url_to_license(url):
"""Given a URL, return the license"""
"""Given a URL, return the license as a license/version tuple"""
(scheme, netloc, path, *remainder) = urlparse(url)

path_parts = path.split('/')
if len(path_parts) != 4:
if len(path_parts) < 4:
raise LicenseException("Did not get 4 path segments, probably not a CC license URL")
license = path_parts[2].upper() # First is '', because it starts with a leading /
version = path_parts[3]

# Handle the PD licenses as special-cases
if license == 'ZERO':
license = 'CC0'
version = None
version = '1.0'
if license == 'MARK':
license = 'PDM'
version = None
version = '1.0'
if license not in LICENSE_LIST:
raise LicenseException("License fragment %s was not a valid license", license)
if version:
return "{} {}".format(license, version)
else:
return license
return (license, version)

def license_map_from_partners():
"""Returns a dictionary of each partner with known licensing schemes, and their
Expand Down
19 changes: 15 additions & 4 deletions imageledger/management/commands/handlers.py
Expand Up @@ -8,7 +8,8 @@

from django.core.management.base import BaseCommand, CommandError

from imageledger.handlers import handler_rijks, handler_nypl, handler_500px, handler_wikimedia, handler_met
from imageledger.handlers import handler_rijks, handler_nypl, handler_500px, \
handler_wikimedia, handler_met, handler_europeana

log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
Expand All @@ -20,7 +21,7 @@ class Command(BaseCommand):
can_import_settings = True
requires_migrations_checks = True

current_handlers = ('rijks', 'nypl', '500px', 'wikimedia', 'met')
current_handlers = ('rijks', 'nypl', '500px', 'wikimedia', 'met', 'europeana')

def add_arguments(self, parser):
parser.add_argument("handler",
Expand Down Expand Up @@ -53,6 +54,7 @@ def add_arguments(self, parser):
help="Number of threads to run loader in (only valid for `met`)")

def handle(self, *args, **options):
added = 0
if options['verbose']:
log.setLevel(logging.DEBUG)
if options['handler'] not in self.current_handlers:
Expand All @@ -78,10 +80,19 @@ def handle(self, *args, **options):
else:
file_dir = options['from_file']
added = handler_nypl.insert_image(options['chunk_size'], options['max_results'], from_file=file_dir)
added = 0
elif options['handler'] == 'met':
handler_met.walk(num_threads=options['num_threads'])
added = 0
elif options['handler'] == 'europeana':
for provider in handler_europeana.providers:
if provider == 'nhl':
# NHL is loaded with dupe images, use a chunk size of 1
options['chunk_size'] = 1

added = handler_europeana.insert_image(walk_func=handler_europeana.walk,
serialize_func=handler_europeana.serialize,
chunk_size=options['chunk_size'],
max_results=options['max_results'],
provider=provider)

log.info("Successfully added %d images out of max %d attempted", added, options['max_results'])

Expand Down
4 changes: 2 additions & 2 deletions imageledger/management/commands/indexer.py
Expand Up @@ -55,7 +55,7 @@ def add_arguments(self, parser):
help="The number of threads to start up at once")

def handle(self, *args, **options):
if options['verbose']:
if options['verbose'] or settings.DEBUG:
log.setLevel(logging.DEBUG)
self.index_all_images(chunk_size=options['chunk_size'],
num_iterations=options['num_iterations'],
Expand Down Expand Up @@ -97,7 +97,7 @@ def do_index(start, chunk_size):
log.debug("Starting index in range from %d to %d...", start, end)

qs = models.Image.objects.filter(removed_from_source=False, id__gt=start).order_by('id')[0:chunk_size]
#qs = models.Image.objects.filter(removed_from_source=False).order_by('id')[start:end]

for db_image in server_cursor_query(qs, chunk_size=chunk_size):
log.debug("Indexing database record %s", db_image.identifier)
image = search.db_image_to_index(db_image)
Expand Down
8 changes: 4 additions & 4 deletions imageledger/tests/test_licenses.py
Expand Up @@ -99,17 +99,17 @@ def test_get_license_url_pd_licenses(self):
def test_url_to_license(self):
"""The URL-to-license method should return the correct license and version number given a well-formed URL"""
url = "https://creativecommons.org/licenses/by/3.0"
self.assertEquals("BY 3.0", licenses.url_to_license(url))
self.assertEquals(("BY", "3.0"), licenses.url_to_license(url))
url = "https://creativecommons.org/licenses/by-nc/4.0"
self.assertEquals("BY-NC 4.0", licenses.url_to_license(url))
self.assertEquals(("BY-NC", "4.0"), licenses.url_to_license(url))

def test_url_to_pd_licenses(self):
"""The URL-to-license method should return the correct license and version number given a
well-formed URL to the public domain licenses"""
url = "https://creativecommons.org/publicdomain/zero/1.0"
self.assertEquals("CC0", licenses.url_to_license(url))
self.assertEquals(("CC0", "1.0"), licenses.url_to_license(url))
url = "https://creativecommons.org/publicdomain/mark/1.0"
self.assertEquals("PDM", licenses.url_to_license(url))
self.assertEquals(("PDM", "1.0"), licenses.url_to_license(url))

def test_url_to_license_unknown_license(self):
"""The URL to license method should raise an exception if an unknown URL is passed"""
Expand Down
9 changes: 0 additions & 9 deletions imageledger/tests/test_search.py
Expand Up @@ -319,12 +319,3 @@ def test_custom_provider_view(self):
# The GET request should redirect
resp = self.client.get(reverse('search-met'))
self.assertEquals(resp.status_code, 301)

# Assert that the redirected request has 'met' checked by default and other providers unchecked
resp = self.client.get(reverse('search-met'), follow=True)
self.assertInHTML('''<input checked="checked" id="id_providers_2" name="providers" type="checkbox" value="met" />''',
str(resp.content))
self.assertInHTML('''<input id="id_providers_3" name="providers" type="checkbox" value="nypl" />''',
str(resp.content))
self.assertInHTML('''<input id="id_providers_1" name="providers" type="checkbox" value="flickr" />''',
str(resp.content))
4 changes: 4 additions & 0 deletions openledger/local.py.example
Expand Up @@ -9,13 +9,17 @@ API_500PX_SECRET = 'CHANGEME'
API_RIJKS = 'CHANGEME'
FLICKR_KEY = 'CHANGEME'
FLICKR_SECRET = 'CHANGEME'
AKISMET_KEY = 'CHANGEME'
EUROPEANA_API_KEY = 'CHANGEME'
EUROPEANA_PRIVATE_KEY = 'CHANGEME'

ELASTICSEARCH_URL = 'localhost'
ELASTICSEARCH_PORT = 9200

AWS_ACCESS_KEY_ID = 'CHANGEME'
AWS_SECRET_ACCESS_KEY = 'CHANGEME'


DATABASES = {
'default': {
'ENGINE': 'django.db.backends.postgresql',
Expand Down
14 changes: 11 additions & 3 deletions openledger/settings.py
Expand Up @@ -190,7 +190,7 @@
# These providers are current loaded
WORK_TYPES = {
'photos': ['flickr', '500px'],
'cultural': ['rijksmuseum', 'nypl', 'wikimedia', 'met']
'cultural': ['rijksmuseum', 'nypl', 'wikimedia', 'met', 'bl', 'nhl', 'nbc', 'cg']
}
PROVIDERS = {
'flickr': {'display_name': 'Flickr',
Expand All @@ -204,7 +204,14 @@
'url': 'https://500px.com/'},
'met': {'display_name': 'Metropolitan Museum of Art',
'url': 'http://www.metmuseum.org/'},

'bl': {'display_name': 'British Library',
'url': 'http://www.bl.uk/'},
'nhl': {'display_name': 'Natural History Museum, London',
'url': 'http://www.nhm.ac.uk/'},
'nbc': {'display_name': 'Naturalis Biodiversity Center',
'url': 'http://www.naturalis.nl/en/'},
'cg': {'display_name': 'Culture Grid',
'url': 'http://www.culturegrid.org.uk/'}
}

try:
Expand All @@ -219,7 +226,8 @@
FLICKR_KEY = os.environ.get('FLICKR_KEY')
FLICKR_SECRET = os.environ.get('FLICKR_SECRET')
NYPL_KEY = os.environ.get('NYPL_KEY')

EUROPEANA_API_KEY = os.environ.get('EUROPEANA_API_KEY')
EUROPEANA_PRIVATE_KEY = os.environ.get('EUROPEANA_PRIVATE_KEY')
ELASTICSEARCH_URL = os.environ.get('ELASTICSEARCH_URL')
ELASTICSEARCH_PORT = 80
AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID')
Expand Down

0 comments on commit 5d71df3

Please sign in to comment.