From 8815f4c2083ca007988441ef7d21893ddaa9cc55 Mon Sep 17 00:00:00 2001 From: Liza Daly Date: Tue, 21 Feb 2017 16:38:58 -0500 Subject: [PATCH] [#162] Initial Europeana by-provider import --- imageledger/handlers/handler_europeana.py | 100 ++++++++++++++++++++ imageledger/handlers/utils.py | 4 +- imageledger/licenses.py | 14 ++- imageledger/management/commands/handlers.py | 19 +++- imageledger/management/commands/indexer.py | 4 +- imageledger/tests/test_licenses.py | 8 +- imageledger/tests/test_search.py | 9 -- openledger/local.py.example | 4 + openledger/settings.py | 14 ++- 9 files changed, 144 insertions(+), 32 deletions(-) create mode 100644 imageledger/handlers/handler_europeana.py diff --git a/imageledger/handlers/handler_europeana.py b/imageledger/handlers/handler_europeana.py new file mode 100644 index 0000000..47950ef --- /dev/null +++ b/imageledger/handlers/handler_europeana.py @@ -0,0 +1,100 @@ +import itertools +import logging +import json +import time +import urllib +import requests + +from django.conf import settings +from imageledger import models, signals, search +from django.db.utils import IntegrityError +from django.utils import timezone + +from imageledger import licenses +from imageledger.handlers.utils import * + +BASE_URL = 'http://www.europeana.eu' +ENDPOINT_PHOTOS = BASE_URL + '/api/v2/search.json' + +SOURCE_NAME = "europeana" + +DELAY_SECONDS = 2 # Time to wait between API requests + +log = logging.getLogger(__name__) + +# List of providers we want to extract. This should match what's in settings.PROVIDERS +# and be added as cultural types in WORK_TYPES +providers = { +# 'bl': 'The British Library', +# 'nhl': 'The Trustees of the Natural History Museum, London', + 'nbc': 'Naturalis Biodiversity Center', + 'cg': 'Culture Grid', +} +def _provider_by_label(label): + """Given a Europeana label, return the provider string we'll track""" + for k, v in providers.items(): + if v == label: + return k + +def photos(search=None, page='*', per_page=20, provider='bl', **kwargs): + log.debug("Searching for images in provider %s", provider) + e_label = providers[provider] # Search this provider using this string + params = { + 'query': 'DATA_PROVIDER:"{}"'.format(e_label), + 'media': True, + 'qf': ['IMAGE_SIZE:large', 'IMAGE_SIZE:extra_large', 'TYPE:IMAGE',], + 'reusability': 'open', + 'profile': 'rich', + 'thumbnail': True, + 'rows': per_page, + 'cursor': page, + 'wskey': settings.EUROPEANA_API_KEY, + } + r = requests.get(ENDPOINT_PHOTOS, params=params) + results = r.json() + results['pages'] = int(int(results['totalResults']) / per_page) + return results + +def serialize(result): + """For a given Europeana result, map that to our database""" + url = result['edmIsShownBy'][0] + + image = models.Image(url=url) + thumbnail = 'https://www.europeana.eu/api/v2/thumbnail-by-url.json?size=w200&type=IMAGE&' + image.thumbnail = thumbnail + urllib.parse.urlencode({'uri': url}) + image.source = SOURCE_NAME + image.provider = _provider_by_label(result['dataProvider'][0]) + image.creator = result['dcCreator'][0] if 'dcCreator' in result else None + license, version = licenses.url_to_license(result['rights'][0]) + image.license = license + image.license_version = version + image.foreign_landing_url = result['guid'] + image.foreign_identifier = result['id'] + image.title = result['title'][0] + image.identifier = signals.create_identifier(image.url) + image.last_synced_with_source = timezone.now() + + tag_names = [] + # Tags, if available + if 'edmConceptPrefLabelLangAware' in result and 'en' in result['edmConceptPrefLabelLangAware']: + # Each one of these is a tag + for tag_label in result['edmConceptPrefLabelLangAware']['en']: + #log.debug("Adding tag %s", tag_label) + models.Tag.objects.get_or_create(name=tag_label.lower(), source=SOURCE_NAME) + tag_names.append(tag_label) + image.tags_list = tag_names + #log.debug("'%s' from %s", image.title, image.provider) + return image + +def walk(page="*", per_page=200, provider='bl'): + """Walk through a set of search results and collect items to serialize""" + has_more = True + + while has_more: + results = photos(page=page, per_page=per_page, provider=provider) + page = results.get('nextCursor') + if not page: + has_more = False + for result in results['items']: + yield result + time.sleep(2) diff --git a/imageledger/handlers/utils.py b/imageledger/handlers/utils.py index 39a71aa..c03c3b6 100644 --- a/imageledger/handlers/utils.py +++ b/imageledger/handlers/utils.py @@ -21,7 +21,7 @@ def grouper_it(n, iterable): return yield itertools.chain((first_el,), chunk_it) -def insert_image(walk_func, serialize_func, chunk_size, max_results=5000): +def insert_image(walk_func, serialize_func, chunk_size, max_results=5000, **kwargs): count = 0 success_count = 0 es = search.init() @@ -29,7 +29,7 @@ def insert_image(walk_func, serialize_func, chunk_size, max_results=5000): mapping = search.Image._doc_type.mapping mapping.save(settings.ELASTICSEARCH_INDEX) - for chunk in grouper_it(chunk_size, walk_func()): + for chunk in grouper_it(chunk_size, walk_func(**kwargs)): if max_results is not None and count >= max_results: break else: diff --git a/imageledger/licenses.py b/imageledger/licenses.py index c72a837..ac73f87 100644 --- a/imageledger/licenses.py +++ b/imageledger/licenses.py @@ -60,10 +60,11 @@ def get_license_url(license, version): return "{}/licenses/{}/{}".format(LICENSE_URL_BASE, license, version) def url_to_license(url): - """Given a URL, return the license""" + """Given a URL, return the license as a license/version tuple""" (scheme, netloc, path, *remainder) = urlparse(url) + path_parts = path.split('/') - if len(path_parts) != 4: + if len(path_parts) < 4: raise LicenseException("Did not get 4 path segments, probably not a CC license URL") license = path_parts[2].upper() # First is '', because it starts with a leading / version = path_parts[3] @@ -71,16 +72,13 @@ def url_to_license(url): # Handle the PD licenses as special-cases if license == 'ZERO': license = 'CC0' - version = None + version = '1.0' if license == 'MARK': license = 'PDM' - version = None + version = '1.0' if license not in LICENSE_LIST: raise LicenseException("License fragment %s was not a valid license", license) - if version: - return "{} {}".format(license, version) - else: - return license + return (license, version) def license_map_from_partners(): """Returns a dictionary of each partner with known licensing schemes, and their diff --git a/imageledger/management/commands/handlers.py b/imageledger/management/commands/handlers.py index e4b1b2f..fc42614 100644 --- a/imageledger/management/commands/handlers.py +++ b/imageledger/management/commands/handlers.py @@ -8,7 +8,8 @@ from django.core.management.base import BaseCommand, CommandError -from imageledger.handlers import handler_rijks, handler_nypl, handler_500px, handler_wikimedia, handler_met +from imageledger.handlers import handler_rijks, handler_nypl, handler_500px, \ + handler_wikimedia, handler_met, handler_europeana log = logging.getLogger(__name__) log.setLevel(logging.INFO) @@ -20,7 +21,7 @@ class Command(BaseCommand): can_import_settings = True requires_migrations_checks = True - current_handlers = ('rijks', 'nypl', '500px', 'wikimedia', 'met') + current_handlers = ('rijks', 'nypl', '500px', 'wikimedia', 'met', 'europeana') def add_arguments(self, parser): parser.add_argument("handler", @@ -53,6 +54,7 @@ def add_arguments(self, parser): help="Number of threads to run loader in (only valid for `met`)") def handle(self, *args, **options): + added = 0 if options['verbose']: log.setLevel(logging.DEBUG) if options['handler'] not in self.current_handlers: @@ -78,10 +80,19 @@ def handle(self, *args, **options): else: file_dir = options['from_file'] added = handler_nypl.insert_image(options['chunk_size'], options['max_results'], from_file=file_dir) - added = 0 elif options['handler'] == 'met': handler_met.walk(num_threads=options['num_threads']) - added = 0 + elif options['handler'] == 'europeana': + for provider in handler_europeana.providers: + if provider == 'nhl': + # NHL is loaded with dupe images, use a chunk size of 1 + options['chunk_size'] = 1 + + added = handler_europeana.insert_image(walk_func=handler_europeana.walk, + serialize_func=handler_europeana.serialize, + chunk_size=options['chunk_size'], + max_results=options['max_results'], + provider=provider) log.info("Successfully added %d images out of max %d attempted", added, options['max_results']) diff --git a/imageledger/management/commands/indexer.py b/imageledger/management/commands/indexer.py index d942b9d..c297277 100644 --- a/imageledger/management/commands/indexer.py +++ b/imageledger/management/commands/indexer.py @@ -55,7 +55,7 @@ def add_arguments(self, parser): help="The number of threads to start up at once") def handle(self, *args, **options): - if options['verbose']: + if options['verbose'] or settings.DEBUG: log.setLevel(logging.DEBUG) self.index_all_images(chunk_size=options['chunk_size'], num_iterations=options['num_iterations'], @@ -97,7 +97,7 @@ def do_index(start, chunk_size): log.debug("Starting index in range from %d to %d...", start, end) qs = models.Image.objects.filter(removed_from_source=False, id__gt=start).order_by('id')[0:chunk_size] - #qs = models.Image.objects.filter(removed_from_source=False).order_by('id')[start:end] + for db_image in server_cursor_query(qs, chunk_size=chunk_size): log.debug("Indexing database record %s", db_image.identifier) image = search.db_image_to_index(db_image) diff --git a/imageledger/tests/test_licenses.py b/imageledger/tests/test_licenses.py index 016443c..8bfceb2 100644 --- a/imageledger/tests/test_licenses.py +++ b/imageledger/tests/test_licenses.py @@ -99,17 +99,17 @@ def test_get_license_url_pd_licenses(self): def test_url_to_license(self): """The URL-to-license method should return the correct license and version number given a well-formed URL""" url = "https://creativecommons.org/licenses/by/3.0" - self.assertEquals("BY 3.0", licenses.url_to_license(url)) + self.assertEquals(("BY", "3.0"), licenses.url_to_license(url)) url = "https://creativecommons.org/licenses/by-nc/4.0" - self.assertEquals("BY-NC 4.0", licenses.url_to_license(url)) + self.assertEquals(("BY-NC", "4.0"), licenses.url_to_license(url)) def test_url_to_pd_licenses(self): """The URL-to-license method should return the correct license and version number given a well-formed URL to the public domain licenses""" url = "https://creativecommons.org/publicdomain/zero/1.0" - self.assertEquals("CC0", licenses.url_to_license(url)) + self.assertEquals(("CC0", "1.0"), licenses.url_to_license(url)) url = "https://creativecommons.org/publicdomain/mark/1.0" - self.assertEquals("PDM", licenses.url_to_license(url)) + self.assertEquals(("PDM", "1.0"), licenses.url_to_license(url)) def test_url_to_license_unknown_license(self): """The URL to license method should raise an exception if an unknown URL is passed""" diff --git a/imageledger/tests/test_search.py b/imageledger/tests/test_search.py index fdd47da..6002c38 100644 --- a/imageledger/tests/test_search.py +++ b/imageledger/tests/test_search.py @@ -319,12 +319,3 @@ def test_custom_provider_view(self): # The GET request should redirect resp = self.client.get(reverse('search-met')) self.assertEquals(resp.status_code, 301) - - # Assert that the redirected request has 'met' checked by default and other providers unchecked - resp = self.client.get(reverse('search-met'), follow=True) - self.assertInHTML('''''', - str(resp.content)) - self.assertInHTML('''''', - str(resp.content)) - self.assertInHTML('''''', - str(resp.content)) diff --git a/openledger/local.py.example b/openledger/local.py.example index 4276f41..a8805e3 100644 --- a/openledger/local.py.example +++ b/openledger/local.py.example @@ -9,6 +9,9 @@ API_500PX_SECRET = 'CHANGEME' API_RIJKS = 'CHANGEME' FLICKR_KEY = 'CHANGEME' FLICKR_SECRET = 'CHANGEME' +AKISMET_KEY = 'CHANGEME' +EUROPEANA_API_KEY = 'CHANGEME' +EUROPEANA_PRIVATE_KEY = 'CHANGEME' ELASTICSEARCH_URL = 'localhost' ELASTICSEARCH_PORT = 9200 @@ -16,6 +19,7 @@ ELASTICSEARCH_PORT = 9200 AWS_ACCESS_KEY_ID = 'CHANGEME' AWS_SECRET_ACCESS_KEY = 'CHANGEME' + DATABASES = { 'default': { 'ENGINE': 'django.db.backends.postgresql', diff --git a/openledger/settings.py b/openledger/settings.py index 58157ca..0df9bc3 100644 --- a/openledger/settings.py +++ b/openledger/settings.py @@ -190,7 +190,7 @@ # These providers are current loaded WORK_TYPES = { 'photos': ['flickr', '500px'], - 'cultural': ['rijksmuseum', 'nypl', 'wikimedia', 'met'] + 'cultural': ['rijksmuseum', 'nypl', 'wikimedia', 'met', 'bl', 'nhl', 'nbc', 'cg'] } PROVIDERS = { 'flickr': {'display_name': 'Flickr', @@ -204,7 +204,14 @@ 'url': 'https://500px.com/'}, 'met': {'display_name': 'Metropolitan Museum of Art', 'url': 'http://www.metmuseum.org/'}, - + 'bl': {'display_name': 'British Library', + 'url': 'http://www.bl.uk/'}, + 'nhl': {'display_name': 'Natural History Museum, London', + 'url': 'http://www.nhm.ac.uk/'}, + 'nbc': {'display_name': 'Naturalis Biodiversity Center', + 'url': 'http://www.naturalis.nl/en/'}, + 'cg': {'display_name': 'Culture Grid', + 'url': 'http://www.culturegrid.org.uk/'} } try: @@ -219,7 +226,8 @@ FLICKR_KEY = os.environ.get('FLICKR_KEY') FLICKR_SECRET = os.environ.get('FLICKR_SECRET') NYPL_KEY = os.environ.get('NYPL_KEY') - + EUROPEANA_API_KEY = os.environ.get('EUROPEANA_API_KEY') + EUROPEANA_PRIVATE_KEY = os.environ.get('EUROPEANA_PRIVATE_KEY') ELASTICSEARCH_URL = os.environ.get('ELASTICSEARCH_URL') ELASTICSEARCH_PORT = 80 AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID')