Skip to content
This repository has been archived by the owner on Jan 13, 2022. It is now read-only.

Retrieve sub providers within Smithsonian #455

Merged
merged 6 commits into from Jul 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
27 changes: 23 additions & 4 deletions src/cc_catalog_airflow/dags/provider_api_scripts/smithsonian.py
Expand Up @@ -15,6 +15,8 @@

from common.storage import image
from common import requester
from util.loader import provider_details as prov

logger = logging.getLogger(__name__)

API_KEY = os.getenv('DATA_GOV_API_KEY')
Expand All @@ -24,7 +26,9 @@
API_ROOT = 'https://api.si.edu/openaccess/api/v1.0/'
SEARCH_ENDPOINT = API_ROOT + 'search'
UNITS_ENDPOINT = API_ROOT + 'terms/unit_code'
PROVIDER = 'smithsonian'
PROVIDER = prov.SMITHSONIAN_DEFAULT_PROVIDER
# SUB_PROVIDERS is a collection of all providers within smithsonian
SUB_PROVIDERS = prov.SMITHSONIAN_SUB_PROVIDERS
ZERO_URL = 'https://creativecommons.org/publicdomain/zero/1.0/'
DEFAULT_PARAMS = {
'api_key': API_KEY,
Expand Down Expand Up @@ -218,13 +222,16 @@ def _process_response_json(response_json):
for row in rows:
image_list = _get_image_list(row)
if image_list:
meta_data = _extract_meta_data(row)
source = _extract_source(meta_data)
total_images = _process_image_list(
image_list,
_get_foreign_landing_url(row),
_get_title(row),
_get_creator(row),
_extract_meta_data(row),
_extract_tags(row)
meta_data,
_extract_tags(row),
source
)
return total_images

Expand Down Expand Up @@ -313,6 +320,16 @@ def _extract_meta_data(row, description_types=DESCRIPTION_TYPES):
return {k: v for (k, v) in meta_data.items() if v is not None}


def _extract_source(meta_data, sub_providers=SUB_PROVIDERS):
unit_code = meta_data.get('unit_code').strip()
source = next((s for s in sub_providers if unit_code in
sub_providers[s]), None)
if source is None:
raise Exception(
f"An unknown unit code value {unit_code} encountered ")
return source


def _extract_tags(row, tag_types=TAG_TYPES):
indexed_structured = _get_indexed_structured_dict(row)
tag_lists_generator = (
Expand Down Expand Up @@ -387,6 +404,7 @@ def _process_image_list(
creator,
meta_data,
tags,
source,
license_url=ZERO_URL
):
total_images = None
Expand All @@ -404,7 +422,8 @@ def _process_image_list(
title=title,
creator=creator,
meta_data=meta_data,
raw_tags=tags
raw_tags=tags,
source=source
)
return total_images

Expand Down
Expand Up @@ -20,6 +20,12 @@
)


def _get_resource_json(json_name):
with open(os.path.join(RESOURCES, json_name)) as f:
resource_json = json.load(f)
return resource_json


def test_get_hash_prefixes_with_len_one():
expect_prefix_list = [
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd',
Expand Down Expand Up @@ -247,6 +253,7 @@ def test_process_response_json_uses_required_getters():
creator_list = ['creator0', 'creator1']
metadata_list = ['metadata0', 'metadata1']
tags_list = ['tags0', 'tags1']
source_list = ['source0', 'source1']

get_row_list = patch.object(si, '_get_row_list', return_value=row_list)
process_image_list = patch.object(
Expand All @@ -264,15 +271,19 @@ def test_process_response_json_uses_required_getters():
si, '_extract_meta_data', side_effect=metadata_list
)
ext_tags = patch.object(si, '_extract_tags', side_effect=tags_list)
ext_source = patch.object(
si, '_extract_source', side_effect=source_list
)

with\
get_row_list as mock_get_row_list,\
get_image_list as mock_get_image_list,\
get_flu as mock_get_foreign_landing_url,\
get_title as mock_get_title,\
get_creator as mock_get_creator,\
ext_meta_data as mock_extract_meta_data,\
ext_tags as mock_extract_tags,\
ext_meta_data as mock_extract_meta_data, \
ext_tags as mock_extract_tags, \
ext_source as mock_extract_source, \
process_image_list as mock_process_image_list:
si._process_response_json(response_json)

Expand All @@ -284,15 +295,17 @@ def test_process_response_json_uses_required_getters():
title_list[0],
creator_list[0],
metadata_list[0],
tags_list[0]
tags_list[0],
source_list[0]
),
call(
image_lists[1],
flu_list[1],
title_list[1],
creator_list[1],
metadata_list[1],
tags_list[1]
tags_list[1],
source_list[1]
)
]
mock_get_row_list.assert_called_once_with(response_json)
Expand All @@ -303,6 +316,7 @@ def test_process_response_json_uses_required_getters():
assert mock_get_creator.mock_calls == getter_calls_list
assert mock_extract_meta_data.mock_calls == getter_calls_list
assert mock_extract_tags.mock_calls == getter_calls_list
assert mock_extract_source.mock_calls == [call(m) for m in metadata_list]


def test_get_row_list_with_no_rows():
Expand Down Expand Up @@ -898,8 +912,9 @@ def test_check_type_with_bad_inputs(required_type, good_indices, default):
foreign_identifier='id_one',
title='The Title',
creator='Alice',
meta_data={'meta': 'data'},
meta_data={'unit_code': 'NMNHBOTANY'},
raw_tags=['tag', 'list'],
source='smithsonian_national_museum_of_natural_history',
),
call(
foreign_landing_url='https://foreignlanding.url',
Expand All @@ -909,8 +924,9 @@ def test_check_type_with_bad_inputs(required_type, good_indices, default):
foreign_identifier='id_two',
title='The Title',
creator='Alice',
meta_data={'meta': 'data'},
meta_data={'unit_code': 'NMNHBOTANY'},
raw_tags=['tag', 'list'],
source='smithsonian_national_museum_of_natural_history',
)
]
),
Expand Down Expand Up @@ -946,8 +962,9 @@ def test_check_type_with_bad_inputs(required_type, good_indices, default):
foreign_identifier='id_two',
title='The Title',
creator='Alice',
meta_data={'meta': 'data'},
meta_data={'unit_code': 'NMNHBOTANY'},
raw_tags=['tag', 'list'],
source='smithsonian_national_museum_of_natural_history',
)
]
),
Expand Down Expand Up @@ -983,8 +1000,9 @@ def test_check_type_with_bad_inputs(required_type, good_indices, default):
foreign_identifier='id_one',
title='The Title',
creator='Alice',
meta_data={'meta': 'data'},
meta_data={'unit_code': 'NMNHBOTANY'},
raw_tags=['tag', 'list'],
source='smithsonian_national_museum_of_natural_history',
)
]
)
Expand All @@ -999,8 +1017,39 @@ def test_process_image_list(input_media, expect_calls):
foreign_landing_url='https://foreignlanding.url',
title='The Title',
creator='Alice',
meta_data={'meta': 'data'},
meta_data={'unit_code': 'NMNHBOTANY'},
tags=['tag', 'list'],
source='smithsonian_national_museum_of_natural_history',
license_url='https://license.url'
)
assert expect_calls == mock_add_item.mock_calls


def test_process_image_data_with_sub_provider():
response = _get_resource_json('sub_provider_example.json')
with patch.object(
si.image_store,
'add_item',
return_value=100
) as mock_add_item:
total_images = si._process_response_json(response)

expect_meta_data = {
'unit_code': 'SIA',
'data_source': 'Smithsonian Institution Archives'
}

mock_add_item.assert_called_once_with(
foreign_landing_url=None,
image_url='https://ids.si.edu/ids/deliveryService?id=SIA-SIA2010-2358',
thumbnail_url='https://ids.si.edu/ids/deliveryService?id=SIA-SIA2010-2358&max=150',
license_url='https://creativecommons.org/publicdomain/zero/1.0/',
foreign_identifier='SIA-SIA2010-2358',
creator='Gruber, Martin A',
title='Views of the National Zoological Park in Washington, DC, showing Elephant',
meta_data=expect_meta_data,
raw_tags=[
'1920s', '1910s', 'Archival materials', 'Photographs', 'Animals'],
source='smithsonian_institution_archives'
)
assert total_images == 100
@@ -0,0 +1,147 @@
{
"status": 200,
"responseCode": 1,
"response": {
"rows": [
{
"id": "edanmdm-siris_arc_291918",
"title": "Views of the National Zoological Park in Washington, DC, showing Elephant",
"unitCode": "SIA",
"type": "edanmdm",
"url": "edanmdm:siris_arc_291918",
"content": {
"descriptiveNonRepeating": {
"record_ID": "siris_arc_291918",
"online_media": {
"mediaCount": 1,
"media": [
{
"thumbnail": "https://ids.si.edu/ids/deliveryService?id=SIA-SIA2010-2358&max=150",
"idsId": "SIA-SIA2010-2358",
"usage": {
"access": "CC0",
"text": ""
},
"type": "Images",
"content": "https://ids.si.edu/ids/deliveryService?id=SIA-SIA2010-2358"
}
]
},
"unit_code": "SIA",
"title_sort": "VIEWS OF THE NATIONAL ZOOLOGICAL PARK IN WASHINGTON DC SHOWING ELEPHANT",
"title": {
"label": "Title",
"content": "Views of the National Zoological Park in Washington, DC, showing Elephant"
},
"metadata_usage": {
"access": "CC0"
},
"data_source": "Smithsonian Institution Archives"
},
"indexedStructured": {
"date": [
"1920s",
"1910s"
],
"object_type": [
"Archival materials",
"Photographs"
],
"name": [
{
"type": "personal_main",
"content": "Gruber, Martin A"
},
{
"type": "corporate_subj",
"content": "National Zoological Park (U.S.)"
}
],
"topic": [
"Animals"
],
"usage_flag": [
"Personal Paper Deposit"
],
"online_media_type": [
"Images"
]
},
"freetext": {
"date": [
{
"label": "Date",
"content": "1919"
},
{
"label": "Date",
"content": "C. 1920-1924"
}
],
"identifier": [
{
"label": "Local number",
"content": "SIA RU007355 [SIA2010-2358]"
}
],
"notes": [
{
"label": "Cite as",
"content": "Smithsonian Institution Archives, Record Unit 7355, Martin A. Gruber Photograph Collection, Image No. SIA2010-2358"
},
{
"label": "Repository Loc.",
"content": "Smithsonian Institution Archives, Capital Gallery, Suite 3000, MRC 507; 600 Maryland Avenue, SW; Washington, DC 20024-2520"
}
],
"name": [
{
"label": "Creator",
"content": "Gruber, Martin A"
},
{
"label": "Subject",
"content": "National Zoological Park (U.S.)"
}
],
"topic": [
{
"label": "Topic",
"content": "Animals"
}
],
"dataSource": [
{
"label": "Data Source",
"content": "Smithsonian Institution Archives"
}
],
"objectRights": [
{
"label": "Restrictions & Rights",
"content": "No access restrictions Many of SIA's holdings are located off-site, and advance notice is recommended to consult a collection. Please email the SIA Reference Team at osiaref@si.edu"
},
{
"label": "Restrictions & Rights",
"content": "No Copyright - United States"
}
],
"objectType": [
{
"label": "Type",
"content": "Black-and-white photographs"
}
]
}
},
"hash": "fffe9e6a7103a3449d84f3acf75ae7260e1c0386",
"docSignature": "b943f314acfdbcd3ed740a1c4bb2774f1e0abc24_c58eacf430afa83b10f6a76fc64459d9",
"timestamp": "1580116221",
"lastTimeUpdated": "1580116213",
"version": ""
}
],
"rowCount": 734,
"message": "content found"
}
}