cc-archive · mathemancer · Jul 29, 2020 · Jul 6, 2020 · Jul 7, 2020 · Jul 7, 2020
diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/smithsonian.py b/src/cc_catalog_airflow/dags/provider_api_scripts/smithsonian.py
@@ -15,6 +15,8 @@
 
 from common.storage import image
 from common import requester
+from util.loader import provider_details as prov
+
 logger = logging.getLogger(__name__)
 
 API_KEY = os.getenv('DATA_GOV_API_KEY')
@@ -24,7 +26,9 @@
 API_ROOT = 'https://api.si.edu/openaccess/api/v1.0/'
 SEARCH_ENDPOINT = API_ROOT + 'search'
 UNITS_ENDPOINT = API_ROOT + 'terms/unit_code'
-PROVIDER = 'smithsonian'
+PROVIDER = prov.SMITHSONIAN_DEFAULT_PROVIDER
+# SUB_PROVIDERS is a collection of all providers within smithsonian
+SUB_PROVIDERS = prov.SMITHSONIAN_SUB_PROVIDERS
 ZERO_URL = 'https://creativecommons.org/publicdomain/zero/1.0/'
 DEFAULT_PARAMS = {
     'api_key': API_KEY,
@@ -218,13 +222,16 @@ def _process_response_json(response_json):
     for row in rows:
         image_list = _get_image_list(row)
         if image_list:
+            meta_data = _extract_meta_data(row)
+            source = _extract_source(meta_data)
             total_images = _process_image_list(
                 image_list,
                 _get_foreign_landing_url(row),
                 _get_title(row),
                 _get_creator(row),
-                _extract_meta_data(row),
-                _extract_tags(row)
+                meta_data,
+                _extract_tags(row),
+                source
             )
     return total_images
 
@@ -313,6 +320,16 @@ def _extract_meta_data(row, description_types=DESCRIPTION_TYPES):
     return {k: v for (k, v) in meta_data.items() if v is not None}
 
 
+def _extract_source(meta_data, sub_providers=SUB_PROVIDERS):
+    unit_code = meta_data.get('unit_code').strip()
+    source = next((s for s in sub_providers if unit_code in
+                   sub_providers[s]), None)
+    if source is None:
+        raise Exception(
+            f"An unknown unit code value {unit_code} encountered ")
+    return source
+
+
 def _extract_tags(row, tag_types=TAG_TYPES):
     indexed_structured = _get_indexed_structured_dict(row)
     tag_lists_generator = (
@@ -387,6 +404,7 @@ def _process_image_list(
         creator,
         meta_data,
         tags,
+        source,
         license_url=ZERO_URL
 ):
     total_images = None
@@ -404,7 +422,8 @@ def _process_image_list(
                 title=title,
                 creator=creator,
                 meta_data=meta_data,
-                raw_tags=tags
+                raw_tags=tags,
+                source=source
             )
     return total_images
 

diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/test_smithsonian.py b/src/cc_catalog_airflow/dags/provider_api_scripts/test_smithsonian.py
@@ -20,6 +20,12 @@
 )
 
 
+def _get_resource_json(json_name):
+    with open(os.path.join(RESOURCES, json_name)) as f:
+        resource_json = json.load(f)
+    return resource_json
+
+
 def test_get_hash_prefixes_with_len_one():
     expect_prefix_list = [
         '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd',
@@ -247,6 +253,7 @@ def test_process_response_json_uses_required_getters():
     creator_list = ['creator0', 'creator1']
     metadata_list = ['metadata0', 'metadata1']
     tags_list = ['tags0', 'tags1']
+    source_list = ['source0', 'source1']
 
     get_row_list = patch.object(si, '_get_row_list', return_value=row_list)
     process_image_list = patch.object(
@@ -264,15 +271,19 @@ def test_process_response_json_uses_required_getters():
         si, '_extract_meta_data', side_effect=metadata_list
     )
     ext_tags = patch.object(si, '_extract_tags', side_effect=tags_list)
+    ext_source = patch.object(
+        si, '_extract_source', side_effect=source_list
+    )
 
     with\
             get_row_list as mock_get_row_list,\
             get_image_list as mock_get_image_list,\
             get_flu as mock_get_foreign_landing_url,\
             get_title as mock_get_title,\
             get_creator as mock_get_creator,\
-            ext_meta_data as mock_extract_meta_data,\
-            ext_tags as mock_extract_tags,\
+            ext_meta_data as mock_extract_meta_data, \
+            ext_tags as mock_extract_tags, \
+            ext_source as mock_extract_source, \
             process_image_list as mock_process_image_list:
         si._process_response_json(response_json)
 
@@ -284,15 +295,17 @@ def test_process_response_json_uses_required_getters():
             title_list[0],
             creator_list[0],
             metadata_list[0],
-            tags_list[0]
+            tags_list[0],
+            source_list[0]
         ),
         call(
             image_lists[1],
             flu_list[1],
             title_list[1],
             creator_list[1],
             metadata_list[1],
-            tags_list[1]
+            tags_list[1],
+            source_list[1]
         )
     ]
     mock_get_row_list.assert_called_once_with(response_json)
@@ -303,6 +316,7 @@ def test_process_response_json_uses_required_getters():
     assert mock_get_creator.mock_calls == getter_calls_list
     assert mock_extract_meta_data.mock_calls == getter_calls_list
     assert mock_extract_tags.mock_calls == getter_calls_list
+    assert mock_extract_source.mock_calls == [call(m) for m in metadata_list]
 
 
 def test_get_row_list_with_no_rows():
@@ -898,8 +912,9 @@ def test_check_type_with_bad_inputs(required_type, good_indices, default):
                     foreign_identifier='id_one',
                     title='The Title',
                     creator='Alice',
-                    meta_data={'meta': 'data'},
+                    meta_data={'unit_code': 'NMNHBOTANY'},
                     raw_tags=['tag', 'list'],
+                    source='smithsonian_national_museum_of_natural_history',
                 ),
                 call(
                     foreign_landing_url='https://foreignlanding.url',
@@ -909,8 +924,9 @@ def test_check_type_with_bad_inputs(required_type, good_indices, default):
                     foreign_identifier='id_two',
                     title='The Title',
                     creator='Alice',
-                    meta_data={'meta': 'data'},
+                    meta_data={'unit_code': 'NMNHBOTANY'},
                     raw_tags=['tag', 'list'],
+                    source='smithsonian_national_museum_of_natural_history',
                 )
             ]
         ),
@@ -946,8 +962,9 @@ def test_check_type_with_bad_inputs(required_type, good_indices, default):
                     foreign_identifier='id_two',
                     title='The Title',
                     creator='Alice',
-                    meta_data={'meta': 'data'},
+                    meta_data={'unit_code': 'NMNHBOTANY'},
                     raw_tags=['tag', 'list'],
+                    source='smithsonian_national_museum_of_natural_history',
                 )
             ]
         ),
@@ -983,8 +1000,9 @@ def test_check_type_with_bad_inputs(required_type, good_indices, default):
                     foreign_identifier='id_one',
                     title='The Title',
                     creator='Alice',
-                    meta_data={'meta': 'data'},
+                    meta_data={'unit_code': 'NMNHBOTANY'},
                     raw_tags=['tag', 'list'],
+                    source='smithsonian_national_museum_of_natural_history',
                 )
             ]
         )
@@ -999,8 +1017,39 @@ def test_process_image_list(input_media, expect_calls):
             foreign_landing_url='https://foreignlanding.url',
             title='The Title',
             creator='Alice',
-            meta_data={'meta': 'data'},
+            meta_data={'unit_code': 'NMNHBOTANY'},
             tags=['tag', 'list'],
+            source='smithsonian_national_museum_of_natural_history',
             license_url='https://license.url'
         )
     assert expect_calls == mock_add_item.mock_calls
+
+
+def test_process_image_data_with_sub_provider():
+    response = _get_resource_json('sub_provider_example.json')
+    with patch.object(
+            si.image_store,
+            'add_item',
+            return_value=100
+    ) as mock_add_item:
+        total_images = si._process_response_json(response)
+
+    expect_meta_data = {
+        'unit_code': 'SIA',
+        'data_source': 'Smithsonian Institution Archives'
+    }
+
+    mock_add_item.assert_called_once_with(
+        foreign_landing_url=None,
+        image_url='https://ids.si.edu/ids/deliveryService?id=SIA-SIA2010-2358',
+        thumbnail_url='https://ids.si.edu/ids/deliveryService?id=SIA-SIA2010-2358&max=150',
+        license_url='https://creativecommons.org/publicdomain/zero/1.0/',
+        foreign_identifier='SIA-SIA2010-2358',
+        creator='Gruber, Martin A',
+        title='Views of the National Zoological Park in Washington, DC, showing Elephant',
+        meta_data=expect_meta_data,
+        raw_tags=[
+            '1920s', '1910s', 'Archival materials', 'Photographs', 'Animals'],
+        source='smithsonian_institution_archives'
+    )
+    assert total_images == 100
diff --git a/...g_airflow/dags/provider_api_scripts/tests/resources/smithsonian/sub_provider_example.json b/...g_airflow/dags/provider_api_scripts/tests/resources/smithsonian/sub_provider_example.json
@@ -0,0 +1,147 @@
+{
+  "status": 200,
+  "responseCode": 1,
+  "response": {
+    "rows": [
+      {
+        "id": "edanmdm-siris_arc_291918",
+        "title": "Views of the National Zoological Park in Washington, DC, showing Elephant",
+        "unitCode": "SIA",
+        "type": "edanmdm",
+        "url": "edanmdm:siris_arc_291918",
+        "content": {
+          "descriptiveNonRepeating": {
+            "record_ID": "siris_arc_291918",
+            "online_media": {
+              "mediaCount": 1,
+              "media": [
+                {
+                  "thumbnail": "https://ids.si.edu/ids/deliveryService?id=SIA-SIA2010-2358&max=150",
+                  "idsId": "SIA-SIA2010-2358",
+                  "usage": {
+                    "access": "CC0",
+                    "text": ""
+                  },
+                  "type": "Images",
+                  "content": "https://ids.si.edu/ids/deliveryService?id=SIA-SIA2010-2358"
+                }
+              ]
+            },
+            "unit_code": "SIA",
+            "title_sort": "VIEWS OF THE NATIONAL ZOOLOGICAL PARK IN WASHINGTON DC SHOWING ELEPHANT",
+            "title": {
+              "label": "Title",
+              "content": "Views of the National Zoological Park in Washington, DC, showing Elephant"
+            },
+            "metadata_usage": {
+              "access": "CC0"
+            },
+            "data_source": "Smithsonian Institution Archives"
+          },
+          "indexedStructured": {
+            "date": [
+              "1920s",
+              "1910s"
+            ],
+            "object_type": [
+              "Archival materials",
+              "Photographs"
+            ],
+            "name": [
+              {
+                "type": "personal_main",
+                "content": "Gruber, Martin A"
+              },
+              {
+                "type": "corporate_subj",
+                "content": "National Zoological Park (U.S.)"
+              }
+            ],
+            "topic": [
+              "Animals"
+            ],
+            "usage_flag": [
+              "Personal Paper Deposit"
+            ],
+            "online_media_type": [
+              "Images"
+            ]
+          },
+          "freetext": {
+            "date": [
+              {
+                "label": "Date",
+                "content": "1919"
+              },
+              {
+                "label": "Date",
+                "content": "C. 1920-1924"
+              }
+            ],
+            "identifier": [
+              {
+                "label": "Local number",
+                "content": "SIA RU007355 [SIA2010-2358]"
+              }
+            ],
+            "notes": [
+              {
+                "label": "Cite as",
+                "content": "Smithsonian Institution Archives, Record Unit 7355, Martin A. Gruber Photograph Collection, Image No. SIA2010-2358"
+              },
+              {
+                "label": "Repository Loc.",
+                "content": "Smithsonian Institution Archives, Capital Gallery, Suite 3000, MRC 507; 600 Maryland Avenue, SW; Washington, DC 20024-2520"
+              }
+            ],
+            "name": [
+              {
+                "label": "Creator",
+                "content": "Gruber, Martin A"
+              },
+              {
+                "label": "Subject",
+                "content": "National Zoological Park (U.S.)"
+              }
+            ],
+            "topic": [
+              {
+                "label": "Topic",
+                "content": "Animals"
+              }
+            ],
+            "dataSource": [
+              {
+                "label": "Data Source",
+                "content": "Smithsonian Institution Archives"
+              }
+            ],
+            "objectRights": [
+              {
+                "label": "Restrictions & Rights",
+                "content": "No access restrictions Many of SIA's holdings are located off-site, and advance notice is recommended to consult a collection. Please email the SIA Reference Team at osiaref@si.edu"
+              },
+              {
+                "label": "Restrictions & Rights",
+                "content": "No Copyright - United States"
+              }
+            ],
+            "objectType": [
+              {
+                "label": "Type",
+                "content": "Black-and-white photographs"
+              }
+            ]
+          }
+        },
+        "hash": "fffe9e6a7103a3449d84f3acf75ae7260e1c0386",
+        "docSignature": "b943f314acfdbcd3ed740a1c4bb2774f1e0abc24_c58eacf430afa83b10f6a76fc64459d9",
+        "timestamp": "1580116221",
+        "lastTimeUpdated": "1580116213",
+        "version": ""
+      }
+    ],
+    "rowCount": 734,
+    "message": "content found"
+  }
+}