# Extracting and Transforming Metadata

In [None]:
import json

In [4]:
# read in a sample file
with open('C:\\Users\\abbys\\OneDrive\\Desktop\\si-676\\assignment-1\\nypl-item-metadata\\0a3cf940-c5d3-012f-013f-58d385a7bc34.json', 'r', encoding='utf-8') as file:
    metadata = json.load(file)

# check if it's there
print(json.dumps(metadata, indent=2)[:100])

{
  "nyplAPI": {
    "request": {
      "uuid": {
        "$": "0a3cf940-c5d3-012f-013f-58d385a7bc34


In [6]:
# now take a look at the "item" key

for attribute in metadata['nyplAPI']['response']['mods'].items():
    print(attribute[0], ':\t', attribute[1])

version :	 3.4
schemaLocation :	 http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-4.xsd
titleInfo :	 {'type': '', 'authority': '', 'usage': 'primary', 'lang': 'eng', 'script': '', 'title': {'$': 'Buddhist priests at their shrine.'}}
typeOfResource :	 {'$': 'still image'}
genre :	 {'authority': 'lctgm', 'valueURI': 'http://id.loc.gov/vocabulary/graphicMaterials/tgm008103', '$': 'Postcards'}
originInfo :	 {'dateIssued': [{'encoding': 'w3cdtf', 'point': 'start', 'keyDate': 'yes', '$': '1907'}, {'encoding': 'w3cdtf', 'qualifier': 'approximate', 'point': 'end', '$': '1918'}], 'place': {'placeTerm': {'type': 'text', '$': 'Ceylon; printed in Germany'}}}
subject :	 [{'topic': {'authority': 'lcsh', 'valueURI': 'http://id.loc.gov/authorities/subjects/sh87004041', '$': 'Sri Lankans'}}, {'geographic': {'authority': 'naf', 'valueURI': 'http://id.loc.gov/authorities/names/n80061039', '$': 'Sri Lanka'}}, {'topic': {'authority': 'lctgm', 'valueURI': 'http://id.loc.gov/vocabulary/

In [81]:
# for reusability, you may want to write this to a file

metadata_fields_file = 'C:\\Users\\abbys\\OneDrive\\Desktop\\si-676\\assignment-1\\nypl-metadata-fields.txt'

with open(metadata_fields_file, 'w') as f:
    f.write('attribute\tvalue\n')
    for attribute in metadata['nyplAPI']['response']['mods'].items():
        f.write(str(attribute[0]) + '\t' + str(attribute[1]) + '\n')

## Choosing attributes for MAP

In [9]:
# look at attributes and their values to determine crosswalking needs
for attribute in metadata['nyplAPI']['response']['mods']:
    print(attribute, ":", metadata['nyplAPI']['response']['mods'][attribute])

version : 3.4
schemaLocation : http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-4.xsd
titleInfo : {'type': '', 'authority': '', 'usage': 'primary', 'lang': 'eng', 'script': '', 'title': {'$': 'Buddhist priests at their shrine.'}}
typeOfResource : {'$': 'still image'}
genre : {'authority': 'lctgm', 'valueURI': 'http://id.loc.gov/vocabulary/graphicMaterials/tgm008103', '$': 'Postcards'}
originInfo : {'dateIssued': [{'encoding': 'w3cdtf', 'point': 'start', 'keyDate': 'yes', '$': '1907'}, {'encoding': 'w3cdtf', 'qualifier': 'approximate', 'point': 'end', '$': '1918'}], 'place': {'placeTerm': {'type': 'text', '$': 'Ceylon; printed in Germany'}}}
subject : [{'topic': {'authority': 'lcsh', 'valueURI': 'http://id.loc.gov/authorities/subjects/sh87004041', '$': 'Sri Lankans'}}, {'geographic': {'authority': 'naf', 'valueURI': 'http://id.loc.gov/authorities/names/n80061039', '$': 'Sri Lanka'}}, {'topic': {'authority': 'lctgm', 'valueURI': 'http://id.loc.gov/vocabulary/graphic

In [73]:
# build nypl data structure
starting_point = metadata['nyplAPI']['response']['mods']

# get all dates
dates = []
for date in starting_point['originInfo']['dateIssued']:
    dates.append(date['$'])

# get all topics
topic_dict = dict(enumerate(starting_point['subject']))
topics = []
n = 0
for key in topic_dict:
    for other_key in topic_dict[key]:
        topics.append(topic_dict[key][other_key]['$'])

# get all physical descriptions
phys_desc_dict = dict(enumerate(starting_point['physicalDescription']))
phys_descriptions = []
n = 0
for key in phys_desc_dict:
    for other_key in phys_desc_dict[key]:
        phys_descriptions.append(phys_desc_dict[key][other_key]['$'])

source_base_url = 'https://digitalcollections.nypl.org/items/'

nypl_item_data = {
    # mods
    'title': starting_point['titleInfo']['title']['$'],
    'genre': starting_point['genre']['$'],
    'placeOfOrigin': starting_point['originInfo']['place']['placeTerm']['$'],
    'subject': topics,
    'format': phys_descriptions,

    # dublin core
    'language': starting_point['titleInfo']['lang'],
    'date': dates,
    'type': starting_point['typeOfResource']['$'],
    'contributor': starting_point['name']['namePart']['$'],
    'rights': metadata['nyplAPI']['response']['rightsStatement']['$'],
    'source': source_base_url + metadata['nyplAPI']['request']['uuid']['$'],
}

for item in nypl_item_data:
    print(item, ":", nypl_item_data[item])

title : Buddhist priests at their shrine.
genre : Postcards
placeOfOrigin : Ceylon; printed in Germany
subject : ['Sri Lankans', 'Sri Lanka', 'Buddhism', 'Buddhist temples', 'Religious articles']
format : ['Collotypes', '14 x 9 cm.', 'Photomechanical prints']
language : eng
date : ['1907', '1918']
type : still image
contributor : A.W.A. Plâté & Co.
rights : The New York Public Library believes that this item is in the public domain under the laws of the United States, but did not make a determination as to its copyright status under the copyright laws of other countries. This item may not be in the public domain under the laws of other countries. Though not required, if you want to credit us as the source, please use the following statement, "From The New York Public Library," and provide a link back to the item on our Digital Collections site. Doing so helps us track how our collection is used and helps justify freely releasing even more content in the future.
source : https://digital

## Now match LOC metadata to chosen terms

In [74]:
# read in a sample file
with open('C:\\Users\\abbys\\OneDrive\\Desktop\\si-676\\assignment-1\\loc-item-metadata\\jpd-00139.json', 'r', encoding='utf-8') as file:
    loc_metadata = json.load(file)

# check if it's there
print(json.dumps(loc_metadata, indent=2)[:100])

{
  "articles_and_essays": null,
  "cite_this": {
    "apa": "Suzuki, H. (1765) <cite>Semitori</cite


In [76]:
# now take a look at the "item" key
for attribute in loc_metadata['item']['item'].items():
    print(attribute[0], ':\t', attribute[1])

access_advisory :	 Restricted access; material extremely fragile; please use online digital image.
call_number :	 FP 2 - JPD, no. 136 (A size) [P&P]
contributors :	 ['Suzuki, Harunobu, 1725?-1770, artist.']
control_number :	 2009615132
created :	 2016-04-20 16:38:11
created_published :	 1765.
created_published_date :	 1765.
creators :	 [{'link': '//www.loc.gov/pictures/related/?fi=name&q=Suzuki%2C%20Harunobu%2C%201725%3F-1770&co=jpd', 'role': 'artist', 'title': 'Suzuki, Harunobu, 1725?-1770'}]
date :	 1765.
digital_id :	 ['jpd 00139 //hdl.loc.gov/loc.pnp/jpd.00139']
display_offsite :	 True
format :	 ['still image']
formats :	 [{'link': '//www.loc.gov/pictures/related/?fi=format&q=Ukiyo-e--Japanese--1760-1770.&co=jpd', 'title': 'Ukiyo-e--Japanese--1760-1770.'}, {'link': '//www.loc.gov/pictures/related/?fi=format&q=Woodcuts--Japanese--Color--1760-1770.&co=jpd', 'title': 'Woodcuts--Japanese--Color--1760-1770.'}]
genre :	 ['Ukiyo-e--Japanese--1760-1770', 'Woodcuts--Japanese--Color--1760-17

In [80]:
# for reusability, you may want to write this to a file

loc_metadata_fields_file = 'C:\\Users\\abbys\\OneDrive\\Desktop\\si-676\\assignment-1\\loc-metadata-fields.txt'

with open(loc_metadata_fields_file, 'w') as f:
    f.write('attribute\tvalue\n')
    for attribute in loc_metadata['item']['item'].items():
        f.write(str(attribute[0]) + '\t' + str(attribute[1]) + '\n')

In [None]:
starting_point = loc_metadata['item']['item']

source_base_url = 'https://www.loc.gov/item/'

# build data structure
loc_item_data = {
    # mods
    'title': starting_point['title'],
    'genre': starting_point['genre'],
    'placeOfOrigin': starting_point['location'],
    'subject': starting_point['subjects'],
    'format': starting_point['medium'],

    # dublin core
    'language': starting_point['language'],
    'date': starting_point['date'],
    'type': starting_point['format'],
    'contributor': starting_point['contributors'],
    'rights': starting_point['rights_information'],
    'source': source_base_url + starting_point['control_number'],
    'alternativeTitle': starting_point['other_title']
}

for item in loc_item_data:
    print(item, ":", loc_item_data[item])

title : Semitori
genre : ['Ukiyo-e--Japanese--1760-1770', 'Woodcuts--Japanese--Color--1760-1770']
placeOfOrigin : ['Japan']
subject : ['Women--Domestic life--Japan--1760-1770', 'Children--Japan--1760-1770', 'Recreation--Japan--1760-1770', 'Crickets--Japan--1760-1770']
format : ['1 print : woodcut, color ; 27.7 x 19.1 cm.']
language : ['jpn']
date : 1765.
type : ['still image']
contributor : ['Suzuki, Harunobu, 1725?-1770, artist.']
rights : No known restrictions on publication.
source : https://www.loc.gov/item/2009615132
alternativeTitle : ['Catching crickets.']
