# Extracting and Transforming Metadata

In [74]:
import csv
import json

# for working with local files
import glob
import os
from os.path import join

## LOC Items

In [75]:
current_loc = os.getcwd()

print(current_loc)

c:\Users\abbys\OneDrive\Desktop\si-676\assignment-1


In [76]:
metadata_file_path = os.path.join('loc-item-metadata')

print(metadata_file_path)

loc-item-metadata


In [77]:
# check for right metadata files
file_count = 0
for file in glob.glob('loc-item-metadata\\*.json'):
    file_count += 1
    print(file)

print('found', file_count, 'files')

loc-item-metadata\cph-3g08479.json
loc-item-metadata\cph-3g10372.json
loc-item-metadata\cph-3g10434.json
loc-item-metadata\cph-3g10521.json
loc-item-metadata\cph-3g10539.json
loc-item-metadata\jpd-00046.json
loc-item-metadata\jpd-00139.json
loc-item-metadata\jpd-00154.json
loc-item-metadata\jpd-00181.json
loc-item-metadata\jpd-00278.json
loc-item-metadata\jpd-00501.json
loc-item-metadata\jpd-00507.json
loc-item-metadata\jpd-00790.json
loc-item-metadata\jpd-01197.json
loc-item-metadata\jpd-01271.json
loc-item-metadata\jpd-01307.json
loc-item-metadata\jpd-01317.json
loc-item-metadata\jpd-01328.json
loc-item-metadata\jpd-01361.json
loc-item-metadata\jpd-01581.json
loc-item-metadata\jpd-01803.json
loc-item-metadata\jpd-01822.json
loc-item-metadata\jpd-02015.json
loc-item-metadata\jpd-02148.json
loc-item-metadata\jpd-02457.json
loc-item-metadata\jpd-02467.json
loc-item-metadata\jpd-02608.json
loc-item-metadata\jpd-02925.json
found 28 files


In [78]:
# add metadata files to list
list_of_item_metadata_files = list() 
for file in glob.glob('loc-item-metadata\\*.json'):
    list_of_item_metadata_files.append(file)

len(list_of_item_metadata_files)

28

In [79]:
# quick duplicate check
list_of_item_metadata_files.sort()

for file in list_of_item_metadata_files:
    print(file)

loc-item-metadata\cph-3g08479.json
loc-item-metadata\cph-3g10372.json
loc-item-metadata\cph-3g10434.json
loc-item-metadata\cph-3g10521.json
loc-item-metadata\cph-3g10539.json
loc-item-metadata\jpd-00046.json
loc-item-metadata\jpd-00139.json
loc-item-metadata\jpd-00154.json
loc-item-metadata\jpd-00181.json
loc-item-metadata\jpd-00278.json
loc-item-metadata\jpd-00501.json
loc-item-metadata\jpd-00507.json
loc-item-metadata\jpd-00790.json
loc-item-metadata\jpd-01197.json
loc-item-metadata\jpd-01271.json
loc-item-metadata\jpd-01307.json
loc-item-metadata\jpd-01317.json
loc-item-metadata\jpd-01328.json
loc-item-metadata\jpd-01361.json
loc-item-metadata\jpd-01581.json
loc-item-metadata\jpd-01803.json
loc-item-metadata\jpd-01822.json
loc-item-metadata\jpd-02015.json
loc-item-metadata\jpd-02148.json
loc-item-metadata\jpd-02457.json
loc-item-metadata\jpd-02467.json
loc-item-metadata\jpd-02608.json
loc-item-metadata\jpd-02925.json


In [85]:
# set up the containers to create the csv of all the item fields
# file for csv to read out
collection_info_csv = 'collection_items_data.csv'
file_count = 0
items_written = 0
error_count = 0

# set up a list for the columns in your csv; 
# your goal should be to automate this, but . . . 
# it works for demonstration as you set up the crosswalk
headers = ['item_type', 'date_uploaded', 'source_file', 'item_id', 'title', 'genre', 'source', 'date', 'place_of_origin', 'subject', 'item_format', 'language', 'contributor', 'rights', 'alternative_title']

for file in list_of_item_metadata_files:
    file_count += 1
    print('opening', file)

    # try first with one file
    with open(file, 'r', encoding='utf-8') as data:
         # load the item data
        try:
            item_data = json.load(data)
        except:
            print('error loading',file)
            error_count += 1
            continue

        starting_point = item_data['item']['item']
        
         # extract/name the data you want
        # item type
        try:
            item_type = starting_point['format']
        except:
            item_type = starting_point['formats']
        # date uplaoded
        date_uploaded = '2024-12-08'

        # extract the data you want
        # for checking purposes, add in the source of the info
        source_file = str(file)
        # make sure there's some unique and stable identifier
        try:
            item_id = item_data['item']['library_of_congress_control_number']
        except:
            item_id = item_data['item']['url'].split('/')[-2]


        title = starting_point['title']
        genre = starting_point['genre']
        source = item_data['item']['url']
        date = starting_point['date']
        try:
            place_of_origin = starting_point['location']
        except:
            place_of_origin = 'Not found'
        try:
            subject = starting_point['subjects']
        except:
            subject = 'Not found'
        try:
            item_format = starting_point['medium']
        except:
            item_format = 'Not found'
        try:
            language = starting_point['language']
        except:
            language = 'Not found'
        try:
            contributor = starting_point['contributors']
        except:
            contributor = 'Not found'
        try:
            rights = starting_point['rights_information']
        except:
            rights = 'Undetermined'
        try:
            alternative_title = starting_point['other_title']
        except:
            alternative_title = 'Not found'

        # dictionary for the rows
        row_dict = dict()
        
        # look for the item metadata, assign it to the dictionary; 
        # start with some basic elements likely (already enumerated in the headers list) :
        # source file
        row_dict['item_type'] = item_type
        row_dict['date_uploaded'] = date_uploaded
        row_dict['source_file'] = source_file
        # identifier
        row_dict['item_id'] = item_id
        # title
        row_dict['title'] = title
        # genre
        row_dict['genre'] = genre
        # link
        row_dict['source'] = source
        # date
        row_dict['date'] = date
        # place of origin
        row_dict['place_of_origin'] = place_of_origin
        # subject
        row_dict['subject'] = subject
        # format
        row_dict['item_format'] = item_format
        # language
        row_dict['language'] = language
        # contributor
        row_dict['contributor'] = contributor
        #rights
        row_dict['rights'] = rights 
        # alt title
        row_dict['alternative_title'] = alternative_title
        
        print('created row dictionary:',row_dict)

    # write to the csv
    with open(collection_info_csv, 'a', encoding='utf-8', newline='') as fout:
        writer = csv.DictWriter(fout, fieldnames=headers)
        if items_written == 0:
            writer.writeheader()
        writer.writerow(row_dict)
        items_written += 1
        print('adding',item_id)

print('\n\n--- LOG ---')
print('wrote',collection_info_csv)
print('with',items_written,'items')
print(error_count,'errors (info not written)')

opening loc-item-metadata\cph-3g08479.json
created row dictionary: {'item_type': ['still image'], 'date_uploaded': '2024-12-08', 'source_file': 'loc-item-metadata\\cph-3g08479.json', 'item_id': '2002700239', 'title': 'Gokakoku ... gankirō ni oite sakamori no zu', 'genre': ['Triptychs--Japanese--1860', 'Woodcuts--Japanese--Color--1860'], 'source': 'https://www.loc.gov/item/2002700239/', 'date': '1860.', 'place_of_origin': ['Japan--Yokohama-shi'], 'subject': ['Foreign visitors--Japan--Yokohama-shi--1860', 'Entertainment--Japan--Yokohama-shi--1860', 'Courtesans--Japan--Yokohama-shi--1860', 'Eating & drinking--Japan--Yokohama-shi--1860', 'Teahouses--Japan--Yokohama-shi--1860-1870'], 'item_format': ['1 print on hōsho paper (3 sheets) : woodcut, color ; 36.5 x 24.5 cm. (each block), 37 x 24.8 cm. (each sheet)'], 'language': ['jpn'], 'contributor': ['Ochiai, Yoshiiku, 1833-1904, artist.'], 'rights': 'No known restrictions on publication.', 'alternative_title': ['Five nations - merrymaking at

## NYPL Items

In [86]:
# check for right metadata files
file_count = 0
for file in glob.glob('nypl-item-metadata\\*.json'):
    file_count += 1
    print(file)

print('found', file_count, 'files')

nypl-item-metadata\00121360-c5d2-012f-3258-58d385a7bc34.json
nypl-item-metadata\0032a710-c5d3-012f-18d7-58d385a7bc34.json
nypl-item-metadata\005b5350-c5d2-012f-c31b-58d385a7bc34.json
nypl-item-metadata\007fbda0-c5d3-012f-dd3b-58d385a7bc34.json
nypl-item-metadata\058cdb20-c5d2-012f-089d-58d385a7bc34.json
nypl-item-metadata\05c1e9e0-c5d2-012f-a038-58d385a7bc34.json
nypl-item-metadata\0608ad80-c5d2-012f-7d97-58d385a7bc34.json
nypl-item-metadata\063b3b00-c5d2-012f-0b79-58d385a7bc34.json
nypl-item-metadata\06831e40-c5d2-012f-1705-58d385a7bc34.json
nypl-item-metadata\06a999f0-c5d3-012f-19e0-58d385a7bc34.json
nypl-item-metadata\06b8ad90-c5d2-012f-5f9e-58d385a7bc34.json
nypl-item-metadata\06dfc390-c5d3-012f-bbf9-58d385a7bc34.json
nypl-item-metadata\070969e0-c5d2-012f-a3f2-58d385a7bc34.json
nypl-item-metadata\072e50a0-c5d3-012f-2ba1-58d385a7bc34.json
nypl-item-metadata\073c37f0-c5d2-012f-42ca-58d385a7bc34.json
nypl-item-metadata\07804990-c5d2-012f-8ff7-58d385a7bc34.json
nypl-item-metadata\07b62

In [87]:
# add metadata files to list
list_of_nypl_item_metadata_files = list() 
for file in glob.glob('nypl-item-metadata\\*.json'):
    list_of_nypl_item_metadata_files.append(file)

len(list_of_nypl_item_metadata_files)

693

In [88]:
# set up the containers to create the csv of all the item fields
# file for csv to read out
file_count = 0
items_written = 0
error_count = 0

# set up a list for the columns in your csv; 
# your goal should be to automate this, but . . . 
# it works for demonstration as you set up the crosswalk
headers = ['item_type', 'date_uploaded', 'source_file', 'item_id', 'title', 'genre', 'source', 'date', 'place_of_origin', 'subject', 'item_format', 'language', 'contributor', 'rights', 'alternative_title']

for file in list_of_nypl_item_metadata_files:
    file_count += 1
    print('opening', file)

    # try first with one file
    with open(file, 'r', encoding='utf-8') as data:
         # load the item data
        try:
            item_data = json.load(data)
            starting_point = item_data['nyplAPI']['response']['mods']
        except:
            print('error loading',file)
            error_count += 1
            continue
        
         # extract/name the data you want
        # item type
        item_type = starting_point['typeOfResource']['$']

        # date uplaoded
        date_uploaded = '2024-12-08'

        # extract the data you want
        # for checking purposes, add in the source of the info
        source_file = str(file)
        source_base_url = 'https://digitalcollections.nypl.org/items/'
        # make sure there's some unique and stable identifier
        item_id = item_data['nyplAPI']['request']['uuid']['$']
        # get all titles
        titles = []
        for title in starting_point['titleInfo']:
            if title == 'title':
                titles.append(starting_point['titleInfo'][title]['$'])
        title = titles
        
        try:
            # get all genres
            genres = []
            for genre in starting_point['genre']:
                genres.append(genre['$'])

            genre = genres
        except:
            try:
                genre = starting_point['genre']['$']
            except:
                genre = 'Not found'
        source = source_base_url + item_data['nyplAPI']['request']['uuid']['$']
        try:
            # get all dates
            dates = []
            for date in starting_point['originInfo']['dateIssued']:
                dates.append(date['$'])
           
            date = dates
        except:
            date = 'Not found'
        try:
            place_of_origin = starting_point['originInfo']['place']['placeTerm']['$']
        except:
            place_of_origin = 'Not found'
        try:
            # get all topics
            topic_dict = dict(enumerate(starting_point['subject']))
            topics = []
            for key in topic_dict:
                for other_key in topic_dict[key]:
                    topics.append(topic_dict[key][other_key]['$'])
            
            subject = topics
        except:
            subject = 'Not found'
        phys_descriptions = []
        try:
            # get all physical descriptions
            phys_desc_dict = dict(enumerate(starting_point['physicalDescription']))
            for key in phys_desc_dict:
                for other_key in phys_desc_dict[key]:
                    phys_descriptions.append(phys_desc_dict[key][other_key]['$'])
            
            item_format = phys_descriptions
        except:
            try:
                phys_keys = []
                for key in starting_point['physicalDescription']:
                    phys_descriptions.append(starting_point['physicalDescription'][key]['$'])
                    
                print(phys_descriptions)
                item_format = phys_descriptions
            except:
                item_format = 'Not found'
        try:
            language = starting_point['titleInfo']['lang']
        except:
            language = 'Not found'
        try:
            contributor = starting_point['name']['namePart']['$']
        except:
            try:
                contributor = starting_point['originInfo']['publisher']['$']
            except:
                contributor = 'Not found'
        try:
            rights = item_data['nyplAPI']['response']['rightsStatement']['$']
        except:
            rights = 'Undetermined'
        
        alternative_title = 'Not found'

        # dictionary for the rows
        row_dict = dict()
        
        # look for the item metadata, assign it to the dictionary; 
        # start with some basic elements likely (already enumerated in the headers list) :
        # source file
        row_dict['item_type'] = item_type
        row_dict['date_uploaded'] = date_uploaded
        row_dict['source_file'] = source_file
        # identifier
        row_dict['item_id'] = item_id
        # title
        row_dict['title'] = title
        # genre
        row_dict['genre'] = genre
        # link
        row_dict['source'] = source
        # date
        row_dict['date'] = date
        # place of origin
        row_dict['place_of_origin'] = place_of_origin
        # subject
        row_dict['subject'] = subject
        # format
        row_dict['item_format'] = item_format
        # language
        row_dict['language'] = language
        # contributor
        row_dict['contributor'] = contributor
        #rights
        row_dict['rights'] = rights 
        # alt title
        row_dict['alternative_title'] = alternative_title
        
        print('created row dictionary:',row_dict)

    # write to the csv
    with open(collection_info_csv, 'a', encoding='utf-8', newline='') as fout:
        writer = csv.DictWriter(fout, fieldnames=headers)
        writer.writerow(row_dict)
        items_written += 1
        print('adding',item_id)

print('\n\n--- LOG ---')
print('wrote',collection_info_csv)
print('with',items_written,'items')
print(error_count,'errors (info not written)')

opening nypl-item-metadata\00121360-c5d2-012f-3258-58d385a7bc34.json
['14 x 9 cm.']
created row dictionary: {'item_type': 'still image', 'date_uploaded': '2024-12-08', 'source_file': 'nypl-item-metadata\\00121360-c5d2-012f-3258-58d385a7bc34.json', 'item_id': '00121360-c5d2-012f-3258-58d385a7bc34', 'title': ['Girl with parasol carrying infant on back.'], 'genre': ['Postcards', 'Genre photographs'], 'source': 'https://digitalcollections.nypl.org/items/00121360-c5d2-012f-3258-58d385a7bc34', 'date': ['1907', '1918'], 'place_of_origin': 'Made in Japan', 'subject': ['Japanese', 'Children', 'Parasols', 'Clothing & dress -- Japan', 'Infants'], 'item_format': ['14 x 9 cm.'], 'language': 'eng', 'contributor': 's.n.', 'rights': 'The New York Public Library believes that this item is in the public domain under the laws of the United States, but did not make a determination as to its copyright status under the copyright laws of other countries. This item may not be in the public domain under the la