# Extracting, Transforming, & Ingesting Metadata

#### Load Libraries

In [387]:
import requests 
import json 
import csv
import os

## Extracting Information

### Collection Metadata

*Endpoint searches for items by map format, with a subject of pictorial maps, and returns up to 150 results.*

In [388]:
endpoint_collection = 'https://www.loc.gov/maps/?fa=subject%3Apictorial+maps&st=list&c=150'

In [389]:
r = requests.get(endpoint_collection, params={'fo':'json'})

In [390]:
pictorial_set = r.json()

In [391]:
with open('pictorial-set.json', 'w') as f:
    json.dump(pictorial_set, f, indent=2)

In [392]:
with open('pictorial-set.json', 'r') as json_file:
    pictorial_set = json.load(json_file)

csv_file_path = 'pictorial_collection.csv'

extract_keys_resources = ["image", "url"]
extract_keys_item = ["title"]

headers = extract_keys_item + extract_keys_resources

with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(headers)

    for result in pictorial_set.get("results", []):
        resources = result.get("resources", [])

        for resource in resources:
            row_resources = [resource.get(key, '') for key in extract_keys_resources]
            item_value = result.get("item", {})
            row_item = [item_value.get(key, '') for key in extract_keys_item]
            csv_writer.writerow(row_item + row_resources)


### Item Metadata

In [383]:
error_count = 0
file_count = 0
item_count = 0

data_directory = os.makedirs('data', exist_ok=True)
item_metadata_file_start = 'item_metadata'
json_suffix = '.json'

endpoint_item = 'https://www.loc.gov/resource/'
with open(csv_file_path, 'r', newline='', encoding='utf-8') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    for row in csv_reader:
        url_value = row['url']
        id = url_value.split('/')[4]
        item_metadata = requests.get(endpoint_item + id, params={'fo':'json'})
        print('requested',item_metadata.url,item_metadata.status_code)
        if item_metadata.status_code != 200:
            print('requested',item_metadata.url,item_metadata.status_code)
            error_count += 1
            continue
        try:
            item_metadata.json()
        except:
            error_count += 1
            print('no json found')
            continue
        fout = os.path.join('data', str(item_metadata_file_start + '-' + id + json_suffix))
        with open(fout, 'w', encoding='utf-8') as json_file:
            json_file.write(json.dumps(item_metadata.json()['item']))
            file_count += 1
            print('wrote', fout)
        item_count += 1

print('items requested:',item_count)
print('errors:',error_count)
print('files written:',file_count)

requested https://www.loc.gov/resource/g7823g.ct003370?fo=json 200
wrote data/item_metadata-g7823g.ct003370.json
requested https://www.loc.gov/resource/g3803o.la000541?fo=json 200
wrote data/item_metadata-g3803o.la000541.json
requested https://www.loc.gov/resource/gmdtitlecolfold.pa-184?fo=json 200
wrote data/item_metadata-gmdtitlecolfold.pa-184.json
requested https://www.loc.gov/resource/gmdtitlecolmaps.hi-013?fo=json 200
wrote data/item_metadata-gmdtitlecolmaps.hi-013.json
requested https://www.loc.gov/resource/gmdtitlecolfold.hi-013?fo=json 200
wrote data/item_metadata-gmdtitlecolfold.hi-013.json
requested https://www.loc.gov/resource/g7824h.ct003399?fo=json 200
wrote data/item_metadata-g7824h.ct003399.json
requested https://www.loc.gov/resource/g7823x.ct003373?fo=json 200
wrote data/item_metadata-g7823x.ct003373.json
requested https://www.loc.gov/resource/g3851a.ct004563?fo=json 200
wrote data/item_metadata-g3851a.ct004563.json
requested https://www.loc.gov/resource/gmdtitlecolfold

In [393]:
pictorial_results = pictorial_set.get('results', [])

csv_file_path = 'pictorial_metadata.csv'

extract_key = ["image_url"]

with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
    csv_writer = csv.writer(csv_file)
    header_values = set()
    for result in pictorial_results:
        item_value = result.get("item", "")
        if isinstance(item_value, dict):
            header_values.update(item_value.keys())

    header =  list(header_values) + extract_key
    csv_writer.writerow(header)

    for result in pictorial_results:
        item_value = result.get("item", {})
        row = [item_value.get(key, '') for key in header_values] + [result.get(key, '') for key in extract_key]
        csv_writer.writerow(row)

In [394]:
csv_output_file = 'pictorial_metadata.csv'

json_data_list = []

for filename in os.listdir('data'):
    if filename.endswith('.json'):
        json_file_path = os.path.join('data', filename)
        with open(json_file_path, 'r', encoding='utf-8') as json_file:
            data = json.load(json_file)
            item_data = data.get('item', {})
            item_data['image_url'] = data.get('image_url', '')
            item_data['created_published'] = ', '.join(item_data.get('created_published', []))
            json_data_list.append(item_data)

all_keys = set().union(*(item.keys() for item in json_data_list))

with open(csv_output_file, 'w', newline='', encoding='utf-8') as csv_output:
    csv_writer = csv.DictWriter(csv_output, fieldnames=all_keys)

    csv_writer.writeheader()

    for data in json_data_list:
        csv_writer.writerow({key: data.get(key, '') for key in all_keys})



## Transforming

| source field name | source field path/dict name | target        | target namespace | notes |
|-------------------|-----------------------------|---------------|------------------|-------|
| Title | item['title'] | dc:title | DC Element | notes |
| Date | item['date'] | dc:date | DC Element | notes |
| Creator | item['contributor'] | dc:creator| DC Element | notes |
| Type | item['format'] | dc:type | DC Element | notes |
| Subject | item['subject']| dc:subject | DC Element | notes |
| Coverage | item['location'] | dc:coverage | DC Element | notes |
| Extent | item['medium'] | dcterms:extent | DC Element | notes |
| Description | item['notes'] | dc:description | DC Element | notes |
| Language | item['language'] | dc:language | DC Element | notes |
| Publisher | item['created_published'] | dc:publisher | DC Element | notes |
| Copyright Date | item['created_published'] | dc:publisher  | DC Element | notes |
| Identifier | item['digital_id'] | dc:identifier | DC Element | notes |


In [395]:
input_csv_path = 'pictorial_metadata.csv'
output_csv_path = 'pictorial_metadata_select.csv'
transformed_csv_path = 'pictorial_metadata_transform.csv'

header_mapping = {
    'format': 'Type',
    'medium': 'Extent',
    'date': 'Date',
    'image_url': 'Image_URL',
    'notes': 'Description',
    'digital_id': 'Identifier',
    'contributors': 'Creator',
    'created_published': 'Publisher',
    'language': 'Language',
    'location': 'Location',
    'subjects': 'Subject',
    'title': 'Title'
}


with open(input_csv_path, 'r', encoding='utf-8') as input_csv, open(output_csv_path, 'w', newline='', encoding='utf-8') as output_csv:
    reader = csv.DictReader(input_csv)
    writer = csv.DictWriter(output_csv, fieldnames=header_mapping.values())

    writer.writeheader()

    for row in reader:
        mapped_row = {header_mapping[old_header]: row[old_header] for old_header in header_mapping}
        writer.writerow(mapped_row)

print(f"New CSV file with selected headers saved at: {output_csv_path}")

with open(output_csv_path, 'r', encoding='utf-8') as input_csv, open(transformed_csv_path, 'w', newline='', encoding='utf-8') as output_csv:
    reader = csv.DictReader(input_csv)

    new_headers = list(header_mapping.values())
    writer = csv.DictWriter(output_csv, fieldnames=new_headers)

    writer.writeheader()

    for row in reader:
        mapped_row = {new_header: row[new_header] for new_header in new_headers}
        writer.writerow(mapped_row)

print(f"New CSV file with updated headers saved at: {transformed_csv_path}")


New CSV file with selected headers saved at: pictorial_metadata_select.csv
New CSV file with updated headers saved at: pictorial_metadata_transform.csv
