In [1]:
import requests
import json
import csv
import os
import ast

In [2]:
endpoint_collection ='https://www.loc.gov/maps/?q=transit&fa=online-format%3Aimage&st=list&c=50'

In [3]:
r = requests.get(endpoint_collection, params={'fo':'json'})

In [4]:
transit_set = r.json()

In [5]:
with open('transit-set.json', 'w') as f:
    json.dump(transit_set, f, indent=2)

with open('transit-set.json', 'r') as json_file:
    pictorial_set = json.load(json_file)

csv_file_path = 'transit_collection.csv'

extract_keys_resources = ["image", "url"]
extract_keys_item = ["title"]

headers = extract_keys_item + extract_keys_resources

with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(headers)

    for result in pictorial_set.get("results", []):
        resources = result.get("resources", [])

        for resource in resources:
            row_resources = [resource.get(key, '') for key in extract_keys_resources]
            item_value = result.get("item", {})
            row_item = [item_value.get(key, '') for key in extract_keys_item]
            csv_writer.writerow(row_item + row_resources)

In [6]:
error_count = 0
file_count = 0
item_count = 0

data_directory = 'data'
item_metadata_file_start = 'item_metadata'
json_suffix = '.json'

endpoint_item = 'https://www.loc.gov/resource/'
with open(csv_file_path, 'r', newline='', encoding='utf-8') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    for row in csv_reader:
        url_value = row['url']
        id = url_value.split('/')[4]
        item_metadata = requests.get(endpoint_item + id, params={'fo':'json'})
        print('requested',item_metadata.url,item_metadata.status_code)
        if item_metadata.status_code != 200:
            print('requested',item_metadata.url,item_metadata.status_code)
            error_count += 1
            continue
        try:
            item_metadata.json()
        except:
            error_count += 1
            print('no json found')
            continue
        fout = os.path.join('data', str(item_metadata_file_start + '-' + id + json_suffix))
        with open(fout, 'w', encoding='utf-8') as json_file:
            json_file.write(json.dumps(item_metadata.json()['item']))
            file_count += 1
            print('wrote', fout)
        item_count += 1

print('items requested:',item_count)
print('errors:',error_count)
print('files written:',file_count)

requested https://www.loc.gov/resource/g3804n.ct002210?fo=json 200
wrote data/item_metadata-g3804n.ct002210.json
requested https://www.loc.gov/resource/g3931p.rr004080?fo=json 200
wrote data/item_metadata-g3931p.rr004080.json
requested https://www.loc.gov/resource/g3804n.ct006628?fo=json 200
wrote data/item_metadata-g3804n.ct006628.json
requested https://www.loc.gov/resource/g3893r.cwh00324?fo=json 200
wrote data/item_metadata-g3893r.cwh00324.json
requested https://www.loc.gov/resource/g3804n.ct006631?fo=json 200
wrote data/item_metadata-g3804n.ct006631.json
requested https://www.loc.gov/resource/g3804n.ct006633?fo=json 200
wrote data/item_metadata-g3804n.ct006633.json
requested https://www.loc.gov/resource/g4852n.ma001019?fo=json 200
wrote data/item_metadata-g4852n.ma001019.json
requested https://www.loc.gov/resource/g3764b.ct006009?fo=json 200
wrote data/item_metadata-g3764b.ct006009.json
requested https://www.loc.gov/resource/g3764b.ct006008?fo=json 200
wrote data/item_metadata-g376

In [7]:
transit_results = transit_set.get('results', [])

csv_file_path = 'transit_metadata.csv'

extract_key = ["image_url"]

with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
    csv_writer = csv.writer(csv_file)
    header_values = set()
    for result in transit_results:
        item_value = result.get("item", "")
        if isinstance(item_value, dict):
            header_values.update(item_value.keys())

    header =  list(header_values) + extract_key
    csv_writer.writerow(header)

    for result in transit_results:
        item_value = result.get("item", {})
        row = [item_value.get(key, '') for key in header_values] + [result.get(key, '') for key in extract_key]
        csv_writer.writerow(row)

In [8]:
csv_output_file = 'transit_metadata.csv'

json_data_list = []

for filename in os.listdir('data'):
    if filename.endswith('.json'):
        json_file_path = os.path.join('data', filename)
        with open(json_file_path, 'r', encoding='utf-8') as json_file:
            data = json.load(json_file)
            item_data = data.get('item', {})
            item_data['image_url'] = data.get('image_url', '')
            item_data['created_published'] = ', '.join(item_data.get('created_published', []))
            json_data_list.append(item_data)

all_keys = set().union(*(item.keys() for item in json_data_list))

with open(csv_output_file, 'w', newline='', encoding='utf-8') as csv_output:
    csv_writer = csv.DictWriter(csv_output, fieldnames=all_keys)

    csv_writer.writeheader()

    for data in json_data_list:
        csv_writer.writerow({key: data.get(key, '') for key in all_keys})

In [9]:
input_csv_path = 'transit_metadata.csv'
output_csv_path = 'transit_metadata_select.csv'
transformed_csv_path = 'transit_metadata_transform.csv'

header_mapping = {
    'format': 'Type',
    'medium': 'Extent',
    'date': 'Date',
    'image_url': 'Image_URL',
    'notes': 'Description',
    'digital_id': 'Identifier',
    'contributors': 'Creator',
    'created_published': 'Publisher',
    'language': 'Language',
    'location': 'Location',
    'subjects': 'Subject',
    'title': 'Title'
}


with open(input_csv_path, 'r', encoding='utf-8') as input_csv, open(output_csv_path, 'w', newline='', encoding='utf-8') as output_csv:
    reader = csv.DictReader(input_csv)
    writer = csv.DictWriter(output_csv, fieldnames=header_mapping.values())

    writer.writeheader()

    for row in reader:
        mapped_row = {header_mapping[old_header]: row[old_header] for old_header in header_mapping}
        writer.writerow(mapped_row)

print(f"New CSV file with selected headers saved at: {output_csv_path}")

with open(output_csv_path, 'r', encoding='utf-8') as input_csv, open(transformed_csv_path, 'w', newline='', encoding='utf-8') as output_csv:
    reader = csv.DictReader(input_csv)

    new_headers = list(header_mapping.values())
    writer = csv.DictWriter(output_csv, fieldnames=new_headers)

    writer.writeheader()

    for row in reader:
        mapped_row = {new_header: row[new_header] for new_header in new_headers}
        writer.writerow(mapped_row)

print(f"New CSV file with updated headers saved at: {transformed_csv_path}")

New CSV file with selected headers saved at: transit_metadata_select.csv
New CSV file with updated headers saved at: transit_metadata_transform.csv


In [10]:
def convert_value_to_string(key, value):
    if key == 'Image_URL':
        try:
            eval_value = ast.literal_eval(value)
            if isinstance(eval_value, (list, tuple)) and len(eval_value) > 0:
                return str(eval_value[0])
            else:
                return str(value)
        except (ValueError, SyntaxError):
            return str(value)
    elif isinstance(value, str):
        try:
            eval_value = ast.literal_eval(value)
            if isinstance(eval_value, (list, tuple)):
                return '/'.join(map(str, eval_value))
            else:
                return str(value)
        except (ValueError, SyntaxError):
            return str(value)
    else:
        return str(value)

input_csv_path = 'transit_metadata_transform.csv'
output_csv_path = 'transit_metadata_transform_extract.csv'

with open(input_csv_path, 'r', encoding='utf-8') as input_csv, open(output_csv_path, 'w', newline='', encoding='utf-8') as output_csv:
    reader = csv.DictReader(input_csv)
    writer = csv.DictWriter(output_csv, fieldnames=reader.fieldnames)

    writer.writeheader()

    for row in reader:
        transformed_row = {key: convert_value_to_string(key, value) for key, value in row.items()}
        writer.writerow(transformed_row)

print(f"New CSV file with extracted string from list: {output_csv_path}")

New CSV file with extracted string from list: transit_metadata_transform_extract.csv


In [11]:
def get_credential(json_file, key, sub_key):
   try:
       with open(json_file) as f:
           data = json.load(f)
           return data[key][sub_key]
   except Exception as e:
       print("Error: ", e)

baseURL = 'http://amsklar.projectst.si.umich.edu/omekas/omeka-s/api/'


key_identity = get_credential('data/json/secrets.json', 'omeka', 'key_identity')
key_credential = get_credential('data/json/secrets.json', 'omeka', 'key_credential')

omekas_credentials = {
    'key_identity': key_identity,
    'key_credential': key_credential
}

In [12]:
headers = {'Content-type': 'application/json'}
resource_type = 'item_sets'

json_item_set = '{"dcterms:title": [{"type": "literal", "property_label": "Title", "@value": "Transit Maps", "property_id": 1}]}'

item_create = requests.post(baseURL + resource_type, params=omekas_credentials, headers=headers, data=json_item_set)

print(item_create.status_code)
print(item_create.text)

200
{"@context":"http:\/\/amsklar.projectst.si.umich.edu\/omekas\/omeka-s\/api-context","@id":"http:\/\/amsklar.projectst.si.umich.edu\/omekas\/omeka-s\/api\/item_sets\/1145","@type":"o:ItemSet","o:id":1145,"o:is_public":true,"o:owner":{"@id":"http:\/\/amsklar.projectst.si.umich.edu\/omekas\/omeka-s\/api\/users\/1","o:id":1},"o:resource_class":null,"o:resource_template":null,"o:thumbnail":null,"o:title":"Transit Maps","thumbnail_display_urls":{"large":null,"medium":null,"square":null},"o:created":{"@value":"2023-12-11T15:33:53+00:00","@type":"http:\/\/www.w3.org\/2001\/XMLSchema#dateTime"},"o:modified":{"@value":"2023-12-11T15:33:53+00:00","@type":"http:\/\/www.w3.org\/2001\/XMLSchema#dateTime"},"o:is_open":false,"o:items":{"@id":"http:\/\/amsklar.projectst.si.umich.edu\/omekas\/omeka-s\/api\/items?item_set_id=1145"},"dcterms:title":[{"type":"literal","property_id":1,"property_label":"Title","is_public":true,"@value":"Transit Maps"}]}
