In [1]:
import click
import requests

In [2]:
def get_all_packages(base_url, token):
    """Query the ESS-DIVE API to retrieve all data packages and their coordinates."""
    packages_endpoint = f"{base_url}/packages"
    page_size = 100
    row_start = 1
    all_packages = []
    total_packages = None
    coordinates_table: List[Dict] = []

    # Prompt for token if not provided
    if not token:
        token = click.prompt('Please enter your ESS-DIVE bearer token', hide_input=True)

    headers = {'Authorization': f'Bearer {token}'}

    try:
        while True:
            # Make GET request to the API with pagination parameters
            params = {
                'rowStart': row_start,
                'pageSize': page_size,
                'isPublic': 'true'
            }
            response = requests.get(packages_endpoint, params=params, headers=headers)
            response.raise_for_status()

            # Parse JSON response
            data = response.json()

            # Break if no results
            if not data.get('result'):
                break

            # Get total on first iteration
            if total_packages is None:
                total_packages = data.get('total', 0)

            all_packages.extend(data['result'])
            click.echo(f"Retrieved {len(data['result'])} packages (total progress: {len(all_packages)}/{total_packages})")

            # Break if we've retrieved all packages
            if len(all_packages) >= total_packages:
                break

            # Move to next page
            row_start += page_size

        # # Print summary table
        # if coordinates_table:
        #     click.echo("\nSummary of Package Coordinates:")
        #     click.echo(tabulate(coordinates_table, headers='keys', tablefmt='grid'))
        #     click.echo(f"\nTotal packages with coordinates: {len(coordinates_table)}")
        # else:
        #     click.echo("\nNo packages found with coordinates.")

    except requests.exceptions.RequestException as e:
        click.echo(f"Error querying ESS-DIVE API: {e}", err=True)
        raise click.Abort()
    return all_packages

In [3]:
url = "https://api-sandbox.ess-dive.lbl.gov"
token=""

In [4]:
prod_url = "https://api.ess-dive.lbl.gov"


In [5]:
pkg_list=get_all_packages(prod_url, prod_token)

Retrieved 100 packages (total progress: 100/1200)
Retrieved 100 packages (total progress: 200/1200)
Retrieved 100 packages (total progress: 300/1200)
Retrieved 100 packages (total progress: 400/1200)
Retrieved 100 packages (total progress: 500/1200)
Retrieved 100 packages (total progress: 600/1200)
Retrieved 100 packages (total progress: 700/1200)
Retrieved 100 packages (total progress: 800/1200)
Retrieved 100 packages (total progress: 900/1200)
Retrieved 100 packages (total progress: 1000/1200)
Retrieved 100 packages (total progress: 1100/1200)
Retrieved 100 packages (total progress: 1200/1200)


In [7]:
# prompt: write a function to query ess-dive package API by id and return result

def get_package_by_id(base_url, package_id, token):
    """Query the ESS-DIVE API for a specific package by ID."""

    packages_endpoint = f"{base_url}/packages/{package_id}"

    headers = {'Authorization': f'Bearer {token}'}

    try:
        response = requests.get(packages_endpoint, headers=headers)
        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)

        data = response.json()
        return data

    except requests.exceptions.RequestException as e:
        print(f"Error querying ESS-DIVE API: {e}")
        return None


In [8]:
# prompt: given a bounding box with northwest and southeast coordinates, return centroid. input is in the form: [{'@type': 'GeoCoordinates', 'name': 'Northwest', 'latitude': 66.952, 'longitude': -168.14}, {'@type': 'GeoCoordinates', 'name': 'Southeast', 'latitude': 64.03, 'longitude': -159.19}]. Check for name Northwest and Southeast on input. If only one coordinate is provided, use that as centroid. Returned object should be a tuple

def get_centroid(bounding_box):
    """
    Calculates the centroid of a bounding box defined by northwest and southeast coordinates.

    Args:
        bounding_box: A list of dictionaries, where each dictionary represents a GeoCoordinate.
                      It should contain items with 'name' as 'Northwest' and 'Southeast'.

    Returns:
        A tuple representing the centroid coordinates (latitude, longitude), or None if input is invalid.
    """
    if not isinstance(bounding_box, list):
        return None

    if len(bounding_box) == 1:
      # Use the single coordinate as the centroid
      coordinate = bounding_box[0]
      if coordinate.get('@type') == 'GeoCoordinates' and coordinate.get('latitude') is not None and coordinate.get('longitude') is not None:
        return (coordinate.get('latitude'), coordinate.get('longitude'))
      else:
        return None

    northwest = None
    southeast = None
    for coord in bounding_box:
        if coord.get('name') == 'Northwest':
            northwest = coord
        elif coord.get('name') == 'Southeast':
            southeast = coord

    if northwest and southeast and northwest.get('@type') == 'GeoCoordinates' and southeast.get('@type') == 'GeoCoordinates':
      try:
        centroid_lat = (northwest['latitude'] + southeast['latitude']) / 2
        centroid_lon = (northwest['longitude'] + southeast['longitude']) / 2
        return (centroid_lat, centroid_lon)
      except (KeyError, TypeError):
        return None # Handle cases where latitude/longitude are missing or not numbers
    else:
        return None


In [13]:
get_package_by_id(prod_url, 'ess-dive-19e50064aae312c-20250304T004733888', prod_token)

{'id': 'ess-dive-19e50064aae312c-20250304T004733888',
 'viewUrl': 'https://data.ess-dive.lbl.gov/view/doi:10.15485/2526687',
 'url': 'https://api.ess-dive.lbl.gov/packages/ess-dive-19e50064aae312c-20250304T004733888',
 'next': None,
 'previous': 'https://api.ess-dive.lbl.gov/packages/ess-dive-c2f4ccc77b5ecb8-20250226T194451991',
 'dateUploaded': '2025-03-04T00:47:34.917Z',
 'dateModified': '2025-03-07T00:00:43.422Z',
 'isPublic': True,
 'citation': 'Carrero S; Fox P; Nico P (2025): Mineralogy of floodplain sediments from Meanders C, O, and Z in the East River Watershed, CO, USA. Watershed Function SFA. Dataset. doi:10.15485/2526687',
 'dataset': {'@context': 'http://schema.org/',
  '@type': 'Dataset',
  '@id': 'doi:10.15485/2526687',
  'name': 'Mineralogy of floodplain sediments from Meanders C, O, and Z in the East River Watershed, CO, USA',
  'description': ['This dataset includes bulk X-ray diffraction data from floodplain sediments collected as a part of the Watershed Function Scie

In [14]:
# prompt: for each package in pkg_list get package data and get centroids for all bounding boxes. create a dataframe with package_id, centroid latitude, centroid longitude

import pandas as pd

# Assuming pkg_list, get_package_by_id, get_centroid, prod_url, and prod_token are defined as in the provided code.

locations = []
datasets = []
for package in pkg_list[0:20]:  # Process the first 20 packages for demonstration
    package_id = package['id']
    print(f"Fetching {package_id}")

    package_details = get_package_by_id(prod_url, package_id, prod_token)
    if not package_details:
        continue


    dataset = package_details.get('dataset', {})
    if dataset:
        data_source = "ESS-DIVE"
        url = package_details.get('viewUrl', '')
        id = package_details.get('id', '')
        alternate_ids = dataset.get('alternateName', '')
        name = dataset.get('name', '')
        description = '\n'.join(dataset.get('description', ''))
        date_uploaded = package_details.get('dateUploaded', '')
        award = dataset.get('award', '')
        citation = dataset.get('citation', '')
        doi = dataset.get('@id', '')
        spatial_coverage = dataset.get('spatialCoverage', [])
        associated_researchers = []

        pi = dataset['provider'].get('member', None)
        associated_researchers.append({
                'name': pi['givenName'] + ' ' + pi['familyName'],
                'email': pi.get('email', ''),
                'affiliation': pi.get('affiliation', ''),
                'role': pi.get('jobTitle', '')
            })


        # TODO: Avoid duplication
        editor = dataset.get('editor', None)
        associated_researchers.append({
                'name': editor['givenName'] + ' ' + editor['familyName'],
                'email': editor.get('email', ''),
                'affiliation': editor.get('affiliation', ''),
                'role': 'Editor'
            })

        # TODO: Avoid duplication
        for person in dataset.get('creator', None):
              associated_researchers.append({
                  'name': person['givenName'] + ' ' + person['familyName'],
                  'email': person.get('email', ''),
                  'affiliation': person.get('affiliation', ''),
                  'role': 'Creator'
              })



        datasets.append({
            'id': id,
            'data_source': data_source,
            'url': url,
            'alternate_ids': alternate_ids,
            'name': name,
            'description': description,
            'date_uploaded': date_uploaded,
            'award': award,
            'citation': citation,
            'doi': doi,
            'associated_researchers': associated_researchers,
        })
        for place in spatial_coverage:
            centroid = get_centroid(place.get('geo', []))
            if centroid:
                locations.append({
                    'dataset_id': id,
                    'latitute': centroid[0],
                    'longitude': centroid[1]
                })





Fetching ess-dive-a66bd8e860fd5d2-20250307T000457833
Fetching ess-dive-b8764d466d15255-20250307T000152589
Fetching ess-dive-3e3ca22cd60ddff-20250306T002220912
Fetching ess-dive-4d4964c3c447a6b-20250304T233919265
Fetching ess-dive-5492d7b0cb4dc5f-20250304T214840948
Fetching ess-dive-123affc5bf658e5-20250304T212031404
Fetching ess-dive-01e8ab563a78be1-20250304T211352752
Fetching ess-dive-8375485eb34dd97-20250304T073447486
Fetching ess-dive-19e50064aae312c-20250304T004733888
Fetching ess-dive-939db401757e797-20250303T155212496
Fetching ess-dive-5d67cc07bc39073-20250228T161607740
Fetching ess-dive-f7b4f8a695ba8e3-20250227T225501084
Fetching ess-dive-f265f7af37e0107-20250227T224411589
Fetching ess-dive-39f4b4f961f5e3a-20250227T223242161
Fetching ess-dive-aaa4e94dbaf22d1-20250227T184220226478
Fetching ess-dive-b2d3f6f16e89545-20250226T143322810527
Fetching ess-dive-0ca64974c5b9ef0-20250224T230909587
Fetching ess-dive-e976198fe417dbb-20250224T215718432
Fetching ess-dive-0c4579ec3e4bbad-202502

In [15]:

import json

# Assuming datasets is defined as in the previous code

# Convert the datasets list to a JSON string
datasets_json = json.dumps(datasets, indent=4)

# Write the JSON string to a file
with open('datasets.json', 'w') as f:
    f.write(datasets_json)


In [16]:

import json

# Assuming datasets is defined as in the previous code

# Convert the datasets list to a JSON string
geodata_json = json.dumps(locations, indent=4)

# Write the JSON string to a file
with open('geodata.json', 'w') as f:
    f.write(geodata_json)
