In [16]:
import json
from os.path import basename, join

from smart_open import open as smart_open
import boto3
from tqdm import tqdm

# Code from https://alexwlchan.net/2017/07/listing-s3-keys/
def get_matching_s3_objects(bucket, prefix='', suffix='',
                            request_payer='None'):
    """
    Generate objects in an S3 bucket.
    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch objects whose key starts with
        this prefix (optional).
    :param suffix: Only fetch objects whose keys end with
        this suffix (optional).
    """
    s3 = boto3.client('s3')
    kwargs = {'Bucket': bucket, 'RequestPayer': request_payer}

    # If the prefix is a single string (not a tuple of strings), we can
    # do the filtering directly in the S3 API.
    if isinstance(prefix, str):
        kwargs['Prefix'] = prefix

    while True:

        # The S3 API response is a large blob of metadata.
        # 'Contents' contains information about the listed objects.
        resp = s3.list_objects_v2(**kwargs)

        try:
            contents = resp['Contents']
        except KeyError:
            return

        for obj in contents:
            key = obj['Key']
            if key.startswith(prefix) and key.endswith(suffix):
                yield obj

        # The S3 API is paginated, returning up to 1000 keys at a time.
        # Pass the continuation token into the next response, until we
        # reach the final page (when this field is missing).
        try:
            kwargs['ContinuationToken'] = resp['NextContinuationToken']
        except KeyError:
            break

def get_json(uri):
    with smart_open(uri) as fd:
        return json.load(fd)

def save_json(json_dict, uri):
    with smart_open(uri, 'w') as fd:
        json.dump(json_dict, fd)

In [14]:
# get all huc8 extract files on s3 with 02 in prefix.
# this is the set that lies within huc2 02 (which is in themid-atlantic region)

huc8_root_uri = 's3://azavea-noaa-hydro-data/noaa/huc8-extracts/transformed/'
huc8_fns = [
    basename(x['Key'])
    for x in list(get_matching_s3_objects('azavea-noaa-hydro-data', 'noaa/huc8-extracts/transformed/02', 'json'))]

reach_ids = []
for huc8_fn in tqdm(huc8_fns):
    huc8_dict = get_json(join(huc8_root_uri, huc8_fn))
    reach_ids.extend(huc8_dict['features'][0]['properties']['comids'])

reach_ids.sort()

100%|██████████| 88/88 [00:33<00:00,  2.65it/s]


[1748535,
 1748537,
 1748539,
 1748541,
 1748543,
 1748545,
 1748547,
 1748549,
 1748551,
 1748553,
 1748555,
 1748557,
 1748559,
 1748561,
 1748563,
 1748565,
 1748567,
 1748569,
 1748571,
 1748573,
 1748575,
 1748577,
 1748579,
 1748581,
 1748583,
 1748585,
 1748587,
 1748589,
 1748591,
 1748593,
 1748595,
 1748597,
 1748599,
 1748601,
 1748603,
 1748605,
 1748607,
 1748609,
 1748611,
 1748613,
 1748615,
 1748617,
 1748619,
 1748621,
 1748623,
 1748625,
 1748627,
 1748629,
 1748631,
 1748633,
 1748635,
 1748637,
 1748639,
 1748641,
 1748643,
 1748645,
 1748647,
 1748649,
 1748651,
 1748707,
 1748709,
 1748711,
 1748713,
 1748715,
 1748717,
 1748719,
 1748723,
 1748725,
 1748727,
 1748729,
 1748731,
 1748733,
 1748735,
 1748737,
 1748739,
 1748741,
 1748743,
 1748745,
 1748747,
 1748749,
 1748751,
 1748753,
 1748755,
 1748757,
 1748759,
 1748761,
 1748763,
 1748765,
 1748767,
 1748769,
 1748771,
 1748773,
 1748775,
 1748777,
 1748779,
 1748781,
 1748783,
 1748785,
 1748787,
 1749159,


In [17]:
out_uri = 's3://azavea-noaa-hydro-data/noaa/huc2-comids.json'
save_json({'comids': reach_ids}, out_uri)

TypeError: save_json() missing 1 required positional argument: 'uri'