In [1]:
import urllib.request
import urllib.parse
import json
import time
import gzip


National Archives explains the following:

For example, if you would like to construct a very specific search across all series for records with a "Top Secret" security classification, and the term "Army" in the description's title:

https://catalog.archives.gov/api/v1/?description.series.accessRestriction.specificAccessRestrictionArray.specificAccessRestriction.securityClassification.termName=top%20secret&description.series.title=army


In [2]:
my_fields = ['naId', 'scopeAndContentNote', 'title', 'recordHistory', 
             'physicalOccurrenceArray', 'localIdentifier',
             'variantControlNumberArray']

def record_yielder(step = 20, skip = 0, start_with = None):
    """
    Using the national Archives 
    """
    cursorMark = '*'
    completed = 0

    if start_with is not None:
        # Allow picking up from a certain point.
        cursorMark = [g.replace("cursorMark=", "") for g in start_with.split("&") if g.startswith("cursorMark")][0]
        completed = int(cursorMark.split("-")[-1])
    
    url = "https://catalog.archives.gov/api/v1/?"
    # I have determined that I only care about these fields--especially 'scopeAndContentNote'

    values = {
        'description.fileUnit.parentSeries.naId': '4488912', 
        "cursorMark": cursorMark,
        "rows": str(step)
    }
    not_done_yet = True
    round = 1
    while not_done_yet:
        data = urllib.parse.urlencode(values)
        my_fields_detailed_path = ["description.fileUnit." + f for f in my_fields]
        
        fast_forward = completed + step < skip
        if fast_forward:
            # As described in the API documentation
            my_fields_detailed_path = ["num"]
            
        data = data + "&resultFields=" + ",".join(my_fields_detailed_path)
        print(url + data)
        try:
            with urllib.request.urlopen(url + data) as response:
                page = json.loads(response.read())
        except urllib.error.HTTPError as e:
            ResponseData = e.read().decode("utf8", 'ignore')
            print(ResponseData)
            raise

        completed += step
        results = page['opaResponse']['results']['result']
        for record in results:
            if not fast_forward:
                yield record
        round += 1
        # NARA doesn't specify their rate limits.
        time.sleep(2)
        values['cursorMark'] = page['opaResponse']['results']['nextCursorMark']
        not_done_yet = page['opaResponse']['results']['total'] > page['opaResponse']['results']['offset'] + page['opaResponse']['results']['rows']


In [3]:
urllib.error.HTTPError

urllib.error.HTTPError

In [4]:
def parse_record(record):
    record_information = {}
    description = record['description']['fileUnit']
    for key in my_fields:
        record_information[key] = description[key]
    return record_information

In [8]:
with gzip.open("all_records_4488912.txt.gz", "at") as fout:
    for record in record_yielder(3000, skip = 581000):
        fout.write(json.dumps(record) + "\n")

https://catalog.archives.gov/api/v1/?description.fileUnit.parentSeries.naId=4488912&cursorMark=%2A&rows=3000&resultFields=num
https://catalog.archives.gov/api/v1/?description.fileUnit.parentSeries.naId=4488912&cursorMark=AoMIP4AAAHjavIDW8gIuZGVzYy0xNTg1NTA5MjA%3D-3000&rows=3000&resultFields=num
https://catalog.archives.gov/api/v1/?description.fileUnit.parentSeries.naId=4488912&cursorMark=AoMIP4AAAHi6nYDW8gIuZGVzYy0xNTg1NDgwMDU%3D-6000&rows=3000&resultFields=num
https://catalog.archives.gov/api/v1/?description.fileUnit.parentSeries.naId=4488912&cursorMark=AoMIP4AAAHid%2Ff%2FV8gIuZGVzYy0xNTg1NDQ5NDU%3D-9000&rows=3000&resultFields=num
https://catalog.archives.gov/api/v1/?description.fileUnit.parentSeries.naId=4488912&cursorMark=AoMIP4AAAHiM2f%2FV8gIuZGVzYy0xNTg1NDE5NjA%3D-12000&rows=3000&resultFields=num
https://catalog.archives.gov/api/v1/?description.fileUnit.parentSeries.naId=4488912&cursorMark=AoMIP4AAAHjyt%2F%2FV8gIuZGVzYy0xNTg1Mzg5Mjg%3D-15000&rows=3000&resultFields=num
https://cata

KeyError: 'nextCursorMark'