## Parse ArchivesSpace Resources via the API

### Import packages
- configparser: Implements a basic configuration language which provides a structure you can use to write Python programs which can be customized by end users.
- json: Exposes an API for JSON (JavaScript Object Notation).
- requests: A HTTP library.
- pandas: An open source data analysis and manipulation tool, built on top of the Python programming language.

In [5]:
import configparser
import json
import requests
import pandas as pd 

### Read Configuration File

In order to authenticate to ArchivesSpace and thus use the API, you'll have needed to supply a separate -- and ignored by git -- "config.ini" file in the home directory that looks like this:

```
[ARCHIVESSPACE]
BaseURL = 
User = 
Password = 
Respository ID = 
```

In [6]:
print('Reading Configuration File')
config = configparser.ConfigParser()
config.read('config.ini')

base_url = config['ARCHIVESSPACE']['BaseURL']
user = config['ARCHIVESSPACE']['User']
password = config['ARCHIVESSPACE']['Password']
repository_id = config['ARCHIVESSPACE']['RepositoryID']

Reading Configuration File


### Authenticate to ArchivesSpace

In [7]:
print('Authenticating to ArchivesSpace')
endpoint = '/users/' + user + '/login'
params = {'password': password}
response = requests.post(base_url + endpoint, params=params)
print(response.status_code)

response = response.json()
session_key = response['session']

Authenticating to ArchivesSpace
200


### Get Resource IDs

You can either get a list of _all_ resource IDs an an ArchivesSpace Repository, or you supply a separate "resource_ids.txt" file in the home directory with one line for every Resource ID for every Resource you want to parse.

In [8]:
resource_ids = []

value = input('I want to parse all Resource IDs in ArchivesSpace. Enter True of False: ') or 'False'

# Convert the input to a boolean value
if value.lower() == "true":
    bool_value = True
elif value.lower() == "false":
    bool_value = False
else:
    print("Invalid input. Please enter True or False.")

# Use the boolean value
if bool_value:
    print('Parsing all Resource Ids in ArchivesSpace.')
    print('  - GETing Resource IDs')
    endpoint = '/repositories/' + str(repository_id) + '/resources'
    headers = {'X-ArchivesSpace-Session': session_key}
    params = {'all_ids': True}
    response = requests.get(base_url + endpoint, headers=headers, params=params)
    print(response.status_code)

    resource_ids = response.json()

else:
    print('Parsing Resource IDs from text file.')
    with open('resource_ids.txt', mode='r') as f:
        resource_ids = f.readlines()

Parsing Resource IDs from text file.


### Parse Resources

In [None]:
results = []

for resource_id in resource_ids:
    resource_id = resource_id.strip()

    print('  - GETing Resource ' + str(resource_id))
    endpoint = '/repositories/' + str(repository_id) + '/resources/' + str(resource_id)
    headers = {'X-ArchivesSpace-Session': session_key}
    response = requests.get(base_url + endpoint, headers=headers)
    print(response.status_code)

    resource = response.json()

    ## extract id
    eadid = resource['ead_id']

    # Extract titleproper
    titleproper = resource['finding_aid_title'][16:]

    ## extract abstract
    abstract = ''
    for note in resource['notes']:
        if note.get('type') == 'abstract':
            abstract = note['content'][0]
                
    ## Extract language
    language = resource['finding_aid_language_note'].replace('<language encodinganalog="Language" langcode="eng">English.</language>', 'English.')

    ## Extract scopecontent
    scopecontent = ''
    for note in resource['notes']:
        if note.get('type') == 'scopecontent':
            scopecontent = note['subnotes'][0]['content']
                
    ## Extract bioghist    
    bioghist = ''
    for note in resource['notes']:
        if note.get('type') == 'bioghist':
            bioghist = note['subnotes'][0].get('content', '')
                
    ## Extract custodhist   

    ## Extract controlaccess
    subject_ids = []
    subjects = []
    subjects_source = []

    genreform_ids = []
    genreforms = []
    genreforms_source = []

    geogname_ids = []
    geognames = []
    geognames_source = []

    for subject in resource['subjects']:
        subject_id = subject['ref'].split('/')[-1]
            
        print('  - GETing Subject ' + str(subject_id))
        endpoint = '/subjects/' + str(subject_id)
        response = requests.get(base_url + endpoint, headers=headers)
        print(response.status_code)
            
        subject = response.json()
            
        if subject['terms'][0]['term_type'] == 'topical':
            subject_ids.append(subject_id)
            subjects.append(subject['terms'][0]['term'])
            subjects_source.append(subject.get('source', 'No Source'))
            
        if subject['terms'][0]['term_type'] == 'genre_form':
            genreform_ids.append(subject_id)
            genreforms.append(subject['terms'][0]['term'])
            genreforms_source.append(subject.get('source', 'No Source'))
            
        if subject['terms'][0]['term_type'] == 'geographic':
            geogname_ids.append(subject_id)
            geognames.append(subject['terms'][0]['term'])
            geognames_source.append(subject.get('source', 'No Source'))

    persname_ids = []
    persnames = []
    persnames_source = []

    corpname_ids = []
    corpnames = []
    corpnames_source = []

    famname_ids = []
    famnames = []
    famnames_source = []

    for linked_agent in resource['linked_agents']:
        linked_agent_id = linked_agent['ref'].split('/')[-1]
            
        if 'people' in linked_agent['ref']:
            print('  - GETing Person Agent ' + str(linked_agent_id))
            endpoint = '/agents/people/' + str(linked_agent_id)
            response = requests.get(base_url + endpoint, headers=headers)
            print(response.status_code)
            
            person_agent = response.json()
            persname_ids.append(linked_agent_id)
            persnames.append(person_agent['names'][0]['sort_name'])
            persnames_source.append(person_agent['names'][0].get('source', 'No Source'))
                
        if 'corporate_entities' in linked_agent['ref']:
            print('  - GETing Coporate Entity Agent ' + str(linked_agent_id))
            endpoint = '/agents/corporate_entities/' + str(linked_agent_id)
            response = requests.get(base_url + endpoint, headers=headers)
            print(response.status_code)
            
            corporate_entity_agent = response.json()
            corpname_ids.append(linked_agent_id)
            corpnames.append(corporate_entity_agent['names'][0]['sort_name'])
            corpnames_source.append(corporate_entity_agent['names'][0].get('source', 'No Source'))
                
        if 'families' in linked_agent['ref']:
            print('  - GETing Family Agent ' + str(linked_agent_id))
            endpoint = '/agents/families/' + str(linked_agent_id)
            response = requests.get(base_url + endpoint, headers=headers)
            print(response.status_code)
            
            family_agent = response.json()
            famname_ids.append(linked_agent_id)
            famnames.append(family_agent['names'][0]['sort_name'])
            famnames_source.append(family_agent['names'][0].get('source', 'No Source'))
                
    result = [str(resource_id), 
              eadid, 
              titleproper, 
              abstract, 
              language, 
              scopecontent, 
              bioghist, 
              '; '.join(subject_ids), 
              '; '.join(subjects), 
              '; '.join(subjects_source), 
              '; '.join(genreform_ids), 
              '; '.join(genreforms), 
              '; '.join(genreforms_source), 
              '; '.join(geogname_ids), 
              '; '.join(geognames), 
              '; '.join(geognames_source), 
              '; '.join(persname_ids), 
              '; '.join(persnames), 
              '; '.join(persnames_source), 
              '; '.join(corpname_ids), 
              '; '.join(corpnames), 
              '; '.join(corpnames_source), 
              '; '.join(famname_ids), 
              '; '.join(famnames), 
              '; '.join(famnames_source)]
    results.append(result)

# Create the pandas DataFrame 
df = pd.DataFrame(results, columns = ['resource_id',
                                      'ead_id', 
                                      'titleproper', 
                                      'abstract', 
                                      'language', 
                                      'scopecontent', 
                                      'bioghist', 
                                      'subject_ids',
                                      'subjects', 
                                      'subjects_source', 
                                      'genreform_ids',
                                      'genreforms', 
                                      'genreforms_source', 
                                      'geogname_ids',
                                      'geognames', 
                                      'geognames_source', 
                                      'persname_ids',
                                      'persnames', 
                                      'persnames_source', 
                                      'corpname_ids',
                                      'corpnames', 
                                      'corpnames_source', 
                                      'famname_ids',
                                      'famnames', 
                                      'famnames_source']) 

print("Alright, we're done!")

### Write Results to CSV file

In [None]:
print('Writing Results to CSV file')


if bool_value:
    df.to_csv('results-allIDs.csv', encoding='utf-8', index=False)
else:
    df.to_csv('results-fromTextFile.csv', encoding='utf-8', index=False)