In [1]:
import pandas as pd
import requests
from IPython.display import display

## **IMPORT DATA VIA API**

This notebook connects to the NWO API and extracts the data from projects, members and products and saves them to seperate pickle files.

## **HOW TO USE** <a class="anchor" id="1"></a>
1. Set the number of pages below or keep it at 0 to collect all available data.
2. Run the entire notebook to produce the pkl files.

In [None]:
# Number of pages to fetch (set to 0 to fetch all pages)
NR_OF_PAGES = 0

In [21]:
# Function to fetch data from NWOpen-API
def fetch_data_from_nwopen_api(base_url, params):
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        response.raise_for_status()

# Function to process the API response and structure it into DataFrames
def process_response(data, projects, members, products):
    for project in data.get('projects', []):

        project_info = {
            'project_id': project.get('project_id'),  # String (100) File number of the project. A unique combination of numbers and/or letters and/or punctuation marks by which the file is identified
            'grant_id': project.get('grant_id'),  # String (100)
            'parent_project_id': project.get('parent_project_id'),  # String (100) Programme, when there is a programme with underlying projects.
            'title': project.get('title'),  # String (255) Title of the project.
            'funding_scheme_id': project.get('funding_scheme_id'),  # String (50) ID of the call under which the project falls.
            'funding_scheme': project.get('funding_scheme'),  # String (255) Name of the call under which the project falls.
            'department': project.get('department'),  # String (255) NWO Domain under which the funding has been allocated to the project.
            'sub_department': project.get('sub_department'),  # String (255) NWO Sub-domain under which the funding has been allocated to the project.
            'start_date': project.get('start_date'),  # Date (YYYY-MM-DD) Actual start date of the project.
            'end_date': project.get('end_date'),  # Date (YYYY-MM-DD) Actual completion date of the project.
            'summary_nl': project.get('summary_nl'),  # String (8000) Scientific summary of the project in Dutch.
            'summary_en': project.get('summary_en'),  # String (8000) Scientific summary of the project in English.
            'summary_update': project.get('summary_update'),  # Array Array details described in the next section, 0.
            'project_members': project.get('project_members'),  # Array Array with details of the project members. Details in section 0.
            'products': project.get('products')  # Array Array with details of products originated from the project. Details in section 0.
        }
        projects.append(project_info)

        for member in project.get('project_members', []):
            member_info = {
                'role': member.get('role'),  # String (255) Role of the project member within the project. Possible returned roles are: Main Applicant, Co-applicant, Project leader, Researcher
                'member_id': member.get('member_id'),  # Integer ID for the project member.
                'orcid': member.get('orcid'),  # String (255) ORCID of the project member. Will become available as soon as NWO’s data is enriched with ORCID.
                'last_name': member.get('last_name'),  # String (75) Last name of the project member.
                'degree_pre_nominal': member.get('degree_pre_nominal'),  # String (500) Pre nominal titles.
                'degree_post_nominal': member.get('degree_post_nominal'),  # String (500) Post nominal titles.
                'initials': member.get('initials'),  # String (16) Initials of the project member.
                'first_name': member.get('first_name'),  # String (75) First name of the project member.
                'prefix': member.get('prefix'),  # String (75) Insert before the name.
                'dai': member.get('dai'),  # String (255) Digital Author Identifier.
                'organisation': member.get('organisation'),  # String (500) Organization where the project member is working for the project.
                'organisation_id': member.get('organisation_id'),  # Integer Identifier for the organisation.
                'ror': member.get('ror'),  # String (255) ROR-id of the organisation the project member is attending the project for. Will become available as soon as NWO’s data is enriched with ROR.
                'active': member.get('active')  # String (3) Indication whether the project member is still working on the project.
            }
            members.append(member_info)

        for product in project.get('products', []):
            product_info = {
                'isbn': product.get('isbn'),  # String (20) ISBN of the product.
                'doi': product.get('doi'),  # String (255) DOI of the product. DOI will become available when NWO data is enriched with DOI’s.
                'title': product.get('title'),  # String (8000) Title of publication.
                'sub_title': product.get('sub_title'),  # String (8000) Subtitle of the publication.
                'year': product.get('year'),  # Integer Year of publication.
                'city': product.get('city'),  # String (50) City of publication.
                'edition': product.get('edition'),  # String (30) Edition of the publication.
                'start': product.get('start'),  # Integer Page in medium where the publication starts.
                'end': product.get('end'),  # Integer Page in medium with the last page of the publication.
                'type': product.get('type'),  # String (50) Type of the publication.
                'url_open_access': product.get('url_open_access'),  # String (1000) Open access link to publication.
                'publisher': product.get('publisher'),  # String (255) Publisher.
                'journal_title': product.get('journal_title'),  # String (255) Title of the journal of publication.
                'authors': product.get('authors')  # Array An array with details of the authors of a product, described in the next section 0.
            }
            products.append(product_info)

            for author in product.get('authors', []):
                author_info = {
                    'last_name': author.get('last_name'),  # String (75) Last name of the project member.
                    'degree_pre_nominal': author.get('degree_pre_nominal'),  # String (500) Pre nominal titles.
                    'degree_post_nominal': author.get('degree_post_nominal'),  # String (500) Post nominal titles.
                    'initials': author.get('initials'),  # String (16) Initials of the project member.
                    'first_name': author.get('first_name'),  # String (75) First name project member.
                    'prefix': author.get('prefix'),  # String (75) Insert before the name.
                    'dai': author.get('dai')  # String (255) Digital Author Identifier.
                }
                products.append(author_info)

    return projects, members, products

# Function to fetch and process all pages of data
def fetch_all_data(base_url, initial_params, nr_of_pages):
    projects = []
    members = []
    products = []

    page = 1
    while True:
        print(f'Retrieving page: {page}')

        params = initial_params.copy()
        params['page'] = page
        data = fetch_data_from_nwopen_api(base_url, params)
        projects, members, products = process_response(data, projects, members, products)

        if nr_of_pages != 0 and page >= nr_of_pages:
            break

        if len(data.get('projects', [])) == 0:
            break

        page += 1

    df_projects = pd.DataFrame(projects)
    df_members = pd.DataFrame(members)
    df_products = pd.DataFrame(products)

    return df_projects, df_members, df_products

# Base URL of the NWOpen-API
base_url = 'https://nwopen-api.nwo.nl/NWOpen-API/api/Projects'

# Example parameters (adjust as needed)
initial_params = {
    'per_page': 100
}

# Fetch and process data
df_projects, df_members, df_products = fetch_all_data(base_url, initial_params, NR_OF_PAGES)


Retrieving page: 1
Retrieving page: 2
Retrieving page: 3
Retrieving page: 4
Retrieving page: 5
Retrieving page: 6
Retrieving page: 7
Retrieving page: 8
Retrieving page: 9
Retrieving page: 10
Retrieving page: 11
Retrieving page: 12
Retrieving page: 13
Retrieving page: 14
Retrieving page: 15
Retrieving page: 16
Retrieving page: 17
Retrieving page: 18
Retrieving page: 19
Retrieving page: 20
Retrieving page: 21
Retrieving page: 22
Retrieving page: 23
Retrieving page: 24
Retrieving page: 25
Retrieving page: 26
Retrieving page: 27
Retrieving page: 28
Retrieving page: 29
Retrieving page: 30
Retrieving page: 31
Retrieving page: 32
Retrieving page: 33
Retrieving page: 34
Retrieving page: 35
Retrieving page: 36
Retrieving page: 37
Retrieving page: 38
Retrieving page: 39
Retrieving page: 40
Retrieving page: 41
Retrieving page: 42
Retrieving page: 43
Retrieving page: 44
Retrieving page: 45
Retrieving page: 46
Retrieving page: 47
Retrieving page: 48
Retrieving page: 49
Retrieving page: 50
Retrievin

In [22]:
# # Save data to pickle files
df_projects.to_pickle(f"../data/df_projects.pkl")
df_members.to_pickle(f"../data/df_members.pkl")
df_products.to_pickle(f"../data/df_products.pkl")

In [None]:
print('projects:')
print(df_projects.shape)
display(df_projects.head(2))

print('\nmembers:')
print(df_members.shape)
display(df_members.head(2))

print('\nproducts:')
print(df_products.shape)
display(df_products.head(2))