In [4]:
import numpy as np
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path
from tqdm import tqdm

In [3]:
def get_html_table_api(
    api_url, 
    slug, 
    columnList=[],
    _token="",
):
    '''
    Arguments
    ----------------------
    api_url : CEB's api - https://cebs-ext.niehs.nih.gov/datasets/api/dataset/data/fetch
    slug : dataset identifier
    draw : used by API to identify which
    columnList : list of columns to request from API
    
    Returns
    ----------------------
    Pandas dataframe
    '''
    # Headers (adjust as needed based on the actual request)
    headers = {
        ""
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Accept-Encoding": "gzip, deflate, br, zstd",
        "Accept-Language": "en-US, en;q=0.9",
        "Connection":"keep-alive",
        "Content-length":"3005",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Cookie": "ncbi_sid=90C64A6173B90DC1_0000SID; _ga=GA1.3.13396594.1731956957; _ga_CNVCG2ERJ0=GS1.1.1732028775.1.1.1732028818.17.0.0; _ga_7SNXGJ5KCC=GS1.2.1732045042.1.1.1732045128.0.0.0; _ga_1EBN0LJRZK=GS1.2.1732045042.1.1.1732045128.0.0.0; _ga_DP2X732JSX=GS1.1.1736367127.5.0.1736367127.0.0.0; _gid=GA1.2.2013118334.1736367218; _ga_Z8SS5QJCLH=GS1.1.1736370157.1.0.1736370157.0.0.0; XSRF-TOKEN=eyJpdiI6IjFkd0EvZkorei9oRlRnZHM2QVgyZWc9PSIsInZhbHVlIjoiYTdzSnhrcjg0KzE4aGJUNjZRZE9sRHp4WHJJUkhlQlQ3TGowRnZHY2pxTGtUZE5ncXJCMHNKTFFCMXhtK2xHd1ZIck9odjJETnlVMjkxRUF6N1ZPakVIZTMzaTBmMDV4eElnUGg2dHdQUDlpT0RxRkxoM2ZidUhGRkxweExZN2ciLCJtYWMiOiI2NWU2Y2EwNDUwZWRhNWJmODdhOTNkOWI0MTBlMjljMjEwMjE3YTY1MWEwYjVkN2NmYjIyMjVkNDRjMTQ5MGNiIiwidGFnIjoiIn0%3D; dtt_data_collections_guided_search_session=eyJpdiI6Inc5d0FaT3RmeDNmQ1ZzS3JBYlBNUVE9PSIsInZhbHVlIjoiRGdRcmNDUktxRFAwMnFMRlg5R1kwZ1ZGMlR6UU50SzI2UWEwakYrWWxzT29ZZEx6ZXUreEhhTkJ2dERraGlCVmdUVndIMUF3d1pHTDJDV1hDYlZTMTlFYzFzb3packZTb0dtWlRjbWtkWUpwd1lTeHVCdEtCRGNJNFdDZ0NIZm0iLCJtYWMiOiIxNTc0ZDllYWM0MGY1ZTg0NDQwODgwNzhlMmY0Y2Q0ZWY0ZmMzY2JlZjI1ZWRmMGQyNDliM2E3M2Y0OTExYTBlIiwidGFnIjoiIn0%3D; _ga=GA1.2.13396594.1731956957; _ga_38SLQ135G0=GS1.2.1736440261.10.1.1736442742.0.0.0; _ga_CSLL4ZEK4L=GS1.1.1736440261.15.1.1736442748.0.0.0",
        "Host": "cebs-ext.niehs.nih.gov",
        "Origin": "https://cebs-ext.niehs.nih.gov",
        "Referer": "https://cebs-ext.niehs.nih.gov/datasets/search/tgx-ddi-tox21-pos",
        "Sec-CH-UA": '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
        "Sec-CH-UA-Mobile": "?0",
        "Sec-CH-UA-Platform": '"macOS"',
        "Sec-Fetch-Dest": "empty",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "same-origin",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
        "X-Requested-With": "XMLHttpRequest",
    }
    # ******************************** Initialize parameters *************************** 
    draw = 1
    all_data = []

    start = 0
    batch_size = 1_000

    # ************************ First get N_iterations to loop through ******************* 
    payload = {
            "draw": draw,                        # Increment with each request if needed
            "start": start,                      # Starting index (pagination)
            "length": batch_size,                # Number of rows to fetch (adjust as needed)
            "search[value]": "",                 # Global search query (empty for no search)
            "search[regex]": "false",            # Regex for global search (false)
            "slug": slug,                        # Dataset identifier
            "_token": _token,                    # CSRF token
            "searchFilters": f"_token={_token}",
            "columnList[]": columnList,          # Columns to fetch
            "previewMode": "false",              # Preview mode
        }

    response = requests.post(api_url, data=payload, headers=headers)
    if response.status_code == 200:
        data = response.json()
        records_total = data.get("recordsTotal", 0)

        N_iterations = int(records_total / batch_size) + 1
        print('# records total: ', records_total)
    else: 
        print(f"Failed to fetch data. Status code: {response.status_code}")
        return


    # ****************************** Now start iterating ********************************
    for i in tqdm(range(N_iterations)): 
        # Payload for the POST request (this should match the request payload from the network tab)
        payload = {
            "draw": draw,                        # Increment with each request if needed
            "start": start,                      # Starting index (pagination)
            "length": batch_size,                # Number of rows to fetch (adjust as needed)
            "search[value]": "",                 # Global search query (empty for no search)
            "search[regex]": "false",            # Regex for global search (false)
            "slug": slug,                        # Dataset identifier
            "_token": _token,                    # CSRF token
            "searchFilters": f"_token={_token}",
            "columnList[]": columnList,          # Columns to fetch
            "previewMode": "false",              # Preview mode
        }
        
        response = requests.post(api_url, data=payload, headers=headers)
        
        if response.status_code == 200:
            data = response.json()
            batch_data = data.get("data", [])
            all_data.extend(batch_data)
            start += batch_size
    
        else:
            print(f"Failed to fetch data. Status code: {response.status_code}")
            break

    # ****************************** Return dataframe ********************************
    df = pd.DataFrame(all_data)
    print('# records fetched: ', len(df))

    return df


In [5]:
api_url = "https://cebs-ext.niehs.nih.gov/datasets/"
get_html_table_api(api_url, slug='trf')

Failed to fetch data. Status code: 404
