# BHL Data Extraction for a RAG implementation using requests and XML
---

API Methods docs: https://www.biodiversitylibrary.org/docs/api3.html#methods


## Use of threads through `threading` library 

#### Several API keys were requested to the BHL server in order to improve time-efficiency by parallelizing the volume of queries sent to the API. These keys are stored in a `list` object, which are then used by the class `ThreadedSpeciesProcessor` when initialized.

##### The class below calls functions from the file `ThreadedProcessor`, with the purpose of making this main notebook more readable.

In [1]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET
import re # regular expressions
from tqdm.notebook import tqdm  # Progress bars
import time # delays
#from google.colab import files uncomment when running from Google Colab remote runtime
import threading
from aux_functions import process_species_list

In [2]:
API_KEYS = [
    # your own set of BHL API keys (not exposing locally-used ones for privacy & security)
]

class ThreadedSpeciesProcessor:
    def __init__(self, csv_name, total_species, lang="Ambas", num_threads=None):

        self.csv_name = csv_name
        self.total_species = total_species
        self.lang = lang
        self.num_threads = num_threads if num_threads else len(API_KEYS)
        self.results = [None] * self.num_threads
        self.api_keys = API_KEYS[:self.num_threads]  # Use only needed keys if there are more than the amount of threads

    def worker(self, thread_idx, species_chunk):
        API_KEY = self.api_keys[thread_idx]

        thread_dfs = []
        with tqdm(total=len(species_chunk),
                 desc=f"Thread {thread_idx+1}/{self.num_threads} (API {API_KEY[-4:]})", # show last 4 chars that identify the key being used
                 position=thread_idx,
                 leave=True) as pbar:

            for species in species_chunk:
                time.sleep(0.35)  # Rate-limiting
                try:
                    species_df = process_species_list([species], self.lang, API_KEY)
                    if species_df is not None and not species_df.empty:
                        thread_dfs.append(species_df)
                except Exception as e:
                    tqdm.write(f"Thread {thread_idx+1} error on {species}: {str(e)}")
                finally:
                    pbar.update(1)

        if thread_dfs:
            self.results[thread_idx] = pd.concat(thread_dfs, ignore_index=True)

    def process(self):
        df = pd.read_csv(self.csv_name)
        if df.empty:
            raise ValueError("No species data found in the CSV")

        unique_species = df["default_name"].unique().tolist()[:self.total_species]

        chunk_size = len(unique_species) // self.num_threads
        chunks = []
        for i in range(self.num_threads):
            start = i * chunk_size
            end = (i + 1) * chunk_size if i < self.num_threads - 1 else len(unique_species)
            chunks.append(unique_species[start:end])

        threads = []
        for i in range(self.num_threads):
            thread = threading.Thread(target=self.worker, args=(i, chunks[i]))
            threads.append(thread)
            thread.start()

        for thread in threads:
            thread.join()

        valid_results = [df for df in self.results if df is not None and not df.empty]
        if not valid_results:
            raise ValueError("No valid results produced. Check API keys and filters")
        return pd.concat(valid_results, ignore_index=True)

## Running the data extraction loop
* if the function is called with the number of threads variable it will only use the first num_threads Api Keys
* if called without that parameter it will use all of the api keys in the API_KEYS list

In [3]:
processor = ThreadedSpeciesProcessor("inbio_data.csv", 3, "Ambas")
df = processor.process()

Thread 1/3 (API d322):   0%|          | 0/1 [00:00<?, ?it/s]

Thread 3/3 (API 2ef8):   0%|          | 0/1 [00:00<?, ?it/s]

Thread 2/3 (API 8114):   0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
df.to_csv("sample-output.csv")
#files.download("eng-esp-400-species.csv") only used when running directly from google colab