<a href="https://colab.research.google.com/github/amien1410/amien-scrapers/blob/main/ifa_substances_scaper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests

In [None]:
import requests
import pandas as pd

# Initialize an empty list to hold the extracted data
data = []

page = 0
pageSize = 48

# Continue fetching data until the results count is less than 24
while True:
    print(page)
    # Fetch the data from the API
    url = f"https://ilv-api.ifa.dguv.de/api/substance?searchValue=&pageNr={page}&pageSize={pageSize}"
    res = requests.get(url)
    result = res.json()

    # Extract the 'content' list from the JSON result
    content = result.get("content", [])

    # Extract relevant fields (name, casNrs, remark) from the content
    for substance in content:
        id = substance.get("id")
        name = substance.get("name")
        casNrs = ", ".join(substance.get("casNrs", []))
        remark = substance.get("remark", "")
        data.append({
            "id": id,
            "name": name,
            "casNrs": casNrs,
            "remark": remark
        })

    # If the number of results is less than 24, stop fetching
    if len(content) < 48:
        break

    # Move to the next page
    page += 1

# Convert the data into a DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to an Excel file
df.to_excel("substances_data.xlsx", index=False)

# Display the saved file path
print("Data saved to substances_data.xlsx")

In [None]:
df

Unnamed: 0,name,casNrs,remark
0,2-Monochlornaphthaline,91-58-7,
1,Acephate,30560-19-1,
2,Acetaldehyde,75-07-0,
3,Acetamide,60-35-5,
4,Acetanilide,103-84-4,
...,...,...,...
2295,Zirconium,7440-67-7,
2296,Zirconium compounds,7440-67-7,"as Zr,\r\nexcept zirconium tetrachloride"
2297,Zirconium dioxide,"1314-23-4, 12036-23-6",
2298,"Zirconium powder, nonstabilized",7440-67-7,


In [None]:
res = requests.get("https://ilv-api.ifa.dguv.de/api/substance/69814")
res.json()

{'id': 69814,
 'name': '2-Monochlornaphthaline',
 'searchName': 'monochlornaphthaline',
 'remark': None,
 'casNrs': ['91-58-7'],
 'limitValueRows': [{'id': 74681,
   'partner': {'id': 8, 'name': 'Austria', 'abbreviation': 'A'},
   'remarks': [],
   'limitValues': [{'id': 74682,
     'value': 0.03,
     'ltvStv': 'LongTermValue',
     'unit': 'ppm',
     'font': 'None',
     'changeState': None,
     'remarks': ['Skin']},
    {'id': 74683,
     'value': 0.09,
     'ltvStv': 'ShortTermValue',
     'unit': 'ppm',
     'font': 'None',
     'changeState': None,
     'remarks': ['Skin', '15 minutes average value']},
    {'id': 74684,
     'value': 0.2,
     'ltvStv': 'LongTermValue',
     'unit': 'mg_m3',
     'font': 'None',
     'changeState': None,
     'remarks': ['Skin']},
    {'id': 74685,
     'value': 0.6,
     'ltvStv': 'ShortTermValue',
     'unit': 'mg_m3',
     'font': 'None',
     'changeState': None,
     'remarks': ['Skin', '15 minutes average value']}],
   'createDateTime': '

In [None]:
# Function to clean up the remarks string
def clean_string(remark):
    if remark:
        return remark.replace("\r\n", " ").strip()  # Replace newlines and strip leading/trailing spaces
    return None

# Function to extract relevant fields from each JSON result
def extract_data(substance):
    data_list = []

    # Loop through each limitValueRow (each country)
    for limit_row in substance.get('limitValueRows', []):
        country = limit_row.get('partner', {}).get('name', '')
        long_term_ppm = None
        long_term_mg_m3 = None
        long_term_f_cm3 = None
        short_term_ppm = None
        short_term_mg_m3 = None
        short_term_f_cm3 = None

        # Extract limit values based on ltvStv type (LongTermValue or ShortTermValue)
        for limit in limit_row.get('limitValues', []):
            # Filter remarks with more than one value
            cleaned_remarks = ", ".join([clean_string(r) for r in limit['remarks']]) if len(limit['remarks']) > 0 else ""

            if limit['ltvStv'] == 'LongTermValue':
                if limit['unit'] == 'ppm':
                    long_term_ppm = limit['value']
                if limit['unit'] == 'mg_m3':
                    long_term_mg_m3 = limit['value']
                elif limit['unit'] == 'f_cm3':
                    long_term_f_cm3 = limit['value']
            elif limit['ltvStv'] == 'ShortTermValue':
                if limit['unit'] == 'ppm':
                    short_term_ppm = limit['value']
                if limit['unit'] == 'mg_m3':
                    short_term_mg_m3 = limit['value']
                elif limit['unit'] == 'f_cm3':
                    short_term_f_cm3 = limit['value']

        # Append the extracted information for each country to the data list
        data_list.append({
            'id': substance.get('id'),
            'name': substance.get('name'),
            'casNrs': ", ".join(substance.get('casNrs', [])),
            'remark': cleaned_remarks,
            'Country': country,
            'TWA-ppm': long_term_ppm,
            'TWA-mg/m3': long_term_mg_m3,
            'TWA-f/cm3': long_term_f_cm3,
            'STEL-ppm': short_term_ppm,
            'STEL-mg/m3': short_term_mg_m3,
            'STEL-f/cm3': short_term_f_cm3
        })

    return data_list

# Input array of substance IDs
substance_ids = [668, 69814, 6, 13632, 28987, 2754]  # Replace with actual IDs

# Initialize an empty list to hold the extracted data
all_data = []

# Loop through each substance ID and fetch the data
for substance_id in substance_ids:
    url = f"https://ilv-api.ifa.dguv.de/api/substance/{substance_id}"
    res = requests.get(url)
    substance = res.json()

    # Extract the relevant data from the response
    extracted_data = extract_data(substance)
    all_data.extend(extracted_data)

# Convert the data into a DataFrame
df = pd.DataFrame(all_data)
df

Unnamed: 0,id,name,casNrs,remark,Country,TWA-ppm,TWA-mg/m3,TWA-f/cm3,STEL-ppm,STEL-mg/m3,STEL-f/cm3
0,668,Acetophenone,98-86-2,,Belgium,10.0,50.00,,,,
1,668,Acetophenone,98-86-2,,Canada - Québec,10.0,49.00,,,,
2,668,Acetophenone,98-86-2,,Canada - Ontario,10.0,,,,,
3,668,Acetophenone,98-86-2,,Denmark,10.0,49.00,,20.0,98.0,
4,668,Acetophenone,98-86-2,,Spain,10.0,50.00,,,,
...,...,...,...,...,...,...,...,...,...,...,...
70,28987,Flour dust,,,Sweden,,3.00,,,,
71,28987,Flour dust,,15 minutes average value,United Kingdom,,10.00,,,30.0,
72,28987,Flour dust,,,South Africa Mining,,3.00,,,,
73,28987,Flour dust,,Inhalable fraction,Israel,,0.50,,,,
