In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

In [2]:
base_url = "https://www.osha.gov/ords/imis/accidentsearch.search"

In [3]:
headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'accept-language': 'en-US,en;q=0.9',
    'cache-control': 'max-age=0',
    'priority': 'u=0, i',
    'referer': 'https://www.osha.gov/ords/imis/accidentsearch.search?sic=&sicgroup=&naics=&acc_description=heat&acc_abstract=&acc_keyword=&inspnr=&fatal=&officetype=All&office=All&startmonth=10&startday=31&startyear=2024&endmonth=01&endday=01&endyear=2000&keyword_list=&p_start=&p_finish=80&p_sort=&p_desc=DESC&p_direction=Next&p_show=20',
    'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
}

cookies = {
    '_ga': 'GA1.1.685929613.1730142385',
    '_ga_CSLL4ZEK4L': 'GS1.1.1732388998.32.1.1732392494.0.0.0',
}

params = {
    'sic': '',
    'sicgroup': '',
    'naics': '',
    'acc_description': 'heat',
    'acc_abstract': '',
    'acc_keyword': '',
    'inspnr': '',
    'fatal': '',
    'officetype': 'All',
    'office': 'All',
    'startmonth': '10',
    'startday': '31',
    'startyear': '2024',
    'endmonth': '01',
    'endday': '01',
    'endyear': '2000',
    'keyword_list': '',
    'p_sort': '',
    'p_desc': 'DESC',
    'p_direction': 'Next',
    'p_show': '20',
}

In [4]:
summary_numbers = []

In [5]:
for page in range(0, 61 * 20, 20):
    print(f"Scraping page with p_finish={page}")
    params['p_finish'] = str(page)
    
    response = requests.get(base_url, params=params, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    page_numbers = [
        a_tag.text.strip()
        for a_tag in soup.find_all('a', href=True)
        if "accidentsearch.accident_detail" in a_tag['href']
    ]

    if not page_numbers:
        print("No more pages. Stopped.")
        break

    summary_numbers.extend(page_numbers)

    time.sleep(5)

print(f"Collected {len(summary_numbers)} summary numbers from all pages.")
print(summary_numbers)

Scraping page with p_finish=0
Collected 20 summary numbers from all pages.
['167601.015', '171242.015', '167481.015', '167213.015', '166832.015', '171335.015', '165828.015', '162263.015', '160090.015', '160100.015', '159481.015', '159448.015', '159496.015', '159249.015', '159560.015', '159454.015', '159272.015', '159294.015', '159031.015', '159516.015']


In [6]:
base_url = "https://www.osha.gov/ords/imis/accidentsearch.accident_detail"

In [7]:
inspection_numbers = []

In [8]:
for summary_number in summary_numbers:
    print(f"Scraping inspection number for summary ID: {summary_number}")
    params = {'id': summary_number}
    
    response = requests.get(base_url, params=params, headers=headers, cookies=cookies)

    if response.status_code != 200:
        print(f"Error fetching page for {summary_number}: HTTP {response.status_code}")
        print(response.text)
        continue
    
    soup = BeautifulSoup(response.text, 'html.parser')

    found = False
    for link in soup.find_all('a', href=True):
        if "establishment.inspection_detail" in link['href']:
            inspection_number = link.text.strip()
            inspection_numbers.append(inspection_number)
            print(f"Found inspection number: {inspection_number}")
            found = True
            break
    if not found:
        print(f"No inspection number found for summary ID: {summary_number}")
    
    time.sleep(5)

print(f"Collected {len(inspection_numbers)} inspection numbers:")
print(inspection_numbers)

Scraping inspection number for summary ID: 167601.015
Found inspection number: 1756686.015
Scraping inspection number for summary ID: 171242.015
Found inspection number: 1754345.015
Scraping inspection number for summary ID: 167481.015
Found inspection number: 1755843.015
Scraping inspection number for summary ID: 167213.015
Found inspection number: 1753424.015
Scraping inspection number for summary ID: 166832.015
Found inspection number: 1750189.015
Scraping inspection number for summary ID: 171335.015
Found inspection number: 1750111.015
Scraping inspection number for summary ID: 165828.015
Found inspection number: 1742027.015
Scraping inspection number for summary ID: 162263.015
Found inspection number: 1715319.015
Scraping inspection number for summary ID: 160090.015
Found inspection number: 1698609.015
Scraping inspection number for summary ID: 160100.015
Found inspection number: 1698692.015
Scraping inspection number for summary ID: 159481.015
Found inspection number: 1694606.015

In [9]:
base_url = "https://www.osha.gov/ords/imis/establishment.inspection_detail?id="

In [10]:
def extract_text(tag):
    return tag.text.strip() if tag else None

In [11]:
def fetch_with_retries(url, headers, retries=3, backoff_factor=2):
    for attempt in range(retries):
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response
        elif response.status_code == 403:
            print(f"403 Forbidden on attempt {attempt + 1}. Retrying...")
        else:
            print(f"HTTP {response.status_code} on attempt {attempt + 1}. Retrying...")
        time.sleep(backoff_factor ** attempt)
    print(f"Failed to fetch {url} after {retries} attempts.")
    return None

In [12]:
OSHA_data = []

In [13]:
total = len(inspection_numbers)

In [14]:
for index, inspection_id in enumerate(inspection_numbers, start=1):
    print(f"Scraping details for Inspection ID: {inspection_id} ({index}/{total})...")

    response = fetch_with_retries(base_url + inspection_id, headers)
    if not response:
        OSHA_data.append({'Inspection Number': inspection_id})
        continue

    soup = BeautifulSoup(response.text, 'html.parser')

    try:
        case_status = extract_text(soup.find('div', class_='well well-small').find('strong'))
        if case_status:
            case_status = case_status.replace("Case Status:", "").strip()

        establishment_name_tag = soup.find('h4', id=inspection_id)
        establishment_name = extract_text(establishment_name_tag).split("-", 1)[-1].strip() if establishment_name_tag else None

        report_id = None
        try:
            report_id = soup.find('strong', string="Report ID").next_sibling.strip().replace(": ", "")
        except AttributeError:
            pass

        site_address = None
        try:
            span4_divs = soup.find_all('div', class_='span4')
            for div in span4_divs:
                paragraphs = div.find_all('p')
                for paragraph in paragraphs:
                    strong_tag = paragraph.find('strong')
                    if strong_tag and "Site Address" in strong_tag.text:
                        address_lines = [line.strip() for line in paragraph.stripped_strings]
                        site_address = "\n".join(line.lstrip(":").strip() for line in address_lines[3:])
                        break
                if site_address:
                    break
        except Exception as e:
            print(f"Error extracting Site Address: {e}")

        union_status = None
        try:
            union_status = soup.find('strong', string="Union Status").next_sibling.strip().replace(": ", "")
        except AttributeError:
            pass

        incident_title = None
        try:
            abstract_div = soup.find('div', class_='span4', text=lambda t: t and "Employee" in t)
            incident_title = extract_text(abstract_div)
        except AttributeError:
            pass

        summary = None
        try:
            summary_tag = soup.find('p', string=lambda s: s and (s.startswith("At") or s.startswith("On")))
            summary = extract_text(summary_tag)
        except AttributeError:
            pass

        naics = None
        try:
            naics_tag = soup.find('strong', string="NAICS").next_sibling.strip()
            if naics_tag:
                naics = naics_tag.lstrip(":").strip()
        except AttributeError:
            pass

        event_date = None
        try:
            event_date_div = soup.find('div', class_='span4')
            for div in soup.find_all('div', class_='span4'):
                strong_tag = div.find('strong')
                if strong_tag and "Event" in strong_tag.text:
                    event_date = div.text.replace("Event", "").replace(":", "").strip()
                    break
        except AttributeError:
            pass

        age = extract_text(soup.find('td', headers='a1 col2'))
        sex = extract_text(soup.find('td', headers='a1 col3'))
        degree_of_injury = extract_text(soup.find('td', headers='a1 col4'))
        occupation = extract_text(soup.find('td', headers='a1 col6'))

        OSHA_data.append({
            'Inspection Number': inspection_id,
            'Case Status': case_status,
            'Establishment Name': establishment_name,
            'Report ID': report_id,
            'Site Address': site_address,
            'Union Status': union_status,
            'Incident Title': incident_title,
            'Summary': summary,
            'NAICS': naics,
            'Event Date': event_date,
            'Age': age,
            'Sex': sex,
            'Degree of Injury': degree_of_injury,
            'Occupation': occupation,
        })

        print(f"Details collected for Inspection ID: {inspection_id} ({index}/{total})")

    except Exception as e:
        print(f"Error parsing data for {inspection_id}: {e}")
        OSHA_data.append({'Inspection Number': inspection_id})

    time.sleep(5)

df = pd.DataFrame(OSHA_data)
df.to_csv("inspection_details.csv", index=False)
print("Data saved to inspection_details.csv")


Scraping details for Inspection ID: 1756686.015 (1/20)...


  abstract_div = soup.find('div', class_='span4', text=lambda t: t and "Employee" in t)


Details collected for Inspection ID: 1756686.015 (1/20)
Scraping details for Inspection ID: 1754345.015 (2/20)...
Details collected for Inspection ID: 1754345.015 (2/20)
Scraping details for Inspection ID: 1755843.015 (3/20)...
Details collected for Inspection ID: 1755843.015 (3/20)
Scraping details for Inspection ID: 1753424.015 (4/20)...
Details collected for Inspection ID: 1753424.015 (4/20)
Scraping details for Inspection ID: 1750189.015 (5/20)...
Details collected for Inspection ID: 1750189.015 (5/20)
Scraping details for Inspection ID: 1750111.015 (6/20)...
Details collected for Inspection ID: 1750111.015 (6/20)
Scraping details for Inspection ID: 1742027.015 (7/20)...
Details collected for Inspection ID: 1742027.015 (7/20)
Scraping details for Inspection ID: 1715319.015 (8/20)...
Details collected for Inspection ID: 1715319.015 (8/20)
Scraping details for Inspection ID: 1698609.015 (9/20)...
Details collected for Inspection ID: 1698609.015 (9/20)
Scraping details for Inspection 

In [15]:
for inspection_id in inspection_numbers:
    print(f"Scraping details for Inspection ID: {inspection_id}...")

    response = fetch_with_retries(base_url + inspection_id, headers)
    if not response:
        continue

    soup = BeautifulSoup(response.text, 'html.parser')

    try:
        case_status = extract_text(soup.find('div', class_='well well-small').find('strong'))
        if case_status:
            case_status = case_status.replace("Case Status:", "").strip()

        inspection_number = inspection_id

        establishment_name_tag = soup.find('h4', id=inspection_id)
        establishment_name = extract_text(establishment_name_tag).split("-", 1)[-1].strip() if establishment_name_tag else None

        report_id = None
        try:
            report_id = soup.find('strong', string="Report ID").next_sibling.strip().replace(": ", "")
        except AttributeError:
            pass

        site_address = None
        try:
            span4_divs = soup.find_all('div', class_='span4')
        
            for div in span4_divs:
                paragraphs = div.find_all('p')
                for paragraph in paragraphs:
                    strong_tag = paragraph.find('strong')
                    if strong_tag and "Site Address" in strong_tag.text:
                        address_lines = [line.strip() for line in paragraph.stripped_strings]
                        site_address = "\n".join(line.lstrip(":").strip() for line in address_lines[3:])
                        break
                if site_address:
                    break
        except Exception as e:
            print(f"Error extracting Site Address: {e}")

        union_status = None
        try:
            union_status = soup.find('strong', string="Union Status").next_sibling.strip().replace(": ", "")
        except AttributeError:
            pass

        incident_title = None
        try:
            abstract_div = soup.find('div', class_='span4', text=lambda t: t and "Employee" in t)
            incident_title = extract_text(abstract_div)
        except AttributeError:
            pass

        summary = None
        try:
            summary_tag = soup.find('p', string=lambda s: s and (s.startswith("At") or s.startswith("On")))
            summary = extract_text(summary_tag)
        except AttributeError:
            pass

        naics = None
        try:
            naics_tag = soup.find('strong', string="NAICS").next_sibling.strip()
            if naics_tag:
                naics = naics_tag.lstrip(":").strip()
        except AttributeError:
            pass

        age = extract_text(soup.find('td', headers='a1 col2'))
        sex = extract_text(soup.find('td', headers='a1 col3'))
        degree_of_injury = extract_text(soup.find('td', headers='a1 col4'))
        occupation = extract_text(soup.find('td', headers='a1 col6'))

        OSHA_data.append({
            'Case Status': case_status,
            'Inspection Number': inspection_number,
            'Establishment Name': establishment_name,
            'Report ID': report_id,
            'Site Address': site_address,
            'Union Status': union_status,
            'Incident Title': incident_title,
            'Summary': summary,
            'NAICS': naics,
            'Age': age,
            'Sex': sex,
            'Degree of Injury': degree_of_injury,
            'Occupation': occupation,
        })

        print(f"OSHA_data collected for Inspection ID: {inspection_id}")

    except Exception as e:
        print(f"Error parsing OSHA_data for {inspection_id}: {e}")

    time.sleep(5)

df = pd.DataFrame(OSHA_data)
df.to_csv("inspection_details.csv", index=False)
print("OSHA_data saved to inspection_details.csv")

Scraping details for Inspection ID: 1756686.015...


  abstract_div = soup.find('div', class_='span4', text=lambda t: t and "Employee" in t)


OSHA_data collected for Inspection ID: 1756686.015
Scraping details for Inspection ID: 1754345.015...
OSHA_data collected for Inspection ID: 1754345.015
Scraping details for Inspection ID: 1755843.015...
OSHA_data collected for Inspection ID: 1755843.015
Scraping details for Inspection ID: 1753424.015...
OSHA_data collected for Inspection ID: 1753424.015
Scraping details for Inspection ID: 1750189.015...
OSHA_data collected for Inspection ID: 1750189.015
Scraping details for Inspection ID: 1750111.015...
OSHA_data collected for Inspection ID: 1750111.015
Scraping details for Inspection ID: 1742027.015...
OSHA_data collected for Inspection ID: 1742027.015
Scraping details for Inspection ID: 1715319.015...
OSHA_data collected for Inspection ID: 1715319.015
Scraping details for Inspection ID: 1698609.015...
OSHA_data collected for Inspection ID: 1698609.015
Scraping details for Inspection ID: 1698692.015...
OSHA_data collected for Inspection ID: 1698692.015
Scraping details for Inspection