In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re

In [None]:
# Function to scrape a webpage and check for search terms
def scrape_page(url, search_terms):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        
        # Verify the content type is HTML
        if 'text/html' not in response.headers.get('Content-Type', ''):
            print(f"Skipping non-HTML content at: {url}")
            return {term: False for term in search_terms}
        
        soup = BeautifulSoup(response.text, 'html.parser')
        page_text = soup.get_text().lower()
        
        matches = {term: bool(re.search(re.escape(term.lower()), page_text)) for term in search_terms}
        return matches
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return {term: False for term in search_terms}

In [None]:
# Function to get all URLs from a webpage
def get_all_urls(url, domain):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    urls = set()
    
    # List of file extensions to exclude
    exclude_extensions = ['.pdf', '.docx', '.xlsx', '.csv', '.jpg', '.jpeg', '.png', '.gif', '.zip', '.rar']
    
    for link in soup.find_all('a', href=True):
        full_url = urljoin(url, link['href'])
        if urlparse(full_url).netloc == domain:
            if not any(full_url.endswith(ext) for ext in exclude_extensions):
                urls.add(full_url)
    
    return urls

In [None]:
# Main function to scrape the whole domain
def scrape_website(domain, search_terms):
    visited_urls = set()
    urls_to_visit = set([domain])
    results = {}
    
    while urls_to_visit:
        current_url = urls_to_visit.pop()
        print(f"Visiting: {current_url}")  # Print URL being visited
        if current_url not in visited_urls:
            visited_urls.add(current_url)
            results[current_url] = scrape_page(current_url, search_terms)
            urls_to_visit.update(get_all_urls(current_url, urlparse(domain).netloc))
    
    return results

In [None]:
# Usage
if __name__ == "__main__":
    domain = "https://ohss.dhs.gov/"
    search_terms = ['Affirmative Action', 'DEI', 'DEIA', 'Diversity & Inclusion', 'Diversity and Inclusion', 'Diversity Equity & Inclusion', 'Diversity Equity and Inclusion', 'Diversity Equity and Inclusion', 'Diversity Equity Inclusion', 'Diversity Inclusion', 'Diversity, Equity & Inclusion', 'Diversity, Equity and Inclusion', 'Diversity, Equity Inclusion', 'Diversity, Equity, & Inclusion', 'Diversity, Equity, and Inclusion', 'Diversity, Equity, Inclusion', 'Equity & Diversity', 'Equity and Diversity', 'Gender Equality', 'Gender Equity', 'IDDP', 'Implicit Bias', 'Inclusion Diversity and Equity', 'Inclusion, Diversity and Equity', 'Inclusion, Diversity, Equity', 'Inclusive Diversity', 'LGBT', 'LGBTQ', 'LGBTQ+', 'LGBTQI', 'LGBTQIA', 'Racial Equality', 'Racial Equity', 'Social Justice', 'STEER', 'STRIDE', 'Unconscious Bias']
    results = scrape_website(domain, search_terms)
    
    for url, matches in results.items():
        print(f"URL: {url}")
        for term, found in matches.items():
            print(f"  {term}: {'Present' if found else 'Not Found'}")