# California License Extraction

## Notebook Configuration

In [1]:
!pip install webdriver-manager






In [2]:
!pip install selenium pandas 





In [3]:
"""
Imports for the selenium portion 
"""
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium.webdriver.support.ui import Select  # Import Select for dropdowns

"""
Imports for the beautiful soup portion
"""

'\nImports for the beautiful soup portion\n'

## Extract DOM from California BOP Search

In [4]:
def scrape_dom():
    # Set up Selenium WebDriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    
    # 1. Navigate to the URL
    driver.get("https://search.dca.ca.gov/advanced")
    
    # 2. Wait until you see the ID "srchSubmitHome"
    WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.ID, "srchSubmitHome")))
    
    # 3. Set the Primary Status to "Active"
    status_dropdown = Select(driver.find_element(By.ID, "primaryStatusCodes"))
    status_dropdown.select_by_visible_text("Active")
    
    # 4. Enter "Sterile Compounding Pharmacy" in the licenseType field
    license_type_field = driver.find_element(By.ID, "licenseType")
    license_type_field.send_keys("Sterile Compounding Pharmacy")
    
    # 5. Click the search button by ID "srchSubmitHome"
    search_button = driver.find_element(By.ID, "srchSubmitHome")
    search_button.click()
    
    # 6. Allow the page to load and scroll until "You have reached the end of your results"
    time.sleep(5)  # Let the page load
    
    # Scroll to the bottom until the "You have reached the end of your results" text is found
    while True:
        page_end_element = driver.find_elements(By.XPATH, "//*[text()='You have reached the end of your results']")
        if page_end_element:
            break
        # Scroll down
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
    
    # 7. Get the entire HTML of the page
    html = driver.page_source
    
    # Close the browser
    driver.quit()
    
    return html

In [5]:
html = scrape_dom()
html

'<html lang="en"><head>\n<script type="text/javascript" async="" src="https://www.googletagmanager.com/gtag/js?id=G-69TD0KNT0F&amp;l=dataLayer&amp;cx=c&amp;gtm=45He4cc1za200"></script><script async="" src="https://www.googletagmanager.com/gtm.js?id=GTM-KDHT6K"></script><script async="" src="https://www.google-analytics.com/analytics.js"></script><script type="text/javascript">\n(function(){\nwindow["loaderConfig"] = "/TSPD/?type=21";\n})();\n\n</script>\n\n<script type="text/javascript" src="/TSPD/?type=18"></script>\n\n\t\t<title>Search - DCA</title>\n\t\t<!-- results template -->\n\t\t<meta charset="utf-8">\n\t\t<meta name="viewport" content="width=device-width, initial-scale=1">\n\t\t<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">\n\t\t<!--[if lte IE 8]><script src="/assets/js/ie/html5shiv.js"></script><![endif]-->\n\t\t<link rel="stylesheet" href="/assets/css/bootstrap.css">\n\t\t<link rel="stylesheet" href="/assets/css/main.css">\n\t\t<link rel="stylesheet" href="/a

## Extract Licenses From DOM

In [7]:
def parse_dom(html: str) -> pd.DataFrame:
    soup = BeautifulSoup(html, "html.parser")
    articles = []
        
    for article in soup.find_all("article"):
        actions_element = article.find("ul", class_="actions")
        if actions_element:
            article_data = {}
    
            # Extract pharmacy name (h3 within the <ul>)
            h3_element = actions_element.find("h3")
            if h3_element:
                article_data["Pharmacy Name"] = h3_element.text.strip()
    
            # Extract other data fields
            for li in actions_element.find_all("li"):
                # Skip <li> that contains <h3>
                if li.find("h3"):
                    continue
                
                text = li.text.strip()
                key_value = text.split(": ", 1)
                if len(key_value) == 2:
                    key, value = key_value
                    article_data[key] = value.strip()
    
            articles.append(article_data)
    
    # Convert the data to a pandas DataFrame
    df = pd.DataFrame(articles)
    
    # Drop the last row with the end of results string 
    df = df[~df.iloc[:, 0].str.contains("You have reached the end of your results", na=False)]

    return df

## Write to CSV

In [8]:
def write_to_csv(df: pd.DataFrame, filename: str): 
    # Save the DataFrame to a CSV file
    df.to_csv(f"{filename}.csv", index=False)
    
    return None


## Testing Pipeline

In [9]:
html = scrape_dom()
pharmacies = parse_dom(html)
write_to_csv(pharmacies, "all_pharmacies_01.03.2025")
