In [2]:
pip install selenium pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: C:\Users\bvlma\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


In [20]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(options=chrome_options)

try:
    # Step 1: Open the webpage
    url = "https://search.dca.ca.gov/?BD=7200&TP=107"
    driver.get(url)

    # Step 2: Wait for the "Search" button and click it
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "srchSubmitHome"))
    )
    search_button = driver.find_element(By.ID, "srchSubmitHome")
    search_button.click()

    # Step 3: Wait for the first results to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "post.yes"))
    )

    # Step 4: Scroll and Load More Results
    last_height = driver.execute_script("return document.body.scrollHeight")
    articles = []
    while True:
        # Find all articles
        new_articles = driver.find_elements(By.CSS_SELECTOR, "article.post.yes")
        articles.extend(new_articles)

        # Scroll down
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Allow time for new content to load

        # Check if the page height has changed
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Step 5: Extract Information from Each Article
    results = []
    for article in articles:
        title = article.find_element(By.CSS_SELECTOR, "h3").text
        license_number = article.find_element(By.CSS_SELECTOR, "span[id^='lic']").text
        license_type = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'License Type:')]]").text
        license_status = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'License Status:')]]").text
        expiration_date = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'Expiration Date:')]]").text
        city = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'City:')]]").text
        state = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'State:')]]").text
        county = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'County:')]]").text
        zip_code = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'Zip:')]]").text

        results.append({
            "Title": title,
            "License Number": license_number,
            "License Type": license_type,
            "License Status": license_status,
            "Expiration Date": expiration_date,
            "City": city,
            "State": state,
            "County": county,
            "Zip": zip_code,
        })

    # Step 6: Save to a DataFrame
    df = pd.DataFrame(results)

    # Print or save the DataFrame
    print(df)
    df.to_csv("pharmacy_data.csv", index=False)

finally:
    # Close the browser
    driver.quit()


                                  Title License Number  \
0                  3RD ST. YOUTH CLINIC       CLE 4375   
1     5TH AVENUE PUBLIC HEALTH SERVICES       CLE 1391   
2               77TH STREET JAIL CLINIC       CLE 1310   
3             ABAJIAN, HOURY BARSOUMIAN      APH 10319   
4                    ABASI, RABIE NIDAL      APH 11380   
...                                 ...            ...   
1995            ADULT MEDICATION CLINIC       CLE 4743   
1996           ADULT URGENT CARE CENTER       CLE 3421   
1997                    AFSHAR, ANAHITA      APH 11429   
1998              AFZAL, SHAMA PELWASHA      APH 11026   
1999               AGHAJANIAN, BERSABEH      APH 11323   

                                    License Type              License Status  \
0        LICENSE TYPE: CLINIC (GOVERNMENT OWNED)       LICENSE STATUS: CLEAR   
1        LICENSE TYPE: CLINIC (GOVERNMENT OWNED)   LICENSE STATUS: CANCELLED   
2        LICENSE TYPE: CLINIC (GOVERNMENT OWNED)       LICENSE 

In [22]:
df.head()

Unnamed: 0,Title,License Number,License Type,License Status,Expiration Date,City,State,County,Zip
0,3RD ST. YOUTH CLINIC,CLE 4375,LICENSE TYPE: CLINIC (GOVERNMENT OWNED),LICENSE STATUS: CLEAR,"EXPIRATION DATE: AUGUST 1, 2025",CITY: SAN FRANCISCO,STATE: CALIFORNIA,COUNTY: SAN FRANCISCO,ZIP: 94124
1,5TH AVENUE PUBLIC HEALTH SERVICES,CLE 1391,LICENSE TYPE: CLINIC (GOVERNMENT OWNED),LICENSE STATUS: CANCELLED,"EXPIRATION DATE: NOVEMBER 1, 2006",CITY: SAN DIEGO,STATE: CALIFORNIA,COUNTY: SAN DIEGO,ZIP: 92103
2,77TH STREET JAIL CLINIC,CLE 1310,LICENSE TYPE: CLINIC (GOVERNMENT OWNED),LICENSE STATUS: CLEAR,"EXPIRATION DATE: NOVEMBER 1, 2025",CITY: LOS ANGELES,STATE: CALIFORNIA,COUNTY: LOS ANGELES,ZIP: 90003
3,"ABAJIAN, HOURY BARSOUMIAN",APH 10319,LICENSE TYPE: ADVANCED PRACTICE PHARMACIST,LICENSE STATUS: CLEAR,"EXPIRATION DATE: DECEMBER 31, 2026",CITY: BEVERLY HILLS,STATE: CALIFORNIA,COUNTY: LOS ANGELES,ZIP: 90210
4,"ABASI, RABIE NIDAL",APH 11380,LICENSE TYPE: ADVANCED PRACTICE PHARMACIST,LICENSE STATUS: CLEAR,"EXPIRATION DATE: OCTOBER 31, 2025",CITY: SANTEE,STATE: CALIFORNIA,COUNTY: SAN DIEGO,ZIP: 92071


In [23]:
# Second Attempt; Issue with the above is that it is not searching for specifically sterile compound pharmacies so we are getting the wrong info back and i have the wrong table format.

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import pandas as pd
import time

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(options=chrome_options)

try:
    # Step 1: Open the webpage
    url = "https://search.dca.ca.gov/?BD=7200&TP=107"
    driver.get(url)

    # Step 2: Wait for the License Type dropdown to be present
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "licenseType"))
    )
    
    # Step 3: Select "Sterile Compounding Pharmacy" from the dropdown
    license_dropdown = Select(driver.find_element(By.ID, "licenseType"))
    license_dropdown.select_by_visible_text("Sterile Compounding Pharmacy")
    
    # Step 4: Wait for the "Search" button and click it
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "srchSubmitHome"))
    )
    search_button = driver.find_element(By.ID, "srchSubmitHome")
    search_button.click()

    # Step 5: Wait for the filtered results to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "post.yes"))
    )

    # Step 6: Scroll and Load More Results
    last_height = driver.execute_script("return document.body.scrollHeight")
    articles = []
    while True:
        # Find all articles
        new_articles = driver.find_elements(By.CSS_SELECTOR, "article.post.yes")
        articles.extend(new_articles)

        # Scroll down
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Allow time for new content to load

        # Check if the page height has changed
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Step 7: Extract Information from Each Article
    results = []
    for article in articles:
        title = article.find_element(By.XPATH, ".//li/h3").text
        license_number = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'License Number:')]]/a/span").text
        license_type = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'License Type:')]]").text.split(": ")[-1]
        license_status = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'License Status:')]]").text.split(": ")[-1]
        expiration_date = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'Expiration Date:')]]").text.split(": ")[-1]
        city = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'City:')]]/span").text
        state = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'State:')]]/span").text
        county = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'County:')]]").text.split(": ")[-1]
        zip_code = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'Zip:')]]").text.split(": ")[-1]

        results.append({
            "Title": title,
            "License Number": license_number,
            "License Type": license_type,
            "License Status": license_status,
            "Expiration Date": expiration_date,
            "City": city,
            "State": state,
            "County": county,
            "Zip": zip_code,
        })

    # Step 8: Save to a DataFrame
    df = pd.DataFrame(results)

    # Print or save the DataFrame
    print(df)
    df.to_csv("sterile_compounding_pharmacy_data.csv", index=False)

finally:
    # Close the browser
    driver.quit()


                             Title License Number  \
0       2222 PRESCRIPTION PHARMACY      LSC 99000   
1     986 INFUSION PHARMACY #1 INC     LSC 101414   
2            A & O CLINIC PHARMACY      LSC 99259   
3         A & O SPECIALTY PHARMACY     LSC 101817   
4         A & O SPECIALTY PHARMACY      LSC 99382   
...                            ...            ...   
1995               ACCESS I.V. LLC      LSC 99867   
1996                     ACCESS IV     LSC 100164   
1997                     ACCESS IV     LSC 100742   
1998                     ACCESS IV      LSC 99866   
1999                 ACCESS IV INC      LSC 99403   

                      License Type License Status     Expiration Date  \
0     STERILE COMPOUNDING PHARMACY      CANCELLED  SEPTEMBER 19, 2016   
1     STERILE COMPOUNDING PHARMACY          CLEAR        JULY 1, 2025   
2     STERILE COMPOUNDING PHARMACY      CANCELLED      AUGUST 1, 2006   
3     STERILE COMPOUNDING PHARMACY          CLEAR       MARCH 1, 2025  

In [25]:
#This is a third attempt; I am now going to use the advanced search and try and limit to just active pharmacies to stop returning so many that are old and cancelled. 

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import pandas as pd
import time

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(options=chrome_options)

try:
    # Step 1: Open the new webpage
    url = "https://search.dca.ca.gov/advanced"
    driver.get(url)

    # Step 2: Wait for the License Type dropdown to be present
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "primaryStatusCodes"))
    )

    # Step 3: Set the Primary Status to "Active"
    status_dropdown = Select(driver.find_element(By.ID, "primaryStatusCodes"))
    status_dropdown.select_by_visible_text("Active")

    # Step 4: Select "Sterile Compounding Pharmacy" from the License Type dropdown
    license_dropdown = Select(driver.find_element(By.ID, "licenseType"))
    license_dropdown.select_by_visible_text("Sterile Compounding Pharmacy")

    # Step 5: Wait for the "Search" button and click it
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "srchSubmitHome"))
    )
    search_button = driver.find_element(By.ID, "srchSubmitHome")
    search_button.click()

    # Step 6: Wait for the filtered results to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "post.yes"))
    )

    # Step 7: Scroll and Load More Results
    last_height = driver.execute_script("return document.body.scrollHeight")
    articles = []
    while True:
        # Find all articles
        new_articles = driver.find_elements(By.CSS_SELECTOR, "article.post.yes")
        articles.extend(new_articles)

        # Scroll down
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Allow time for new content to load

        # Check if the page height has changed
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Step 8: Extract Information from Each Article
    results = []
    for article in articles:
        title = article.find_element(By.XPATH, ".//li/h3").text
        license_number = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'License Number:')]]/a/span").text
        license_type = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'License Type:')]]").text.split(": ")[-1]
        license_status = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'License Status:')]]").text.split(": ")[-1]
        expiration_date = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'Expiration Date:')]]").text.split(": ")[-1]
        city = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'City:')]]/span").text
        state = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'State:')]]/span").text
        county = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'County:')]]").text.split(": ")[-1]
        zip_code = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'Zip:')]]").text.split(": ")[-1]

        results.append({
            "Title": title,
            "License Number": license_number,
            "License Type": license_type,
            "License Status": license_status,
            "Expiration Date": expiration_date,
            "City": city,
            "State": state,
            "County": county,
            "Zip": zip_code,
        })

    # Step 9: Save to a DataFrame
    df = pd.DataFrame(results)

    # Print or save the DataFrame
    print(df)
    df.to_csv("sterile_compounding_pharmacy_data_2.csv", index=False)

finally:
    # Close the browser
    driver.quit()


                                       Title License Number  \
0               986 INFUSION PHARMACY #1 INC     LSC 101414   
1                   A & O SPECIALTY PHARMACY     LSC 101817   
2                   ACCREDO HEALTH GROUP INC     NSC 101280   
3                   ACCREDO HEALTH GROUP INC     NSC 101279   
4                      ADVANCE CARE PHARMACY      LSC 99611   
...                                      ...            ...   
1675            ADVENTIST HEALTH SIMI VALLEY     LSC 101218   
1676            ADVENTIST HEALTH SIMI VALLEY     LSC 101217   
1677                 ADVENTIST HEALTH SONORA     LSC 101199   
1678        ADVENTIST HEALTH SONORA-PHARMACY      LSC 99918   
1679  ADVENTIST HEALTH SPECIALTY BAKERSFIELD     LSC 101888   

                                  License Type License Status  \
0                 STERILE COMPOUNDING PHARMACY          CLEAR   
1                 STERILE COMPOUNDING PHARMACY          CLEAR   
2     NONRESIDENT STERILE COMPOUNDING PHARMACY  