In [2]:
pip install selenium pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: C:\Users\bvlma\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


In [20]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(options=chrome_options)

try:
    # Step 1: Open the webpage
    url = "https://search.dca.ca.gov/?BD=7200&TP=107"
    driver.get(url)

    # Step 2: Wait for the "Search" button and click it
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "srchSubmitHome"))
    )
    search_button = driver.find_element(By.ID, "srchSubmitHome")
    search_button.click()

    # Step 3: Wait for the first results to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "post.yes"))
    )

    # Step 4: Scroll and Load More Results
    last_height = driver.execute_script("return document.body.scrollHeight")
    articles = []
    while True:
        # Find all articles
        new_articles = driver.find_elements(By.CSS_SELECTOR, "article.post.yes")
        articles.extend(new_articles)

        # Scroll down
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Allow time for new content to load

        # Check if the page height has changed
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Step 5: Extract Information from Each Article
    results = []
    for article in articles:
        title = article.find_element(By.CSS_SELECTOR, "h3").text
        license_number = article.find_element(By.CSS_SELECTOR, "span[id^='lic']").text
        license_type = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'License Type:')]]").text
        license_status = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'License Status:')]]").text
        expiration_date = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'Expiration Date:')]]").text
        city = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'City:')]]").text
        state = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'State:')]]").text
        county = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'County:')]]").text
        zip_code = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'Zip:')]]").text

        results.append({
            "Title": title,
            "License Number": license_number,
            "License Type": license_type,
            "License Status": license_status,
            "Expiration Date": expiration_date,
            "City": city,
            "State": state,
            "County": county,
            "Zip": zip_code,
        })

    # Step 6: Save to a DataFrame
    df = pd.DataFrame(results)

    # Print or save the DataFrame
    print(df)
    df.to_csv("pharmacy_data.csv", index=False)

finally:
    # Close the browser
    driver.quit()


                                  Title License Number  \
0                  3RD ST. YOUTH CLINIC       CLE 4375   
1     5TH AVENUE PUBLIC HEALTH SERVICES       CLE 1391   
2               77TH STREET JAIL CLINIC       CLE 1310   
3             ABAJIAN, HOURY BARSOUMIAN      APH 10319   
4                    ABASI, RABIE NIDAL      APH 11380   
...                                 ...            ...   
1995            ADULT MEDICATION CLINIC       CLE 4743   
1996           ADULT URGENT CARE CENTER       CLE 3421   
1997                    AFSHAR, ANAHITA      APH 11429   
1998              AFZAL, SHAMA PELWASHA      APH 11026   
1999               AGHAJANIAN, BERSABEH      APH 11323   

                                    License Type              License Status  \
0        LICENSE TYPE: CLINIC (GOVERNMENT OWNED)       LICENSE STATUS: CLEAR   
1        LICENSE TYPE: CLINIC (GOVERNMENT OWNED)   LICENSE STATUS: CANCELLED   
2        LICENSE TYPE: CLINIC (GOVERNMENT OWNED)       LICENSE 

In [22]:
df.head()

Unnamed: 0,Title,License Number,License Type,License Status,Expiration Date,City,State,County,Zip
0,3RD ST. YOUTH CLINIC,CLE 4375,LICENSE TYPE: CLINIC (GOVERNMENT OWNED),LICENSE STATUS: CLEAR,"EXPIRATION DATE: AUGUST 1, 2025",CITY: SAN FRANCISCO,STATE: CALIFORNIA,COUNTY: SAN FRANCISCO,ZIP: 94124
1,5TH AVENUE PUBLIC HEALTH SERVICES,CLE 1391,LICENSE TYPE: CLINIC (GOVERNMENT OWNED),LICENSE STATUS: CANCELLED,"EXPIRATION DATE: NOVEMBER 1, 2006",CITY: SAN DIEGO,STATE: CALIFORNIA,COUNTY: SAN DIEGO,ZIP: 92103
2,77TH STREET JAIL CLINIC,CLE 1310,LICENSE TYPE: CLINIC (GOVERNMENT OWNED),LICENSE STATUS: CLEAR,"EXPIRATION DATE: NOVEMBER 1, 2025",CITY: LOS ANGELES,STATE: CALIFORNIA,COUNTY: LOS ANGELES,ZIP: 90003
3,"ABAJIAN, HOURY BARSOUMIAN",APH 10319,LICENSE TYPE: ADVANCED PRACTICE PHARMACIST,LICENSE STATUS: CLEAR,"EXPIRATION DATE: DECEMBER 31, 2026",CITY: BEVERLY HILLS,STATE: CALIFORNIA,COUNTY: LOS ANGELES,ZIP: 90210
4,"ABASI, RABIE NIDAL",APH 11380,LICENSE TYPE: ADVANCED PRACTICE PHARMACIST,LICENSE STATUS: CLEAR,"EXPIRATION DATE: OCTOBER 31, 2025",CITY: SANTEE,STATE: CALIFORNIA,COUNTY: SAN DIEGO,ZIP: 92071


In [23]:
# Second Attempt; Issue with the above is that it is not searching for specifically sterile compound pharmacies so we are getting the wrong info back and i have the wrong table format.

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import pandas as pd
import time

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(options=chrome_options)

try:
    # Step 1: Open the webpage
    url = "https://search.dca.ca.gov/?BD=7200&TP=107"
    driver.get(url)

    # Step 2: Wait for the License Type dropdown to be present
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "licenseType"))
    )
    
    # Step 3: Select "Sterile Compounding Pharmacy" from the dropdown
    license_dropdown = Select(driver.find_element(By.ID, "licenseType"))
    license_dropdown.select_by_visible_text("Sterile Compounding Pharmacy")
    
    # Step 4: Wait for the "Search" button and click it
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "srchSubmitHome"))
    )
    search_button = driver.find_element(By.ID, "srchSubmitHome")
    search_button.click()

    # Step 5: Wait for the filtered results to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "post.yes"))
    )

    # Step 6: Scroll and Load More Results
    last_height = driver.execute_script("return document.body.scrollHeight")
    articles = []
    while True:
        # Find all articles
        new_articles = driver.find_elements(By.CSS_SELECTOR, "article.post.yes")
        articles.extend(new_articles)

        # Scroll down
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Allow time for new content to load

        # Check if the page height has changed
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Step 7: Extract Information from Each Article
    results = []
    for article in articles:
        title = article.find_element(By.XPATH, ".//li/h3").text
        license_number = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'License Number:')]]/a/span").text
        license_type = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'License Type:')]]").text.split(": ")[-1]
        license_status = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'License Status:')]]").text.split(": ")[-1]
        expiration_date = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'Expiration Date:')]]").text.split(": ")[-1]
        city = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'City:')]]/span").text
        state = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'State:')]]/span").text
        county = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'County:')]]").text.split(": ")[-1]
        zip_code = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'Zip:')]]").text.split(": ")[-1]

        results.append({
            "Title": title,
            "License Number": license_number,
            "License Type": license_type,
            "License Status": license_status,
            "Expiration Date": expiration_date,
            "City": city,
            "State": state,
            "County": county,
            "Zip": zip_code,
        })

    # Step 8: Save to a DataFrame
    df = pd.DataFrame(results)

    # Print or save the DataFrame
    print(df)
    df.to_csv("sterile_compounding_pharmacy_data.csv", index=False)

finally:
    # Close the browser
    driver.quit()


                             Title License Number  \
0       2222 PRESCRIPTION PHARMACY      LSC 99000   
1     986 INFUSION PHARMACY #1 INC     LSC 101414   
2            A & O CLINIC PHARMACY      LSC 99259   
3         A & O SPECIALTY PHARMACY     LSC 101817   
4         A & O SPECIALTY PHARMACY      LSC 99382   
...                            ...            ...   
1995               ACCESS I.V. LLC      LSC 99867   
1996                     ACCESS IV     LSC 100164   
1997                     ACCESS IV     LSC 100742   
1998                     ACCESS IV      LSC 99866   
1999                 ACCESS IV INC      LSC 99403   

                      License Type License Status     Expiration Date  \
0     STERILE COMPOUNDING PHARMACY      CANCELLED  SEPTEMBER 19, 2016   
1     STERILE COMPOUNDING PHARMACY          CLEAR        JULY 1, 2025   
2     STERILE COMPOUNDING PHARMACY      CANCELLED      AUGUST 1, 2006   
3     STERILE COMPOUNDING PHARMACY          CLEAR       MARCH 1, 2025  

In [25]:
#This is a third attempt; I am now going to use the advanced search and try and limit to just active pharmacies to stop returning so many that are old and cancelled. 

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import pandas as pd
import time

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(options=chrome_options)

try:
    # Step 1: Open the new webpage
    url = "https://search.dca.ca.gov/advanced"
    driver.get(url)

    # Step 2: Wait for the License Type dropdown to be present
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "primaryStatusCodes"))
    )

    # Step 3: Set the Primary Status to "Active"
    status_dropdown = Select(driver.find_element(By.ID, "primaryStatusCodes"))
    status_dropdown.select_by_visible_text("Active")

    # Step 4: Select "Sterile Compounding Pharmacy" from the License Type dropdown
    license_dropdown = Select(driver.find_element(By.ID, "licenseType"))
    license_dropdown.select_by_visible_text("Sterile Compounding Pharmacy")

    # Step 5: Wait for the "Search" button and click it
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "srchSubmitHome"))
    )
    search_button = driver.find_element(By.ID, "srchSubmitHome")
    search_button.click()

    # Step 6: Wait for the filtered results to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "post.yes"))
    )

    # Step 7: Scroll and Load More Results
    last_height = driver.execute_script("return document.body.scrollHeight")
    articles = []
    while True:
        # Find all articles
        new_articles = driver.find_elements(By.CSS_SELECTOR, "article.post.yes")
        articles.extend(new_articles)

        # Scroll down
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Allow time for new content to load

        # Check if the page height has changed
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Step 8: Extract Information from Each Article
    results = []
    for article in articles:
        title = article.find_element(By.XPATH, ".//li/h3").text
        license_number = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'License Number:')]]/a/span").text
        license_type = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'License Type:')]]").text.split(": ")[-1]
        license_status = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'License Status:')]]").text.split(": ")[-1]
        expiration_date = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'Expiration Date:')]]").text.split(": ")[-1]
        city = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'City:')]]/span").text
        state = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'State:')]]/span").text
        county = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'County:')]]").text.split(": ")[-1]
        zip_code = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'Zip:')]]").text.split(": ")[-1]

        results.append({
            "Title": title,
            "License Number": license_number,
            "License Type": license_type,
            "License Status": license_status,
            "Expiration Date": expiration_date,
            "City": city,
            "State": state,
            "County": county,
            "Zip": zip_code,
        })

    # Step 9: Save to a DataFrame
    df = pd.DataFrame(results)

    # Print or save the DataFrame
    print(df)
    df.to_csv("sterile_compounding_pharmacy_data_2.csv", index=False)

finally:
    # Close the browser
    driver.quit()


                                       Title License Number  \
0               986 INFUSION PHARMACY #1 INC     LSC 101414   
1                   A & O SPECIALTY PHARMACY     LSC 101817   
2                   ACCREDO HEALTH GROUP INC     NSC 101280   
3                   ACCREDO HEALTH GROUP INC     NSC 101279   
4                      ADVANCE CARE PHARMACY      LSC 99611   
...                                      ...            ...   
1675            ADVENTIST HEALTH SIMI VALLEY     LSC 101218   
1676            ADVENTIST HEALTH SIMI VALLEY     LSC 101217   
1677                 ADVENTIST HEALTH SONORA     LSC 101199   
1678        ADVENTIST HEALTH SONORA-PHARMACY      LSC 99918   
1679  ADVENTIST HEALTH SPECIALTY BAKERSFIELD     LSC 101888   

                                  License Type License Status  \
0                 STERILE COMPOUNDING PHARMACY          CLEAR   
1                 STERILE COMPOUNDING PHARMACY          CLEAR   
2     NONRESIDENT STERILE COMPOUNDING PHARMACY  

In [28]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import pandas as pd
import time

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(options=chrome_options)

try:
    # Step 1: Open the new webpage
    url = "https://search.dca.ca.gov/advanced"
    driver.get(url)

    # Step 2: Wait for the License Type dropdown to be present
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "primaryStatusCodes"))
    )

    # Step 3: Set the Primary Status to "Active"
    status_dropdown = Select(driver.find_element(By.ID, "primaryStatusCodes"))
    status_dropdown.select_by_visible_text("Active")

    # Step 4: Select "Sterile Compounding Pharmacy" from the License Type dropdown
    license_dropdown = Select(driver.find_element(By.ID, "licenseType"))
    license_dropdown.select_by_visible_text("Sterile Compounding Pharmacy")

    # Step 5: Wait for the "Search" button and click it
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "srchSubmitHome"))
    )
    search_button = driver.find_element(By.ID, "srchSubmitHome")
    search_button.click()

    # Step 6: Wait for the filtered results to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "post.yes"))
    )

    # Step 7: Scroll and Load More Results
    last_height = driver.execute_script("return document.body.scrollHeight")
    articles = []
    while True:
        # Find all articles with a class 'post' and a specific id
        article_elements = driver.find_elements(By.CSS_SELECTOR, "article.post")

        # Loop through each article and extract the data
        for article in article_elements:
            article_id = article.get_attribute("id")  # Extract article id

            title = article.find_element(By.XPATH, ".//li/h3").text

            # Wait until the License Number element is found (with the dynamic 'lic' id)
            try:
                license_number = WebDriverWait(article, 10).until(
                    EC.presence_of_element_located(
                        (By.XPATH, ".//li[strong[contains(text(), 'License Number:')]]/a/span[starts-with(@id, 'lic')]")
                    )
                ).text
            except:
                license_number = "Not Available"

            license_type = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'License Type:')]]").text.split(": ")[-1]
            license_status = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'License Status:')]]").text.split(": ")[-1]
            expiration_date = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'Expiration Date:')]]").text.split(": ")[-1]
            city = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'City:')]]/span").text
            state = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'State:')]]/span").text
            county = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'County:')]]").text.split(": ")[-1]
            zip_code = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'Zip:')]]").text.split(": ")[-1]

            # Add the data to the results list
            articles.append({
                "Article ID": article_id,
                "Title": title,
                "License Number": license_number,
                "License Type": license_type,
                "License Status": license_status,
                "Expiration Date": expiration_date,
                "City": city,
                "State": state,
                "County": county,
                "Zip": zip_code,
            })

        # Scroll down to load more results
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Allow time for new content to load

        # Check if the page height has changed (to detect if more articles are available)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Step 8: Save the results to a DataFrame
    df = pd.DataFrame(articles)

    # Print or save the DataFrame
    print(df)
    df.to_csv("sterile_compounding_pharmacy_data.csv", index=False)

finally:
    # Close the browser
    driver.quit()


NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":".//li[strong[contains(text(), 'License Type:')]]"}
  (Session info: chrome=131.0.6778.86); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF6F88A6CB5+28821]
	(No symbol) [0x00007FF6F8813840]
	(No symbol) [0x00007FF6F86B578A]
	(No symbol) [0x00007FF6F87091BE]
	(No symbol) [0x00007FF6F87094AC]
	(No symbol) [0x00007FF6F86FC52C]
	(No symbol) [0x00007FF6F872F33F]
	(No symbol) [0x00007FF6F86FC3F6]
	(No symbol) [0x00007FF6F872F510]
	(No symbol) [0x00007FF6F874F412]
	(No symbol) [0x00007FF6F872F0A3]
	(No symbol) [0x00007FF6F86FA778]
	(No symbol) [0x00007FF6F86FB8E1]
	GetHandleVerifier [0x00007FF6F8BDFCAD+3408013]
	GetHandleVerifier [0x00007FF6F8BF741F+3504127]
	GetHandleVerifier [0x00007FF6F8BEB5FD+3455453]
	GetHandleVerifier [0x00007FF6F896BDBB+835995]
	(No symbol) [0x00007FF6F881EB5F]
	(No symbol) [0x00007FF6F881A814]
	(No symbol) [0x00007FF6F881A9AD]
	(No symbol) [0x00007FF6F880A199]
	BaseThreadInitThunk [0x00007FF96D0E259D+29]
	RtlUserThreadStart [0x00007FF96D72AF38+40]


## Next Attempt 

In [29]:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import pandas as pd
import time
import re
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(options=chrome_options)

try:
    # Step 1: Open the new webpage
    url = "https://search.dca.ca.gov/advanced"
    driver.get(url)

    # Step 2: Wait for the License Type dropdown to be present
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "primaryStatusCodes"))
    )

    # Step 3: Set the Primary Status to "Active"
    status_dropdown = Select(driver.find_element(By.ID, "primaryStatusCodes"))
    status_dropdown.select_by_visible_text("Active")

    # Step 4: Select "Sterile Compounding Pharmacy" from the License Type dropdown
    license_dropdown = Select(driver.find_element(By.ID, "licenseType"))
    license_dropdown.select_by_visible_text("Sterile Compounding Pharmacy")

    # Step 5: Wait for the "Search" button and click it
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "srchSubmitHome"))
    )
    search_button = driver.find_element(By.ID, "srchSubmitHome")
    search_button.click()

    # Step 6: Wait for the filtered results to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "post.yes"))
    )

    # Step 7: Scroll and Load More Results
    articles = []
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    while True:
        # Find all articles with a numeric id
        article_elements = driver.find_elements(By.CSS_SELECTOR, "article.post")
        
        for article in article_elements:
            # Extract the numeric ID from the article's `id` attribute
            article_id = article.get_attribute("id")
            if not re.match(r'^\d+$', article_id):
                continue  # Skip non-numeric IDs
            
            try:
                # Extract details from the <li> elements within the article
                li_elements = article.find_elements(By.XPATH, ".//li")
                article_data = {"Article ID": article_id}
    
                for li in li_elements:
                    text = li.text.strip()
                    
                    if "License Number:" in text:
                        # Extract License Number
                        article_data["License Number"] = li.find_element(By.XPATH, ".//a/span[starts-with(@id, 'lic')]").text
                    
                    elif "License Type:" in text:
                        # Extract License Type
                        article_data["License Type"] = text.split(": ")[-1]
                    
                    elif "License Status:" in text:
                        # Extract License Status
                        article_data["License Status"] = text.split(": ")[-1]
                    
                    elif "Expiration Date:" in text:
                        # Extract Expiration Date
                        article_data["Expiration Date"] = text.split(": ")[-1]
                    
                    elif "City:" in text:
                        # Extract City
                        article_data["City"] = li.find_element(By.XPATH, ".//span").text
                    
                    elif "State:" in text:
                        # Extract State
                        article_data["State"] = li.find_element(By.XPATH, ".//span").text
                    
                    elif "County:" in text:
                        # Extract County
                        article_data["County"] = text.split(": ")[-1]
                    
                    elif "Zip:" in text:
                        # Extract Zip Code
                        article_data["Zip"] = text.split(": ")[-1]
                
                # Add the collected data to the articles list
                if article_data not in articles:  # Avoid duplicates
                    articles.append(article_data)
    
            except Exception as e:
                print(f"Error processing article ID {article_id}: {e}")
        
        # Scroll down to load more results
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Allow time for new content to load
    
        # Check if the page height has changed (to detect if more articles are available)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    
    # Step 8: Save the results to a DataFrame
    df = pd.DataFrame(articles)

    # Print or save the DataFrame
    print(df)
    df.to_csv("sterile_compounding_pharmacy_data.csv", index=False)

finally:
    # Close the browser
    driver.quit()


    Article ID
0            0
1            1
2            2
3            3
4            4
..         ...
840        840
841        841
842        842
843        843
844        844

[845 rows x 1 columns]


In [30]:
df.head()

Unnamed: 0,Article ID
0,0
1,1
2,2
3,3
4,4


## New Strategy:  Use UL when Class = "Actions" to locate LIs 

In [31]:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import pandas as pd
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(options=chrome_options)

try:
    # Step 1: Open the webpage
    url = "https://search.dca.ca.gov/advanced"
    driver.get(url)

    # Step 2: Wait for the License Type dropdown to be present
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "primaryStatusCodes"))
    )

    # Step 3: Set the Primary Status to "Active"
    status_dropdown = Select(driver.find_element(By.ID, "primaryStatusCodes"))
    status_dropdown.select_by_visible_text("Active")

    # Step 4: Select "Sterile Compounding Pharmacy" from the License Type dropdown
    license_dropdown = Select(driver.find_element(By.ID, "licenseType"))
    license_dropdown.select_by_visible_text("Sterile Compounding Pharmacy")

    # Step 5: Wait for the "Search" button and click it
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "srchSubmitHome"))
    )
    search_button = driver.find_element(By.ID, "srchSubmitHome")
    search_button.click()

    # Step 6: Wait for the initial results to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "actions"))
    )

    # Step 7: Scroll and load more results
    articles = []
    last_height = driver.execute_script("return document.body.scrollHeight")
    scroll_pause_time = 2  # Adjust based on network speed

    while True:
        # Find all <ul> elements with class "actions"
        ul_elements = driver.find_elements(By.CSS_SELECTOR, "ul.actions")
        
        for ul in ul_elements:
            try:
                # Initialize a dictionary to store data
                article_data = {}

                # Extract the pharmacy name (h3 within the <ul>)
                try:
                    article_data["Pharmacy Name"] = ul.find_element(By.XPATH, "./li/h3").text
                except:
                    article_data["Pharmacy Name"] = "Not Available"

                # Extract data for each relevant field
                for li in ul.find_elements(By.XPATH, "./li"):
                    text = li.text.strip()

                    if "License Number:" in text:
                        article_data["License Number"] = li.find_element(By.XPATH, ".//a/span").text

                    elif "License Type:" in text:
                        article_data["License Type"] = text.split(": ")[-1]

                    elif "License Status:" in text:
                        article_data["License Status"] = text.split(": ")[-1]

                    elif "Expiration Date:" in text:
                        article_data["Expiration Date"] = text.split(": ")[-1]

                    elif "Secondary Status:" in text:
                        article_data["Secondary Status"] = text.split(": ")[-1]

                    elif "City:" in text:
                        article_data["City"] = li.find_element(By.XPATH, ".//span").text

                    elif "State:" in text:
                        article_data["State"] = li.find_element(By.XPATH, ".//span").text

                    elif "County:" in text:
                        article_data["County"] = text.split(": ")[-1]

                    elif "Zip:" in text:
                        article_data["Zip"] = text.split(": ")[-1]

                # Avoid duplicates in the results
                if article_data not in articles:
                    articles.append(article_data)

            except Exception as e:
                print(f"Error processing <ul> element: {e}")

        # Scroll down to load more results
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(scroll_pause_time)

        # Check if new content has loaded by comparing page height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:  # Break if no new content is loaded
            break
        last_height = new_height

    # Step 8: Save the results to a DataFrame
    df = pd.DataFrame(articles)

    # Print or save the DataFrame
    print(df)
    df.to_csv("active_sterile_pharmacy_licenses_california.csv", index=False)

finally:
    # Close the browser
    driver.quit()


                                        Pharmacy Name
0                        986 INFUSION PHARMACY #1 INC
1                                       Not Available
2                            A & O SPECIALTY PHARMACY
3                            ACCREDO HEALTH GROUP INC
4                               ADVANCE CARE PHARMACY
5         ADVENTIST HEALTH AIS CANCER CENTER PHARMACY
6                        ADVENTIST HEALTH AND RIDEOUT
7    ADVENTIST HEALTH BAKERSFIELD - HOSPITAL PHARMACY
8                             ADVENTIST HEALTH DELANO
9                           ADVENTIST HEALTH GLENDALE
10                           ADVENTIST HEALTH HANFORD
11                   ADVENTIST HEALTH HOWARD MEMORIAL
12  ADVENTIST HEALTH LODI MEMORIAL HOSPITAL PHARMA...
13                   ADVENTIST HEALTH MENDOCINO COAST
14                       ADVENTIST HEALTH SIMI VALLEY
15                            ADVENTIST HEALTH SONORA
16                   ADVENTIST HEALTH SONORA-PHARMACY
17             ADVENTIST HEA

## New Attempt

In [37]:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(options=chrome_options)

try:
    # Step 1: Open the webpage
    url = "https://search.dca.ca.gov/advanced"
    driver.get(url)

    # Step 2: Wait for the License Type dropdown to be present
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "primaryStatusCodes"))
    )

    # Step 3: Set the Primary Status to "Active"
    status_dropdown = Select(driver.find_element(By.ID, "primaryStatusCodes"))
    status_dropdown.select_by_visible_text("Active")

    # Step 4: Select "Sterile Compounding Pharmacy" from the License Type dropdown
    license_dropdown = Select(driver.find_element(By.ID, "licenseType"))
    license_dropdown.select_by_visible_text("Sterile Compounding Pharmacy")

    # Step 5: Wait for the "Search" button and click it
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "srchSubmitHome"))
    )
    search_button = driver.find_element(By.ID, "srchSubmitHome")
    search_button.click()

    # Step 6: Wait for the initial results to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "actions"))
    )

    # Step 7: Loop through IDs and scrape data
    articles = []

    for article_id in range(845):  # IDs from 0 to 844
        try:
            # Construct the article selector dynamically using XPath
            article_selector = f"//article[@id='{article_id}']"
            
            # Locate the article element by XPath
            article_element = driver.find_element(By.XPATH, article_selector)
            
            # Locate the <ul> element with class "actions" within the article
            ul_element = article_element.find_element(By.CSS_SELECTOR, "ul.actions")
            
            # Initialize a dictionary to store data
            article_data = {}

            # Extract the pharmacy name (h3 within the <ul>)
            try:
                article_data["Pharmacy Name"] = ul_element.find_element(By.XPATH, "./li/h3").text
            except:
                article_data["Pharmacy Name"] = "Not Available"

            # Extract data for each relevant field
            for li in ul_element.find_elements(By.XPATH, "./li"):
                text = li.text.strip()

                if "License Number:" in text:
                    article_data["License Number"] = li.find_element(By.XPATH, ".//a/span").text

                elif "License Type:" in text:
                    article_data["License Type"] = text.split(": ")[-1]

                elif "License Status:" in text:
                    article_data["License Status"] = text.split(": ")[-1]

                elif "Expiration Date:" in text:
                    article_data["Expiration Date"] = text.split(": ")[-1]

                elif "Secondary Status:" in text:
                    article_data["Secondary Status"] = text.split(": ")[-1]

                elif "City:" in text:
                    article_data["City"] = li.find_element(By.XPATH, ".//span").text

                elif "State:" in text:
                    article_data["State"] = li.find_element(By.XPATH, ".//span").text

                elif "County:" in text:
                    article_data["County"] = text.split(": ")[-1]

                elif "Zip:" in text:
                    article_data["Zip"] = text.split(": ")[-1]

            # Avoid duplicates in the results
            if article_data not in articles:
                articles.append(article_data)

        except Exception as e:
            print(f"Error processing article with ID {article_id}: {e}")

    # At the end of the loop, `articles` will contain all the scraped data.
    print(f"Scraped {len(articles)} articles.")

    # Step 8: Save the results to a DataFrame
    df = pd.DataFrame(articles)

    # Print or save the DataFrame
    print(df)
    df.to_csv("pharmacies_attempt5.csv", index=False)

finally:
    # Close the browser
    driver.quit()


Scraped 18 articles.
                                        Pharmacy Name
0                        986 INFUSION PHARMACY #1 INC
1                            A & O SPECIALTY PHARMACY
2                            ACCREDO HEALTH GROUP INC
3                               ADVANCE CARE PHARMACY
4         ADVENTIST HEALTH AIS CANCER CENTER PHARMACY
5                        ADVENTIST HEALTH AND RIDEOUT
6    ADVENTIST HEALTH BAKERSFIELD - HOSPITAL PHARMACY
7                             ADVENTIST HEALTH DELANO
8                           ADVENTIST HEALTH GLENDALE
9                            ADVENTIST HEALTH HANFORD
10                   ADVENTIST HEALTH HOWARD MEMORIAL
11  ADVENTIST HEALTH LODI MEMORIAL HOSPITAL PHARMA...
12                   ADVENTIST HEALTH MENDOCINO COAST
13                       ADVENTIST HEALTH SIMI VALLEY
14                            ADVENTIST HEALTH SONORA
15                   ADVENTIST HEALTH SONORA-PHARMACY
16             ADVENTIST HEALTH SPECIALTY BAKERSFIELD
17     

## Attempt 6

In [38]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(options=chrome_options)

try:
    # Step 1: Open the webpage
    url = "https://search.dca.ca.gov/advanced"
    driver.get(url)

    # Step 2: Wait for the filters and set them
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "primaryStatusCodes"))
    )

    # Set the Primary Status to "Active"
    status_dropdown = Select(driver.find_element(By.ID, "primaryStatusCodes"))
    status_dropdown.select_by_visible_text("Active")

    # Set the License Type to "Sterile Compounding Pharmacy"
    license_dropdown = Select(driver.find_element(By.ID, "licenseType"))
    license_dropdown.select_by_visible_text("Sterile Compounding Pharmacy")

    # Step 3: Wait for the "Search" button and click it
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "srchSubmitHome"))
    )
    search_button = driver.find_element(By.ID, "srchSubmitHome")
    search_button.click()

    # Step 4: Wait for the first results to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "post.yes"))
    )

    # Step 5: Scroll and Load More Results
    last_height = driver.execute_script("return document.body.scrollHeight")
    articles = []
    while True:
        # Find all articles
        new_articles = driver.find_elements(By.CSS_SELECTOR, "article.post.yes")
        articles.extend(new_articles)

        # Scroll down
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Allow time for new content to load

        # Check if the page height has changed
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Step 6: Extract Information from Each Article
    results = []
    for article in articles:
        try:
            title = article.find_element(By.CSS_SELECTOR, "h3").text
            license_number = article.find_element(By.CSS_SELECTOR, "span[id^='lic']").text
            license_type = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'License Type:')]]").text
            license_status = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'License Status:')]]").text
            expiration_date = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'Expiration Date:')]]").text
            city = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'City:')]]").text
            state = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'State:')]]").text
            county = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'County:')]]").text
            zip_code = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'Zip:')]]").text

            results.append({
                "Title": title,
                "License Number": license_number,
                "License Type": license_type,
                "License Status": license_status,
                "Expiration Date": expiration_date,
                "City": city,
                "State": state,
                "County": county,
                "Zip": zip_code,
            })
        except Exception as e:
            print(f"Error processing article: {e}")

    # Step 7: Save to a DataFrame
    df = pd.DataFrame(results)

    # Print or save the DataFrame
    print(df)
    df.to_csv("pharmacies_attempt6.csv", index=False)

finally:
    # Close the browser
    driver.quit()


                                       Title License Number  \
0               986 INFUSION PHARMACY #1 INC     LSC 101414   
1                   A & O SPECIALTY PHARMACY     LSC 101817   
2                   ACCREDO HEALTH GROUP INC     NSC 101280   
3                   ACCREDO HEALTH GROUP INC     NSC 101279   
4                      ADVANCE CARE PHARMACY      LSC 99611   
...                                      ...            ...   
1675            ADVENTIST HEALTH SIMI VALLEY     LSC 101218   
1676            ADVENTIST HEALTH SIMI VALLEY     LSC 101217   
1677                 ADVENTIST HEALTH SONORA     LSC 101199   
1678        ADVENTIST HEALTH SONORA-PHARMACY      LSC 99918   
1679  ADVENTIST HEALTH SPECIALTY BAKERSFIELD     LSC 101888   

                                           License Type  \
0            LICENSE TYPE: STERILE COMPOUNDING PHARMACY   
1            LICENSE TYPE: STERILE COMPOUNDING PHARMACY   
2     LICENSE TYPE: NONRESIDENT STERILE COMPOUNDING ...   
3     L

##  Attempt 7:
Last attempt rendered the correct fields but gave duplicates for the first 20 values available, over and over, and ocne removed, we only had 0 through 19 accounted for.  

In [39]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(options=chrome_options)

try:
    # Step 1: Open the webpage
    url = "https://search.dca.ca.gov/advanced"
    driver.get(url)

    # Step 2: Wait for the filters and set them (Primary Status and License Type)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "primaryStatusCodes"))
    )

    # Set the Primary Status to "Active"
    status_dropdown = Select(driver.find_element(By.ID, "primaryStatusCodes"))
    status_dropdown.select_by_visible_text("Active")

    # Set the License Type to "Sterile Compounding Pharmacy"
    license_dropdown = Select(driver.find_element(By.ID, "licenseType"))
    license_dropdown.select_by_visible_text("Sterile Compounding Pharmacy")

    # Step 3: Wait for the "Search" button and click it
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "srchSubmitHome"))
    )
    search_button = driver.find_element(By.ID, "srchSubmitHome")
    search_button.click()

    # Step 4: Wait for the first results to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "post.yes"))
    )

    # Step 5: Scroll and Load More Results
    # We'll iterate through article IDs instead of scrolling indefinitely to avoid repetition.
    articles = []
    for article_id in range(845):  # Assuming article IDs go from 0 to 844
        try:
            # Construct the XPath for each article dynamically using the article_id
            article_selector = f"//article[@id='{article_id}']"
            
            # Locate the article element using the dynamically generated XPath
            article_element = driver.find_element(By.XPATH, article_selector)
            
            # Extract the <ul> with class "actions" within the article
            ul_element = article_element.find_element(By.CSS_SELECTOR, "ul.actions")
            
            # Initialize a dictionary to store data for the current article
            article_data = {}

            # Extract information from the article's <ul> element
            try:
                article_data["Pharmacy Name"] = ul_element.find_element(By.XPATH, "./li/h3").text
            except:
                article_data["Pharmacy Name"] = "Not Available"

            for li in ul_element.find_elements(By.XPATH, "./li"):
                text = li.text.strip()

                # Extract specific information based on the li content
                if "License Number:" in text:
                    article_data["License Number"] = li.find_element(By.XPATH, ".//a/span").text
                elif "License Type:" in text:
                    article_data["License Type"] = text.split(": ")[-1]
                elif "License Status:" in text:
                    article_data["License Status"] = text.split(": ")[-1]
                elif "Expiration Date:" in text:
                    article_data["Expiration Date"] = text.split(": ")[-1]
                elif "Secondary Status:" in text:
                    article_data["Secondary Status"] = text.split(": ")[-1]
                elif "City:" in text:
                    article_data["City"] = li.find_element(By.XPATH, ".//span").text
                elif "State:" in text:
                    article_data["State"] = li.find_element(By.XPATH, ".//span").text
                elif "County:" in text:
                    article_data["County"] = text.split(": ")[-1]
                elif "Zip:" in text:
                    article_data["Zip"] = text.split(": ")[-1]

            # Append the article data to the list, avoiding duplicates
            if article_data not in articles:
                articles.append(article_data)

        except Exception as e:
            print(f"Error processing article with ID {article_id}: {e}")
            continue  # Skip to the next article if an error occurs

    # At the end of the loop, `articles` will contain all the scraped data.
    print(f"Scraped {len(articles)} articles.")

    # Step 6: Save the results to a DataFrame
    df = pd.DataFrame(articles)

    # Print or save the DataFrame
    print(df)
    df.to_csv("pharmacies_attempt7.csv", index=False)

finally:
    # Close the browser
    driver.quit()


Scraped 18 articles.
                                        Pharmacy Name
0                        986 INFUSION PHARMACY #1 INC
1                            A & O SPECIALTY PHARMACY
2                            ACCREDO HEALTH GROUP INC
3                               ADVANCE CARE PHARMACY
4         ADVENTIST HEALTH AIS CANCER CENTER PHARMACY
5                        ADVENTIST HEALTH AND RIDEOUT
6    ADVENTIST HEALTH BAKERSFIELD - HOSPITAL PHARMACY
7                             ADVENTIST HEALTH DELANO
8                           ADVENTIST HEALTH GLENDALE
9                            ADVENTIST HEALTH HANFORD
10                   ADVENTIST HEALTH HOWARD MEMORIAL
11  ADVENTIST HEALTH LODI MEMORIAL HOSPITAL PHARMA...
12                   ADVENTIST HEALTH MENDOCINO COAST
13                       ADVENTIST HEALTH SIMI VALLEY
14                            ADVENTIST HEALTH SONORA
15                   ADVENTIST HEALTH SONORA-PHARMACY
16             ADVENTIST HEALTH SPECIALTY BAKERSFIELD
17     

## Attempt 8
Attempt 7 was a regression; go back and try again 

In [40]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(options=chrome_options)

try:
    # Step 1: Open the webpage
    url = "https://search.dca.ca.gov/advanced"
    driver.get(url)

    # Step 2: Wait for the filters to load and set them (Primary Status and License Type)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "primaryStatusCodes"))
    )

    # Set the Primary Status to "Active"
    status_dropdown = Select(driver.find_element(By.ID, "primaryStatusCodes"))
    status_dropdown.select_by_visible_text("Active")

    # Set the License Type to "Sterile Compounding Pharmacy"
    license_dropdown = Select(driver.find_element(By.ID, "licenseType"))
    license_dropdown.select_by_visible_text("Sterile Compounding Pharmacy")

    # Step 3: Wait for the "Search" button and click it
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "srchSubmitHome"))
    )
    search_button = driver.find_element(By.ID, "srchSubmitHome")
    search_button.click()

    # Step 4: Wait for the results to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "post.yes"))
    )

    # Step 5: Scroll and Load More Results
    # Scroll through the page to load all articles before scraping
    last_height = driver.execute_script("return document.body.scrollHeight")
    articles = []

    while True:
        # Find all article elements that match the criteria (using CSS class 'post.yes')
        new_articles = driver.find_elements(By.CSS_SELECTOR, "article.post.yes")
        articles.extend(new_articles)

        # Scroll down the page
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Wait for the new content to load

        # Check if the page height has changed after scrolling. If no new height, stop scrolling.
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Step 6: Extract Information from Each Article
    results = []
    for article in articles:
        try:
            # Get the ul with class "actions" to extract data from the article
            ul_element = article.find_element(By.CSS_SELECTOR, "ul.actions")
            article_data = {}

            # Extract Pharmacy Name
            try:
                article_data["Pharmacy Name"] = ul_element.find_element(By.XPATH, "./li/h3").text
            except Exception as e:
                article_data["Pharmacy Name"] = "Not Available"

            # Loop through each <li> in the <ul> and extract relevant data
            for li in ul_element.find_elements(By.XPATH, "./li"):
                text = li.text.strip()

                # Extract specific data based on the li's content
                if "License Number:" in text:
                    article_data["License Number"] = li.find_element(By.XPATH, ".//a/span").text
                elif "License Type:" in text:
                    article_data["License Type"] = text.split(": ")[-1]
                elif "License Status:" in text:
                    article_data["License Status"] = text.split(": ")[-1]
                elif "Expiration Date:" in text:
                    article_data["Expiration Date"] = text.split(": ")[-1]
                elif "Secondary Status:" in text:
                    article_data["Secondary Status"] = text.split(": ")[-1]
                elif "City:" in text:
                    article_data["City"] = li.find_element(By.XPATH, ".//span").text
                elif "State:" in text:
                    article_data["State"] = li.find_element(By.XPATH, ".//span").text
                elif "County:" in text:
                    article_data["County"] = text.split(": ")[-1]
                elif "Zip:" in text:
                    article_data["Zip"] = text.split(": ")[-1]

            # Append article data to results list
            if article_data not in results:
                results.append(article_data)

        except Exception as e:
            print(f"Error processing article: {e}")
            continue  # Skip to the next article if there's an error

    # At the end of the loop, `results` will contain all the scraped data
    print(f"Scraped {len(results)} articles.")

    # Step 7: Save the results to a DataFrame
    df = pd.DataFrame(results)

    # Print or save the DataFrame
    print(df)
    df.to_csv("pharmacies_attempt8.csv", index=False)

finally:
    # Close the browser
    driver.quit()


Scraped 17 articles.
                                        Pharmacy Name
0                        986 INFUSION PHARMACY #1 INC
1                            A & O SPECIALTY PHARMACY
2                            ACCREDO HEALTH GROUP INC
3                               ADVANCE CARE PHARMACY
4         ADVENTIST HEALTH AIS CANCER CENTER PHARMACY
5                        ADVENTIST HEALTH AND RIDEOUT
6    ADVENTIST HEALTH BAKERSFIELD - HOSPITAL PHARMACY
7                             ADVENTIST HEALTH DELANO
8                           ADVENTIST HEALTH GLENDALE
9                            ADVENTIST HEALTH HANFORD
10                   ADVENTIST HEALTH HOWARD MEMORIAL
11  ADVENTIST HEALTH LODI MEMORIAL HOSPITAL PHARMA...
12                   ADVENTIST HEALTH MENDOCINO COAST
13                       ADVENTIST HEALTH SIMI VALLEY
14                            ADVENTIST HEALTH SONORA
15                   ADVENTIST HEALTH SONORA-PHARMACY
16             ADVENTIST HEALTH SPECIALTY BAKERSFIELD


## Attempt 9:
Explicitly try loading all values first, and then return and loop through them.  

In [41]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(options=chrome_options)

try:
    # Step 1: Open the webpage
    url = "https://search.dca.ca.gov/advanced"
    driver.get(url)

    # Step 2: Wait for the filters to load and set them (Primary Status and License Type)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "primaryStatusCodes"))
    )

    # Set the Primary Status to "Active"
    status_dropdown = Select(driver.find_element(By.ID, "primaryStatusCodes"))
    status_dropdown.select_by_visible_text("Active")

    # Set the License Type to "Sterile Compounding Pharmacy"
    license_dropdown = Select(driver.find_element(By.ID, "licenseType"))
    license_dropdown.select_by_visible_text("Sterile Compounding Pharmacy")

    # Step 3: Wait for the "Search" button and click it
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "srchSubmitHome"))
    )
    search_button = driver.find_element(By.ID, "srchSubmitHome")
    search_button.click()

    # Step 4: Scroll to the bottom of the page to load all articles
    last_height = driver.execute_script("return document.body.scrollHeight")

    # Scroll to load all articles
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Wait for the new content to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Step 5: Loop through all articles by ID (from 0 to 844)
    results = []
    for article_id in range(845):  # ID range from 0 to 844
        try:
            # Construct the article selector dynamically using XPath
            article_selector = f"//article[@id='{article_id}']"
            
            # Locate the article element by XPath
            article_element = driver.find_element(By.XPATH, article_selector)
            
            # Locate the <ul> element with class "actions" within the article
            ul_element = article_element.find_element(By.CSS_SELECTOR, "ul.actions")
            
            # Initialize a dictionary to store data
            article_data = {}

            # Extract the pharmacy name (h3 within the <ul>)
            try:
                article_data["Pharmacy Name"] = ul_element.find_element(By.XPATH, "./li/h3").text
            except:
                article_data["Pharmacy Name"] = "Not Available"

            # Extract data for each relevant field
            for li in ul_element.find_elements(By.XPATH, "./li"):
                text = li.text.strip()

                if "License Number:" in text:
                    article_data["License Number"] = li.find_element(By.XPATH, ".//a/span").text

                elif "License Type:" in text:
                    article_data["License Type"] = text.split(": ")[-1]

                elif "License Status:" in text:
                    article_data["License Status"] = text.split(": ")[-1]

                elif "Expiration Date:" in text:
                    article_data["Expiration Date"] = text.split(": ")[-1]

                elif "Secondary Status:" in text:
                    article_data["Secondary Status"] = text.split(": ")[-1]

                elif "City:" in text:
                    article_data["City"] = li.find_element(By.XPATH, ".//span").text

                elif "State:" in text:
                    article_data["State"] = li.find_element(By.XPATH, ".//span").text

                elif "County:" in text:
                    article_data["County"] = text.split(": ")[-1]

                elif "Zip:" in text:
                    article_data["Zip"] = text.split(": ")[-1]

            # Avoid duplicates in the results
            if article_data not in results:
                results.append(article_data)

        except Exception as e:
            print(f"Error processing article with ID {article_id}: {e}")

    # At the end of the loop, `results` will contain all the scraped data
    print(f"Scraped {len(results)} articles.")

    # Step 6: Save the results to a DataFrame
    df = pd.DataFrame(results)

    # Print or save the DataFrame
    print(df)
    df.to_csv("pharmacies_attempt9.csv", index=False)

finally:
    # Close the browser
    driver.quit()


Scraped 767 articles.
                                         Pharmacy Name
0                         986 INFUSION PHARMACY #1 INC
1                             A & O SPECIALTY PHARMACY
2                             ACCREDO HEALTH GROUP INC
3                                ADVANCE CARE PHARMACY
4          ADVENTIST HEALTH AIS CANCER CENTER PHARMACY
..                                                 ...
762                        WEST ANAHEIM MEDICAL CENTER
763  WHITE MEMORIAL MEDICAL CENTER DBA ADVENTIST HE...
764                   WHITTIER HOSPITAL MEDICAL CENTER
765  WOMENS INTERNATIONAL PHARMACY, INC. DBA BELMAR...
766  WOODLAND MEMORIAL HOSPITAL STERILE COMPOUNDING...

[767 rows x 1 columns]


## Attempt 10 
Last attempt generated almost all of the pharmacies, but title only.  
Continue trying to pick up all of the items on the page.  

In [42]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(options=chrome_options)

try:
    # Step 1: Open the webpage
    url = "https://search.dca.ca.gov/advanced"
    driver.get(url)

    # Step 2: Wait for the filters to load and set them (Primary Status and License Type)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "primaryStatusCodes"))
    )

    # Set the Primary Status to "Active"
    status_dropdown = Select(driver.find_element(By.ID, "primaryStatusCodes"))
    status_dropdown.select_by_visible_text("Active")

    # Set the License Type to "Sterile Compounding Pharmacy"
    license_dropdown = Select(driver.find_element(By.ID, "licenseType"))
    license_dropdown.select_by_visible_text("Sterile Compounding Pharmacy")

    # Step 3: Wait for the "Search" button and click it
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "srchSubmitHome"))
    )
    search_button = driver.find_element(By.ID, "srchSubmitHome")
    search_button.click()

    # Step 4: Implement infinite scrolling
    last_height = driver.execute_script("return document.body.scrollHeight")
    articles = []

    while True:
        # Scroll to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait for new content to load
        time.sleep(2)

        # Get new height
        new_height = driver.execute_script("return document.body.scrollHeight")

        if new_height == last_height:
            break

        last_height = new_height

        # Extract data from all articles on the page
        article_elements = driver.find_elements(By.CSS_SELECTOR, ".post.yes")
        
        for element in article_elements:
            try:
                article_data = {}

                # Extract pharmacy name (h3 within the <ul>)
                h3_element = element.find_element(By.XPATH, ".//li/h3")
                article_data["Pharmacy Name"] = h3_element.text.strip()

                # Extract other data fields
                for li in element.find_elements(By.CSS_SELECTOR, ".actions li"):
                    text = li.text.strip()
                    key_value = text.split(": ", 1)
                    if len(key_value) == 2:
                        key, value = key_value
                        article_data[key] = value.strip()

                articles.append(article_data)
            except Exception as e:
                print(f"Error processing article: {e}")

    # Step 5: Save the results to a DataFrame
    df = pd.DataFrame(articles)

    # Print or save the DataFrame
    print(df)
    df.to_csv("pharmacies_attempt10.csv", index=False)

finally:
    # Close the browser
    driver.quit()


                               Pharmacy Name LICENSE NUMBER  \
0               986 INFUSION PHARMACY #1 INC     LSC 101414   
1                   A & O SPECIALTY PHARMACY     LSC 101817   
2                   ACCREDO HEALTH GROUP INC     NSC 101280   
3                   ACCREDO HEALTH GROUP INC     NSC 101279   
4                      ADVANCE CARE PHARMACY      LSC 99611   
...                                      ...            ...   
1655            ADVENTIST HEALTH SIMI VALLEY     LSC 101218   
1656            ADVENTIST HEALTH SIMI VALLEY     LSC 101217   
1657                 ADVENTIST HEALTH SONORA     LSC 101199   
1658        ADVENTIST HEALTH SONORA-PHARMACY      LSC 99918   
1659  ADVENTIST HEALTH SPECIALTY BAKERSFIELD     LSC 101888   

                                  LICENSE TYPE LICENSE STATUS  \
0                 STERILE COMPOUNDING PHARMACY          CLEAR   
1                 STERILE COMPOUNDING PHARMACY          CLEAR   
2     NONRESIDENT STERILE COMPOUNDING PHARMACY  

## Attempt 11
Last one got all the items but looped repeatedly through the first 20 or so results, we still need to capture the totality of the results on the page. 

In [43]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(options=chrome_options)

try:
    # Step 1: Open the webpage
    url = "https://search.dca.ca.gov/advanced"
    driver.get(url)

    # Step 2: Wait for the filters to load and set them (Primary Status and License Type)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "primaryStatusCodes"))
    )

    # Set the Primary Status to "Active"
    status_dropdown = Select(driver.find_element(By.ID, "primaryStatusCodes"))
    status_dropdown.select_by_visible_text("Active")

    # Set the License Type to "Sterile Compounding Pharmacy"
    license_dropdown = Select(driver.find_element(By.ID, "licenseType"))
    license_dropdown.select_by_visible_text("Sterile Compounding Pharmacy")

    # Step 3: Wait for the "Search" button and click it
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "srchSubmitHome"))
    )
    search_button = driver.find_element(By.ID, "srchSubmitHome")
    search_button.click()

    # Step 4: Implement infinite scrolling
    last_height = driver.execute_script("return document.body.scrollHeight")
    articles = []

    while True:
        # Scroll to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait for new content to load
        time.sleep(2)

        # Get new height
        new_height = driver.execute_script("return document.body.scrollHeight")

        if new_height == last_height:
            break

        last_height = new_height

        # Extract data from all articles on the page
        article_elements = driver.find_elements(By.CSS_SELECTOR, ".post.yes")
        
        for element in article_elements:
            try:
                article_data = {}

                # Extract pharmacy name (h3 within the <ul>)
                h3_element = element.find_element(By.XPATH, ".//li/h3")
                article_data["Pharmacy Name"] = h3_element.text.strip()

                # Extract other data fields
                for li in element.find_elements(By.CSS_SELECTOR, ".actions li"):
                    text = li.text.strip()
                    key_value = text.split(": ", 1)
                    if len(key_value) == 2:
                        key, value = key_value
                        article_data[key] = value.strip()

                articles.append(article_data)
            except Exception as e:
                print(f"Error processing article: {e}")

    # Step 5: Save the results to a DataFrame
    df = pd.DataFrame(articles)

    # Print or save the DataFrame
    print(df)
    df.to_csv("pharmacies_attempt11.csv", index=False)

finally:
    # Close the browser
    driver.quit()


                               Pharmacy Name LICENSE NUMBER  \
0               986 INFUSION PHARMACY #1 INC     LSC 101414   
1                   A & O SPECIALTY PHARMACY     LSC 101817   
2                   ACCREDO HEALTH GROUP INC     NSC 101280   
3                   ACCREDO HEALTH GROUP INC     NSC 101279   
4                      ADVANCE CARE PHARMACY      LSC 99611   
...                                      ...            ...   
1655            ADVENTIST HEALTH SIMI VALLEY     LSC 101218   
1656            ADVENTIST HEALTH SIMI VALLEY     LSC 101217   
1657                 ADVENTIST HEALTH SONORA     LSC 101199   
1658        ADVENTIST HEALTH SONORA-PHARMACY      LSC 99918   
1659  ADVENTIST HEALTH SPECIALTY BAKERSFIELD     LSC 101888   

                                  LICENSE TYPE LICENSE STATUS  \
0                 STERILE COMPOUNDING PHARMACY          CLEAR   
1                 STERILE COMPOUNDING PHARMACY          CLEAR   
2     NONRESIDENT STERILE COMPOUNDING PHARMACY  

## Attempt 12
Still having issues with the data repeating, we will try again to grab beyond the first 20 results.  

In [44]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(options=chrome_options)

try:
    # Step 1: Open the webpage
    url = "https://search.dca.ca.gov/advanced"
    driver.get(url)

    # Step 2: Wait for the filters to load and set them (Primary Status and License Type)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "primaryStatusCodes"))
    )

    # Set the Primary Status to "Active"
    status_dropdown = Select(driver.find_element(By.ID, "primaryStatusCodes"))
    status_dropdown.select_by_visible_text("Active")

    # Set the License Type to "Sterile Compounding Pharmacy"
    license_dropdown = Select(driver.find_element(By.ID, "licenseType"))
    license_dropdown.select_by_visible_text("Sterile Compounding Pharmacy")

    # Step 3: Wait for the "Search" button and click it
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "srchSubmitHome"))
    )
    search_button = driver.find_element(By.ID, "srchSubmitHome")
    search_button.click()

    # Step 4: Extract data from all ul elements with class "actions"
    actions_elements = driver.find_elements(By.CSS_SELECTOR, ".post.yes ul.actions")

    articles = []

    for element in actions_elements:
        try:
            article_data = {}

            # Extract pharmacy name (h3 within the <ul>)
            h3_element = element.find_element(By.XPATH, ".//li/h3")
            article_data["Pharmacy Name"] = h3_element.text.strip()

            # Extract other data fields
            for li in element.find_elements(By.CSS_SELECTOR, ".actions li"):
                text = li.text.strip()
                key_value = text.split(": ", 1)
                if len(key_value) == 2:
                    key, value = key_value
                    article_data[key] = value.strip()

            articles.append(article_data)
        except Exception as e:
            print(f"Error processing article: {e}")

    # Step 5: Save the results to a DataFrame
    df = pd.DataFrame(articles)

    # Print or save the DataFrame
    print(df)
    df.to_csv("pharmacies_attempt12.csv", index=False)

except Exception as e:
    print(f"An error occurred: {e}")
finally:
    # Close the browser
    driver.quit()


Error processing article: Message: no such element: Unable to locate element: {"method":"xpath","selector":".//li/h3"}
  (Session info: chrome=131.0.6778.86); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF6F88A6CB5+28821]
	(No symbol) [0x00007FF6F8813840]
	(No symbol) [0x00007FF6F86B578A]
	(No symbol) [0x00007FF6F87091BE]
	(No symbol) [0x00007FF6F87094AC]
	(No symbol) [0x00007FF6F86FC52C]
	(No symbol) [0x00007FF6F872F33F]
	(No symbol) [0x00007FF6F86FC3F6]
	(No symbol) [0x00007FF6F872F510]
	(No symbol) [0x00007FF6F874F412]
	(No symbol) [0x00007FF6F872F0A3]
	(No symbol) [0x00007FF6F86FA778]
	(No symbol) [0x00007FF6F86FB8E1]
	GetHandleVerifier [0x00007FF6F8BDFCAD+3408013]
	GetHandleVerifier [0x00007FF6F8BF741F+3504127]
	GetHandleVerifier [0x00007FF6F8BEB5FD+3455453]
	GetHandleVerifier [0x00007FF6F896BDBB+835995]
	(No symbol) [0x00007FF6F881EB5F]
	(No 

## Attempt 13
Still trying - only got first 20 records again 

In [46]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(options=chrome_options)

try:
    # Step 1: Open the webpage
    url = "https://search.dca.ca.gov/advanced"
    driver.get(url)

    # Step 2: Wait for the filters to load and set them (Primary Status and License Type)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "primaryStatusCodes"))
    )

    # Set the Primary Status to "Active"
    status_dropdown = Select(driver.find_element(By.ID, "primaryStatusCodes"))
    status_dropdown.select_by_visible_text("Active")

    # Set the License Type to "Sterile Compounding Pharmacy"
    license_dropdown = Select(driver.find_element(By.ID, "licenseType"))
    license_dropdown.select_by_visible_text("Sterile Compounding Pharmacy")

    # Step 3: Wait for the "Search" button and click it
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "srchSubmitHome"))
    )
    search_button = driver.find_element(By.ID, "srchSubmitHome")
    search_button.click()

    # Implement a more robust infinite scrolling mechanism
    SCROLL_PAUSE_TIME = 5

    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

        # Extract data after each scroll
        actions_elements = driver.find_elements(By.CSS_SELECTOR, ".post.yes ul.actions")
        
        for element in actions_elements:
            try:
                article_data = {}

                # Extract pharmacy name (h3 within the <ul>)
                h3_element = element.find_elements(By.CSS_SELECTOR, "li > h3")
                if h3_element:
                    article_data["Pharmacy Name"] = h3_element[0].text.strip()

                # Extract other data fields
                for li in element.find_elements(By.CSS_SELECTOR, ".actions li"):
                    text = li.text.strip()
                    key_value = text.split(": ", 1)
                    if len(key_value) == 2:
                        key, value = key_value
                        article_data[key] = value.strip()

                articles.append(article_data)
            except Exception as e:
                print(f"Error processing article: {e}")

    # Step 5: Save the results to a DataFrame
    df = pd.DataFrame(articles)

    # Print or save the DataFrame
    print(df)
    df.to_csv("pharmacies.csv", index=False)

except Exception as e:
    print(f"An error occurred: {e}")
finally:
    # Close the browser
    driver.quit()


                               Pharmacy Name LICENSE NUMBER  \
0               986 INFUSION PHARMACY #1 INC     LSC 101414   
1                   A & O SPECIALTY PHARMACY     LSC 101817   
2                   ACCREDO HEALTH GROUP INC     NSC 101280   
3                   ACCREDO HEALTH GROUP INC     NSC 101279   
4                      ADVANCE CARE PHARMACY      LSC 99611   
...                                      ...            ...   
3355                                     NaN            NaN   
3356        ADVENTIST HEALTH SONORA-PHARMACY      LSC 99918   
3357                                     NaN            NaN   
3358  ADVENTIST HEALTH SPECIALTY BAKERSFIELD     LSC 101888   
3359                                     NaN            NaN   

                                  LICENSE TYPE LICENSE STATUS  \
0                 STERILE COMPOUNDING PHARMACY          CLEAR   
1                 STERILE COMPOUNDING PHARMACY          CLEAR   
2     NONRESIDENT STERILE COMPOUNDING PHARMACY  

## Attempt 14
If the above fails I will try this which was combined from the results that got most of the names with the one that got all the fields for the first 20 items 