In [2]:
pip install selenium pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: C:\Users\bvlma\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


In [20]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(options=chrome_options)

try:
    # Step 1: Open the webpage
    url = "https://search.dca.ca.gov/?BD=7200&TP=107"
    driver.get(url)

    # Step 2: Wait for the "Search" button and click it
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "srchSubmitHome"))
    )
    search_button = driver.find_element(By.ID, "srchSubmitHome")
    search_button.click()

    # Step 3: Wait for the first results to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "post.yes"))
    )

    # Step 4: Scroll and Load More Results
    last_height = driver.execute_script("return document.body.scrollHeight")
    articles = []
    while True:
        # Find all articles
        new_articles = driver.find_elements(By.CSS_SELECTOR, "article.post.yes")
        articles.extend(new_articles)

        # Scroll down
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Allow time for new content to load

        # Check if the page height has changed
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Step 5: Extract Information from Each Article
    results = []
    for article in articles:
        title = article.find_element(By.CSS_SELECTOR, "h3").text
        license_number = article.find_element(By.CSS_SELECTOR, "span[id^='lic']").text
        license_type = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'License Type:')]]").text
        license_status = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'License Status:')]]").text
        expiration_date = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'Expiration Date:')]]").text
        city = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'City:')]]").text
        state = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'State:')]]").text
        county = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'County:')]]").text
        zip_code = article.find_element(By.XPATH, ".//li[strong[contains(text(), 'Zip:')]]").text

        results.append({
            "Title": title,
            "License Number": license_number,
            "License Type": license_type,
            "License Status": license_status,
            "Expiration Date": expiration_date,
            "City": city,
            "State": state,
            "County": county,
            "Zip": zip_code,
        })

    # Step 6: Save to a DataFrame
    df = pd.DataFrame(results)

    # Print or save the DataFrame
    print(df)
    df.to_csv("pharmacy_data.csv", index=False)

finally:
    # Close the browser
    driver.quit()


                                  Title License Number  \
0                  3RD ST. YOUTH CLINIC       CLE 4375   
1     5TH AVENUE PUBLIC HEALTH SERVICES       CLE 1391   
2               77TH STREET JAIL CLINIC       CLE 1310   
3             ABAJIAN, HOURY BARSOUMIAN      APH 10319   
4                    ABASI, RABIE NIDAL      APH 11380   
...                                 ...            ...   
1995            ADULT MEDICATION CLINIC       CLE 4743   
1996           ADULT URGENT CARE CENTER       CLE 3421   
1997                    AFSHAR, ANAHITA      APH 11429   
1998              AFZAL, SHAMA PELWASHA      APH 11026   
1999               AGHAJANIAN, BERSABEH      APH 11323   

                                    License Type              License Status  \
0        LICENSE TYPE: CLINIC (GOVERNMENT OWNED)       LICENSE STATUS: CLEAR   
1        LICENSE TYPE: CLINIC (GOVERNMENT OWNED)   LICENSE STATUS: CANCELLED   
2        LICENSE TYPE: CLINIC (GOVERNMENT OWNED)       LICENSE 

In [22]:
df.head()

Unnamed: 0,Title,License Number,License Type,License Status,Expiration Date,City,State,County,Zip
0,3RD ST. YOUTH CLINIC,CLE 4375,LICENSE TYPE: CLINIC (GOVERNMENT OWNED),LICENSE STATUS: CLEAR,"EXPIRATION DATE: AUGUST 1, 2025",CITY: SAN FRANCISCO,STATE: CALIFORNIA,COUNTY: SAN FRANCISCO,ZIP: 94124
1,5TH AVENUE PUBLIC HEALTH SERVICES,CLE 1391,LICENSE TYPE: CLINIC (GOVERNMENT OWNED),LICENSE STATUS: CANCELLED,"EXPIRATION DATE: NOVEMBER 1, 2006",CITY: SAN DIEGO,STATE: CALIFORNIA,COUNTY: SAN DIEGO,ZIP: 92103
2,77TH STREET JAIL CLINIC,CLE 1310,LICENSE TYPE: CLINIC (GOVERNMENT OWNED),LICENSE STATUS: CLEAR,"EXPIRATION DATE: NOVEMBER 1, 2025",CITY: LOS ANGELES,STATE: CALIFORNIA,COUNTY: LOS ANGELES,ZIP: 90003
3,"ABAJIAN, HOURY BARSOUMIAN",APH 10319,LICENSE TYPE: ADVANCED PRACTICE PHARMACIST,LICENSE STATUS: CLEAR,"EXPIRATION DATE: DECEMBER 31, 2026",CITY: BEVERLY HILLS,STATE: CALIFORNIA,COUNTY: LOS ANGELES,ZIP: 90210
4,"ABASI, RABIE NIDAL",APH 11380,LICENSE TYPE: ADVANCED PRACTICE PHARMACIST,LICENSE STATUS: CLEAR,"EXPIRATION DATE: OCTOBER 31, 2025",CITY: SANTEE,STATE: CALIFORNIA,COUNTY: SAN DIEGO,ZIP: 92071
