<a href="https://colab.research.google.com/github/amien1410/amien-scrapers/blob/main/checkchildcare_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

In [None]:
url = "https://checkchildcare.com/Daycare/Index/001t0000007q98sAAA"
res = requests.get(url)
soup = BeautifulSoup(res.content, "html.parser")
soup

In [11]:
data = {}

# --- 1. Daycare Title/Name ---
title_tag = soup.select_one(".daycare-title")
data["daycare_title"] = title_tag.get_text(strip=True) if title_tag else None

# --- 2. Address ---
addr_tag = soup.select_one(".contact-item .fa-location-dot ~ .contact-content span")
data["address"] = addr_tag.get_text(strip=True) if addr_tag else None

# --- 3. Last Update (convert to date format) ---
update_tag = soup.find("span", string=lambda t: t and "Updated:" in t)
if not update_tag:
    update_tag = soup.select_one(".fa-calendar-check")  # backup
last_update = None
if update_tag:
    text = update_tag.parent.get_text(strip=True).replace("Updated:", "").strip()
    try:
        last_update = datetime.strptime(text, "%b %d, %Y").strftime("%Y-%m-%d")
    except ValueError:
        last_update = text
data["last_update"] = last_update

# --- 4. License Status ---
license_tag = soup.find("span", string=lambda t: t and "License:" in t)
if not license_tag:
    license_tag = soup.select_one(".fa-certificate")  # backup
license_text = None
if license_tag:
    license_text = license_tag.parent.get_text(strip=True).replace("License:", "").strip()
data["license"] = license_text

# --- 5. Inspections (list of dicts + url) ---
inspections = []
for row in soup.select("#inspectionstbl tbody tr"):
    cols = row.find_all("td")
    if len(cols) >= 3:
        # Extract the inspection link (if available)
        url_tag = row.select_one("a.btn-outline-primary")
        url = url_tag["href"] if url_tag and url_tag.has_attr("href") else None
        # Clean date
        date_text = cols[0].get_text(strip=True)
        try:
            date_iso = datetime.strptime(date_text, "%b %d, %Y").strftime("%Y-%m-%d")
        except ValueError:
            date_iso = date_text
        inspections.append({
            "date": date_iso,
            "risk_level": cols[1].get_text(strip=True),
            "observation": cols[2].get_text(" ", strip=True),
            "url": url
        })
data["inspections"] = inspections

# --- 6. Complaints (list or empty) ---
complaints_section = soup.select_one("#complaintstbl")
complaints = []
if complaints_section:
    rows = complaints_section.select("tbody tr")
    if not rows:
        text = complaints_section.get_text(strip=True)
        if "No provider complaints" in text:
            complaints = []
        else:
            complaints = [text]
    else:
        for row in rows:
            complaints.append(row.get_text(strip=True))
data["complaints"] = complaints

# --- 7. Program Information ---
program_info = {}
for item in soup.select("#programinfo .info-item"):
    label = item.select_one(".info-label")
    value = item.select_one(".info-value")
    if label and value:
        program_info[label.get_text(strip=True)] = value.get_text(strip=True)
data["program_information"] = program_info

# --- 8. Contact Information ---
contact_info = {}

for item in soup.select(".contact-card .contact-item"):
    label_tag = item.select_one("h5")
    value_tag = item.select_one("a, span")

    if label_tag and value_tag:
        key = label_tag.get_text(strip=True)

        # If Cloudflare-protected email is found
        cf_tag = value_tag.select_one("span.__cf_email__")
        if cf_tag and cf_tag.has_attr("data-cfemail"):
            encoded = cf_tag["data-cfemail"]
            value = decode_cfemail(encoded)
        else:
            value = value_tag.get_text(strip=True)

        contact_info[key] = value
data["contact_information"] = contact_info
data

{'daycare_title': 'Lil Peoples Bellevue – Infant Building',
 'address': None,
 'last_update': '2025-09-16',
 'license': 'Open',
 'inspections': [{'date': '2024-09-13',
   'risk_level': 'Serious',
   'observation': '110-300-0106(12) – It was observed that three staff members had expired Food Handler Permits. 110-30 ... Read More',
   'url': '/Inspection/Index/18239/Lil%20Peoples%20Bellevue%20%E2%80%93%20Infant%20Building'},
  {'date': '2024-09-13',
   'risk_level': 'Serious',
   'observation': '110-300-0260(1)(a) - The laundry detergent was observed stored under the sink in the bathroom. This ... Read More',
   'url': '/Inspection/Index/18242/Lil%20Peoples%20Bellevue%20%E2%80%93%20Infant%20Building'},
  {'date': '2024-09-13',
   'risk_level': 'Serious',
   'observation': 'The tree house on the playground was observed to be installed directly on open earth, without the re ... Read More',
   'url': '/Inspection/Index/18240/Lil%20Peoples%20Bellevue%20%E2%80%93%20Infant%20Building'},
  {'da

In [10]:
def decode_cfemail(encoded_hex: str) -> str:
    # First byte is the key
    key = int(encoded_hex[:2], 16)
    # Decode every subsequent pair of hex digits
    email = ''.join(
        chr(int(encoded_hex[i:i+2], 16) ^ key)
        for i in range(2, len(encoded_hex), 2)
    )
    return email

# Example from your HTML:
encoded = "49282b3b286733202722092e24282025672a2624"
decoded_email = decode_cfemail(encoded)
print(decoded_email)


abra.zink@gmail.com


In [None]:
url2 = "https://checkchildcare.com/Inspection/Index/18239/Lil%20Peoples%20Bellevue%20%E2%80%93%20Infant%20Building"
res = requests.get(url2)
soup = BeautifulSoup(res.content, "html.parser")
soup

In [13]:
inspection_data = {}

# Loop through all rows that have labels and values
for row in soup.select(".detail-info-card .info-row"):
    label_tag = row.select_one(".detail-label")
    value_tag = row.select_one(".detail-value")
    if label_tag and value_tag:
        label = label_tag.get_text(strip=True)
        # Get the text inside, even if nested in span or badge
        value = value_tag.get_text(strip=True)
        inspection_data[label] = value

# Capture official document link
doc_link = soup.select_one(".detail-info-card .document-link")
if doc_link and doc_link.has_attr("href"):
    inspection_data["Official Document"] = doc_link["href"]

# Output the structured data
inspection_data

{'Observation': '110-300-0106(12) – It was observed that three staff members had expired Food Handler Permits. 110-300-0106(8) – It was observed that three staff members had expired Safe Sleep training.',
 'Risk Level': 'Serious',
 'Inspection Date': 'Sep 13, 2024',
 'Correction Date': '',
 'Disputed': '',
 'Official Document': 'https://wa-del.my.salesforce.com/sfc/p/t00000008aPL/a/cs000005xtC6/crvMH4xpH5takBLTNpLn.eNB4EZYdRtv3mlAl6BcP0M'}

In [None]:
url3 = "https://checkchildcare.com/Search/UnifiedSearch?SearchText=Seattle,WA&Type=locationSearch&Page=2&ComplaintFilter=All&InspectionFilter=All&LicenseStatusFilter=All"
res = requests.get(url3)
soup = BeautifulSoup(res.content, "html.parser")
soup

In [16]:
url3 = "https://checkchildcare.com/Search/UnifiedSearch?SearchText=Seattle,WA&Type=locationSearch&Page=2&ComplaintFilter=All&InspectionFilter=All&LicenseStatusFilter=All"
res = requests.get(url3)
soup = BeautifulSoup(res.content, "html.parser")
providers_data = []

# Loop through all provider items
for item in soup.select(".provider-list-item"):
    provider = {}

    # Basic info
    name_tag = item.select_one(".provider-name a")
    address_tag = item.select_one(".provider-address")
    detail_link_tag = item.select_one(".text-end a.btn")

    provider["Name"] = name_tag.get_text(strip=True) if name_tag else None
    provider["Address"] = address_tag.get_text(strip=True) if address_tag else None
    provider["Details URL"] = (
        name_tag["href"] if name_tag and name_tag.has_attr("href") else None
    )

    # Extract provider info rows (like Early Achiever, License Status)
    for info_row in item.select(".provider-info-row"):
        label = info_row.select_one(".provider-info-label")
        value = info_row.select_one(".provider-info-value")
        if label and value:
            provider[label.get_text(strip=True)] = value.get_text(strip=True)

    # Extract complaints and inspection data
    for concern in item.select(".concerns-row"):
        concern_type = concern.select_one(".concerns-label a")
        if concern_type:
            section_title = concern_type.get_text(strip=True).replace(":", "")
            stats = {
                badge.get_text(strip=True).split(":")[0]:
                badge.get_text(strip=True).split(":")[1].strip()
                for badge in concern.select(".status-badge")
                if ":" in badge.get_text()
            }
            provider[section_title] = stats

    providers_data.append(provider)

# --- OUTPUT RESULTS ---
for p in providers_data:
    print(p)
    print("-" * 100)

{'Name': 'Queen Anne Community Center (Queen Anne Community Center)', 'Address': '1901 1st Ave W, Seattle, WA 98119, USA', 'Details URL': '/Daycare/Index/001t0000007qHuVAAU', 'Early Achiever:': 'Not Enrolled', 'License Status:': 'Open', 'Complaints': {'Immediate': '0', 'Serious': '0', 'Short': '0', 'Long': '0'}, 'Inspections': {'Immediate': '0', 'Serious': '2', 'Short': '1', 'Long': '0'}}
----------------------------------------------------------------------------------------------------
{'Name': 'Yousuf Rukia (Suhaima childcare)', 'Address': 'Seattle, WA 98108, USA', 'Details URL': '/Daycare/Index/0018y000007NFCQAA4', 'Early Achiever:': 'Quality Level 3', 'License Status:': 'Open', 'Complaints': {'Immediate': '0', 'Serious': '0', 'Short': '0', 'Long': '0'}, 'Inspections': {'Immediate': '0', 'Serious': '0', 'Short': '0', 'Long': '0'}}
----------------------------------------------------------------------------------------------------
{'Name': 'Rainier Community Center (Rainier Communit

In [17]:
import time

BASE_URL = "https://checkchildcare.com/Search/UnifiedSearch"
params = {
    "SearchText": "Seattle,WA",
    "Type": "locationSearch",
    "ComplaintFilter": "All",
    "InspectionFilter": "All",
    "LicenseStatusFilter": "All",
}

providers_data = []
page = 1

while True:
    print(f"Scraping page {page}...")
    params["Page"] = page

    res = requests.get(BASE_URL, params=params)
    if res.status_code != 200:
        print(f"⚠️ Request failed on page {page} (status {res.status_code})")
        break

    soup = BeautifulSoup(res.content, "html.parser")
    items = soup.select(".provider-list-item")

    if not items:
        print("✅ No more provider data found — scraping complete.")
        break

    for item in items:
        provider = {}

        # Basic info
        name_tag = item.select_one(".provider-name a")
        address_tag = item.select_one(".provider-address")

        provider["Name"] = name_tag.get_text(strip=True) if name_tag else None
        provider["Address"] = address_tag.get_text(strip=True) if address_tag else None
        provider["Details URL"] = (
            name_tag["href"] if name_tag and name_tag.has_attr("href") else None
        )

        # Extract provider info rows (like Early Achiever, License Status)
        for info_row in item.select(".provider-info-row"):
            label = info_row.select_one(".provider-info-label")
            value = info_row.select_one(".provider-info-value")
            if label and value:
                provider[label.get_text(strip=True)] = value.get_text(strip=True)

        # Extract complaints and inspection data
        for concern in item.select(".concerns-row"):
            concern_type = concern.select_one(".concerns-label a")
            if concern_type:
                section_title = concern_type.get_text(strip=True).replace(":", "")
                stats = {
                    badge.get_text(strip=True).split(":")[0]:
                    badge.get_text(strip=True).split(":")[1].strip()
                    for badge in concern.select(".status-badge")
                    if ":" in badge.get_text()
                }
                provider[section_title] = stats

        providers_data.append(provider)

    # Wait a bit to avoid hammering the server
    time.sleep(1)
    page += 1

print(f"\n✅ Total providers scraped: {len(providers_data)}\n")

# --- OUTPUT RESULTS ---
for p in providers_data:
    print(p)
    print("-" * 100)

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
Scraping 