In [6]:
from __future__ import annotations

import re
from dataclasses import replace
from typing import Iterable, Optional
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup

try:
    from puppyping.models import DogMedia, DogProfile
    from puppyping.db import get_cached_links, store_cached_links
except ImportError:  # Allows running as a script: python puppyping/puppy_scraper.py
    from models import DogMedia, DogProfile
    from db import get_cached_links, store_cached_links


_SHELTER_URL = "https://wright-wayrescue.org/adoptable-pets"

In [12]:
headers = {
    "User-Agent": "paws-scraper/1.0 (+respectful; non-commercial)",
    "Accept-Language": "en-US,en;q=0.9",
}
r = requests.get(_SHELTER_URL, headers=headers, timeout=30)

In [13]:
r.raise_for_status()

In [14]:
x = BeautifulSoup(r.text, "html.parser")

In [15]:
x

<!DOCTYPE html>

<html lang="en-US" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/">
<head>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="initial-scale=1" name="viewport"/>
<!-- This is Squarespace. --><!-- wrightwayrescue -->
<base href=""/>
<meta charset="utf-8">
<title>Adoptable Puppies — Wright-Way Rescue</title>
<meta content="Sec-CH-UA-Platform-Version, Sec-CH-UA-Model" http-equiv="Accept-CH"><link href="https://images.squarespace-cdn.com/content/v1/5719016a27d4bd6dafffd678/1592358719866-B61XFI0VSNAG8BCH397X/favicon.ico" rel="icon" type="image/x-icon"/>
<link href="https://wright-wayrescue.org/adoptable-pets" rel="canonical"/>
<meta content="Wright-Way Rescue" property="og:site_name"/>
<meta content="Adoptable Puppies — Wright-Way Rescue" property="og:title"/>
<meta content="https://wright-wayrescue.org/adoptable-pets" property="og:url"/>
<meta content="website" property="og:type"/>
<meta content="Adopt

In [8]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

START_URL = "https://wright-wayrescue.org/adoptable-pets"
PETANGO_BASE = "https://ws.petango.com/webservices/adoptablesearch/"

def fetch_puppy_urls() -> set[str]:
    session = requests.Session()

    # 1) Fetch Squarespace page
    resp = session.get(START_URL, timeout=30)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")

    # 2) Extract Petango iframe src
    iframe = soup.find("iframe")
    if not iframe or not iframe.get("src"):
        raise RuntimeError("Petango iframe not found")

    iframe_src = iframe["src"]

    # 3) Fetch Petango listing page
    resp = session.get(iframe_src, timeout=30)
    resp.raise_for_status()

    petango = BeautifulSoup(resp.text, "html.parser")

    # 4) Collect per-puppy detail URLs
    urls = {
        urljoin(PETANGO_BASE, a["href"])
        for a in petango.select('a[href*="wsAdoptableAnimalDetails.aspx"]')
        if a.get("href")
    }

    return urls


# Example usage
if __name__ == "__main__":
    puppy_urls = fetch_puppy_urls()
    for url in sorted(puppy_urls):
        print(url)


https://ws.petango.com/webservices/adoptablesearch/wsAdoptableAnimalDetails.aspx?id=58414683&css=http://ws.petango.com/WebServices/adoptablesearch/css/styles.css&authkey=io53xfw8b0k2ocet3yb83666507n2168taf513lkxrqe681kf8
https://ws.petango.com/webservices/adoptablesearch/wsAdoptableAnimalDetails.aspx?id=58646757&css=http://ws.petango.com/WebServices/adoptablesearch/css/styles.css&authkey=io53xfw8b0k2ocet3yb83666507n2168taf513lkxrqe681kf8
https://ws.petango.com/webservices/adoptablesearch/wsAdoptableAnimalDetails.aspx?id=59065951&css=http://ws.petango.com/WebServices/adoptablesearch/css/styles.css&authkey=io53xfw8b0k2ocet3yb83666507n2168taf513lkxrqe681kf8
https://ws.petango.com/webservices/adoptablesearch/wsAdoptableAnimalDetails.aspx?id=59256217&css=http://ws.petango.com/WebServices/adoptablesearch/css/styles.css&authkey=io53xfw8b0k2ocet3yb83666507n2168taf513lkxrqe681kf8
https://ws.petango.com/webservices/adoptablesearch/wsAdoptableAnimalDetails.aspx?id=59614886&css=http://ws.petango.c

In [9]:
len(puppy_urls)

130

In [10]:
puppy_urls

{'https://ws.petango.com/webservices/adoptablesearch/wsAdoptableAnimalDetails.aspx?id=58414683&css=http://ws.petango.com/WebServices/adoptablesearch/css/styles.css&authkey=io53xfw8b0k2ocet3yb83666507n2168taf513lkxrqe681kf8',
 'https://ws.petango.com/webservices/adoptablesearch/wsAdoptableAnimalDetails.aspx?id=58646757&css=http://ws.petango.com/WebServices/adoptablesearch/css/styles.css&authkey=io53xfw8b0k2ocet3yb83666507n2168taf513lkxrqe681kf8',
 'https://ws.petango.com/webservices/adoptablesearch/wsAdoptableAnimalDetails.aspx?id=59065951&css=http://ws.petango.com/WebServices/adoptablesearch/css/styles.css&authkey=io53xfw8b0k2ocet3yb83666507n2168taf513lkxrqe681kf8',
 'https://ws.petango.com/webservices/adoptablesearch/wsAdoptableAnimalDetails.aspx?id=59256217&css=http://ws.petango.com/WebServices/adoptablesearch/css/styles.css&authkey=io53xfw8b0k2ocet3yb83666507n2168taf513lkxrqe681kf8',
 'https://ws.petango.com/webservices/adoptablesearch/wsAdoptableAnimalDetails.aspx?id=59614886&css=h

In [23]:
from __future__ import annotations

import re
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Optional
from urllib.parse import urljoin, urlparse, parse_qs

import requests
from bs4 import BeautifulSoup


# ============================================================================
# DATA MODELS (unchanged)
# ============================================================================

@dataclass(frozen=True)
class DogMedia:
    images: list[str] = field(default_factory=list)
    videos: list[str] = field(default_factory=list)
    embeds: list[str] = field(default_factory=list)

    def summary(self) -> str:
        return f"{len(self.images)} images, {len(self.videos)} videos, {len(self.embeds)} embeds"


@dataclass(frozen=True)
class DogProfile:
    dog_id: int
    url: str

    name: Optional[str] = None
    breed: Optional[str] = None
    gender: Optional[str] = None
    age_raw: Optional[str] = None
    age_months: Optional[float] = None
    weight_lbs: Optional[float] = None

    location: Optional[str] = None
    status: Optional[str] = None

    ratings: dict[str, Optional[int]] = field(default_factory=dict)
    description: Optional[str] = None
    media: DogMedia = field(default_factory=DogMedia)

    scraped_at_utc: str = field(
        default_factory=lambda: datetime.now(timezone.utc).isoformat()
    )


# ============================================================================
# CONSTANTS
# ============================================================================

START_URL = "https://wright-wayrescue.org/adoptable-pets"
PETANGO_BASE = "https://ws.petango.com/webservices/adoptablesearch/"

LABEL_MAP = {
    "Animal ID": "dog_id",
    "Breed": "breed",
    "Gender": "gender",
    "Age": "age_raw",
    "Location": "location",
    "Stage": "status",
}

NAME_NOISE = "Click a number to change picture or play to see a video"


# ============================================================================
# UTILITIES
# ============================================================================

def _clean(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()


def _extract_query_id(url: str) -> Optional[int]:
    try:
        qs = parse_qs(urlparse(url).query)
        if "id" in qs:
            return int(qs["id"][0])
    except Exception:
        pass
    return None


# ============================================================================
# STEP 1: GET PUPPY DETAIL URLS
# ============================================================================

def fetch_puppy_urls() -> set[str]:
    session = requests.Session()

    resp = session.get(START_URL, timeout=30)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    iframe = soup.find("iframe")
    if not iframe or not iframe.get("src"):
        raise RuntimeError("Petango iframe not found")

    iframe_src = iframe["src"]

    resp = session.get(iframe_src, timeout=30)
    resp.raise_for_status()
    petango = BeautifulSoup(resp.text, "html.parser")

    return {
        urljoin(PETANGO_BASE, a["href"])
        for a in petango.select('a[href*="wsAdoptableAnimalDetails.aspx"]')
        if a.get("href")
    }


# ============================================================================
# STEP 2: SCRAPE INDIVIDUAL DOG PAGE
# ============================================================================

def extract_description(soup: BeautifulSoup) -> Optional[str]:
    """
    Petango descriptions are usually the longest free-text block on the page.
    """
    blocks = [
        _clean(el.get_text(" ", strip=True))
        for el in soup.select("p, div")
        if len(_clean(el.get_text(" ", strip=True))) >= 120
    ]
    return blocks[0] if blocks else None


def extract_name_from_description(description: Optional[str]) -> Optional[str]:
    """
    Extract name from description text starting with:
      'Meet <Name> ...'

    Also removes known Petango boilerplate and trims whitespace.
    """
    if not description:
        return None

    # Remove known noise first
    cleaned = description.replace(NAME_NOISE, "")
    cleaned = _clean(cleaned)

    m = re.search(
        r"\bMeet\s+(.+?)(?:[.!—–-]|$)",
        cleaned,
        flags=re.IGNORECASE,
    )
    if not m:
        return None

    name = m.group(1)
    return _clean(name)


def extract_label_values(soup: BeautifulSoup) -> dict[str, str]:
    data: dict[str, str] = {}

    for tr in soup.select("tr"):
        tds = tr.find_all(["td", "th"])
        if len(tds) >= 2:
            key = _clean(tds[0].get_text()).rstrip(":")
            val = _clean(tds[1].get_text())
            if key and val:
                data[key] = val

    text = soup.get_text("\n", strip=True)
    for label in LABEL_MAP:
        if label not in data:
            m = re.search(rf"{re.escape(label)}\s*:\s*(.+)", text)
            if m:
                data[label] = _clean(m.group(1).split("\n")[0])

    return data


def parse_age_months(age_raw: Optional[str]) -> Optional[float]:
    if not age_raw:
        return None

    s = age_raw.lower()

    def grab(unit: str) -> float:
        m = re.search(rf"(\d+)\s*{unit}", s)
        return float(m.group(1)) if m else 0.0

    years = grab("year") + grab("years")
    months = grab("month") + grab("months")
    weeks = grab("week") + grab("weeks")
    days = grab("day") + grab("days")

    total = years * 12 + months + weeks * (7 / 30) + days * (1 / 30)
    return total if total > 0 else None


def extract_media(soup: BeautifulSoup, page_url: str) -> DogMedia:
    images = {
        urljoin(page_url, img["src"])
        for img in soup.select("img[src]")
    }

    videos = {
        urljoin(page_url, v["src"])
        for v in soup.select("video[src], video source[src]")
    }

    embeds = {
        urljoin(page_url, f["src"])
        for f in soup.select("iframe[src]")
    }

    return DogMedia(
        images=sorted(images),
        videos=sorted(videos),
        embeds=sorted(embeds),
    )


def scrape_dog_profile(url: str, session: requests.Session) -> DogProfile:
    resp = session.get(url, timeout=30)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    labels = extract_label_values(soup)
    description = extract_description(soup)

    dog_id = (
        int(re.sub(r"[^\d]", "", labels["Animal ID"]))
        if "Animal ID" in labels
        else _extract_query_id(url)
    )
    if dog_id is None:
        raise ValueError(f"Missing dog_id for {url}")

    age_raw = labels.get("Age")

    return DogProfile(
        dog_id=dog_id,
        url=url,
        name=extract_name_from_description(description),
        breed=labels.get("Breed"),
        gender=labels.get("Gender"),
        age_raw=age_raw,
        age_months=parse_age_months(age_raw),
        weight_lbs=None,
        location=labels.get("Location"),
        status=labels.get("Stage"),
        ratings={},
        description=description,
        media=extract_media(soup, url),
    )


# ============================================================================
# STEP 3: PUBLIC API — URLs → DogProfiles
# ============================================================================

def scrape_dog_profiles(urls: set[str]) -> list[DogProfile]:
    session = requests.Session()
    profiles: list[DogProfile] = []

    for url in sorted(urls):
        try:
            profiles.append(scrape_dog_profile(url, session))
        except Exception as e:
            print(f"[WARN] Failed {url}: {e}")

    return profiles


# ============================================================================
# EXAMPLE USAGE
# ============================================================================

if __name__ == "__main__":
    puppy_urls = fetch_puppy_urls()
    dogs = scrape_dog_profiles(puppy_urls)

    for dog in dogs:
        print(dog)


DogProfile(dog_id=58414683, url='https://ws.petango.com/webservices/adoptablesearch/wsAdoptableAnimalDetails.aspx?id=58414683&css=http://ws.petango.com/WebServices/adoptablesearch/css/styles.css&authkey=io53xfw8b0k2ocet3yb83666507n2168taf513lkxrqe681kf8', name='Nann : [ 1 ] [ 2 ] Animal ID 58414683 Species Dog Breed Terrier, Jack Russell/Beagle Age 11 months 7 days Gender Female Size Small Color Red Spayed/Neutered Declawed No Housetrained Unknown Location Foster Home Intake Date 5/1/2025 Stage Reserved Meet this adorable litter of Jack Russell/Beagle mix puppies, born and raised in the heart of Missouri', breed='Terrier, Jack Russell/Beagle', gender='Female', age_raw='11 months 7 days', age_months=22.466666666666665, weight_lbs=None, location='Foster Home', status='Reserved', ratings={}, description="Meet Nann Click a number to change picture or play to see a video: [ 1 ] [ 2 ] Animal ID 58414683 Species Dog Breed Terrier, Jack Russell/Beagle Age 11 months 7 days Gender Female Size Sm