In [6]:
!pip install selenium webdriver-manager beautifulsoup4






[notice] A new release of pip is available: 24.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
#!/usr/bin/env python3
"""
gmaps_full_extractor.py

Fetch ALL available business details within a given radius on Google Maps.
Strategy:
 - Convert place URL -> search URL (if needed)
 - Scroll results panel until stable
 - Collect unique result hrefs ("/maps/place/...")
 - Open each href in a new tab, parse place page (BeautifulSoup), extract many fields
 - Close tab and continue

Note: This scrapes Google Maps pages rendered client-side. Google may change DOM frequently.
Use responsibly and respect Google terms.
"""
import re
import json
import time
import logging
import random
from typing import Optional, Tuple, List, Dict
from urllib.parse import urlparse, parse_qs, quote_plus
from datetime import datetime, timezone

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException, WebDriverException
from webdriver_manager.chrome import ChromeDriverManager

# -------------------------
# Configuration & Logging
# -------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[logging.FileHandler("scraper.log"), logging.StreamHandler()],
)
logger = logging.getLogger("gmaps-full")

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
]

PHONE_RE = re.compile(r'(\+?\d[\d\-\s\(\)]{6,}\d)')

ZOOM_LEVELS = {50: 8, 20: 11, 10: 12, 5: 13, 2: 14, 1: 15}

def radius_to_zoom(radius_km: int) -> int:
    for r, z in sorted(ZOOM_LEVELS.items(), reverse=True):
        if radius_km >= r:
            return z
    return 15

# -------------------------
# Helpers
# -------------------------
def parse_coordinates(url: str) -> Tuple[Optional[float], Optional[float]]:
    try:
        if "@" in url:
            after = url.split("@", 1)[1]
            parts = after.split(",", 3)
            lat = float(parts[0])
            lng = float(parts[1])
            return lat, lng
        parsed = urlparse(url)
        qs = parse_qs(parsed.query)
        if "ll" in qs:
            lat_s, lng_s = qs["ll"][0].split(",")[:2]
            return float(lat_s), float(lng_s)
    except Exception as e:
        logger.debug("parse_coordinates error: %s", e)
    return None, None

def make_search_url(lat: float, lng: float, zoom: int, keyword: Optional[str] = None) -> str:
    k = quote_plus(keyword.strip()) if keyword else "businesses"
    return f"https://www.google.com/maps/search/{k}/@{lat},{lng},{zoom}z"

def safe_sleep(a=0.4, b=1.0):
    time.sleep(random.uniform(a, b))

# -------------------------
# Selenium driver init
# -------------------------
def init_driver(headless: bool = False) -> webdriver.Chrome:
    options = Options()
    if headless:
        options.add_argument("--headless=new")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-extensions")
    options.add_argument("--disable-gpu")
    options.add_argument("window-size=1400,900")
    options.add_argument(f"user-agent={random.choice(USER_AGENTS)}")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    # attempt to hide webdriver flag
    try:
        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined});")
    except Exception:
        pass

    logger.info("Chrome driver initialized")
    return driver

# -------------------------
# DOM utilities / scrolling
# -------------------------
def find_results_panel(driver, timeout=12):
    wait = WebDriverWait(driver, timeout)
    candidates = [
        (By.CSS_SELECTOR, "div[role='feed']"),
        (By.XPATH, "//div[@role='region']//div[contains(@class,'scrollbox')]"),
        (By.XPATH, "//div[contains(@aria-label,'Results') or contains(@aria-label,'results')]"),
        (By.CSS_SELECTOR, "div[role='region']"),
    ]
    for by, sel in candidates:
        try:
            el = wait.until(EC.presence_of_element_located((by, sel)))
            return el
        except TimeoutException:
            continue
    return None

def scroll_until_stable(driver, panel, max_rounds=60):
    last_count = 0
    no_change_iters = 0
    for i in range(max_rounds):
        # collect hrefs count
        hrefs = set([a.get_attribute("href") for a in driver.find_elements(By.XPATH, "//a[contains(@href,'/maps/place/')]") if a.get_attribute("href")])
        cur_count = len(hrefs)
        if cur_count > last_count:
            logger.info("Loaded %d results", cur_count)
            last_count = cur_count
            no_change_iters = 0
        else:
            no_change_iters += 1
        # scroll
        try:
            driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", panel)
        except Exception:
            driver.execute_script("window.scrollBy(0, window.innerHeight);")
        safe_sleep(0.6, 1.0)
        if no_change_iters >= 4:
            break
    # return unique hrefs collected
    all_links = [a.get_attribute("href") for a in driver.find_elements(By.XPATH, "//a[contains(@href,'/maps/place/')]") if a.get_attribute("href")]
    # dedupe while preserving order
    seen = set(); dedup = []
    for h in all_links:
        if h not in seen:
            seen.add(h); dedup.append(h)
    return dedup

# -------------------------
# Place page extraction (open each place url in a new tab)
# -------------------------
def extract_from_place_url(driver, href) -> Dict:
    """
    Open href in a new tab, switch, wait for content, parse with BeautifulSoup,
    extract fields and close tab.
    """
    result = {
        "business_name": "N/A",
        "address": "N/A",
        "category": "N/A",
        "rating": "N/A",
        "reviews_count": "0",
        "google_maps_url": href,
        "company_url": "N/A",
        "phone": "N/A",
        "opening_hours": [],
        "price_level": "N/A",
        "attributes": [],
        "images": [],
        "description": "N/A",
        "raw_page_text_snippet": ""
    }

    original_window = driver.current_window_handle
    # open new tab
    driver.execute_script("window.open(arguments[0], '_blank');", href)
    safe_sleep(0.6, 1.2)
    # switch to new window
    windows = driver.window_handles
    new_window = [w for w in windows if w != original_window][-1]
    driver.switch_to.window(new_window)

    try:
        # Wait for main content
        WebDriverWait(driver, 12).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        safe_sleep(1.0, 2.0)
        # Detect captcha / block
        page_text = driver.page_source.lower()
        if "unusual traffic" in page_text or "are you a robot" in page_text or "sorry" in page_text and "detected" in page_text:
            logger.error("Captcha/Block detected on place page; aborting extraction for this href.")
            raise RuntimeError("Captcha detected")

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # name
        h1 = soup.select_one("h1")
        if h1 and h1.get_text(strip=True):
            result["business_name"] = h1.get_text(strip=True)

        # address
        # buttons with data-item-id='address' or aria-label contains 'Address' etc
        addr = None
        cand = soup.select_one("button[data-item-id='address'] .Io6YTe, span[data-item-id='address'], div[aria-label*='Address']")
        if cand and cand.get_text(strip=True):
            addr = cand.get_text(strip=True)
        else:
            # try common class
            cand2 = soup.select_one("div[data-tooltip='Copy address'], div[data-item-id='address']")
            if cand2 and cand2.get_text(strip=True):
                addr = cand2.get_text(strip=True)
        if addr:
            result["address"] = addr

        # category
        cat = soup.select_one("button[jsaction*='pane.rating.category'], div[data-item-id='subtitle'], span.fontBodySmall")
        if cat and cat.get_text(strip=True):
            result["category"] = cat.get_text(strip=True)

        # rating
        r_el = soup.find(lambda t: t.name == "div" and t.has_attr("aria-label") and "star" in t["aria-label"].lower() or "rating" in t.get_text().lower())
        if r_el and r_el.has_attr("aria-label"):
            mm = re.search(r"(\d+(\.\d+)?)", r_el["aria-label"])
            if mm:
                result["rating"] = mm.group(1)

        # reviews count
        for b in soup.select("button"):
            txt = b.get_text(" ", strip=True)
            if txt and "review" in txt.lower():
                m = re.search(r"(\d[\d,]*)", txt.replace(",", ""))
                if m:
                    result["reviews_count"] = m.group(1).replace(",", "")
                    break

        # website link
        for a in soup.find_all("a", href=True):
            href_a = a["href"]
            if href_a.startswith("http") and ("google" not in href_a and "/maps" not in href_a):
                result["company_url"] = href_a
                break

        # phone - look for phone-like text or aria-label
        ph = None
        # aria-labels like 'Call' buttons
        phone_btn = soup.select_one("button[aria-label*='Call'], a[aria-label*='Call']")
        if phone_btn:
            text = phone_btn.get_text(" ", strip=True)
            match = PHONE_RE.search(text)
            if match:
                ph = match.group(1)
        if not ph:
            # search page text
            text_all = soup.get_text(" ")
            match = PHONE_RE.search(text_all)
            if match:
                ph = match.group(1)
        if ph:
            result["phone"] = ph

        # opening hours - look for table or li's in hours section
        hours = []
        hours_parent = soup.select_one("table[class*='WgFkxc']") or soup.select_one("div[aria-label*='Hours']") or soup.select_one("div[data-item-id='hours']")
        if hours_parent:
            for tr in hours_parent.select("tr"):
                txt = tr.get_text(" ", strip=True)
                if txt:
                    hours.append(txt)
        else:
            # fallback: find common day lines
            for li in soup.select("div.section-open-hours, div[jsinstance] li"):
                t = li.get_text(" ", strip=True)
                if t and any(day in t.lower() for day in ["mon", "tue", "wed", "thu", "fri", "sat", "sun"]):
                    hours.append(t)
        if hours:
            result["opening_hours"] = hours

        # price level (if present)
        txt_all = soup.get_text(" ")
        price_m = re.search(r"(\£|\$|€)\s*\d+", txt_all)
        if price_m:
            result["price_level"] = price_m.group(0)
        else:
            # text like "$$ · Pricey"
            match2 = re.search(r"(\${1,4}|\£{1,4}|€{1,4}|\w+\s*·\s*Pricey)", txt_all)
            if match2:
                result["price_level"] = match2.group(0)

        # attributes - small badges near top - fetch small spans
        attrs = []
        for sp in soup.select("button[jsaction*='pane.placeActions'], span[class*='ucwH6d'] , div.fontBodySmall"):
            t = sp.get_text(" ", strip=True)
            if t and len(t) < 60:
                attrs.append(t)
        result["attributes"] = list(dict.fromkeys([a for a in attrs if a and len(a) > 0]))[:12]

        # images - attempt to collect image srcs from carousel thumbnails
        images = []
        for img in soup.select("img[src]"):
            src = img.get("src", "")
            if src and ("maps" not in src and "google" not in src and len(src) > 30):
                images.append(src)
            elif src and "gstatic" in src:
                images.append(src)
        result["images"] = images[:10]

        # description / about
        desc_candidate = soup.select_one("div[data-section-id='overview'] , div[data-item-id='description']")
        if desc_candidate and desc_candidate.get_text(strip=True):
            result["description"] = desc_candidate.get_text(" ", strip=True)
        else:
            # try first paragraph-like text with moderate length
            long_texts = [p.get_text(" ", strip=True) for p in soup.select("div") if p.get_text(strip=True) and 40 < len(p.get_text(strip=True)) < 600]
            if long_texts:
                result["description"] = long_texts[0]

        # raw snippet for debug
        result["raw_page_text_snippet"] = soup.get_text(" ", strip=True)[:800]

    except Exception as e:
        logger.warning("Error extracting %s: %s", href, e)
    finally:
        # close the tab and switch back
        try:
            driver.close()
        except Exception:
            pass
        try:
            driver.switch_to.window(original_window)
        except Exception:
            # attempt to switch to first window
            try:
                driver.switch_to.window(driver.window_handles[0])
            except Exception:
                pass
    return result

# -------------------------
# Main scraping orchestration
# -------------------------
def scrape_area(input_url: str, radius_km: int = 5, keyword: Optional[str] = None, headless: bool = False) -> Dict:
    lat, lng = parse_coordinates(input_url)
    if lat is None or lng is None:
        raise ValueError("Could not parse coordinates from URL. Provide a place or search URL containing coordinates.")

    zoom = radius_to_zoom(radius_km)
    search_url = make_search_url(lat, lng, zoom, keyword)

    driver = init_driver(headless=headless)
    try:
        logger.info("Opening search URL: %s", search_url)
        driver.get(search_url)
        safe_sleep(2.0, 3.5)

        # detect captcha early
        page_src = driver.page_source.lower()
        if "unusual traffic" in page_src or "are you a robot" in page_src:
            raise RuntimeError("Captcha / unusual traffic detected. Aborting.")

        panel = find_results_panel(driver, timeout=12)
        hrefs = []
        if panel:
            hrefs = scroll_until_stable(driver, panel, max_rounds=60)
        else:
            # fallback: collect links on page
            hrefs = [a.get_attribute("href") for a in driver.find_elements(By.XPATH, "//a[contains(@href,'/maps/place/')]") if a.get_attribute("href")]

        # Remove query parameters and dedupe by base path (optional)
        clean_hrefs = []
        seen = set()
        for h in hrefs:
            if not h:
                continue
            # normalise up to ? or &
            base = h.split("?")[0]
            if base not in seen:
                seen.add(base)
                clean_hrefs.append(h)

        logger.info("Collected %d unique place links to visit", len(clean_hrefs))

        results = []
        for idx, href in enumerate(clean_hrefs, start=1):
            logger.info("Processing %d/%d : %s", idx, len(clean_hrefs), href)
            try:
                info = extract_from_place_url(driver, href)
                results.append(info)
                safe_sleep(0.8, 1.8)
            except RuntimeError as e:
                logger.error("Aborting due to: %s", e)
                break
            except Exception as e:
                logger.warning("Failed to extract %s: %s", href, e)
                safe_sleep(0.5, 1.0)
                continue

        output = {
            "input_url": input_url,
            "search_url": search_url,
            "radius_km": radius_km,
            "coordinates": {"lat": lat, "lng": lng},
            "zoom_level": zoom,
            "timestamp": datetime.now(timezone.utc).isoformat(),
            "total_businesses": len(results),
            "data": results
        }
        return output

    finally:
        try:
            driver.quit()
        except Exception:
            pass

# -------------------------
# Save helper & CLI
# -------------------------
def save_json(data: Dict, fname="results.json"):
    with open(fname, "w", encoding="utf-8") as fh:
        json.dump(data, fh, indent=2, ensure_ascii=False)
    logger.info("Saved results to %s", fname)

def main():
    try:
        url = input("Google Maps place/search URL: ").strip()
        if not url:
            print("URL required"); return
        radius = input("Radius in km (default 5): ").strip() or "5"
        keyword = input("Optional search keyword (e.g., hotels, restaurants). Leave blank for 'businesses': ").strip() or None
        headless = input("Headless? (y/N): ").strip().lower() == "y"

        out = scrape_area(url, radius_km=int(radius), keyword=keyword, headless=headless)
        save_json(out, "results.json")
        print(json.dumps(out, indent=2, ensure_ascii=False))
    except KeyboardInterrupt:
        logger.info("Cancelled by user")
    except Exception as e:
        logger.exception("Fatal: %s", e)

if __name__ == "__main__":
    main()


2025-11-22 23:52:36,969 - INFO - Get LATEST chromedriver version for google-chrome
2025-11-22 23:52:38,751 - INFO - Get LATEST chromedriver version for google-chrome
2025-11-22 23:52:39,891 - INFO - Driver [C:\Users\ABC\.wdm\drivers\chromedriver\win64\142.0.7444.175\chromedriver-win32/chromedriver.exe] found in cache
2025-11-22 23:52:43,051 - INFO - Chrome driver initialized
2025-11-22 23:52:43,051 - INFO - Opening search URL: https://www.google.com/maps/search/hotel/@33.6319495,73.0679672,12z
2025-11-22 23:52:52,958 - INFO - Loaded 10 results
2025-11-22 23:52:55,935 - INFO - Loaded 16 results
2025-11-22 23:52:59,336 - INFO - Loaded 21 results
2025-11-22 23:53:03,302 - INFO - Loaded 30 results
2025-11-22 23:53:04,911 - INFO - Loaded 36 results
2025-11-22 23:53:06,877 - INFO - Loaded 37 results
2025-11-22 23:53:12,361 - INFO - Loaded 44 results
2025-11-22 23:53:14,481 - INFO - Loaded 49 results
2025-11-22 23:53:16,631 - INFO - Loaded 51 results
2025-11-22 23:53:23,321 - INFO - Loaded 63

{
  "input_url": "https://www.google.com/maps/place/Cp+hotel+islamabad/@33.6319495,73.0679672,13z/data=!4m13!1m2!2m1!1shotel!3m9!1s0x38dfeb16e6026be5:0x6dd68482e121408e!5m2!4m1!1i2!8m2!3d33.6299321!4d73.1153929!15sCgVob3RlbJIBBWhvdGVsqgE2EAEqCSIFaG90ZWwoADIcEAEiGLv8DWy0GmcrQbW6pt4yK-nv9R63SKOHTzIJEAIiBWhvdGVs4AEA!16s%2Fg%2F11c3k6f842?authuser=0&entry=ttu&g_ep=EgoyMDI1MTExNy4wIKXMDSoASAFQAw%3D%3D",
  "search_url": "https://www.google.com/maps/search/hotel/@33.6319495,73.0679672,12z",
  "radius_km": 10,
  "coordinates": {
    "lat": 33.6319495,
    "lng": 73.0679672
  },
  "zoom_level": 12,
  "timestamp": "2025-11-22T19:08:05.607156+00:00",
  "total_businesses": 111,
  "data": [
    {
      "business_name": "Pearl Continental Hotel Rawalpindi",
      "address": "The Mall Rd, Rawalpindi, 46000",
      "category": "Select your dates to see the best prices",
      "rating": "N/A",
      "reviews_count": "19274",
      "google_maps_url": "https://www.google.com/maps/place/Pearl+Continental+H

In [3]:
#!/usr/bin/env python3
"""
gmaps_full_extractor_with_shortlink.py

Fetch ALL available business details within a given radius on Google Maps.
Supports short Google Maps links (maps.app.goo.gl, goo.gl/maps, etc.) by expanding them first.

Strategy:
 - Expand short link (requests -> follow redirects). If that doesn't reveal coords, fall back to Selenium navigation to resolve final URL.
 - Convert place URL -> search URL (if needed)
 - Scroll results panel until stable
 - Collect unique result hrefs ("/maps/place/...")
 - Open each href in a new tab, parse place page (BeautifulSoup), extract many fields
 - Close tab and continue

Note: This scrapes Google Maps pages rendered client-side. Google may change DOM frequently.
Use responsibly and respect Google terms.
"""
import re
import json
import time
import logging
import random
import requests
from typing import Optional, Tuple, List, Dict
from urllib.parse import urlparse, parse_qs, quote_plus
from datetime import datetime, timezone

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException, WebDriverException
from webdriver_manager.chrome import ChromeDriverManager

# -------------------------
# Configuration & Logging
# -------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[logging.FileHandler("scraper.log"), logging.StreamHandler()],
)
logger = logging.getLogger("gmaps-full")

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
]

PHONE_RE = re.compile(r'(\+?\d[\d\-\s\(\)]{6,}\d)')

ZOOM_LEVELS = {50: 8, 20: 11, 10: 12, 5: 13, 2: 14, 1: 15}

SHORT_DOMAINS = ("maps.app.goo.gl", "goo.gl", "maps.fi", "goo.gl/maps")

def radius_to_zoom(radius_km: int) -> int:
    for r, z in sorted(ZOOM_LEVELS.items(), reverse=True):
        if radius_km >= r:
            return z
    return 15

# -------------------------
# Helpers
# -------------------------
def parse_coordinates(url: str) -> Tuple[Optional[float], Optional[float]]:
    """Extract lat,lng from a Google Maps URL (supports @ style and ll query)."""
    try:
        if "@" in url:
            after = url.split("@", 1)[1]
            parts = after.split(",", 3)
            lat = float(parts[0])
            lng = float(parts[1])
            return lat, lng
        parsed = urlparse(url)
        qs = parse_qs(parsed.query)
        if "ll" in qs:
            lat_s, lng_s = qs["ll"][0].split(",")[:2]
            return float(lat_s), float(lng_s)
    except Exception as e:
        logger.debug("parse_coordinates error: %s", e)
    return None, None

def make_search_url(lat: float, lng: float, zoom: int, keyword: Optional[str] = None) -> str:
    k = quote_plus(keyword.strip()) if keyword else "businesses"
    return f"https://www.google.com/maps/search/{k}/@{lat},{lng},{zoom}z"

def safe_sleep(a=0.4, b=1.0):
    time.sleep(random.uniform(a, b))

# -------------------------
# Short-link expansion
# -------------------------
def expand_short_url_requests(url: str, timeout: int = 10) -> Optional[str]:
    """
    Expand short URL using requests (follows redirects). Returns expanded URL or None.
    """
    try:
        logger.info("Attempting to expand short URL via HTTP requests: %s", url)
        resp = requests.get(url, timeout=timeout, allow_redirects=True)
        final = resp.url
        logger.info("Expanded (requests) -> %s", final)
        return final
    except Exception as e:
        logger.debug("Requests-based expansion failed: %s", e)
        return None

# -------------------------
# Selenium driver init
# -------------------------
def init_driver(headless: bool = False) -> webdriver.Chrome:
    options = Options()
    if headless:
        options.add_argument("--headless=new")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-extensions")
    options.add_argument("--disable-gpu")
    options.add_argument("window-size=1400,900")
    options.add_argument(f"user-agent={random.choice(USER_AGENTS)}")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    # attempt to hide webdriver flag
    try:
        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined});")
    except Exception:
        pass

    logger.info("Chrome driver initialized")
    return driver

# -------------------------
# DOM utilities / scrolling
# -------------------------
def find_results_panel(driver, timeout=12):
    wait = WebDriverWait(driver, timeout)
    candidates = [
        (By.CSS_SELECTOR, "div[role='feed']"),
        (By.XPATH, "//div[@role='region']//div[contains(@class,'scrollbox')]"),
        (By.XPATH, "//div[contains(@aria-label,'Results') or contains(@aria-label,'results')]"),
        (By.CSS_SELECTOR, "div[role='region']"),
    ]
    for by, sel in candidates:
        try:
            el = wait.until(EC.presence_of_element_located((by, sel)))
            return el
        except TimeoutException:
            continue
    return None

def scroll_until_stable(driver, panel, max_rounds=60):
    last_count = 0
    no_change_iters = 0
    for i in range(max_rounds):
        # collect hrefs count
        hrefs = set([a.get_attribute("href") for a in driver.find_elements(By.XPATH, "//a[contains(@href,'/maps/place/')]") if a.get_attribute("href")])
        cur_count = len(hrefs)
        if cur_count > last_count:
            logger.info("Loaded %d results", cur_count)
            last_count = cur_count
            no_change_iters = 0
        else:
            no_change_iters += 1
        # scroll
        try:
            driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", panel)
        except Exception:
            driver.execute_script("window.scrollBy(0, window.innerHeight);")
        safe_sleep(0.6, 1.0)
        if no_change_iters >= 4:
            break
    # return unique hrefs collected
    all_links = [a.get_attribute("href") for a in driver.find_elements(By.XPATH, "//a[contains(@href,'/maps/place/')]") if a.get_attribute("href")]
    # dedupe while preserving order
    seen = set(); dedup = []
    for h in all_links:
        if h not in seen:
            seen.add(h); dedup.append(h)
    return dedup

# -------------------------
# Place page extraction (open each place url in a new tab)
# -------------------------
def extract_from_place_url(driver, href) -> Dict:
    """
    Open href in a new tab, switch, wait for content, parse with BeautifulSoup,
    extract fields and close tab.
    """
    result = {
        "business_name": "N/A",
        "address": "N/A",
        "category": "N/A",
        "rating": "N/A",
        "reviews_count": "0",
        "google_maps_url": href,
        "company_url": "N/A",
        "phone": "N/A",
        "opening_hours": [],
        "price_level": "N/A",
        "attributes": [],
        "images": [],
        "description": "N/A",
        "raw_page_text_snippet": ""
    }

    original_window = driver.current_window_handle
    # open new tab
    driver.execute_script("window.open(arguments[0], '_blank');", href)
    safe_sleep(0.6, 1.2)
    # switch to new window
    windows = driver.window_handles
    new_window = [w for w in windows if w != original_window][-1]
    driver.switch_to.window(new_window)

    try:
        # Wait for main content
        WebDriverWait(driver, 12).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        safe_sleep(1.0, 2.0)
        # Detect captcha / block
        page_text = driver.page_source.lower()
        if "unusual traffic" in page_text or "are you a robot" in page_text or ("sorry" in page_text and "detected" in page_text):
            logger.error("Captcha/Block detected on place page; aborting extraction for this href.")
            raise RuntimeError("Captcha detected")

        soup = BeautifulSoup(driver.page_source, "html.parser")

        # name
        h1 = soup.select_one("h1")
        if h1 and h1.get_text(strip=True):
            result["business_name"] = h1.get_text(strip=True)

        # address
        addr = None
        cand = soup.select_one("button[data-item-id='address'] .Io6YTe, span[data-item-id='address'], div[aria-label*='Address']")
        if cand and cand.get_text(strip=True):
            addr = cand.get_text(strip=True)
        else:
            cand2 = soup.select_one("div[data-tooltip='Copy address'], div[data-item-id='address']")
            if cand2 and cand2.get_text(strip=True):
                addr = cand2.get_text(strip=True)
        if addr:
            result["address"] = addr

        # category
        cat = soup.select_one("button[jsaction*='pane.rating.category'], div[data-item-id='subtitle'], span.fontBodySmall")
        if cat and cat.get_text(strip=True):
            result["category"] = cat.get_text(strip=True)

        # rating
        r_el = soup.find(lambda t: t.name == "div" and t.has_attr("aria-label") and ("star" in t["aria-label"].lower() or "rating" in t.get_text().lower()))
        if r_el and r_el.has_attr("aria-label"):
            mm = re.search(r"(\d+(\.\d+)?)", r_el["aria-label"])
            if mm:
                result["rating"] = mm.group(1)

        # reviews count
        for b in soup.select("button"):
            txt = b.get_text(" ", strip=True)
            if txt and "review" in txt.lower():
                m = re.search(r"(\d[\d,]*)", txt.replace(",", ""))
                if m:
                    result["reviews_count"] = m.group(1).replace(",", "")
                    break

        # website link
        for a in soup.find_all("a", href=True):
            href_a = a["href"]
            if href_a.startswith("http") and ("google" not in href_a and "/maps" not in href_a):
                result["company_url"] = href_a
                break

        # phone - look for phone-like text or aria-label
        ph = None
        phone_btn = soup.select_one("button[aria-label*='Call'], a[aria-label*='Call']")
        if phone_btn:
            text = phone_btn.get_text(" ", strip=True)
            match = PHONE_RE.search(text)
            if match:
                ph = match.group(1)
        if not ph:
            # search page text
            text_all = soup.get_text(" ")
            match = PHONE_RE.search(text_all)
            if match:
                ph = match.group(1)
        if ph:
            result["phone"] = ph

        # opening hours - look for table or li's in hours section
        hours = []
        hours_parent = soup.select_one("table[class*='WgFkxc']") or soup.select_one("div[aria-label*='Hours']") or soup.select_one("div[data-item-id='hours']")
        if hours_parent:
            for tr in hours_parent.select("tr"):
                txt = tr.get_text(" ", strip=True)
                if txt:
                    hours.append(txt)
        else:
            for li in soup.select("div.section-open-hours, div[jsinstance] li"):
                t = li.get_text(" ", strip=True)
                if t and any(day in t.lower() for day in ["mon", "tue", "wed", "thu", "fri", "sat", "sun"]):
                    hours.append(t)
        if hours:
            result["opening_hours"] = hours

        # price level (if present)
        txt_all = soup.get_text(" ")
        price_m = re.search(r"(\£|\$|€)\s*\d+", txt_all)
        if price_m:
            result["price_level"] = price_m.group(0)
        else:
            match2 = re.search(r"(\${1,4}|\£{1,4}|€{1,4}|\w+\s*·\s*Pricey)", txt_all)
            if match2:
                result["price_level"] = match2.group(0)

        # attributes - small badges near top - fetch small spans
        attrs = []
        for sp in soup.select("button[jsaction*='pane.placeActions'], span[class*='ucwH6d'] , div.fontBodySmall"):
            t = sp.get_text(" ", strip=True)
            if t and len(t) < 60:
                attrs.append(t)
        result["attributes"] = list(dict.fromkeys([a for a in attrs if a and len(a) > 0]))[:12]

        # images - attempt to collect image srcs from carousel thumbnails
        images = []
        for img in soup.select("img[src]"):
            src = img.get("src", "")
            if src and len(src) > 30:
                images.append(src)
        result["images"] = images[:10]

        # description / about
        desc_candidate = soup.select_one("div[data-section-id='overview'] , div[data-item-id='description']")
        if desc_candidate and desc_candidate.get_text(strip=True):
            result["description"] = desc_candidate.get_text(" ", strip=True)
        else:
            long_texts = [p.get_text(" ", strip=True) for p in soup.select("div") if p.get_text(strip=True) and 40 < len(p.get_text(strip=True)) < 600]
            if long_texts:
                result["description"] = long_texts[0]

        # raw snippet for debug
        result["raw_page_text_snippet"] = soup.get_text(" ", strip=True)[:800]

    except Exception as e:
        logger.warning("Error extracting %s: %s", href, e)
    finally:
        # close the tab and switch back
        try:
            driver.close()
        except Exception:
            pass
        try:
            driver.switch_to.window(original_window)
        except Exception:
            try:
                driver.switch_to.window(driver.window_handles[0])
            except Exception:
                pass
    return result

# -------------------------
# Main scraping orchestration (with shortlink support)
# -------------------------
def scrape_area(input_url: str, radius_km: int = 5, keyword: Optional[str] = None, headless: bool = False) -> Dict:
    """
    Main orchestrator:
     - expand short urls if needed
     - initialize driver
     - if coords not found in expanded URL, use driver navigation to resolve final URL
     - then continue with search page, scrolling, and extraction
    """
    input_url = input_url.strip()
    full_url = input_url

    # Step 1: If input looks like a short domain, try to expand via requests
    parsed = urlparse(input_url)
    domain = parsed.netloc.lower()
    if any(d in domain for d in SHORT_DOMAINS):
        expanded = expand_short_url_requests(input_url)
        if expanded:
            full_url = expanded
        else:
            logger.info("Requests expansion failed; will resolve via browser after driver init.")

    # Step 2: Try to get coordinates from the (possibly expanded) url
    lat, lng = parse_coordinates(full_url)

    # Init driver now (we may need it to resolve final URL if coords still missing)
    driver = init_driver(headless=headless)
    try:
        # If coords are still missing, navigate with Selenium to let any JS redirects finish and get final URL
        if lat is None or lng is None:
            try:
                logger.info("Coordinates not found in URL; navigating browser to resolve final URL (may handle JS redirects).")
                driver.get(full_url)
                safe_sleep(2.0, 3.0)
                resolved = driver.current_url
                logger.info("Browser-resolved URL: %s", resolved)
                full_url = resolved
                lat, lng = parse_coordinates(full_url)
            except Exception as e:
                logger.debug("Browser-based URL resolution failed: %s", e)
                # continue — lat/lng might still be None and will raise below

        if lat is None or lng is None:
            raise ValueError("Could not parse coordinates from the resolved URL. Provide a place or search URL containing coordinates (or a short link that redirects to one).")

        zoom = radius_to_zoom(radius_km)
        # If URL is a place page (contains /place/) convert to search URL unless keyword provided
        if "/place/" in full_url and (not keyword):
            search_url = make_search_url(lat, lng, zoom, None)
            logger.info("Converted place URL to search URL: %s", search_url)
        else:
            # build canonical search URL if keyword given, else use existing URL if it looks like a search
            if keyword:
                search_url = make_search_url(lat, lng, zoom, keyword)
            else:
                # If full_url already contains "/search/" or "@lat,lng", use it; otherwise build search_url
                if "/search/" in full_url or "@%s" in full_url or "maps/search" in full_url:
                    search_url = full_url
                else:
                    search_url = make_search_url(lat, lng, zoom, None)

        logger.info("Opening search URL: %s", search_url)
        driver.get(search_url)
        safe_sleep(2.0, 3.5)

        # detect captcha early
        page_src = driver.page_source.lower()
        if "unusual traffic" in page_src or "are you a robot" in page_src:
            raise RuntimeError("Captcha / unusual traffic detected. Aborting.")

        panel = find_results_panel(driver, timeout=12)
        hrefs = []
        if panel:
            hrefs = scroll_until_stable(driver, panel, max_rounds=60)
        else:
            # fallback: collect links on page
            hrefs = [a.get_attribute("href") for a in driver.find_elements(By.XPATH, "//a[contains(@href,'/maps/place/')]") if a.get_attribute("href")]

        # Remove query parameters and dedupe by base path (optional)
        clean_hrefs = []
        seen = set()
        for h in hrefs:
            if not h:
                continue
            base = h.split("?")[0]
            if base not in seen:
                seen.add(base)
                clean_hrefs.append(h)

        logger.info("Collected %d unique place links to visit", len(clean_hrefs))

        results = []
        for idx, href in enumerate(clean_hrefs, start=1):
            logger.info("Processing %d/%d : %s", idx, len(clean_hrefs), href)
            try:
                info = extract_from_place_url(driver, href)
                results.append(info)
                safe_sleep(0.8, 1.8)
            except RuntimeError as e:
                logger.error("Aborting due to: %s", e)
                break
            except Exception as e:
                logger.warning("Failed to extract %s: %s", href, e)
                safe_sleep(0.5, 1.0)
                continue

        output = {
            "input_url": input_url,
            "resolved_url": full_url,
            "search_url": search_url,
            "radius_km": radius_km,
            "coordinates": {"lat": lat, "lng": lng},
            "zoom_level": zoom,
            "timestamp": datetime.now(timezone.utc).isoformat(),
            "total_businesses": len(results),
            "data": results
        }
        return output

    finally:
        try:
            driver.quit()
        except Exception:
            pass

# -------------------------
# Save helper & CLI
# -------------------------
def save_json(data: Dict, fname="results1.json"):
    with open(fname, "w", encoding="utf-8") as fh:
        json.dump(data, fh, indent=2, ensure_ascii=False)
    logger.info("Saved results to %s", fname)

def main():
    try:
        url = input("Google Maps place/search URL (short or full): ").strip()
        if not url:
            print("URL required"); return
        radius = input("Radius in km (default 5): ").strip() or "5"
        keyword = input("Optional search keyword (e.g., hotels, restaurants). Leave blank for 'businesses': ").strip() or None
        headless = input("Headless? (y/N): ").strip().lower() == "y"

        out = scrape_area(url, radius_km=int(radius), keyword=keyword, headless=headless)
        save_json(out, "results.json")
        print(json.dumps(out, indent=2, ensure_ascii=False))
    except KeyboardInterrupt:
        logger.info("Cancelled by user")
    except Exception as e:
        logger.exception("Fatal: %s", e)

if __name__ == "__main__":
    main()


2025-11-22 13:59:27,886 - INFO - Attempting to expand short URL via HTTP requests: https://maps.app.goo.gl/eEwC6bKmcS8UM2vp8
2025-11-22 13:59:31,012 - INFO - Expanded (requests) -> https://www.google.com/maps/place/Oriole+Guest+House/@33.6615155,72.9399799,12z/data=!4m11!3m10!1s0x38dfbfd336314c8d:0xd74d393578fa954b!5m3!1s2025-11-22!4m1!1i2!8m2!3d33.6856644!4d73.0444932!15sCgVIb3RlbJIBC2d1ZXN0X2hvdXNlqgE2EAEqCSIFaG90ZWwoADIcEAEiGLv8DWy0GmcrQbW6pt4yK-nv9R63SKOHTzIJEAIiBWhvdGVs4AEA!16s%2Fg%2F11txhdqtrw?authuser=0&entry=tts&g_ep=EgoyMDI1MTExNy4wIPu8ASoASAFQAw%3D%3D&skid=276a88d4-6fe4-42b4-82e3-8ecce29e92b9
2025-11-22 13:59:32,049 - INFO - Get LATEST chromedriver version for google-chrome
2025-11-22 13:59:32,466 - INFO - Get LATEST chromedriver version for google-chrome
2025-11-22 13:59:32,866 - INFO - Driver [C:\Users\ABC\.wdm\drivers\chromedriver\win64\142.0.7444.175\chromedriver-win32/chromedriver.exe] found in cache
2025-11-22 13:59:34,822 - INFO - Chrome driver initialized
2025-11-22 1

{
  "input_url": "https://maps.app.goo.gl/eEwC6bKmcS8UM2vp8",
  "resolved_url": "https://www.google.com/maps/place/Oriole+Guest+House/@33.6615155,72.9399799,12z/data=!4m11!3m10!1s0x38dfbfd336314c8d:0xd74d393578fa954b!5m3!1s2025-11-22!4m1!1i2!8m2!3d33.6856644!4d73.0444932!15sCgVIb3RlbJIBC2d1ZXN0X2hvdXNlqgE2EAEqCSIFaG90ZWwoADIcEAEiGLv8DWy0GmcrQbW6pt4yK-nv9R63SKOHTzIJEAIiBWhvdGVs4AEA!16s%2Fg%2F11txhdqtrw?authuser=0&entry=tts&g_ep=EgoyMDI1MTExNy4wIPu8ASoASAFQAw%3D%3D&skid=276a88d4-6fe4-42b4-82e3-8ecce29e92b9",
  "search_url": "https://www.google.com/maps/search/hotel/@33.6615155,72.9399799,13z",
  "radius_km": 5,
  "coordinates": {
    "lat": 33.6615155,
    "lng": 72.9399799
  },
  "zoom_level": 13,
  "timestamp": "2025-11-22T09:10:30.966719+00:00",
  "total_businesses": 105,
  "data": [
    {
      "business_name": "Oriole Guest House",
      "address": "9 Street 55, G-9/4 G 9/4 G-9, Islamabad, 44090",
      "category": "Select your dates to see the best prices",
      "rating": "N/A",
 