In [None]:
%pip install html5lib
%pip install webdriver-manager
%pip install selenium
%pip install selenium webdriver-manager pandas matplotlib -q

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep
import pandas as pd
import matplotlib.pyplot as plt

print("✅ Environment ready")

In [4]:
# Start Chrome browser via webdriver-manager
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.maximize_window()
print("✅ Chrome started")

✅ Chrome started


In [6]:

# Define start URL
START_URL = "https://esf.fang.com/house-a010-b05048/"

# Safely quit an existing driver if it still exists
def safe_quit(drv):
    """Try to quit an existing driver safely."""
    try:
        drv.quit()
    except Exception:
        pass

# Check and clean up any old driver instance
try:
    _ = driver.current_url  # Test if the driver is still alive
except Exception:
    try:
        driver  # If the variable exists but is broken
        safe_quit(driver)
    except NameError:
        pass  # Driver not defined yet

# Start a new Chrome driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.maximize_window()

print(f"🌐 Opening {START_URL}")
driver.get(START_URL)

# Wait for the listing container to ensure the page is ready
WebDriverWait(driver, 12).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, "div.shop_list.shop_list_4, #listBox"))
)
print("✅ Page loaded and listing container detected.")

🌐 Opening https://esf.fang.com/house-a010-b05048/
✅ Page loaded and listing container detected.


In [16]:
# Crawl first 20 pages (from page 1) for Tongzhou · Majuqiao on Fang.com
# Fields:
# - area_sqm: strictly from <p class="tel_shop"> ... ㎡ ... -> e.g., "94.09"
# - unit_price_yuan_per_sqm: from <dd class="price_right"> <span> ... 元/㎡ </span> -> e.g., "31402"

import re, time, pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# ---------- Config ----------
START_URL = "https://esf.fang.com/house-a010-b05048/"  # page 1 (Majuqiao)
MAX_PAGES = 20
SLEEP_SEC = 0.4
CSV_PATH = "majiaoqiao_first20pages_area_unit.csv"
BASE = "https://esf.fang.com"

# ---------- Regex helpers ----------
AREA_PATTERN = re.compile(r"(\d+(?:\.\d+)?)\s*[㎡m²]")
NUM_PATTERN  = re.compile(r"(\d+(?:\.\d+)?)")

def num_first(text: str) -> str:
    """Return the first decimal number in text, else ''."""
    if not text:
        return ""
    m = NUM_PATTERN.search(text.replace(",", ""))
    return m.group(1) if m else ""

# ---------- Driver helpers ----------
def ensure_driver():
    """Return a usable Selenium Chrome driver, recreating if needed."""
    global driver
    try:
        _ = driver.current_url  # raises if window is gone
        return driver
    except Exception:
        try:
            driver.quit()
        except Exception:
            pass
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
        driver.maximize_window()
        return driver

def wait_list_loaded(drv, timeout=12):
    """Wait until listing container exists on the page."""
    WebDriverWait(drv, timeout).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "div.shop_list.shop_list_4, #listBox"))
    )

# ---------- Scraping helpers ----------
def area_from_tel_shop(card_el) -> str:
    """Extract area strictly from <p class='tel_shop'> ... ㎡ ..."""
    try:
        tel_text = card_el.find_element(By.CSS_SELECTOR, "p.tel_shop").text
        tel_text = " ".join(tel_text.replace("\xa0", " ").split())  # normalize whitespace
        m = AREA_PATTERN.search(tel_text)
        return m.group(1) if m else ""
    except Exception:
        return ""

def scrape_current_page_tel_area(drv):
    """Scrape all listings on current page (area from tel_shop; unit price from price_right)."""
    # locate container
    container = None
    for sel in ["div.shop_list.shop_list_4", "#listBox"]:
        els = drv.find_elements(By.CSS_SELECTOR, sel)
        if els:
            container = els[0]
            break
    if container is None:
        return []

    # each listing card
    cards = container.find_elements(By.CSS_SELECTOR, "dl.clearfix")
    if not cards:
        cards = container.find_elements(By.CSS_SELECTOR, "dl")

    out = []
    for it in cards:
        # area from tel_shop
        area_num = area_from_tel_shop(it)

        # unit price from price_right "... 元/㎡"
        unit_num = ""
        try:
            up_text = it.find_element(
                By.XPATH, ".//dd[contains(@class,'price_right')]//span[contains(text(),'元/㎡')]"
            ).text.strip()
            unit_num = num_first(up_text)
        except Exception:
            try:
                up_text2 = it.find_element(By.XPATH, ".//*[contains(text(),'元/㎡')]").text.strip()
                unit_num = num_first(up_text2)
            except Exception:
                unit_num = ""

        if area_num or unit_num:
            out.append({"area_sqm": area_num, "unit_price_yuan_per_sqm": unit_num})
    return out

def get_next_href(drv) -> str:
    """Return absolute href of the '下一页' link, or '' if none."""
    try:
        a = drv.find_element(By.XPATH, "//a[normalize-space(text())='下一页']")
        href = (a.get_attribute("href") or "").strip()
        if href.startswith("/"):
            href = BASE + href
        return href
    except Exception:
        return ""

def crawl_all_pages(drv, start_url: str, max_pages: int = 20, sleep_sec: float = 0.4):
    """Paginate until '下一页' missing or reaching max_pages; return all rows."""
    all_rows, seen = [], set()
    drv.get(start_url)
    wait_list_loaded(drv, 12)

    page_idx = 1
    while True:
        cur = drv.current_url
        if cur in seen:
            print("⚠️ Repeat URL, stop:", cur)
            break
        seen.add(cur)

        rows = scrape_current_page_tel_area(drv)
        print(f"Page {page_idx}: {len(rows)} rows | {cur}")
        all_rows.extend(rows)

        if page_idx >= max_pages:
            print("Reached max_pages, stop."); break

        nxt = get_next_href(drv)
        if not nxt or nxt == cur:
            print("No further '下一页', stop."); break

        drv.get(nxt)
        page_idx += 1
        time.sleep(sleep_sec)

    return all_rows

# ---------- RUN ----------
driver = ensure_driver()
records_20 = crawl_all_pages(driver, START_URL, max_pages=MAX_PAGES, sleep_sec=SLEEP_SEC)
print(f"TOTAL rows (first {MAX_PAGES} pages): {len(records_20)}")

df = pd.DataFrame(records_20)
df.to_csv(CSV_PATH, index=False, encoding="utf-8-sig")
print("💾 Saved:", CSV_PATH)

# quick preview
print("Head:")
print(df.head(8))
print("Tail:")
print(df.tail(8))

Page 1: 60 rows | https://esf.fang.com/house-a010-b05048/
Page 2: 60 rows | https://esf.fang.com/house-a010-b05048/i32/
Page 3: 60 rows | https://esf.fang.com/house-a010-b05048/i33/
Page 4: 60 rows | https://esf.fang.com/house-a010-b05048/i34/
Page 5: 60 rows | https://esf.fang.com/house-a010-b05048/i35/
Page 6: 60 rows | https://esf.fang.com/house-a010-b05048/i36/
Page 7: 60 rows | https://esf.fang.com/house-a010-b05048/i37/
Page 8: 60 rows | https://esf.fang.com/house-a010-b05048/i38/
Page 9: 60 rows | https://esf.fang.com/house-a010-b05048/i39/
Page 10: 60 rows | https://esf.fang.com/house-a010-b05048/i310/
Page 11: 60 rows | https://esf.fang.com/house-a010-b05048/i311/
Page 12: 60 rows | https://esf.fang.com/house-a010-b05048/i312/
Page 13: 60 rows | https://esf.fang.com/house-a010-b05048/i313/
Page 14: 60 rows | https://esf.fang.com/house-a010-b05048/i314/
Page 15: 60 rows | https://esf.fang.com/house-a010-b05048/i315/
Page 16: 60 rows | https://esf.fang.com/house-a010-b05048/i316

In [21]:
# Crawl ALL rental pages for Tongzhou · Majuqiao on zu.fang.com
# Fields:
# - area_sqm: from <p class="font15 mt12 bold"> ... ㎡ ... </p> -> e.g., "52"
# - rent_yuan_per_month: from <div class="moreInfo"><span class="price">3000</span>元/月</div> -> "3000"
# Pagination:
# - Click/visit the anchor whose text is exactly "下一页" until it disappears.

import re, time, pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

START_URL = "https://zu.fang.com/house-a010-b05048/"
BASE      = "https://zu.fang.com"
CSV_PATH  = "majiaoqiao_rent_all_pages_area_rent.csv"

AREA_RE = re.compile(r"(\d+(?:\.\d+)?)\s*[㎡m²]")
NUM_RE  = re.compile(r"(\d+(?:\.\d+)?)")

def ensure_driver():
    """Return a usable Selenium Chrome driver, recreating if needed."""
    global driver
    try:
        _ = driver.current_url
        return driver
    except Exception:
        try:
            driver.quit()
        except Exception:
            pass
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
        driver.maximize_window()
        return driver

def wait_list_loaded(drv, timeout=12):
    """Wait until listing container exists on the page."""
    WebDriverWait(drv, timeout).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "div.shop_list.shop_list_4, #listBox"))
    )

def area_from_rent_header(card_el) -> str:
    """Extract area from <p class='font15 mt12 bold'> ... ㎡ ...>."""
    try:
        p = card_el.find_element(By.CSS_SELECTOR, "p.font15.mt12.bold")
        txt = " ".join(p.text.replace("\xa0", " ").split())
        m = AREA_RE.search(txt)
        return m.group(1) if m else ""
    except Exception:
        return ""

def rent_from_moreinfo(card_el) -> str:
    """Extract monthly rent from <div class='moreInfo'><span class='price'>..</span>元/月</div>."""
    # primary
    try:
        price_span = card_el.find_element(By.CSS_SELECTOR, "div.moreInfo span.price")
        txt = price_span.text.strip()
        m = NUM_RE.search(txt.replace(",", ""))
        if m:
            return m.group(1)
    except Exception:
        pass
    # fallback within the card
    try:
        txt2 = card_el.find_element(By.XPATH, ".//*[contains(text(),'元/月')]").text.strip()
        m2 = NUM_RE.search(txt2.replace(",", ""))
        return m2.group(1) if m2 else ""
    except Exception:
        return ""

def scrape_rent_current_page(drv):
    """Scrape all rental listings on the current page."""
    # locate container
    container = None
    for sel in ["div.shop_list.shop_list_4", "#listBox"]:
        els = drv.find_elements(By.CSS_SELECTOR, sel)
        if els:
            container = els[0]; break
    if container is None:
        return []

    # each listing card (dl)
    cards = container.find_elements(By.CSS_SELECTOR, "dl.clearfix") or container.find_elements(By.CSS_SELECTOR, "dl")

    out = []
    for it in cards:
        area_num = area_from_rent_header(it)
        rent_num = rent_from_moreinfo(it)
        if area_num or rent_num:
            out.append({"area_sqm": area_num, "rent_yuan_per_month": rent_num})
    return out

def get_next_href(drv) -> str:
    """Return absolute href of the '下一页' link, or '' if none."""
    try:
        a = drv.find_element(By.XPATH, "//a[normalize-space(text())='下一页']")
        href = (a.get_attribute("href") or "").strip()
        if href.startswith("/"):
            href = BASE + href
        return href
    except Exception:
        return ""

def crawl_all_pages(drv, start_url: str, sleep_sec: float = 0.4, max_pages: int = 1000):
    """Paginate until no '下一页' or reaching max_pages; return all rows."""
    all_rows, seen = [], set()
    drv.get(start_url)
    wait_list_loaded(drv, 12)

    page_idx = 1
    while True:
        cur = drv.current_url
        if cur in seen:
            print("⚠️ Repeat URL, stop:", cur); break
        seen.add(cur)

        rows = scrape_rent_current_page(drv)
        print(f"Page {page_idx}: {len(rows)} rows | {cur}")
        all_rows.extend(rows)

        if page_idx >= max_pages:
            print("Reached max_pages, stop."); break

        nxt = get_next_href(drv)
        if not nxt or nxt == cur:
            print("No further '下一页', stop."); break

        drv.get(nxt)
        page_idx += 1
        time.sleep(sleep_sec)

    return all_rows

# ---- RUN: paginate all pages ----
driver = ensure_driver()
records = crawl_all_pages(driver, START_URL, sleep_sec=0.4, max_pages=1000)
print(f"TOTAL rows: {len(records)}")

df = pd.DataFrame(records)
df.to_csv(CSV_PATH, index=False, encoding="utf-8-sig")
print("💾 Saved:", CSV_PATH)

# quick preview
print("Head:")
print(df.head(8))
print("Tail:")
print(df.tail(8))


Page 1: 60 rows | https://zu.fang.com/house-a010-b05048/
Page 2: 60 rows | https://zu.fang.com/house-a010-b05048/i32/
Page 3: 60 rows | https://zu.fang.com/house-a010-b05048/i33/
Page 4: 60 rows | https://zu.fang.com/house-a010-b05048/i34/
Page 5: 60 rows | https://zu.fang.com/house-a010-b05048/i35/
Page 6: 60 rows | https://zu.fang.com/house-a010-b05048/i36/
Page 7: 60 rows | https://zu.fang.com/house-a010-b05048/i37/
Page 8: 60 rows | https://zu.fang.com/house-a010-b05048/i38/
Page 9: 60 rows | https://zu.fang.com/house-a010-b05048/i39/
Page 10: 60 rows | https://zu.fang.com/house-a010-b05048/i310/
Page 11: 60 rows | https://zu.fang.com/house-a010-b05048/i311/
Page 12: 60 rows | https://zu.fang.com/house-a010-b05048/i312/
Page 13: 29 rows | https://zu.fang.com/house-a010-b05048/i313/
No further '下一页', stop.
TOTAL rows: 749
💾 Saved: majiaoqiao_rent_all_pages_area_rent.csv
Head:
  area_sqm rent_yuan_per_month
0       52                3000
1       60                3000
2      118     