In [1]:
# -*- coding: utf-8 -*-
import re, time, random
from pathlib import Path

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException

START_URL   = "https://esf.fang.com/house-a012-b01182/i31/"
MAX_PAGE    = 20
CSV_FILE    = "hlg_fang_esf.csv"
DRIVER_PATH = r"E:\quant\B_homework\HW3\driver\chromedriver.exe"

def init_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    # options.add_argument("--headless=new")
    service = Service(executable_path=DRIVER_PATH) if DRIVER_PATH else Service()
    driver = webdriver.Chrome(service=service, options=options)
    driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
    })
    driver.set_page_load_timeout(30)
    return driver

def parse_one_page(driver):
    wait = WebDriverWait(driver, 15)
    container = wait.until(EC.presence_of_element_located(
        (By.XPATH, '//div[contains(@class,"shop_list")]')))
    cards = container.find_elements(By.XPATH, './/dd[h4]')  # 只取有标题的 dd
    records = []

    for card in cards:
        try:
            # 标题
            title = card.find_element(By.XPATH, './/span[@class="tit_shop"]').text.strip()

            # 面积
            info_txt = card.find_element(By.XPATH, './/p[@class="tel_shop"]').text
            m = re.search(r'([\d.]+)\s*㎡', info_txt)
            area = float(m.group(1)) if m else None

            # 小区
            community = card.find_element(By.XPATH, './/p[@class="add_shop"]/a').text.strip()

            # 价格（在兄弟节点 dd.price_right 里）
            try:
                price_dd = card.find_element(By.XPATH, './following-sibling::dd[@class="price_right"]')
                total_txt = price_dd.find_element(By.XPATH, './/span[@class="red"]').text
                m = re.search(r'([\d.]+)', total_txt)
                total_price = float(m.group(1)) if m else None

                unit_txt = price_dd.find_element(By.XPATH, './/span[2]').text
                m = re.search(r'([\d,]+)', unit_txt)
                unit_price = float(m.group(1).replace(',', '')) if m else None
            except:
                total_price, unit_price = None, None

            records.append({
                "名称": title,
                "面积(㎡)": area,
                "小区": community,
                "总价(万)": total_price,
                "单价(元/㎡)": unit_price
            })
        except Exception as e:
            print("[WARN] 丢弃一行：", e)

    return records


def go_next_page(driver, wait):
    try:
        first_card = driver.find_element(By.XPATH, '//div[contains(@class,"shop_list") and contains(@class,"shop_list_4")]//dd[1]')
    except:
        first_card = None

    next_btn = wait.until(EC.presence_of_element_located(
    (By.XPATH, '//a[contains(text(),"下一页")]')
    ))

    # 滚动到按钮位置
    driver.execute_script("arguments[0].scrollIntoView(true);", next_btn)
    time.sleep(0.5)

    # JS 强制点击
    driver.execute_script("arguments[0].click();", next_btn)

    try:
        wait.until(EC.staleness_of(first_card))
    except:
        pass

    wait.until(EC.presence_of_element_located(
        (By.XPATH, '//div[contains(@class,"shop_list") and contains(@class,"shop_list_4")]//dd')))
    time.sleep(random.uniform(1.0, 1.8))

def main():
    import pandas as pd
    driver = init_driver()
    wait = WebDriverWait(driver, 15)
    all_records = []

    try:
        driver.get(START_URL)
        for page in range(1, MAX_PAGE + 1):
            print(f"正在采集第 {page:02d} 页 …")
            try:
                page_records = parse_one_page(driver)
                all_records.extend(page_records)
                if page == MAX_PAGE:
                    break
                go_next_page(driver, wait)
            except TimeoutException:
                print("[ERROR] 加载下一页超时，提前结束")
                break
            except Exception as e:
                print("[ERROR] 未知异常：", e)
                break
    finally:
        driver.quit()

    pd.DataFrame(all_records).to_csv(CSV_FILE, index=False, encoding="utf-8-sig")
    print("全部完成，共采集 {} 条，已写入 {}".format(len(all_records), Path(CSV_FILE).absolute()))

if __name__ == "__main__":
    main()


正在采集第 01 页 …
正在采集第 02 页 …
正在采集第 03 页 …
正在采集第 04 页 …
正在采集第 05 页 …
正在采集第 06 页 …
正在采集第 07 页 …
正在采集第 08 页 …
正在采集第 09 页 …
正在采集第 10 页 …
正在采集第 11 页 …
正在采集第 12 页 …
正在采集第 13 页 …
正在采集第 14 页 …
正在采集第 15 页 …
正在采集第 16 页 …
正在采集第 17 页 …
正在采集第 18 页 …
正在采集第 19 页 …
正在采集第 20 页 …
全部完成，共采集 1200 条，已写入 E:\quant\B_homework\HW3\hlg_fang_esf.csv
