In [18]:
# -*- coding: utf-8 -*-
import re, time, csv, random
from pathlib import Path

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException

# -----------------  参数区  -----------------
START_URL   = "https://zu.fang.com/house-a012-b01182/"
MAX_PAGE    = 20
CSV_FILE    = "hlg_fang_rent.csv"
DRIVER_PATH = r"E:\quant\B_homework\HW3\driver\chromedriver.exe"
# -------------------------------------------

# --------------  浏览器初始化  --------------
def init_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    # options.add_argument("--headless")
    service = Service(executable_path=DRIVER_PATH) if DRIVER_PATH else Service()
    driver = webdriver.Chrome(service=service, options=options)
    driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": "Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"
    })
    return driver

# --------------  单页解析（XPath 版） --------------
def parse_one_page(driver):
    wait = WebDriverWait(driver, 15)
    # 1. 等待列表容器出现
    wait.until(EC.presence_of_element_located(
        (By.XPATH, '//*[@id="listBox"]//div[contains(@class,"houseList")]')))

    # 2. 取所有<dl>行
    rows = driver.find_elements(By.XPATH, '//*[@id="listBox"]//dd')
    data = []
    for dl in rows:
        try:
            # 标题
            title = dl.find_element(By.XPATH, './/p[contains(@class,"title")]/a').text.strip()

            # 月租金：moreInfo 里第一个 <span>
            price_txt = dl.find_element(By.XPATH, './/span[contains(@class,"price")]').text
            price = int(re.search(r'(\d+)', price_txt.replace(',', '')).group(1))

            # 面积：gray6 段落里第一个 <span>，且包含“㎡”
            info_txt = dl.find_element(By.XPATH, './/p[contains(@class,"font15") or contains(@class,"room")]').text
            m = re.search(r'([\d.]+)\s*㎡', info_txt)
            area = float(m.group(1)) if m else None  # 容错：若未匹配到则为 None

            data.append({"名称": title, "面积(㎡)": area, "月租金(元)": price})
        except Exception as e:
            print("[WARN] 丢弃一行：", e)
    return data
# --------------  主流程  --------------
def main():
    driver = init_driver()
    driver.get(START_URL)
    all_records = []

    for page in range(1, MAX_PAGE + 1):
        print(f"正在采集第 {page:02d} 页 …")
        try:
            all_records += parse_one_page(driver)
            if page == MAX_PAGE:
                break
            # 点击“下一页”
            next_btn = driver.find_element(By.XPATH, '//a[contains(text(),"下一页")]')
            next_btn.click()
            time.sleep(random.uniform(1.2, 2.4))
        except TimeoutException:
            print("[ERROR] 加载下一页超时，提前结束")
            break
        except Exception as e:
            print("[ERROR] 未知异常：", e)
            break

    driver.quit()
    # 保存 csv
    import pandas as pd
    pd.DataFrame(all_records).to_csv(CSV_FILE, index=False, encoding="utf-8-sig")
    print("全部完成，共采集 {} 条，已写入 {}".format(len(all_records), Path(CSV_FILE).absolute()))

if __name__ == "__main__":
    main()

正在采集第 01 页 …
正在采集第 02 页 …
正在采集第 03 页 …
正在采集第 04 页 …
正在采集第 05 页 …
正在采集第 06 页 …
正在采集第 07 页 …
正在采集第 08 页 …
正在采集第 09 页 …
正在采集第 10 页 …
正在采集第 11 页 …
正在采集第 12 页 …
正在采集第 13 页 …
正在采集第 14 页 …
正在采集第 15 页 …
正在采集第 16 页 …
正在采集第 17 页 …
正在采集第 18 页 …
正在采集第 19 页 …
正在采集第 20 页 …
全部完成，共采集 1200 条，已写入 E:\quant\B_homework\HW3\hlg_fang_rent.csv
