##### 本代码通过Selenium自动化爬取房产网站，具有鲁棒性强、容错能力高、易扩展等优点。它采用浏览器指纹伪装和随机延时，有效应对反爬虫措施。页面解析时用多重选择器和正则兜底，保证结构变动时依旧能抓取关键字段。数据清洗和导出前自动去除无效项，确保结果质量。主要参数变量化，便于迁移和复用。整体流程分为参数设定、浏览器初始化、数据解析、分页循环、数据清洗与导出，结构清晰，适合批量和长期数据采集任务。

In [1]:
import re
import time
import random
from pathlib import Path

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, WebDriverException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
# ====== 配置参数 ======
CITY = "重庆"
DISTRICT = "渝北"
BLOCK = "中央公园"
MAX_PAGES = 20
START_URL = 'https://cq.esf.fang.com/house-a058-b021981/'  

OUTPUT_ROOT = Path(r'C:\Users\lenovo\Desktop')  
OUTPUT_DIR = OUTPUT_ROOT / "CentralPark_sale_output"
OUTPUT_DIR.mkdir(exist_ok=True)
SALES_CSV = OUTPUT_DIR / f"{CITY}_{DISTRICT}_{BLOCK}_sales.csv"

# ====== 浏览器初始化与反爬设置 ======
def init_driver(headless=False):
    opts = Options()
    opts.add_argument('--disable-blink-features=AutomationControlled')
    opts.add_argument('--start-maximized')
    opts.add_argument('--no-sandbox')
    opts.add_argument('--disable-dev-shm-usage')
    opts.add_experimental_option('excludeSwitches', ['enable-automation'])
    opts.add_experimental_option('useAutomationExtension', False)
    if headless:
        opts.add_argument('--headless=new')
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=opts)
    driver.implicitly_wait(10)
    return driver

def human_sleep(a=1.0, b=2.5):
    time.sleep(random.uniform(a, b))

# ====== 字段解析工具 ======
def to_float(text):
    if text is None:
        return np.nan
    t = re.sub(r'[ ,]', '', str(text))
    m = re.search(r'(\d+(?:\.\d+)?)\s*万', t)
    if m:
        return float(m.group(1)) * 10000
    m = re.search(r'(\d+(?:\.\d+)?)', t)
    return float(m.group(1)) if m else np.nan

def to_area_m2(text):
    if text is None:
        return np.nan
    m = re.search(r'(\d+(?:\.\d+)?)\s*㎡', str(text))
    if m:
        return float(m.group(1))
    m = re.search(r'(\d+(?:\.\d+)?)', str(text))
    return float(m.group(1)) if m else np.nan

def to_unit_price(text):
    if text is None:
        return np.nan
    m = re.search(r'(\d+(?:\.\d+)?)\s*元/㎡', str(text).replace(',', ''))
    return float(m.group(1)) if m else np.nan

# ====== 单页解析函数 ======
def parse_sale_page(page_source):
    soup = BeautifulSoup(page_source, 'lxml')
    items = []
    for block in soup.select('div.shop_list, div.listBox, div#listBox'):
        candidates = block.select('dl, li, div[class*=list]')
        for c in candidates:
            txt = c.get_text(' ', strip=True)
            if not txt:
                continue
            title = None
            tlink = c.select_one('a[title], a[class*=title], p.title a')
            if tlink and tlink.get_text(strip=True):
                title = tlink.get_text(strip=True)
            # 价格
            total_price = None
            unit_price = None
            p_total = c.select_one('dd.price span, span.price, b.price, div.price, p.price')
            p_unit = c.select_one('dd.price i, span.unitPrice, i.unitPrice, p.unitPrice')
            total_price = to_float(p_total.get_text(strip=True)) if p_total else to_float(txt)
            unit_price = to_unit_price(p_unit.get_text(strip=True)) if p_unit else to_unit_price(txt)
            # 面积
            area_m2 = None
            for cand in [c.select_one('dd.area p'), c.select_one('span.area'), c.select_one('p.area'), c]:
                if cand:
                    area_m2 = to_area_m2(cand.get_text(' ', strip=True))
                    if not np.isnan(area_m2):
                        break
            # 户型
            rooms = None
            m = re.search(r'(\d+)室(\d+)厅', txt)
            if m:
                rooms = f"{m.group(1)}室{m.group(2)}厅"
            # 位置
            location = None
            loc = c.select_one('dd.address p a, p.address a, span.address a')
            if loc:
                location = loc.get_text(strip=True)
            else:
                m2 = re.search(r'(渝中|大坪|重庆)', txt)
                location = m2.group(1) if m2 else None
            items.append({
                'title': title,
                'total_price_yuan': total_price,
                'unit_price_yuan_per_m2': unit_price,
                'area_m2': area_m2,
                'rooms': rooms,
                'location': location
            })
    return pd.DataFrame(items)

# ====== 翻页函数 ======
def go_next_page(driver):
    for locator in [
        (By.LINK_TEXT, '下一页'),
        (By.PARTIAL_LINK_TEXT, '下一'),
        (By.CSS_SELECTOR, 'a.next, a[rel=next]')
    ]:
        try:
            el = driver.find_element(*locator)
            driver.execute_script('arguments[0].click();', el)
            human_sleep()
            return True
        except NoSuchElementException:
            continue
        except WebDriverException:
            continue
    return False
            

In [3]:
# ====== 主流程 ======
driver = init_driver(headless=False)
driver.get(START_URL)
human_sleep()
print('主页已加载')
sale_rows = []
page = 1
while page <= MAX_PAGES:
    try:
        df = parse_sale_page(driver.page_source)
        if not df.empty:
            df['city'] = CITY
            df['district'] = DISTRICT
            df['block'] = BLOCK
            df['page'] = page
            sale_rows.append(df)
            print(f'第{page}页: {len(df)}条')
        else:
            print(f'第{page}页: 无有效条目')
        if not go_next_page(driver):
            print('未找到下一页，提前终止于第', page, '页')
            break
        page += 1
    except Exception as e:
        print(f'第{page}页解析异常: {e}')
        break
driver.quit()
sale_df = pd.concat(sale_rows, ignore_index=True) if sale_rows else pd.DataFrame()
print('总计抓取:', len(sale_df))
# 清洗缺失
sale_df = sale_df.dropna()
sale_df.to_csv(SALES_CSV, index=False, encoding='utf-8-sig')
print('数据已保存:', SALES_CSV)
# 快速摘要
if not sale_df.empty:
    if 'unit_price_yuan_per_m2' in sale_df.columns:
        sale_df['price_per_m2'] = sale_df['unit_price_yuan_per_m2']
    else:
        sale_df['price_per_m2'] = sale_df['total_price_yuan'] / sale_df['area_m2']
    print('中位单价:', pd.to_numeric(sale_df['price_per_m2'], errors='coerce').median())

主页已加载
第1页: 142条
第2页: 142条
第3页: 142条
第4页: 142条
第5页: 142条
第6页: 142条
第7页: 142条
第8页: 142条
第9页: 142条
第10页: 142条
第11页: 142条
第12页: 142条
第13页: 142条
第14页: 142条
第15页: 142条
第16页: 142条
第17页: 142条
第18页: 142条
第19页: 142条
第20页: 142条
总计抓取: 2840
数据已保存: C:\Users\lenovo\Desktop\CentralPark_sale_output\重庆_渝北_中央公园_sales.csv
中位单价: 15103.0
