In [11]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import re

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# 初始URL
url = 'https://zhangjiakou.zu.fang.com/house-a011434/'


all_data = []

try:
    driver.get(url)
    print("开始爬取租房数据...")
    
    for page_num in range(20):
        print(f"正在爬取第 {page_num + 1} 页...")
        
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "houseList"))
        )
        
        # 查找所有的房源元素 (dl标签)
        house_elements = driver.find_elements(By.CSS_SELECTOR, "dl.list.hiddenMap.rel")
        
        # 遍历每个房源元素，提取数据
        for house in house_elements:
            try:
                # 1. 提取面积数据 - 从font15 mt12 bold中提取
                try:
                    area_element = house.find_element(By.CSS_SELECTOR, "p.font15.mt12.bold")
                    area_text = area_element.text
                    
                    area_match = re.search(r'(\d+)㎡', area_text)
                    if area_match:
                        area = area_match.group(1) 
                    else:
                        area = "N/A"
                except NoSuchElementException:
                    area = "N/A"
                
                # 2. 提取租金数据 - 从price类中提取
                try:
                    price_element = house.find_element(By.CLASS_NAME, "price")
                    rent_price = price_element.text
                except NoSuchElementException:
                    rent_price = "N/A"
                
                # 将数据添加到列表，按照要求的顺序
                all_data.append({
                    '面积(㎡)': area,
                    '租金(元/月)': rent_price
                })
                
            except Exception as e:
                # 如果单个房源解析失败，跳过并继续下一个
                print(f"解析房源时出错: {e}")
                continue
        
        print(f"第 {page_num + 1} 页爬取完成，共找到 {len(house_elements)} 个房源")
        
        # 检查是否还有下一页
        if page_num < 19:
            try:
                # 查找分页区域
                fanye_element = driver.find_element(By.CLASS_NAME, "fanye")
                
                # 查找所有链接，找到包含"下一页"的链接
                page_links = fanye_element.find_elements(By.TAG_NAME, "a")
                next_page_link = None
                
                for link in page_links:
                    if "下一页" in link.text:
                        next_page_link = link
                        break
                
                if next_page_link:
                    # 点击下一页
                    next_page_link.click()
                    
                    # 等待页面加载
                    time.sleep(2)
                    
                    # 检查是否成功跳转到新页面（通过URL变化或新内容加载）
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CLASS_NAME, "houseList"))
                    )
                else:
                    print("找不到下一页链接，爬取结束")
                    break
                    
            except NoSuchElementException:
                print("找不到分页区域，爬取结束")
                break
            except TimeoutException:
                print("页面加载超时，可能已无更多页面，爬取结束")
                break
            except Exception as e:
                print(f"翻页时出错: {e}")
                break
        else:
            print("已达到20页目标，爬取结束")

finally:
    # 关闭浏览器
    driver.quit()
    print("浏览器已关闭")

if all_data:
    df = pd.DataFrame(all_data)
    
    print("\n爬取到的数据前5行:")
    print(df.head())
    
    df[['面积(㎡)', '租金(元/月)']].to_excel('hebei_zu_huailai_data.xlsx', index=False)
    print(f"\n数据已保存到 '租房数据.xlsx'，共 {len(df)} 条记录")

    print("\n数据统计信息:")
    print(df.describe())
    
else:
    print("没有爬取到任何数据")

开始爬取租房数据...
正在爬取第 1 页...
第 1 页爬取完成，共找到 60 个房源
正在爬取第 2 页...
第 2 页爬取完成，共找到 18 个房源
找不到下一页链接，爬取结束
浏览器已关闭

爬取到的数据前5行:
  面积(㎡) 租金(元/月)
0    40     950
1   104    1000
2    87    1200
3    78    2000
4   175    3000

数据已保存到 '租房数据.xlsx'，共 78 条记录

数据统计信息:
       面积(㎡) 租金(元/月)
count     78      78
unique    43      23
top       84    1200
freq       7      14
