In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
import re
import time
import ipywidgets as widgets
from IPython.display import display
import os

In [2]:
def get_page():
    # 爬取信息
    folder_path = "house_rent"

    # 检查文件夹是否存在
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)  # 递归创建文件夹
        
    global driver
    
    # 找到所有房源 dl 标签（class="list hiddenMap rel"）
    house_list_xpath = '/html/body/div[4]/div[4]/div[1]/div[4]'
    house_list_container = driver.find_element(By.XPATH, house_list_xpath)
    house_elements = house_list_container.find_elements(By.XPATH, './dl[@class="list hiddenMap rel"]')

    data = []
    for house in house_elements:
        try:
        
            # 房屋标题
            title = house.find_element(By.XPATH, './/p[@class="title"]/a').text.strip()
        
            # 房屋面积
            detail_text = house.find_element(By.XPATH, './/p[contains(@class, "font15")]').text
            size = ""
            for part in detail_text.split("|"):
                if "㎡" in part:
                    size = part.strip()
                    break
            
            # 房屋价格
            price = house.find_element(By.XPATH, './/span[@class="price"]').text.strip()
            
            data.append({
                'title': title,
                'area': size,
                'total_price': price
            })
        except Exception as e:
            print("某条房源解析失败：", e)
            continue
    
    return data

In [3]:
def scrape_info(area):
    global driver
    all_data = []
    page_cnt = 0
    label = widgets.Label(value="正在开始解析...")
    display(label)
                    
    while True:
        try:
            data = get_page()
            all_data.extend(data)
            page_cnt += 1
            label.value = f"第 {page_cnt} 页信息已经解析。"
                
            # 查找“下一页”按钮
            next_button = driver.find_element(By.XPATH, '//div[@class="houseList"]/following-sibling::div//a[text()="下一页"]')
            # 为了能够按到“下一页”的按钮进行缩放
            driver.execute_script("document.body.style.zoom='0.25'")
            # 滚动到按钮处，防止被遮挡
            driver.execute_script("arguments[0].scrollIntoView();", next_button)
            time.sleep(0.2)

            # 点击“下一页”
            next_button.click()
            
        except:
            print(f"于第 {page_cnt} 页解析失败，已经解析到末尾。")
            break

    print(f"一共解析了 {page_cnt} 页。")
    
    # 输出 CSV 文件
    df = pd.DataFrame(all_data)
    df.to_csv("./house_rent/rent_price_" + area + ".csv", index=False, encoding="utf-8-sig")
    print(f"已成功保存 {len(df)} 条房源信息.")

In [4]:
service = Service('./chromedriver.exe')
driver = webdriver.Chrome(service=service)

In [5]:
# 沙河
url = 'https://zu.fang.com/house-a012-b02698/s31/'
driver.get(url)
# 注意：运行时若出现验证码或滑动验证需要在sleep结束前手动解除
time.sleep(5)
scrape_info("shahe")

Label(value='正在开始解析...')

于第 36 页解析失败，已经解析到末尾。
一共解析了 36 页。
已成功保存 2126 条房源信息.


In [6]:
# 霍营
url = "https://zu.fang.com/house-a012-b02697/s31/"
driver.get(url)
# 注意：运行时若出现验证码或滑动验证需要在sleep结束前手动解除
time.sleep(5)
scrape_info("huoying")

Label(value='正在开始解析...')

于第 11 页解析失败，已经解析到末尾。
一共解析了 11 页。
已成功保存 642 条房源信息.


In [7]:
# 回龙观
url = "https://zu.fang.com/house-a012-b01182/s31/"
driver.get(url)
# 注意：运行时若出现验证码或滑动验证需要在sleep结束前手动解除
time.sleep(5)
scrape_info("huilongguan")

Label(value='正在开始解析...')

于第 24 页解析失败，已经解析到末尾。
一共解析了 24 页。
已成功保存 1440 条房源信息.


In [9]:
# 天通苑
url = "https://zu.fang.com/house-a012-b01183/s31/"
driver.get(url)
# 注意：运行时若出现验证码或滑动验证需要在sleep结束前手动解除
time.sleep(5)
scrape_info("tiantongyuan")

Label(value='正在开始解析...')

于第 29 页解析失败，已经解析到末尾。
一共解析了 29 页。
已成功保存 1704 条房源信息.
