In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
import re
import time
import ipywidgets as widgets
from IPython.display import display
import os

In [2]:
def get_page():
    global driver
    
    # 找到所有房源容器
    house_list = driver.find_elements(By.XPATH, '//div[@class="shop_list shop_list_4"]/dl[@class="clearfix"]')
    data = []

    for house in house_list:
        try:
            # 房屋标题
            img_tag = house.find_element(By.XPATH, './/dt/a/img')
            alt = img_tag.get_attribute('alt')

            # 房屋面积
            tel_info = house.find_element(By.XPATH, './/p[@class="tel_shop"]').text
            size = tel_info.split('|')[1].strip() if '|' in tel_info else ""

            # 房屋总价
            price_dd = house.find_element(By.XPATH, './/dd[@class="price_right"]')
            spans = price_dd.find_elements(By.TAG_NAME, 'span')

            price_text = spans[0].text.strip() if len(spans) > 0 else ""
            unit_price_text = spans[1].text.strip() if len(spans) > 1 else ""
            
            # 房屋单价
            unit_price_match = re.search(r'(\d+)', unit_price_text)
            unit_price = unit_price_match.group(1) if unit_price_match else ""
            data.append({
                "tag": alt,
                "area": size,
                "total_price": price_text,
                "unit_price": unit_price
            })
        except Exception as e:
            print("某条房源解析失败：", e)
            continue
    
    return data


In [3]:
def scrape_info(area):
    # 爬取信息
    folder_path = "house_price"

    # 检查文件夹是否存在
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)  # 递归创建文件夹
        

    global driver
    all_data = []
    page_cnt = 0
    label = widgets.Label(value="正在开始解析...")
    display(label)
                    
    while True:
        try:
            data = get_page()
            all_data.extend(data)
            page_cnt += 1
            label.value = f"第 {page_cnt} 页信息已经解析。"
                
            # 查找“下一页”按钮
            next_button = driver.find_element(By.XPATH, '//p[@class="last"]/a[contains(text(),"下一页")]')
            # 为了能够按到“下一页”的按钮进行缩放
            driver.execute_script("document.body.style.zoom='0.25'")
            # 滚动到按钮处，防止被遮挡
            driver.execute_script("arguments[0].scrollIntoView();", next_button)
            time.sleep(0.2)

            # 点击“下一页”
            next_button.click()
            
        except:
            print(f"于第 {page_cnt} 页解析失败，已经解析到末尾。")
            break

    print(f"一共解析了 {page_cnt} 页。")
    
    # 输出 CSV 文件
    df = pd.DataFrame(all_data)
    df.to_csv("./house_price/house_price_" + area + ".csv", index=False, encoding="utf-8-sig")
    print(f"已成功保存 {len(df)} 条房源信息.")

In [4]:
# driver初始化
driver = webdriver.Chrome()

In [5]:
# 沙河
url = "https://esf.fang.com/house-a012-b02698/i31/"
driver.get(url)
# 注意：运行时若出现验证码或滑动验证需要在sleep结束前手动解除
time.sleep(5)
scrape_info("shahe")

Label(value='正在开始解析...')

于第 42 页解析失败，已经解析到末尾。
一共解析了 42 页。
已成功保存 2513 条房源信息.


In [6]:
# 霍营
url = "https://esf.fang.com/house-a012-b02697/i31/"
driver.get(url)
# 注意：运行时若出现验证码或滑动验证需要在sleep结束前手动解除
time.sleep(5)
scrape_info("huoying")

Label(value='正在开始解析...')

于第 8 页解析失败，已经解析到末尾。
一共解析了 8 页。
已成功保存 458 条房源信息.


In [7]:
# 回龙观
url = "https://esf.fang.com/house-a012-b01182/i31/"
driver.get(url)
# 注意：运行时若出现验证码或滑动验证需要在sleep结束前手动解除
time.sleep(5)
scrape_info("huilongguan")

Label(value='正在开始解析...')

于第 50 页解析失败，已经解析到末尾。
一共解析了 50 页。
已成功保存 2985 条房源信息.


In [8]:
# 天通苑
url = "https://esf.fang.com/house-a012-b01183/i31/"
driver.get(url)
# 注意：运行时若出现验证码或滑动验证需要在sleep结束前手动解除
time.sleep(5)
scrape_info("tiantongyuan")

Label(value='正在开始解析...')

于第 43 页解析失败，已经解析到末尾。
一共解析了 43 页。
已成功保存 2530 条房源信息.
