In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import re

def scrape_fang_data():
    """
    同时爬取房天下二手房和租房数据
    """
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)
    driver.maximize_window()
    
    # 存储二手房和租房数据
    sale_data = []
    rent_data = []
    
    try:
        # ==================== 爬取二手房数据 ====================
        print("=" * 50)
        print("开始爬取二手房数据...")
        print("=" * 50)
        
        # 1. 访问二手房首页
        print("正在访问二手房首页...")
        driver.get("https://esf.fang.com/")
        time.sleep(3)
        
        # 2. 点击海淀区
        print("正在定位海淀区...")
        haidian_xpath = '//*[@id="kesfqbfylb_A01_03_01"]/ul/li[2]/a'
        
        haidian_link = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, haidian_xpath))
        )
        print(f"找到海淀区链接: {haidian_link.text}")
        haidian_link.click()
        print("✅ 已点击海淀区")
        time.sleep(3)
        
        # 3. 点击北太平庄
        print("正在定位北太平庄...")
        beitaipingzhuang_xpath = '//*[@id="ri010"]/div[1]/ul/li[2]/ul/li[3]/a'
        
        beitaipingzhuang_link = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, beitaipingzhuang_xpath))
        )
        print(f"找到北太平庄链接: {beitaipingzhuang_link.text}")
        beitaipingzhuang_link.click()
        print("✅ 已点击北太平庄")
        time.sleep(5)
        
        # 4. 爬取二手房多页数据
        sale_page_num = 1
        max_sale_pages = 4  # 二手房4页
        
        while sale_page_num <= max_sale_pages:
            print(f"正在爬取二手房第 {sale_page_num} 页...")
            
            page_data = scrape_sale_page(driver, sale_page_num)
            if page_data:
                sale_data.extend(page_data)
                print(f"二手房第 {sale_page_num} 页成功爬取 {len(page_data)} 条数据")
            else:
                print(f"二手房第 {sale_page_num} 页没有数据，停止爬取")
                break
            
            if sale_page_num == max_sale_pages:
                print("二手房已爬取完4页，停止爬取")
                break
            
            if not go_to_next_page(driver, sale_page_num):
                print(f"二手房第 {sale_page_num} 页是最后一页，爬取完成")
                break
                
            sale_page_num += 1
            time.sleep(3)
        
        print(f"二手房爬取完成！共获取 {len(sale_data)} 条数据")
        
        # ==================== 爬取租房数据 ====================
        print("\n" + "=" * 50)
        print("开始爬取租房数据...")
        print("=" * 50)
        
        # 1. 访问租房首页
        print("正在访问租房首页...")
        driver.get("https://zu.fang.com/")
        time.sleep(3)
        
        # 2. 点击海淀区
        print("正在定位海淀区...")
        rent_hadian_selectors = [
            '//a[contains(text(), "海淀")]',
            '//*[contains(@id, "haidian")]',
            '//a[contains(@href, "haidian")]'
        ]
        
        haidian_found = False
        for selector in rent_hadian_selectors:
            try:
                haidian_link = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.XPATH, selector))
                )
                print(f"找到海淀区链接: {haidian_link.text}")
                haidian_link.click()
                print("✅ 已点击海淀区")
                haidian_found = True
                time.sleep(3)
                break
            except:
                continue
        
        if not haidian_found:
            print("❌ 未找到海淀区链接，尝试直接访问...")
            driver.get("https://zu.fang.com/house-haidian/")
            time.sleep(3)
        
        # 3. 点击北太平庄
        print("正在定位北太平庄...")
        rent_beitaipingzhuang_selectors = [
            '//a[contains(text(), "北太平庄")]',
            '//a[contains(@href, "beitapingzhuang")]',
            '//*[contains(@id, "beitai")]'
        ]
        
        beitaipingzhuang_found = False
        for selector in rent_beitaipingzhuang_selectors:
            try:
                beitaipingzhuang_link = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.XPATH, selector))
                )
                print(f"找到北太平庄链接: {beitaipingzhuang_link.text}")
                beitaipingzhuang_link.click()
                print("✅ 已点击北太平庄")
                beitaipingzhuang_found = True
                time.sleep(5)
                break
            except:
                continue
        
        if not beitaipingzhuang_found:
            print("❌ 未找到北太平庄链接，尝试直接访问...")
            driver.get("https://zu.fang.com/house-haidian/beitapingzhuang/")
            time.sleep(5)
        
        # 4. 爬取租房多页数据
        rent_page_num = 1
        max_rent_pages = 5  # 租房5页
        
        while rent_page_num <= max_rent_pages:
            print(f"正在爬取租房第 {rent_page_num} 页...")
            
            page_data = scrape_rent_page(driver, rent_page_num)
            if page_data:
                rent_data.extend(page_data)
                print(f"租房第 {rent_page_num} 页成功爬取 {len(page_data)} 条数据")
            else:
                print(f"租房第 {rent_page_num} 页没有数据，停止爬取")
                break
            
            if rent_page_num == max_rent_pages:
                print("租房已爬取完5页，停止爬取")
                break
            
            if not go_to_next_page(driver, rent_page_num):
                print(f"租房第 {rent_page_num} 页是最后一页，爬取完成")
                break
                
            rent_page_num += 1
            time.sleep(3)
        
        print(f"租房爬取完成！共获取 {len(rent_data)} 条数据")
        
    except Exception as e:
        print(f"爬虫执行出错: {e}")
    
    finally:
        input("按Enter键关闭浏览器...")
        driver.quit()
    
    return pd.DataFrame(sale_data), pd.DataFrame(rent_data)

def scrape_sale_page(driver, page_num):
    """爬取二手房当前页面数据"""
    house_data_list = []
    
    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".shop_list dl"))
        )
        
        house_elements = driver.find_elements(By.CSS_SELECTOR, ".shop_list dl")
        print(f"找到 {len(house_elements)} 个二手房房源")
        
        for i, house in enumerate(house_elements):
            try:
                house_data = extract_sale_details(house, i, page_num)
                if house_data:
                    house_data_list.append(house_data)
            except:
                continue
                
    except Exception as e:
        print(f"二手房爬取时发生错误: {e}")
    
    return house_data_list

def scrape_rent_page(driver, page_num):
    """爬取租房当前页面数据"""
    house_data_list = []
    
    try:
        # 租房页面的选择器可能不同，尝试多种
        rent_selectors = [
            ".shop_list dl",
            ".houseList dl", 
            ".list dl",
            "dl[class*='list']"
        ]
        
        house_elements = []
        for selector in rent_selectors:
            elements = driver.find_elements(By.CSS_SELECTOR, selector)
            if elements:
                house_elements = elements
                print(f"使用选择器 {selector} 找到 {len(elements)} 个租房房源")
                break
        
        if not house_elements:
            print("未找到租房房源元素")
            return house_data_list
        
        for i, house in enumerate(house_elements):
            try:
                house_data = extract_rent_details(house, i, page_num)
                if house_data:
                    house_data_list.append(house_data)
            except:
                continue
                
    except Exception as e:
        print(f"租房爬取时发生错误: {e}")
    
    return house_data_list

def extract_sale_details(house_element, index, page_num):
    """提取二手房详细信息"""
    try:
        # 详细信息
        detail_elements = house_element.find_elements(By.XPATH, './/p | .//div[contains(@class, "info")]')
        detail_text = ""
        for elem in detail_elements:
            text = elem.text.strip()
            if "室" in text and "厅" in text and "㎡" in text:
                detail_text = text
                break
        
        if not detail_text:
            try:
                detail_element = house_element.find_element(By.XPATH, './/*[contains(text(), "室") and contains(text(), "厅")]')
                detail_text = detail_element.text
            except:
                pass
        
        # 价格信息
        total_price = "N/A"
        try:
            price_elements = house_element.find_elements(By.XPATH, './/*[contains(text(), "万")]')
            for elem in price_elements:
                text = elem.text.strip()
                if "万" in text and any(c.isdigit() for c in text):
                    total_price = text
                    break
        except:
            pass
        
        # 单价信息
        unit_price = "N/A"
        try:
            unit_elements = house_element.find_elements(By.XPATH, './/*[contains(text(), "元/㎡")]')
            for elem in unit_elements:
                text = elem.text.strip()
                if "元/㎡" in text:
                    unit_price_match = re.search(r'(\d+)元/㎡', text)
                    unit_price = unit_price_match.group(1) if unit_price_match else "N/A"
                    break
        except:
            pass
        
        # 面积
        area_match = re.search(r'(\d+\.?\d*)㎡', detail_text)
        area = float(area_match.group(1)) if area_match else 0.0
        
        house_data = {
            '面积(㎡)': area,
            '总价(万)': total_price,
            '单价(元/㎡)': unit_price,
            '爬取页码': page_num,
            '数据类型': '二手房'
        }
        
        return house_data
        
    except:
        return None

def extract_rent_details(house_element, index, page_num):
    """提取租房详细信息"""
    try:
        # 获取整个元素的文本
        full_text = house_element.text
        
        # 租金信息（租房通常是元/月）
        rent_price = "N/A"
        rent_match = re.search(r'(\d+)元/月', full_text)
        if rent_match:
            rent_price = rent_match.group(1)
        else:
            # 尝试其他格式
            price_elements = house_element.find_elements(By.XPATH, './/*[contains(text(), "元/月")]')
            for elem in price_elements:
                text = elem.text.strip()
                rent_match = re.search(r'(\d+)元/月', text)
                if rent_match:
                    rent_price = rent_match.group(1)
                    break
        
        # 面积信息
        area = 0.0
        area_match = re.search(r'(\d+\.?\d*)㎡', full_text)
        if area_match:
            area = float(area_match.group(1))
        else:
            # 尝试其他面积格式
            area_match2 = re.search(r'(\d+\.?\d*)平米', full_text)
            if area_match2:
                area = float(area_match2.group(1))
        
        house_data = {
            '面积(㎡)': area,
            '月租金(元)': rent_price,
            '爬取页码': page_num,
            '数据类型': '租房'
        }
        
        return house_data
        
    except:
        return None

def go_to_next_page(driver, current_page):
    """翻页功能"""
    next_selectors = [
        '/html/body/div[4]/div[4]/div[5]/div/p[1]/a',
        '//a[@id="PageContNew_next"]',
        '//a[contains(text(), "下一页")]',
        '//a[contains(text(), ">")]',
        '//a[contains(@class, "next")]'
    ]
    
    for selector in next_selectors:
        try:
            next_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, selector))
            )
            
            if next_button.is_displayed() and next_button.is_enabled():
                driver.execute_script("arguments[0].click();", next_button)
                print(f"✅ 成功翻页到第 {current_page + 1} 页")
                time.sleep(3)
                return True
                
        except:
            continue
    
    return False

# 主程序
if __name__ == "__main__":
    print("开始同时爬取二手房和租房数据...")
    print("提示：二手房4页，租房5页")
    
    df_sale, df_rent = scrape_fang_data()
    
    # 保存二手房数据
    if not df_sale.empty:
        df_sale.to_csv("北太平庄二手房数据.csv", index=False, encoding='utf-8-sig')
        print(f"\n✅ 二手房数据已保存到: 北太平庄二手房数据.csv")
        print(f"二手房数据量: {len(df_sale)} 条")
        print(f"二手房数据预览:")
        print(df_sale.head())
        
        # 显示二手房各页数据量
        sale_page_counts = df_sale['爬取页码'].value_counts().sort_index()
        print("\n二手房各页数据统计:")
        for page, count in sale_page_counts.items():
            print(f"第{page}页: {count}条数据")
    else:
        print("\n❌ 未获取到二手房数据")
    
    # 保存租房数据
    if not df_rent.empty:
        df_rent.to_csv("北太平庄租房数据.csv", index=False, encoding='utf-8-sig')
        print(f"\n✅ 租房数据已保存到: 北太平庄租房数据.csv")
        print(f"租房数据量: {len(df_rent)} 条")
        print(f"租房数据预览:")
        print(df_rent.head())
        
        # 显示租房各页数据量
        rent_page_counts = df_rent['爬取页码'].value_counts().sort_index()
        print("\n租房各页数据统计:")
        for page, count in rent_page_counts.items():
            print(f"第{page}页: {count}条数据")
    else:
        print("\n❌ 未获取到租房数据")
    
    print("\n爬虫程序执行完毕！")