In [None]:
import pandas as pd
import re
import time
import os
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common import NoSuchElementException, TimeoutException
from webdriver_manager.microsoft import EdgeChromiumDriverManager

class QiaoxiHouseSpider:
    """爬取桥西地区二手房和租房数据的爬虫类"""
    
    def __init__(self, max_pages=30):
        """初始化爬虫，设置URL、最大页数和数据保存路径"""
        self.ershou_url = "https://zhangjiakou.esf.fang.com/house-a014962"  # 二手房URL
        self.rent_url = "https://zhangjiakou.zu.fang.com/house-a014962/"  # 租房URL
        self.max_pages = max_pages  # 最大爬取页数
        self.area_name = "桥西"  # 目标区域
        # 创建数据保存目录
        self.data_dir = "house_data"
        if not os.path.exists(self.data_dir):
            os.makedirs(self.data_dir)
        
    def _get_driver(self):
        """配置并返回Edge浏览器驱动（无头模式）"""
        edge_opts = Options()
        edge_opts.add_argument("--headless=new")  # 无头模式，不显示浏览器窗口
        edge_opts.add_argument("--disable-gpu")
        edge_opts.add_argument("--no-sandbox")
        edge_opts.add_experimental_option("useAutomationExtension", False)
        edge_opts.add_argument("--ignore-certificate-errors")
        edge_opts.add_argument("--ignore-ssl-errors")
        return webdriver.Edge(options=edge_opts)
    
    def _parse_ershou_page(self, driver, wait):
        """解析二手房页面，提取房源信息"""
        house_list = []
        try:
            # 等待房源列表加载完成
            container = wait.until(
                EC.presence_of_element_located((By.CLASS_NAME, "shop_list_4"))
            )
            items = container.find_elements(By.XPATH, ".//dl[@dataflag='bg']")
            
            for item in items:
                house_info = {"区域": self.area_name}
                
                # 提取房源标题和小区名称
                try:
                    title_elem = item.find_element(By.CLASS_NAME, "tit_shop")
                    title_text = title_elem.text.strip()
                    house_info["房源标题"] = title_text
                    comm_match = re.search(r'^([^\d]+?)[\s\-]', title_text)
                    house_info["小区名称"] = comm_match.group(1).strip() if comm_match else None
                except:
                    house_info["房源标题"] = None
                    house_info["小区名称"] = None
                
                # 提取户型、面积、朝向等信息
                try:
                    info_text = item.find_element(By.CLASS_NAME, "tel_shop").text
                    house_type = re.search(r"\d+室\d+厅", info_text)
                    house_info["户型"] = house_type.group() if house_type else None
                    
                    area_val = re.search(r"\d+\.?\d+㎡", info_text)
                    house_info["建筑面积"] = area_val.group() if area_val else None
                    
                    direction = re.search(r"[东南西北]+向", info_text)
                    house_info["朝向"] = direction.group() if direction else None
                    
                    year = re.search(r"\d{4}年", info_text)
                    house_info["建造年份"] = year.group() if year else None
                    
                    floor_match = re.search(r"(高|中|低)层.*?共(\d+)层", info_text)
                    if floor_match:
                        house_info["楼层类型"] = floor_match.group(1) + "层"
                        house_info["总层数"] = floor_match.group(2) + "层"
                    else:
                        house_info["楼层类型"] = None
                        house_info["总层数"] = None
                except:
                    house_info["户型"] = house_info["建筑面积"] = house_info["朝向"] = None
                    house_info["建造年份"] = house_info["楼层类型"] = house_info["总层数"] = None
                
                # 提取价格信息
                try:
                    total_price = item.find_element(By.XPATH, ".//dd[@class='price_right']/span[1]").text
                    house_info["总价"] = total_price
                except:
                    house_info["总价"] = None
                    
                try:
                    unit_price = item.find_element(By.XPATH, ".//dd[@class='price_right']/span[2]").text
                    house_info["单价"] = unit_price
                except:
                    house_info["单价"] = None
                
                # 提取区域位置
                try:
                    area_pos = item.find_element(By.CLASS_NAME, "add_shop").text
                    house_info["区域位置"] = area_pos
                except:
                    house_info["区域位置"] = None
                
                house_list.append(house_info)
        except Exception as e:
            print(f"解析二手房页面出错: {str(e)}")
            
        return house_list
    
    def crawl_ershou_data(self):
        """爬取二手房数据并保存为CSV文件"""
        all_data = []
        try:
            with self._get_driver() as driver:
                driver.get(self.ershou_url)
                wait = WebDriverWait(driver, 5)
                current_page = 1
                
                # 循环爬取多页
                while current_page <= self.max_pages:
                    print(f"正在爬取桥西二手房第{current_page}页...")
                    page_data = self._parse_ershou_page(driver, wait)
                    all_data.extend(page_data)
                    
                    # 翻页处理
                    if current_page < self.max_pages:
                        try:
                            next_btn = wait.until(
                                EC.element_to_be_clickable((By.XPATH, "//a[text()='下一页']"))
                            )
                            driver.execute_script("arguments[0].click();", next_btn)
                            time.sleep(2)  # 等待页面加载
                            current_page += 1
                        except (NoSuchElementException, TimeoutException):
                            print(f"第{current_page}页后无更多页面，终止爬取")
                            break
        
            # 保存数据到CSV
            file_path = os.path.join(self.data_dir, f"桥西二手房数据_{self.max_pages}页.csv")
            df = pd.DataFrame(all_data)
            df.to_csv(file_path, index=False, encoding="utf-8-sig")
            print(f"桥西二手房数据爬取完成，共{len(all_data)}条，已保存到 {file_path}")
            return df
        except Exception as e:
            print(f"二手房爬取过程出错: {str(e)}")
            return None
    
    def crawl_rent_data(self):
        """爬取租房数据并保存为CSV文件"""
        all_data = []
        try:
            with self._get_driver() as driver:
                driver.get(self.rent_url)
                wait = WebDriverWait(driver, 5)
                current_page = 1
                
                # 循环爬取多页
                while current_page <= self.max_pages:
                    print(f"正在爬取桥西租房第{current_page}页...")
                    try:
                        wait.until(EC.presence_of_element_located((By.CLASS_NAME, "houseList")))
                        items = driver.find_elements(
                            By.XPATH, "//div[@class='houseList']//dl[contains(@class, 'list')]"
                        )
                        
                        for item in items:
                            rent_info = {"区域": self.area_name}
                            
                            # 提取房源标题和小区名称
                            try:
                                title_elem = item.find_element(By.XPATH, ".//dd//p[@class='title']/a")
                                title_text = title_elem.get_attribute("title")
                                rent_info["房源标题"] = title_text
                                comm_match = re.search(r'^([^\d]+?)[\s\-]', title_text)
                                rent_info["小区名称"] = comm_match.group(1).strip() if comm_match else None
                            except:
                                continue
                            
                            # 提取户型、面积、朝向
                            try:
                                info_elem = item.find_element(By.XPATH, ".//dd//p[@class='font15 mt12']")
                                info_text = info_elem.text
                                
                                house_type = re.search(r"\d+室\d+厅", info_text)
                                rent_info["户型"] = house_type.group() if house_type else None
                                
                                area_val = re.search(r"\d+\.?\d+㎡", info_text)
                                rent_info["面积"] = area_val.group() if area_val else None
                                
                                direction = re.search(r"[东南西北]+向", info_text)
                                rent_info["朝向"] = direction.group() if direction else None
                            except:
                                rent_info["户型"] = rent_info["面积"] = rent_info["朝向"] = None
                            
                            # 提取租金
                            try:
                                price_elem = item.find_element(By.CLASS_NAME, "price")
                                price_text = price_elem.text.strip()
                                price_match = re.search(r"\d+", price_text)
                                rent_info["租金(元/月)"] = int(price_match.group()) if price_match else None
                            except:
                                rent_info["租金(元/月)"] = None
                                
                            # 判断是否业主直租
                            try:
                                item.find_element(By.XPATH, ".//span[contains(text(), '个人') or contains(text(), '业主')]")
                                rent_info["是否业主直租"] = "是"
                            except:
                                rent_info["是否业主直租"] = "否"
                                
                            all_data.append(rent_info)
                        
                        # 翻页处理
                        if current_page < self.max_pages:
                            try:
                                next_btn = wait.until(
                                    EC.element_to_be_clickable((By.XPATH, "//a[contains(text(), '下一页')]"))
                                )
                                driver.execute_script("arguments[0].click();", next_btn)
                                time.sleep(2)  # 等待页面加载
                                current_page += 1
                            except:
                                print(f"第{current_page}页后无更多页面，终止爬取")
                                break
                    except Exception as e:
                        print(f"租房页面爬取错误: {str(e)}")
                        break
        
            # 保存数据到CSV
            file_path = os.path.join(self.data_dir, f"桥西租房数据_{self.max_pages}页.csv")
            df = pd.DataFrame(all_data)
            df.to_csv(file_path, index=False, encoding="utf-8-sig")
            print(f"桥西租房数据爬取完成，共{len(all_data)}条，已保存到 {file_path}")
            return df
        except Exception as e:
            print(f"租房爬取过程出错: {str(e)}")
            return None

if __name__ == "__main__":
    spider = QiaoxiHouseSpider(max_pages=30) # 初始化爬虫，最多爬取30页
    # 爬取二手房数据
    spider.crawl_ershou_data()  # 间隔3秒，避免请求过于频繁
    time.sleep(3)
    # 爬取租房数据
    spider.crawl_rent_data()   
