In [None]:
# 导入必要的库
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import logging
import os
import sys
from logging.handlers import RotatingFileHandler
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

In [None]:
class LoggingConfig:
    """日志配置类"""
    
    @staticmethod
    def setup_logging(log_dir="logs", log_level=logging.INFO):
        """设置日志配置"""
        
        # 创建日志目录
        if not os.path.exists(log_dir):
            os.makedirs(log_dir)
        
        # 获取主日志记录器
        logger = logging.getLogger('HouseCrawler')
        logger.setLevel(log_level)
        
        # 避免重复添加处理器
        if logger.handlers:
            return logger
        
        # 日志格式
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
            datefmt='%Y-%m-%d %H:%M:%S'
        )
        
        # 控制台处理器
        console_handler = logging.StreamHandler(sys.stdout)
        console_handler.setLevel(log_level)
        console_handler.setFormatter(formatter)
        
        # 文件处理器 - 按日期分割
        log_file = os.path.join(log_dir, f"house_crawler_{datetime.now().strftime('%Y%m%d')}.log")
        file_handler = RotatingFileHandler(
            log_file,
            maxBytes=10 * 1024 * 1024,  # 10MB
            backupCount=5,
            encoding='utf-8'
        )
        file_handler.setLevel(log_level)
        file_handler.setFormatter(formatter)
        
        # 错误日志文件处理器 - 只记录ERROR及以上级别
        error_file_handler = RotatingFileHandler(
            os.path.join(log_dir, f"error_{datetime.now().strftime('%Y%m%d')}.log"),
            maxBytes=5 * 1024 * 1024,  # 5MB
            backupCount=3,
            encoding='utf-8'
        )
        error_file_handler.setLevel(logging.ERROR)
        error_file_handler.setFormatter(formatter)
        
        # 添加处理器
        logger.addHandler(console_handler)
        logger.addHandler(file_handler)
        logger.addHandler(error_file_handler)
        
        return logger

class EnhancedHouseDataCrawler:
    """房屋数据爬虫"""
    
    def __init__(self):
        self.logger = LoggingConfig.setup_logging()
        self.second_hand_results = []
        self.rental_results = []
        self.drivers = []
        
    def setup_driver(self, browser_type='chrome'):
        """设置浏览器驱动并记录日志"""
        try:
            self.logger.info(f"正在初始化{browser_type}浏览器驱动")
            
            if browser_type.lower() == 'chrome':
                driver = webdriver.Chrome()
            elif browser_type.lower() == 'edge':
                driver = webdriver.Edge()
            else:
                driver = webdriver.Chrome()
            
            driver.maximize_window()
            self.drivers.append(driver)
            self.logger.info(f"{browser_type}浏览器初始化成功")
            return driver
            
        except Exception as e:
            self.logger.error(f"浏览器初始化失败: {str(e)}", exc_info=True)
            return None

In [4]:


    
    def close_all_drivers(self):
        """关闭所有浏览器实例"""
        self.logger.info("正在关闭所有浏览器实例")
        for driver in self.drivers:
            try:
                driver.quit()
            except Exception as e:
                self.logger.warning(f"关闭浏览器时出现异常: {str(e)}")
    
    def extract_second_hand_info(self, house_element):
        """提取二手房信息并记录详细日志"""
        house_data = {"户型": "", "面积": "", "楼层": "", "朝向": "", "总价": "", "单价": ""}
        
        try:
            self.logger.debug("开始解析二手房元素")
            
            # 提取基本要素
            tel_shop = house_element.find("p", class_="tel_shop")
            if tel_shop:
                tel_text = tel_shop.get_text(strip=True).split("|")
                tel_text = [item.strip() for item in tel_text if item.strip()]
                
                if len(tel_text) >= 1:
                    house_data["户型"] = tel_text[0]
                if len(tel_text) >= 2:
                    house_data["面积"] = tel_text[1]
                if len(tel_text) >= 3:
                    house_data["楼层"] = tel_text[2]
                if len(tel_text) >= 4:
                    house_data["朝向"] = tel_text[3]
                
                self.logger.debug(f"解析到基础信息: 户型={house_data['户型']}, 面积={house_data['面积']}")
            
            # 提取价格信息
            price_right = house_element.find("dd", class_="price_right")
            if price_right:
                total_price = price_right.find("span", class_="red")
                if total_price:
                    house_data["总价"] = total_price.get_text(strip=True)
                
                unit_price = price_right.find("span", class_=None)
                if unit_price and "元/㎡" in unit_price.get_text():
                    house_data["单价"] = unit_price.get_text(strip=True)
                
                self.logger.debug(f"解析到价格信息: 总价={house_data['总价']}, 单价={house_data['单价']}")
            
            self.logger.info(f"二手房信息提取成功: {house_data['户型']} - {house_data['总价']}")
            
        except Exception as e:
            self.logger.error(f"二手房信息提取失败: {str(e)}", exc_info=True)
        
        return house_data
    
    def extract_rental_info(self, house_element):
        """提取租房信息并记录详细日志"""
        house_data = {
            "租赁类型": "", "户型": "", "面积": "", "朝向": "", 
            "区域": "", "交通（地铁距离）": "", "月租金": ""
        }
        
        try:
            self.logger.debug("开始解析租房元素")
            
            # 租赁类型
            basic_p = house_element.find("p", class_="font15 mt12 bold")
            if basic_p:
                basic_text = basic_p.get_text(strip=True)
                house_data["租赁类型"] = basic_text.split("|")[0].strip() or "整租"
            
            # 户型、面积、朝向
            if basic_p:
                basic_segments = [seg.strip() for seg in basic_text.split("|") if seg.strip()]
                if len(basic_segments) >= 2:
                    house_data["户型"] = basic_segments[1]
                if len(basic_segments) >= 3:
                    house_data["面积"] = basic_segments[2]
                if len(basic_segments) >= 4:
                    house_data["朝向"] = basic_segments[3].replace("朝", "")
            
            # 区域
            area_p = house_element.find("p", class_="gray6 mt12")
            if area_p:
                house_data["区域"] = area_p.get_text(strip=True)
            
            # 交通信息
            subway_span = house_element.find("span", class_="note subInfor")
            if subway_span:
                house_data["交通（地铁距离）"] = subway_span.get_text(strip=True)
            
            # 月租金
            more_info_div = house_element.find("div", class_="moreInfo")
            if more_info_div:
                price_num = more_info_div.find("span", class_="price")
                if price_num:
                    price_num_text = price_num.get_text(strip=True)
                    price_unit = more_info_div.get_text(strip=True).replace(price_num_text, "")
                    house_data["月租金"] = f"{price_num_text}{price_unit}"
            
            self.logger.info(f"租房信息提取成功: {house_data['户型']} - {house_data['月租金']}")
            self.logger.debug(f"详细租房数据: {house_data}")
            
        except Exception as e:
            self.logger.error(f"租房信息提取失败: {str(e)}", exc_info=True)
        
        return house_data
    
    def crawl_second_hand_houses(self):
        """爬取二手房数据"""
        self.logger.info("🏠 开始爬取二手房数据")
        
        base_url = "https://zhangjiakou.esf.fang.com/house-a011426/"
        target_pages = 5
        columns = ["户型", "面积", "楼层", "朝向", "总价", "单价"]
        
        driver = self.setup_driver('chrome')
        if not driver:
            self.logger.error("二手房爬虫启动失败：浏览器初始化失败")
            return []
        
        all_house_data = []
        successful_pages = 0
        
        for page_num in range(1, target_pages + 1):
            try:
                if page_num == 1:
                    current_url = base_url
                else:
                    current_url = f"{base_url}i3{page_num}/"
                
                self.logger.info(f"📄 正在访问二手房第{page_num}页: {current_url}")
                driver.get(current_url)
                
                # 等待页面加载
                WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By.XPATH, "//dl[@dataflag='bg']"))
                )
                
                page_source = driver.page_source
                soup = BeautifulSoup(page_source, "lxml")
                house_elements = soup.find_all("dl", attrs={"dataflag": "bg"})
                
                if not house_elements:
                    self.logger.warning(f"第{page_num}页未找到二手房数据")
                    if page_num == 1:
                        self.logger.error("第一页未找到数据，可能页面结构已变化")
                        break
                    continue
                
                self.logger.info(f"第{page_num}页找到{len(house_elements)}个房源")
                page_house_count = 0
                
                for idx, house_element in enumerate(house_elements, 1):
                    house_data = self.extract_second_hand_info(house_element)
                    if house_data.get("总价"):
                        all_house_data.append(house_data)
                        page_house_count += 1
                    
                    if idx % 5 == 0:
                        self.logger.debug(f"第{page_num}页: 已处理{idx}/{len(house_elements)}个房源")
                
                successful_pages += 1
                self.logger.info(f"第{page_num}页完成，提取{page_house_count}个有效房源")
                
                time.sleep(random.uniform(2, 4))
                
            except TimeoutException:
                self.logger.error(f"第{page_num}页加载超时")
            except Exception as e:
                self.logger.error(f"第{page_num}页爬取异常: {str(e)}", exc_info=True)
        
        self.logger.info(f"二手房爬取完成: 成功{successful_pages}页，共获取{len(all_house_data)}条数据")
        self.second_hand_results = all_house_data
        return all_house_data
    
    def crawl_rental_houses(self):
        """爬取租房数据"""
        self.logger.info("🏡 开始爬取租房数据")
        
        base_url = "http://zu.zhangjiakou.fang.com/house-a078/"
        target_pages = 2
        columns = ["租赁类型", "户型", "面积", "朝向", "区域", "交通（地铁距离）", "月租金"]
        
        driver = self.setup_driver('edge')
        if not driver:
            self.logger.error("租房爬虫启动失败：浏览器初始化失败")
            return []
        
        all_house_data = []
        successful_pages = 0
        
        for page_num in range(1, target_pages + 1):
            try:
                if page_num == 1:
                    current_url = base_url
                else:
                    current_url = f"{base_url}i{page_num}/"
                
                self.logger.info(f"📄 正在访问租房第{page_num}页: {current_url}")
                driver.get(current_url)
                
                WebDriverWait(driver, 15).until(
                    EC.presence_of_all_elements_located((By.XPATH, "//dl[@class='list hiddenMap rel']"))
                )
                
                page_source = driver.page_source
                soup = BeautifulSoup(page_source, "lxml")
                house_elements = soup.find_all("dl", class_="list hiddenMap rel")
                
                if not house_elements:
                    self.logger.warning(f"第{page_num}页未找到租房数据")
                    continue
                
                self.logger.info(f"第{page_num}页找到{len(house_elements)}个房源")
                page_house_count = 0
                
                for idx, house_element in enumerate(house_elements, 1):
                    house_data = self.extract_rental_info(house_element)
                    if house_data.get("月租金"):
                        all_house_data.append(house_data)
                        page_house_count += 1
                    
                    if idx % 5 == 0:
                        self.logger.debug(f"第{page_num}页: 已处理{idx}/{len(house_elements)}个房源")
                
                successful_pages += 1
                self.logger.info(f"第{page_num}页完成，提取{page_house_count}个有效房源")
                
                time.sleep(random.uniform(2, 4))
                
            except TimeoutException:
                self.logger.error(f"第{page_num}页加载超时")
            except Exception as e:
                self.logger.error(f"第{page_num}页爬取异常: {str(e)}", exc_info=True)
        
        self.logger.info(f"租房爬取完成: 成功{successful_pages}页，共获取{len(all_house_data)}条数据")
        self.rental_results = all_house_data
        return all_house_data
    
    def save_data(self, data, filename, columns):
        """保存数据到CSV文件"""
        try:
            if data:
                df = pd.DataFrame(data, columns=columns)
                df.to_csv(filename, index=False, encoding='utf-8-sig')
                self.logger.info(f"💾 数据已保存至: {filename}，共{len(data)}条记录")
                return True
            else:
                self.logger.warning(f"⚠️ {filename} 无数据可保存")
                return False
        except Exception as e:
            self.logger.error(f"数据保存失败 {filename}: {str(e)}", exc_info=True)
            return False
    
    def run_concurrent_crawlers(self):
        """同时运行两个爬虫"""
        self.logger.info("🚀 启动并发爬取模式")
        start_time = time.time()
        
        import threading
        
        # 创建线程
        second_hand_thread = threading.Thread(target=self.crawl_second_hand_houses)
        rental_thread = threading.Thread(target=self.crawl_rental_houses)
        
        # 启动线程
        second_hand_thread.start()
        rental_thread.start()
        
        # 等待线程完成
        second_hand_thread.join()
        rental_thread.join()
        
        # 保存数据
        second_hand_success = self.save_data(
            self.second_hand_results, 
            "张家口二手房数据.csv", 
            ["户型", "面积", "楼层", "朝向", "总价", "单价"]
        )
        
        rental_success = self.save_data(
            self.rental_results,
            "张家口租房数据.csv",
            ["租赁类型", "户型", "面积", "朝向", "区域", "交通（地铁距离）", "月租金"]
        )
        
        end_time = time.time()
        total_time = end_time - start_time
        
        self.logger.info(f"🎉 并发爬取完成! 总耗时: {total_time:.2f}秒")
        self.logger.info(f"🏠 二手房数据: {len(self.second_hand_results)}条")
        self.logger.info(f"🏡 租房数据: {len(self.rental_results)}条")
        
        self.close_all_drivers()
        return second_hand_success or rental_success

def main():
    """主函数"""
    crawler = EnhancedHouseDataCrawler()
    
    crawler.logger.info("=" * 50)
    crawler.logger.info("🏠 房屋数据爬虫系统 - 增强日志版")
    crawler.logger.info("=" * 50)
    
    print("请选择爬虫模式:")
    print("1. 只爬取二手房数据")
    print("2. 只爬取租房数据") 
    print("3. 同时爬取二手房和租房数据（并发模式）")
    
    choice = input("请输入选择 (1/2/3): ").strip()
    
    start_time = time.time()
    
    try:
        if choice == "1":
            crawler.logger.info("🔍 选择模式: 只爬取二手房数据")
            crawler.crawl_second_hand_houses()
            crawler.save_data(
                crawler.second_hand_results,
                "张家口二手房数据.csv",
                ["户型", "面积", "楼层", "朝向", "总价", "单价"]
            )
            
        elif choice == "2":
            crawler.logger.info("🔍 选择模式: 只爬取租房数据")
            crawler.crawl_rental_houses()
            crawler.save_data(
                crawler.rental_results,
                "张家口租房数据.csv",
                ["租赁类型", "户型", "面积", "朝向", "区域", "交通（地铁距离）", "月租金"]
            )
            
        elif choice == "3":
            crawler.logger.info("🔍 选择模式: 并发爬取二手房和租房数据")
            crawler.run_concurrent_crawlers()
            
        else:
            crawler.logger.error("❌ 无效选择，请重新运行程序并输入1、2或3")
            return
        
        end_time = time.time()
        crawler.logger.info(f"✅ 所有任务完成! 总耗时: {end_time - start_time:.2f}秒")
        
    except KeyboardInterrupt:
        crawler.logger.warning("⏹️ 用户中断程序")
    except Exception as e:
        crawler.logger.error(f"❌ 程序执行出错: {str(e)}", exc_info=True)
    finally:
        crawler.close_all_drivers()

if __name__ == "__main__":
    main()

2025-10-16 05:13:07 - HouseCrawler - INFO - [2371627500.py:406] - 🏠 房屋数据爬虫系统 - 增强日志版
请选择爬虫模式:
1. 只爬取二手房数据
2. 只爬取租房数据
3. 同时爬取二手房和租房数据（并发模式）


请输入选择 (1/2/3):  3


2025-10-16 05:13:13 - HouseCrawler - INFO - [2371627500.py:438] - 🔍 选择模式: 并发爬取二手房和租房数据
2025-10-16 05:13:13 - HouseCrawler - INFO - [2371627500.py:361] - 🚀 启动并发爬取模式
2025-10-16 05:13:13 - HouseCrawler - INFO - [2371627500.py:214] - 🏠 开始爬取二手房数据
2025-10-16 05:13:13 - HouseCrawler - INFO - [2371627500.py:282] - 🏡 开始爬取租房数据
2025-10-16 05:13:13 - HouseCrawler - INFO - [2371627500.py:88] - 正在初始化chrome浏览器驱动
2025-10-16 05:13:13 - HouseCrawler - INFO - [2371627500.py:88] - 正在初始化edge浏览器驱动
2025-10-16 05:13:14 - HouseCrawler - INFO - [2371627500.py:99] - chrome浏览器初始化成功
2025-10-16 05:13:14 - HouseCrawler - INFO - [2371627500.py:235] - 📄 正在访问二手房第1页: https://zhangjiakou.esf.fang.com/house-a011426/
2025-10-16 05:13:14 - HouseCrawler - INFO - [2371627500.py:99] - edge浏览器初始化成功
2025-10-16 05:13:14 - HouseCrawler - INFO - [2371627500.py:303] - 📄 正在访问租房第1页: http://zu.zhangjiakou.fang.com/house-a078/
2025-10-16 05:13:16 - HouseCrawler - INFO - [2371627500.py:318] - 第1页找到60个房源
2025-10-16 05:13:16 - HouseCrawler