In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import random


In [2]:
def crawl_house_info():
    all_houses = []
    for page in range(1, 21):  # 爬取前20页
        print(f"正在爬取第{page}页...")

        # 用f-string格式化字符串，动态拼接页码参数
        url = f"https://sh.zu.fang.com/house-a019-b02768/i3{page}/"
        
        # 模拟浏览器请求头
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Connection": "keep-alive"
        }
        
        try:
            # 发送请求并处理编码
            response = requests.get(url, headers=headers, allow_redirects=True)
            if response.encoding:
                html_content = response.text
            else:
                try:
                    response.encoding = 'gb2312'
                    html_content = response.text
                except UnicodeDecodeError:
                    response.encoding = 'gbk'
                    html_content = response.text
            
            # 解析HTML
            soup = BeautifulSoup(html_content, 'html.parser')
            house_list = soup.find_all('dl', class_='list hiddenMap rel')  # 房源节点定位
            
            for house in house_list:
                house_info = {}
                
                # 1. 提取标题（修正：确保获取完整标题）
                title_p = house.find('p', class_='title')
                if title_p:
                    title_a = title_p.find('a')
                    if title_a:
                        # 优先使用title属性（完整标题），无则用文本
                        house_info['标题'] = title_a.get('title', '').strip() or title_a.text.strip()
                    else:
                        house_info['标题'] = '无标题'
                else:
                    house_info['标题'] = '无标题'
                
                # 2. 提取租赁类型、户型、面积、朝向（核心修正：按分隔符拆分）
                detail_p = house.find('p', class_='font15 mt12 bold')
                if detail_p:
                    # 初始化默认值
                    house_info['租赁类型'] = '未知'
                    house_info['户型'] = '未知'
                    house_info['面积'] = '未知'
                    house_info['朝向'] = '未知'
                    
                    # 关键：按<span class="splitline">|</span>分割字段
                    # 先获取所有子节点，过滤空文本，保留有效内容
                    detail_parts = []
                    for child in detail_p.contents:
                        text = child.strip() if isinstance(child, str) else ''
                        if text and text != '|':  # 排除分隔符"|"
                            detail_parts.append(text)
                    
                    # 按分割后的列表顺序匹配（固定结构：租赁类型|户型|面积|朝向）
                    if len(detail_parts) >= 1:
                        house_info['租赁类型'] = detail_parts[0]
                    if len(detail_parts) >= 2:
                        house_info['户型'] = detail_parts[1]
                    if len(detail_parts) >= 3:
                        house_info['面积'] = detail_parts[2]
                    if len(detail_parts) >= 4:
                        house_info['朝向'] = detail_parts[3]
                
                # 3. 提取区域、商圈、小区名称（核心修正：按文本顺序提取）
                area_p = house.find('p', class_='gray6 mt12')
                if area_p:
                    # 初始化默认值
                    house_info['区域'] = '未知'
                    house_info['商圈'] = '未知'
                    house_info['小区名称'] = '未知'
                    
                    # 提取所有文本内容（格式：区域-商圈-小区名称）
                    area_texts = [text.strip() for text in area_p.stripped_strings if text.strip()]
                    # 直接按顺序匹配（第1个是区域，第2个是商圈，第3个是小区）
                    if len(area_texts) >= 1:
                        house_info['区域'] = area_texts[0]
                    if len(area_texts) >= 2:
                        house_info['商圈'] = area_texts[1]
                    if len(area_texts) >= 3:
                        house_info['小区名称'] = area_texts[2]
                
                # 4. 提取地铁信息
                subway_span = house.find('span', class_='note subInfor')
                house_info['地铁信息'] = subway_span.text.strip() if subway_span else '无'
                
                # 5. 提取房源标签（核心修正：放宽标签容器定位）
                # 标签可能在任意p标签下，只要子元素包含class="note"的span
                label_spans = house.find_all('span', class_=lambda x: x and 'note' in x)
                if label_spans:
                    labels = [span.text.strip() for span in label_spans if span.text.strip()]
                    house_info['标签'] = ', '.join(labels)
                else:
                    house_info['标签'] = '无'
                
                # 6. 提取租金
                price_span = house.find('span', class_='price')
                if price_span:
                    house_info['租金'] = f"{price_span.text.strip()}元/月"
                else:
                    house_info['租金'] = '未知'
                
                # 添加到总列表
                all_houses.append(house_info)
            
            # 随机延迟防反爬
            time.sleep(random.uniform(1, 3))
            
        except Exception as e:
            print(f"爬取第{page}页时出错: {str(e)}")
            continue
    
    print(f"爬取完成，共获取{len(all_houses)}条租房信息")
    return all_houses


In [3]:
# 保存到CSV
def save_to_csv(houses, filename='housing_rent_data_longhua.csv'):
    if not houses:
        print("没有租房信息可保存")
        return
    
    fieldnames = set()
    for house in houses:
        fieldnames.update(house.keys())
    fieldnames = sorted(fieldnames)
    
    with open(filename, 'w', newline='', encoding='utf-8-sig') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for house in houses:
            writer.writerow(house)
    
    print(f"租房信息已保存到{filename}")


In [4]:
if __name__ == "__main__":
    rental_data = crawl_house_info()
    if rental_data:
        save_to_csv(rental_data)


正在爬取第1页...
正在爬取第2页...
正在爬取第3页...
正在爬取第4页...
正在爬取第5页...
正在爬取第6页...
正在爬取第7页...
正在爬取第8页...
正在爬取第9页...
正在爬取第10页...
正在爬取第11页...
正在爬取第12页...
正在爬取第13页...
正在爬取第14页...
正在爬取第15页...
正在爬取第16页...
正在爬取第17页...
正在爬取第18页...
正在爬取第19页...
正在爬取第20页...
爬取完成，共获取1200条租房信息
租房信息已保存到housing_rent_data_longhua.csv
