# HW3 Data Mining — 北京海淀·万柳

目标：抓取房天下（二手房与租房）万柳区域前 20 页数据，清洗并计算 `price/m2` 与 `rent/m2`，导出为 CSV。

说明：网站可能存在反爬策略，使用 Selenium + webdriver-manager 自动管理浏览器驱动，像葛雷老师说的一样，我们要做鲁棒性的代码，所以我这里包含稳健的selector选择与分页逻辑。

In [37]:
%%capture
%pip install -q pandas numpy beautifulsoup4 lxml selenium webdriver-manager matplotlib

In [38]:
import re
import time
import random
from pathlib import Path

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, WebDriverException

print('Versions -> pandas:', pd.__version__, 'numpy:', np.__version__)

Versions -> pandas: 2.2.3 numpy: 1.26.4


In [None]:
# 配置
CITY = '北京'
DISTRICT = '海淀'
BLOCK = '万柳'
MAX_PAGES = 20

# 输出目录
OUTPUT_ROOT = Path(r'c:\Users\32854\Desktop\quant课\Data_mininig_HW3')
OUTPUT_DIR = OUTPUT_ROOT / 'output'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

SALES_CSV = OUTPUT_DIR / 'beijing_haidian_wanliu_sales.csv'
RENT_CSV = OUTPUT_DIR / 'beijing_haidian_wanliu_rent.csv'
COMBINED_CSV = OUTPUT_DIR / 'beijing_haidian_wanliu_combined.csv'

print('Output dir:', OUTPUT_DIR)

Output dir: c:\Users\32854\Desktop\quant课\Data_mininig_HW3\output


In [None]:
def init_driver(headless=False):
    opts = Options()
    # 更接近真实用户
    opts.add_argument('--disable-blink-features=AutomationControlled')
    opts.add_argument('--start-maximized')
    opts.add_argument('--no-sandbox')
    opts.add_argument('--disable-dev-shm-usage')
    opts.add_experimental_option('excludeSwitches', ['enable-automation'])
    opts.add_experimental_option('useAutomationExtension', False)
    if headless:
        opts.add_argument('--headless=new')
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=opts)
    driver.implicitly_wait(10)
    return driver

def human_sleep(a=1.0, b=2.5):
    time.sleep(random.uniform(a, b))

driver = init_driver(headless=False)
driver.get('https://esf.fang.com/')
human_sleep()
print('ESF homepage loaded')

# 尝试选择北京城市（若页面有城市弹窗或切换入口）
try:
    beijing_links = driver.find_elements(By.PARTIAL_LINK_TEXT, CITY)
    if beijing_links:
        beijing_links[0].click()
        human_sleep()
        print('Switched to city:', CITY)
except Exception as e:
    print('City switch skipped:', e)

# 在搜索框中输入 "万柳" 
try:
    # 常见选择器尝试
    for selector in ['input#keyword', 'input[name=keyword]', 'input[type=text]']:
        try:
            el = driver.find_element(By.CSS_SELECTOR, selector)
            el.clear()
            el.send_keys(BLOCK)
            human_sleep(0.5, 1.2)
            el.submit()
            human_sleep()
            print('Search submitted via', selector)
            break
        except NoSuchElementException:
            continue
except Exception as e:
    print('Auto search failed, you can search manually:', e)

ESF homepage loaded
Switched to city: 北京
Search submitted via input[type=text]


如上自动搜索若失败：请在当前页面手动搜索输入 "万柳"，进入结果页后再运行下面单元以开始抓取与分页。

In [41]:
def to_float(text):
    if text is None:
        return np.nan
    t = re.sub(r'[ ,]', '', str(text))
    # 万 -> 元
    m = re.search(r'(\d+(?:\.\d+)?)\s*万', t)
    if m:
        return float(m.group(1)) * 10000
    m = re.search(r'(\d+(?:\.\d+)?)', t)
    return float(m.group(1)) if m else np.nan

def to_area_m2(text):
    if text is None:
        return np.nan
    m = re.search(r'(\d+(?:\.\d+)?)\s*㎡', str(text))
    if m:
        return float(m.group(1))
    # 兜底：只取数字
    m = re.search(r'(\d+(?:\.\d+)?)', str(text))
    return float(m.group(1)) if m else np.nan

def to_unit_price(text):
    # e.g., 12345 元/㎡
    if text is None:
        return np.nan
    m = re.search(r'(\d+(?:\.\d+)?)\s*元/㎡', str(text).replace(',', ''))
    return float(m.group(1)) if m else np.nan

def parse_esf_page(page_source):
    soup = BeautifulSoup(page_source, 'lxml')
    items = []
    # 常见容器（站点结构可能变动，使用多种备选解析策略）
    for block in soup.select('div.shop_list, div.listBox, div#listBox'):
        # 多类型条目结构：dl/li/div
        candidates = block.select('dl, li, div[class*=list]')
        for c in candidates:
            txt = c.get_text(' ', strip=True)
            if not txt:
                continue
            title = None
            tlink = c.select_one('a[title], a[class*=title], p.title a')
            if tlink and tlink.get_text(strip=True):
                title = tlink.get_text(strip=True)
            # 价格（总价/单价）
            total_price = None
            unit_price = None
            # 先尝试结构化节点
            p_total = c.select_one('dd.price span, span.price, b.price, div.price, p.price')
            p_unit = c.select_one('dd.price i, span.unitPrice, i.unitPrice, p.unitPrice')
            total_price = to_float(p_total.get_text(strip=True)) if p_total else to_float(txt)
            unit_price = to_unit_price(p_unit.get_text(strip=True)) if p_unit else to_unit_price(txt)
            # 面积
            area_m2 = None
            for cand in [c.select_one('dd.area p'), c.select_one('span.area'), c.select_one('p.area'), c]:
                if cand:
                    area_m2 = to_area_m2(cand.get_text(' ', strip=True))
                    if not np.isnan(area_m2):
                        break
            # 户型
            rooms = None
            m = re.search(r'(\d+)室(\d+)厅', txt)
            if m:
                rooms = f"{m.group(1)}室{m.group(2)}厅"
            # 社区/位置
            location = None
            loc = c.select_one('dd.address p a, p.address a, span.address a')
            if loc:
                location = loc.get_text(strip=True)
            else:
                # 兜底：包含海淀/万柳字样
                m2 = re.search(r'(海淀|万柳|北京)', txt)
                location = m2.group(1) if m2 else None
            items.append({
                'source': 'esf',
                'title': title,
                'total_price_yuan': total_price,
                'unit_price_yuan_per_m2': unit_price,
                'area_m2': area_m2,
                'rooms': rooms,
                'location': location
            })
    return pd.DataFrame(items)

def go_next_page(driver):
    for locator in [
        (By.LINK_TEXT, '下一页'),
        (By.PARTIAL_LINK_TEXT, '下一'),
        (By.CSS_SELECTOR, 'a.next, a[rel=next]')
    ]:
        try:
            el = driver.find_element(*locator)
            driver.execute_script('arguments[0].click();', el)
            human_sleep()
            return True
        except NoSuchElementException:
            continue
        except WebDriverException:
            continue
    return False

# 从当前搜索结果页开始分页抓取（最多 MAX_PAGES 页）
esf_rows = []
page = 1
while page <= MAX_PAGES:
    df = parse_esf_page(driver.page_source)
    if not df.empty:
        df['city'] = CITY
        df['district'] = DISTRICT
        df['block'] = BLOCK
        df['page'] = page
        esf_rows.append(df)
        print(f'ESF page {page}: {len(df)} rows')
    else:
        print(f'ESF page {page}: no rows parsed')
    if not go_next_page(driver):
        print('No next page found, stop at page', page)
        break
    page += 1

esf_df = pd.concat(esf_rows, ignore_index=True) if esf_rows else pd.DataFrame()
print('ESF total rows:', len(esf_df))
esf_df.to_csv(SALES_CSV, index=False, encoding='utf-8-sig')
print('Saved:', SALES_CSV)

ESF page 1: 141 rows
ESF page 2: 141 rows
ESF page 3: 141 rows
ESF page 4: 141 rows
ESF page 5: 141 rows
ESF page 6: 141 rows
ESF page 7: 141 rows
ESF page 8: 141 rows
ESF page 9: 141 rows
ESF page 10: 141 rows
ESF page 11: 141 rows
ESF page 12: 141 rows
ESF page 13: 141 rows
ESF page 14: 141 rows
ESF page 15: 141 rows
ESF page 16: 141 rows
ESF page 17: 141 rows
ESF page 18: 141 rows
ESF page 19: 141 rows
ESF page 20: 141 rows
ESF total rows: 2820
Saved: c:\Users\32854\Desktop\quant课\Data_mininig_HW3\output\beijing_haidian_wanliu_sales.csv


In [42]:
#处理空值所在行，并重新写入文件
df_origin = pd.read_csv('output\\beijing_haidian_wanliu_sales.csv')
df_origin.dropna(inplace=True)
df_origin.to_csv('output\\beijing_haidian_wanliu_sales.csv', index=False)


In [43]:
df_cleaned = pd.read_csv('output\\beijing_haidian_wanliu_sales.csv')
df_cleaned.columns


Index(['source', 'title', 'total_price_yuan', 'unit_price_yuan_per_m2',
       'area_m2', 'rooms', 'location', 'city', 'district', 'block', 'page'],
      dtype='object')

In [46]:
# 抓取租房数据
driver = init_driver(headless=False)
driver.get('https://zu.fang.com/')
human_sleep()
print('ZU homepage loaded')

# 搜索万柳（同样若失败可手动输入后执行下一步）
try:
    for selector in ['#input_key']:
        try:
            el = driver.find_element(By.CSS_SELECTOR, selector)
            el.clear()
            el.send_keys(BLOCK)
            human_sleep(0.5, 1.2)
            el.submit()
            human_sleep()
            print('ZU search submitted via', selector)
            break
        except NoSuchElementException:
            continue
except Exception as e:
    print('ZU auto search failed, you can search manually:', e)

def parse_zu_page(page_source):
    soup = BeautifulSoup(page_source, 'lxml')
    items = []
    for block in soup.select('div.houseList, div.listBox, div#listBox'):
        candidates = block.select('dl, li, div[class*=list]')
        for c in candidates:
            txt = c.get_text(' ', strip=True)
            if not txt:
                continue
            title = None
            tlink = c.select_one('a[title], a[class*=title], p.title a')
            if tlink and tlink.get_text(strip=True):
                title = tlink.get_text(strip=True)
            # 月租金
            rent_month = None
            rm = re.search(r'(\d+(?:\.\d+)?)\s*元/月', txt.replace(',', ''))
            if rm:
                rent_month = float(rm.group(1))
            # 面积
            area_m2 = to_area_m2(txt)
            # 位置
            location = None
            loc = c.select_one('dd.address p a, p.address a, span.address a')
            if loc:
                location = loc.get_text(strip=True)
            else:
                m2 = re.search(r'(海淀|万柳|北京)', txt)
                location = m2.group(1) if m2 else None
            items.append({
                'source': 'zu',
                'title': title,
                'rent_month_yuan': rent_month,
                'area_m2': area_m2,
                'location': location
            })
    return pd.DataFrame(items)

zu_rows = []
page = 1
while page <= MAX_PAGES:
    df = parse_zu_page(driver.page_source)
    if not df.empty:
        df['city'] = CITY
        df['district'] = DISTRICT
        df['block'] = BLOCK
        df['page'] = page
        zu_rows.append(df)
        print(f'ZU page {page}: {len(df)} rows')
    else:
        print(f'ZU page {page}: no rows parsed')
    if not go_next_page(driver):
        print('No next page found, stop at page', page)
        break
    page += 1

zu_df = pd.concat(zu_rows, ignore_index=True) if zu_rows else pd.DataFrame()
print('ZU total rows:', len(zu_df))
zu_df.to_csv(RENT_CSV, index=False, encoding='utf-8-sig')
print('Saved:', RENT_CSV)

driver.quit()
print('Driver closed')

ZU homepage loaded
ZU search submitted via #input_key
ZU page 1: 204 rows
ZU page 2: 204 rows
ZU page 3: 204 rows
ZU page 4: 204 rows
ZU page 5: 204 rows
ZU page 6: 204 rows
ZU page 7: 204 rows
ZU page 8: 204 rows
ZU page 9: 204 rows
ZU page 10: 204 rows
ZU page 11: 204 rows
ZU page 12: 204 rows
ZU page 13: 204 rows
ZU page 14: 204 rows
ZU page 15: 204 rows
ZU page 16: 204 rows
ZU page 17: 204 rows
ZU page 18: 212 rows
ZU page 19: 204 rows
ZU page 20: 204 rows
ZU total rows: 4088
Saved: c:\Users\32854\Desktop\quant课\Data_mininig_HW3\output\beijing_haidian_wanliu_rent.csv
Driver closed


In [47]:
#一样的处理rent数据缺失行，并重新写入文件
df_rent = pd.read_csv('output\\beijing_haidian_wanliu_rent.csv')
df_rent.dropna(inplace=True)
df_rent.to_csv('output\\beijing_haidian_wanliu_rent.csv', index=False)
df_rent.columns

Index(['source', 'title', 'rent_month_yuan', 'area_m2', 'location', 'city',
       'district', 'block', 'page'],
      dtype='object')

In [48]:
#一样的处理rent数据缺失行，并重新写入文件
df_rent = pd.read_csv('output\\beijing_haidian_wanliu_rent.csv')
df_rent.dropna(inplace=True)
df_rent.to_csv('output\\beijing_haidian_wanliu_rent.csv', index=False)
df_rent.columns

Index(['source', 'title', 'rent_month_yuan', 'area_m2', 'location', 'city',
       'district', 'block', 'page'],
      dtype='object')

In [49]:
#一样的处理rent数据缺失行，并重新写入文件
df_rent = pd.read_csv('output\\beijing_haidian_wanliu_rent.csv')
df_rent.dropna(inplace=True)
df_rent.to_csv('output\\beijing_haidian_wanliu_rent.csv', index=False)
df_rent.columns

Index(['source', 'title', 'rent_month_yuan', 'area_m2', 'location', 'city',
       'district', 'block', 'page'],
      dtype='object')

In [None]:
#df_rent.head(5)

In [50]:
# 合并与计算指标
def compute_metrics(esf_df, zu_df):
    esf = esf_df.copy()
    zu = zu_df.copy()
    # 计算 price/m2（优先使用单价；若缺失，使用总价/面积）
    if 'unit_price_yuan_per_m2' in esf.columns:
        esf['price_per_m2'] = esf['unit_price_yuan_per_m2']
    else:
        esf['price_per_m2'] = esf['total_price_yuan'] / esf['area_m2']
    # 租金每平米（按月租金除面积）
    zu['rent_per_m2'] = zu['rent_month_yuan'] / zu['area_m2']
    # 简单合并：按 block + location（粗粒度），并保留原表
    esf['dataset'] = 'sales'
    zu['dataset'] = 'rent'
    # 输出
    return esf, zu

esf_df = pd.read_csv(SALES_CSV) if SALES_CSV.exists() else pd.DataFrame()
zu_df = pd.read_csv(RENT_CSV) if RENT_CSV.exists() else pd.DataFrame()
esf_out, zu_out = compute_metrics(esf_df, zu_df)
combined = pd.concat([esf_out, zu_out], ignore_index=True)
combined.to_csv(COMBINED_CSV, index=False, encoding='utf-8-sig')
print('Saved combined:', COMBINED_CSV)

# 快速摘要
def quick_summary(esf, zu):
    summ = {}
    if not esf.empty:
        summ['sales_rows'] = len(esf)
        summ['sales_price_per_m2_median'] = float(pd.to_numeric(esf['price_per_m2'], errors='coerce').median())
    if not zu.empty:
        summ['rent_rows'] = len(zu)
        summ['rent_per_m2_median'] = float(pd.to_numeric(zu['rent_per_m2'], errors='coerce').median())
    return summ

print('Summary:', quick_summary(esf_out, zu_out))
#放在一个df中
summary_df = pd.DataFrame([quick_summary(esf_out, zu_out)])
summary_df


Saved combined: c:\Users\32854\Desktop\quant课\Data_mininig_HW3\output\beijing_haidian_wanliu_combined.csv
Summary: {'sales_rows': 1175, 'sales_price_per_m2_median': 141983.0, 'rent_rows': 2400, 'rent_per_m2_median': 151.8987341772152}


Unnamed: 0,sales_rows,sales_price_per_m2_median,rent_rows,rent_per_m2_median
0,1175,141983.0,2400,151.898734


In [51]:
#处理成小组数据共享的格式，只保留数字的几列
df_cleaned_sales = df_cleaned[['total_price_yuan','unit_price_yuan_per_m2','area_m2']]
df_cleaned_sales.to_csv('output\\beijing_haidian_wanliu_sales_groupshare.csv', index=False)
df_cleaned_rent = df_rent[['rent_month_yuan','area_m2']]
df_cleaned_rent.to_csv('output\\beijing_haidian_wanliu_rent_groupshare.csv', index=False)


In [None]:
df_rent

Unnamed: 0,source,title,rent_month_yuan,area_m2,location,city,district,block,page
0,zu,15套起看 媲美万柳书院万城华府玺园 稳定出租,80000.0,416.0,万柳,北京,海淀,万柳,1
1,zu,10套可看万城华府尚园 新出平层4居室 带车库 同看,58000.0,468.0,海淀,北京,海淀,万柳,1
2,zu,新出位置安静 可直接入住 楼层好 330平米 三面采,138000.0,330.0,海淀,北京,海淀,万柳,1
3,zu,万城华府海园 5室3厅4卫 视野开阔 主卧朝南,120000.0,381.0,海淀,北京,海淀,万柳,1
4,zu,20套起看万城华府四面采光好山阁 万城华府龙园一层花,88000.0,395.0,海淀,北京,海淀,万柳,1
...,...,...,...,...,...,...,...,...,...
2395,zu,2室2厅阳春光华枫树园,15900.0,107.0,海淀,北京,海淀,万柳,20
2396,zu,南北通透万泉庄1室1厅精装修,6700.0,45.0,海淀,北京,海淀,万柳,20
2397,zu,4室2厅万泉新新家园,36000.0,246.0,海淀,北京,海淀,万柳,20
2398,zu,3室2厅光大水墨风景,31000.0,164.0,海淀,北京,海淀,万柳,20


: 