# 3-2 Data Mining

In [1]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service

# 二手房数据爬取
DRIVER_PATH = r'C:\Users\lshte\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe'
MAX_PAGES = 20

all_house_data = []

def initialize_driver():
    Opt = webdriver.ChromeOptions()
    Opt.add_argument('start-maximized')
    s = Service(DRIVER_PATH)
    driver = webdriver.Chrome(service = s, options=Opt)
    driver.implicitly_wait(10) # 隐式等待，增加元素加载的容错性
    return driver

def click_element(driver, by, value, timeout=20):
    try:
        # 显式等待：直到元素可点击
        element = WebDriverWait(driver, timeout).until(
            EC.element_to_be_clickable((by, value))
        )
        element.click()
        time.sleep(1)
        print(f"click on: {value}")
    except Exception as e:
        print(f"error: {e}")

def navigate_and_filter(driver):
    print("--- start navigate and filter ---")
    
    city_pinyin = 'tj' # 天津
    target_url = f'https://{city_pinyin}.esf.fang.com/'
    driver.get(target_url)
    # 滨海新区
    area_selector = '/html/body/div[4]/div[3]/div[2]/div[1]/ul/li[1]/ul/li[11]/a'
    click_element(driver, By.XPATH, area_selector)
    time.sleep(2)
    # 中新生态城
    sub_area_selector = '/html/body/div[4]/div[3]/div[2]/div[1]/ul/li[2]/ul/li[34]/a'
    click_element(driver, By.XPATH, sub_area_selector)

    print("filter accomplished")
    time.sleep(2)

def extract_house_data(driver, page_num):
    print(f"\n--- working on page {page_num} ---")
    house_list_items = driver.find_elements(By.CSS_SELECTOR, '.shop_list_4 > dl')
    
    if not house_list_items:
        print("no house found on this page.")
        return False

    for item in house_list_items:
        try:
            # 提取面积
            p_element = item.find_element(By.CSS_SELECTOR, 'p.tel_shop')
            full_info_text = p_element.text
            index_end = full_info_text.rfind('㎡')
            index_end -= 1
            area_text = ''
            for i in range(index_end, -1, -1):
                char = full_info_text[i]
                if char.isspace() or char == '|':
                    break
                elif char.isdigit() or char == '.':
                    area_text = char + area_text
                else :
                    break
            area_num = float(area_text)

            # 提取总价
            total_price_element = item.find_element(By.CSS_SELECTOR, 'span.red')
            total_price = total_price_element.text.replace('万', '').strip()
            total_price = float(total_price)
            
            # 提取单价
            unit_price_element = item.find_element(By.CSS_SELECTOR, 'dd.price_right > span:nth-child(2)')
            unit_price = unit_price_element.text.replace('元/㎡', '').strip()
            unit_price = float(unit_price)
            
            all_house_data.append({
                '面积(㎡)': area_num,
                '总价(万)': total_price,
                '单价(元/㎡)': unit_price
            })
            
        except Exception as e:
            print(f"error occurs at extracting house data: {e}")

    return True

def find_next_page(driver):
    try:
        # 查找 class 为 'next' 的链接，并且它必须是可点击的
        next_page_link = driver.find_element(By.XPATH, '//a[text()="下一页"]')
        
        if 'nohref' in next_page_link.get_attribute('class'):
            print("已到达最后一页或下一页链接不可用。")
            return False
            
        # 使用点击函数
        click_element(driver, By.XPATH, '//a[text()="下一页"]')
        return True
    except:
        print("未找到下一页按钮，爬取结束。")
        return False


driver = initialize_driver()

navigate_and_filter(driver)

for page in range(1, MAX_PAGES + 1):
    # 提取当前页数据
    success = extract_house_data(driver, page)
    if not success:
        break # 如果当前页提取失败，则退出

    # 检查是否需要翻页
    if page < MAX_PAGES:
        if not find_next_page(driver):
            break # 如果找不到下一页，则退出
    else:
        print(f"已达到最大爬取页数 {MAX_PAGES}。")


driver.quit()

--- start navigate and filter ---
click on: /html/body/div[4]/div[3]/div[2]/div[1]/ul/li[1]/ul/li[11]/a
click on: /html/body/div[4]/div[3]/div[2]/div[1]/ul/li[2]/ul/li[34]/a
filter accomplished

--- working on page 1 ---
click on: //a[text()="下一页"]

--- working on page 2 ---
click on: //a[text()="下一页"]

--- working on page 3 ---
click on: //a[text()="下一页"]

--- working on page 4 ---
click on: //a[text()="下一页"]

--- working on page 5 ---
click on: //a[text()="下一页"]

--- working on page 6 ---
click on: //a[text()="下一页"]

--- working on page 7 ---
click on: //a[text()="下一页"]

--- working on page 8 ---
click on: //a[text()="下一页"]

--- working on page 9 ---
click on: //a[text()="下一页"]

--- working on page 10 ---
click on: //a[text()="下一页"]

--- working on page 11 ---
click on: //a[text()="下一页"]

--- working on page 12 ---
click on: //a[text()="下一页"]

--- working on page 13 ---
click on: //a[text()="下一页"]

--- working on page 14 ---
click on: //a[text()="下一页"]

--- working on page 15 ---
cli

In [2]:
# 输出文件
OUTPUT_FILE = 'Tianjin_Zhongxin_esf_data.xlsx'
if all_house_data:
    df = pd.DataFrame(all_house_data)
    print("\n--- result display ---")
    print(df.head())
    df.to_excel(OUTPUT_FILE, index=False)
    print(f"\ndata has been saved to : {OUTPUT_FILE}")
else:
    print("no data to save.")


--- result display ---
   面积(㎡)  总价(万)  单价(元/㎡)
0   80.0   60.0   7500.0
1   89.0   70.0   7865.0
2  125.0  100.0   8000.0
3   87.0   56.0   6436.0
4  108.0   70.0   6481.0

data has been saved to : Tianjin_Zhongxin_esf_data.xlsx


In [3]:
# 租房数据爬取
def navigate_and_filter_zu(driver):
    print("--- start navigate and filter ---")
    
    city_pinyin = 'tj' # 天津
    target_url = f'https://{city_pinyin}.zu.fang.com/'
    driver.get(target_url)
    # 滨海新区
    area_selector = '/html/body/div[4]/div[2]/div[2]/div/dl[1]/dd/a[12]'
    click_element(driver, By.XPATH, area_selector)
    time.sleep(2)
    # 中新生态城
    sub_area_selector = '/html/body/div[4]/div[2]/div[2]/div[1]/div/a[35]'
    click_element(driver, By.XPATH, sub_area_selector)

    print("filter accomplished")
    time.sleep(2)

def extract_house_data_zu(driver, page_num):
    print(f"\n--- working on page {page_num} ---")
    house_list_items = driver.find_elements(By.CSS_SELECTOR, '.houseList > dl')
    
    if not house_list_items:
        print("no house found on this page.")
        return False

    for item in house_list_items:
        try:
            # 提取面积
            p_element = item.find_element(By.CSS_SELECTOR, 'p.font15.mt12.bold')
            full_info_text = p_element.text
            index_end = full_info_text.rfind('㎡')
            index_end -= 1
            area_text = ''
            for i in range(index_end, -1, -1):
                char = full_info_text[i]
                if char.isspace() or char == '|':
                    break
                elif char.isdigit() or char == '.':
                    area_text = char + area_text
                else :
                    break
            area_num = float(area_text)
            
            # 提取单价
            unit_price_element = item.find_element(By.CSS_SELECTOR, 'p.mt5.alingC > span.price')
            unit_price = unit_price_element.text.strip()
            unit_price = float(unit_price)
            
            all_house_data.append({
                '面积(㎡)': area_num,
                '租房单价(元/月)': unit_price
            })
            
        except Exception as e:
            print(f"error occurs at extracting house data: {e}")

    return True

driver = initialize_driver()

navigate_and_filter_zu(driver)

for page in range(1, MAX_PAGES + 1):
    # 提取当前页数据
    success = extract_house_data_zu(driver, page)
    if not success:
        break # 如果当前页提取失败，则退出

    # 检查是否需要翻页
    if page < MAX_PAGES:
        if not find_next_page(driver):
            break # 如果找不到下一页，则退出
    else:
        print(f"已达到最大爬取页数 {MAX_PAGES}。")


driver.quit()

--- start navigate and filter ---
click on: /html/body/div[4]/div[2]/div[2]/div/dl[1]/dd/a[12]
click on: /html/body/div[4]/div[2]/div[2]/div[1]/div/a[35]
filter accomplished

--- working on page 1 ---
click on: //a[text()="下一页"]

--- working on page 2 ---
click on: //a[text()="下一页"]

--- working on page 3 ---
click on: //a[text()="下一页"]

--- working on page 4 ---
click on: //a[text()="下一页"]

--- working on page 5 ---
click on: //a[text()="下一页"]

--- working on page 6 ---
click on: //a[text()="下一页"]

--- working on page 7 ---
click on: //a[text()="下一页"]

--- working on page 8 ---
click on: //a[text()="下一页"]

--- working on page 9 ---
click on: //a[text()="下一页"]

--- working on page 10 ---
click on: //a[text()="下一页"]

--- working on page 11 ---
click on: //a[text()="下一页"]

--- working on page 12 ---
click on: //a[text()="下一页"]

--- working on page 13 ---
click on: //a[text()="下一页"]

--- working on page 14 ---
click on: //a[text()="下一页"]

--- working on page 15 ---
click on: //a[text()="下

In [4]:
#输出文件
OUTPUT_FILE = 'Tianjin_Zhongxin_zu_data.xlsx'

if all_house_data:
    df = pd.DataFrame(all_house_data)
    print("\n--- result display ---")
    print(df.head())
    df.to_excel(OUTPUT_FILE, index=False)
    print(f"\ndata has been saved to : {OUTPUT_FILE}")
else:
    print("no data to save.")



--- result display ---
   面积(㎡)  总价(万)  单价(元/㎡)  租房单价(元/月)
0   80.0   60.0   7500.0        NaN
1   89.0   70.0   7865.0        NaN
2  125.0  100.0   8000.0        NaN
3   87.0   56.0   6436.0        NaN
4  108.0   70.0   6481.0        NaN

data has been saved to : Tianjin_Zhongxin_zu_data.xlsx
