In [38]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import re

def fetch_area_ids(url):
    """
    提取指定網頁上所有地區的元素id。

    :param url: 目標網頁的URL
    :return: 包含所有地區元素id的陣列
    """
    # 初始化WebDriver
    driver = webdriver.Chrome() # 請確保你有安裝ChromeDriver並設置好PATH
    driver.get(url)  # 導航到目標網頁

    # 定位含有showAdminArea的所有元素
    elements = driver.find_elements(By.XPATH, "//a[@onclick]")
    
    # 提取元素的id
    area_ids = [element.get_attribute('id') for element in elements if element.get_attribute("onclick").startswith("showAdminArea")]

    driver.quit()  # 關閉WebDriver
    return area_ids

# 使用範例
# url = "目標網頁URL"  # 替換成實際的網頁URL
# area_ids = fetch_area_ids(url)
# print("找到的地區元素ID：", area_ids)


In [39]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def fetch_district_ids(url, area_ids):
    """
    根據地區元素id進入第二層，抓取每一層內的區域元素id。

    :param url: 目標網頁的URL
    :param area_ids: 地區元素id的陣列
    :return: 一個字典，鍵為地區id，值為該地區內部區域的id陣列
    """
    driver = webdriver.Chrome() # 初始化WebDriver
    driver.get(url)  # 導航到目標網頁
    
    wait = WebDriverWait(driver, 10)
    district_ids = {}
    
    for area_id in area_ids:
        # 模擬點擊對應地區元素
        area_element = wait.until(EC.element_to_be_clickable((By.ID, area_id)))
        area_element.click()

        # 等待區域列表加載
        wait.until(EC.visibility_of_element_located((By.XPATH, "//a[@onclick and contains(text(),'區')]")))

        # 抓取區域元素的id
        district_elements = driver.find_elements(By.XPATH, "//a[@onclick and contains(text(),'區')]")
        district_ids_for_area = [element.get_attribute('id') for element in district_elements]
        
        district_ids[area_id] = district_ids_for_area

        # (可選) 返回地區列表，準備下一次點擊
        # 可以添加一個返回按鈕的點擊，或者重新加載網頁

    driver.quit()  # 關閉WebDriver
    return district_ids

# 假設已有的地區id陣列
# area_ids = ["taipei", "newtaipei"]  # 示範用地區id，請替換成實際的id
# url = "目標網頁URL"  # 替換成實際的網頁URL
# district_ids = fetch_district_ids(url, area_ids)
# print("找到的區域元素ID：", district_ids)


In [40]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def fetch_table_data_with_pagination(url, district_ids):
    """
    根據區域元素id抓取區域內所有<table>的資料，並在抓取後切換頁面。

    :param url: 目標網頁的URL
    :param district_ids: 區域元素id的列表或字典
    :return: 包含所有表格數據的字典
    """
    driver = webdriver.Chrome() # 初始化WebDriver
    driver.get(url)  # 導航到目標網頁
    
    wait = WebDriverWait(driver, 10)
    all_table_data = {}

    for district_id in district_ids:
        # 模擬點擊對應區域元素
        district_element = wait.until(EC.element_to_be_clickable((By.ID, district_id)))
        district_element.click()

        district_table_data = []
        
        # 檢查分頁並遍歷每一頁
        while True:
            # 等待表格加載
            wait.until(EC.visibility_of_element_located((By.TAG_NAME, "table")))

            # 解析表格數據
            tables = driver.find_elements(By.TAG_NAME, "table")
            for table in tables:
                rows = table.find_elements(By.TAG_NAME, "tr")
                for row in rows:
                    cols = row.find_elements(By.TAG_NAME, "td")
                    row_data = [col.text for col in cols]
                    district_table_data.append(row_data)

            # 尝试点击下一页，如果没有下一页则退出循环
            try:
                next_page_button = driver.find_element(By.XPATH, "//a[@onclick='nextPage()']")
                next_page_button.click()
            except:
                break  # 沒有下一頁時退出循环
        
        all_table_data[district_id] = district_table_data

        # (可選) 返回區域列表，準備下一次點擊
        # 可以添加一個返回按鈕的點擊，或者重新加載網頁

    driver.quit()  # 關閉WebDriver
    return all_table_data

# 假設已有的區域id列表
# district_ids = ["district1", "district2"]  # 示範用區域id，請替換成實際的id
# url = "目標網頁URL"  # 替換成實際的網頁URL
# table_data = fetch_table_data_with_pagination(url, district_ids)
# print("找到的表格數據：", table_data)


In [41]:
# 服务映射
service_mapping = {
    "store02": "夯番薯",
    "store05": "Fami!ce(有販售店)",
    "store04": "休憩區",
    "store27": "烤馬鈴薯",
    "store38": "SOHOT炎選-現烤點心",
    "store39": "ChargeSPOT",
    "store41": "福爾摩沙茶館",
    "store45": "鼎王麻辣蛋",
    "store23": "塑環真®循環杯",
    "store16": "SOHOT炎選-炸烤物",
    "store29": "馬尚煮",
    "store53": "哈肯舖",
    "store28": "咖啡複合店",
    "store48": "FamiSuper",
    "store26": "Fami自助洗衣",
    "store40": "智能咖啡機",
    "store10": "Fami!ce(雙口味店)",
    "store44": "Fami!ce(圓滾滾店)",
    "store50": "Fami!ce(萌布朗店)",
    "store42": "picard",
    "store18": "天和鮮物",
    "store06": "生鮮蔬菜",
    "store32": "好市多專架",
    "store52": "小熊菓子",
    "store54": "穆斯林友善商品店舖",
    "store08": "相片立可得",
    "store46": "gogoro電池交換站",
    "store03": "廁所",
    "store49": "蒸新鮮",
    "store45": "鼎王麻辣蛋",
    "store14": "現磨咖啡",
    # 根据实际服务继续添加
}


In [42]:


# def main(url):
#     driver = webdriver.Chrome()
#     driver.get(url)
    
#     area_ids = fetch_area_ids(driver, url)
#     all_data = {}
    
#     for area_id in area_ids:
#         district_ids = fetch_district_ids(driver, area_id)
#         for district_id in district_ids:
#             table_data = fetch_table_data_with_pagination(driver, district_id)
#             all_data[district_id] = table_data
    
#     driver.quit()
#     return all_data




In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time
import requests
def fetch_area_ids(driver, url):
    area_ids = []
    # 启动WebDriver
    driver.get(url)

    # 使用 XPath 查找所有 class 包含 "city" 的 div 元素
    cities = driver.find_elements(By.XPATH, "//div[contains(@class, 'city')]")


    # 遍历找到的元素并打印城市名称
    for city in cities:
        # 假设城市名称就是<a>标签内的文本
        city_name = city.text
        print(f"City Name: {city_name}")
        area_ids.append(city_name)
    return area_ids

def fetch_district_ids(driver, area_id):
    # 模擬點擊對應地區元素
    driver.execute_script("showAdminArea('{area_id}');")

    # 操作完成后，返回主文档
    
    # 等待區域列表加載並抓取區域元素的id
    district_ids = []
    # 找到包含产品的ul元素
    ul_element = driver.find_element(By.ID, 'showTownList')
    # 获取所有产品元素
    districts = ul_element.find_elements(By.CSS_SELECTOR, "li")

    district_ids = [district.text for district in districts]
    print(district_ids)
    driver.switch_to.default_content()
    return district_ids

def fetch_table_data_with_pagination(driver,url, district_id):

    district_table_data = []
    try:
        driver.find_element(By.ID, district_id).click()
        while True:
            WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.TAG_NAME, "table")))
            soup = BeautifulSoup(driver.page_source, "html.parser")
            tables = soup.find_all("table")
            for table in tables:
                rows = table.find_all("tr")
                for row in rows:
                                      
                    # 店铺名
                    shop_name = row.find_all("td")[0].text.strip()
                    
                    # 店铺信息，这是一个嵌套的<table>，需要进一步处理
                    shop_info_td = row.find_all("td")[1]
                    nested_table = shop_info_td.find("table")
                    if nested_table:
                        shop_number = nested_table.find("td").text.split('：')[-1].strip()
                    
                    # 地址和电话通常在同一个<td>中，通过<br>分隔
                    address_phone_info = shop_info_td.get_text(separator="\n").split("\n")
                    address = address_phone_info[1].split('：')[-1].strip()
                    phone = address_phone_info[2].split('：')[-1].strip()

                    # 特殊服务处理
                    service_td = row.find_all("td")[-1]
                    service_icons = service_td.find_all("span")
                    services = []
                    for icon in service_icons:
                        class_name = icon.get("class")[0] if icon.get("class") else None
                        if class_name and class_name in service_mapping:
                            services.append(service_mapping[class_name])
                        else:
                            services.append("未知服务")  # 对于未在映射表中找到的类名
                    # cols = row.find_all("td")
                    # row_data = [col.text.strip() for col in cols]
                    
                    # 将提取的信息组装成一个字典，以便于后续处理
                    shop_data = {
                        "shop_name": shop_name,
                        "shop_number": shop_number,
                        "address": address,
                        "phone": phone,
                        "services": ", ".join(services)
                    }
                    print(shop_data)
                    district_table_data.append(shop_data)

            try:
                next_page_button = driver.find_element(By.XPATH, "//a[@onclick='nextPage()']")
                next_page_button.click()
            except NoSuchElementException:
                break  # 没有找到下一页时退出循环
    except Exception as e:
        print(f"Error fetching table data for district {district_id}: {e}")
    return district_table_data


def setup_headless_chrome():
    """设置并返回一个无头模式的Chrome WebDriver实例。"""
    # options = Options()
    # options.add_argument("--headless")
    # options.add_argument("--disable-gpu")
    # 根据需要，你可以在这里添加更多的配置
    driver = webdriver.Chrome()
    return driver

def fetch_data(driver, area_id, district_id):
    """在无头浏览器中为特定区域和區域ID抓取数据，并处理分页。"""
    district_table_data = []
    try:
        # 假设 `safe_click` 和 `fetch_table_data_with_pagination` 已经根据需要进行了调整
        # safe_click(driver, (By.ID, district_id))  # 需要在函数中实现点击操作
        district_table_data = fetch_table_data_with_pagination(driver, district_id)
    except Exception as e:
        print(f"Error in fetch_data for {area_id}-{district_id}: {e}")
    finally:
        return district_table_data

def main_parallel(url):
    """主函数，使用并行处理方式来加速数据抓取。"""
    all_data = {}
    with ThreadPoolExecutor(max_workers=4) as executor:  # 调整max_workers以适应你的系统
        driver = setup_headless_chrome()
        future_to_id = {}
        for area_id in fetch_area_ids(driver,url):  # 假设 `area_ids` 已经定义
            for district_id in fetch_district_ids(driver, area_id):  # 假设 `district_ids` 已经定义
                print(area_id,district_id)
                # 提交任务到线程池
                future = executor.submit(fetch_data, driver, area_id, district_id)
                future_to_id[future] = (area_id, district_id)
                
        for future in as_completed(future_to_id):
            area_id, district_id = future_to_id[future]
            try:
                data = future.result()
                all_data[(area_id, district_id)] = data
            except Exception as exc:
                print(f'{area_id}-{district_id} generated an exception: {exc}')
    
    # 处理all_data
    print("抓取到的所有数据：", all_data)
    return all_data

import csv

def save_data_to_csv(all_data, filename='FM_2.csv'):
    """将抓取到的所有数据保存到CSV文件中。"""
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # 遍历数据并写入
        for (area_id, district_id), data in all_data.items():
            for row in data:
                writer.writerow([area_id, district_id, ' | '.join(row)])  # 假设行数据是列表形式，用'|'连接

if __name__ == "__main__":
    url = "https://www.family.com.tw/Marketing/storemap/?v=1"
    all_data = main_parallel(url)  # 确保main_parallel返回all_data
    save_data_to_csv(all_data)
    print("数据已保存到CSV文件。")





City Name: 宜蘭縣
City Name: 花蓮縣
City Name: 台東縣
City Name: 基隆市
City Name: 台北市
City Name: 新北市
City Name: 桃園市
City Name: 新竹市
City Name: 新竹縣
City Name: 苗栗縣
City Name: 雲林縣
City Name: 嘉義市
City Name: 嘉義縣
City Name: 台南市
City Name: 高雄市
City Name: 屏東縣
City Name: 澎湖縣
City Name: 金門縣
City Name: 連江縣
City Name: 台中市
City Name: 彰化縣
City Name: 南投縣


TimeoutException: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF79532AD02+56930]
	(No symbol) [0x00007FF79529F602]
	(No symbol) [0x00007FF7951542E5]
	(No symbol) [0x00007FF7951998ED]
	(No symbol) [0x00007FF795199A2C]
	(No symbol) [0x00007FF7951DA967]
	(No symbol) [0x00007FF7951BBCDF]
	(No symbol) [0x00007FF7951D81E2]
	(No symbol) [0x00007FF7951BBA43]
	(No symbol) [0x00007FF79518D438]
	(No symbol) [0x00007FF79518E4D1]
	GetHandleVerifier [0x00007FF7956A6F8D+3711213]
	GetHandleVerifier [0x00007FF7957004CD+4077101]
	GetHandleVerifier [0x00007FF7956F865F+4044735]
	GetHandleVerifier [0x00007FF7953C9736+706710]
	(No symbol) [0x00007FF7952AB8DF]
	(No symbol) [0x00007FF7952A6AC4]
	(No symbol) [0x00007FF7952A6C1C]
	(No symbol) [0x00007FF7952968D4]
	BaseThreadInitThunk [0x00007FFE1661257D+29]
	RtlUserThreadStart [0x00007FFE1784AA58+40]


In [58]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# 启动WebDriver
driver = webdriver.Chrome()  # 确保已安装ChromeDriver并设置好环境变量
url = "https://www.family.com.tw/Marketing/storemap/?v=1"
driver.get(url)

html_content = driver.page_source
print(html_content)
soup = BeautifulSoup(html_content, 'html.parser')

# 使用Beautiful Soup查找所有包含城市信息的<a>标签
city_links = soup.select("div.city > a")

# 遍历所有找到的链接，并打印出城市名和onclick属性值
for link in city_links:
    city_name = link.text
    onclick_value = link.get('onclick')
    print(f"City Name: {city_name}, OnClick: {onclick_value}")

# 关闭浏览器
driver.quit()

<html xmlns="http://www.w3.org/1999/xhtml"><head id="Head1"><meta charset="utf-8"><meta name="viewport" content="width=device-width, initial-scale=1.0"><meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"><meta http-equiv="X-UA-Compatible" content="ie=edge"><meta name="description" content="FamilyMart 全家便利商店"><meta property="og:type" content="article"><meta property="og:title" content="FamilyMart 全家便利商店"><meta property="og:url" content="https://ec.fami.store/family/Marketing"><meta property="og:image" content="https://ec.fami.store/family./images/logo.png"><meta property="og:image:width" content="1024"><meta property="og:image:height" content="1024"><meta property="og:title" content="全家與你親密在一起，讓生活更有意思"><meta property="og:image" content="http://www.family.com.tw/manager/Marketing/Images/FamilyMart.gif"><meta property="og:description" content="全家便利商店開發的新產品、提供的新服務，讓日常生活的每一天，感到驚喜!!來到全家便利商店，享受的是不斷推陳出新的生活便利屋，發掘到優惠且豐富的商品，全年無休一直在你身邊，全家就是你家!!"><meta name="copyright" content="Copyright 