In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import csv
import re

In [None]:
#初始化WebDriver
def init_driver():
    options = webdriver.ChromeOptions()
    # 不使用 headless 模式以顯示瀏覽器
    options.add_argument('--disable-gpu')
    return webdriver.Chrome(options=options)

#第一階段：抓取景點ID與地點
def scrape_spot_ids():
    url = "https://tw.trip.com/travel-guide/attraction/taiwan-100076/tourist-attractions"
    driver = init_driver()
    driver.get(url)

    spot_data = []
    max_pages = 100  # 設定爬取的最大頁數
    current_page = 1

    while current_page <= max_pages:
        #滑動頁面並確保加載完成
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)  # 延長等待時間

        #解析當前頁面內容
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        spots = soup.select('a.online-poi-item-card[href^="https://tw.trip.com/travel-guide/attraction/"]')

        #驗證是否找到 spots
        print(f"第 {current_page} 頁，找到 {len(spots)} 個景點項目")
        if not spots:
            break

        for spot in spots:
            #提取 title 和 href
            title = spot.get('title')
            href = spot.get('href')

            #提取 city 和 spot_id
            match = re.search(r'attraction/([^/]+)/([^?]+)', href)
            city = match.group(1)
            spot_id = match.group(2)
            time.sleep(1)
            #添加到結果
            spot_data.append([city, spot_id, title])
            print(f"抓取: {title}, {city}/{spot_id}")

        #處理下一頁
        next_button = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'button.ant-pagination-item-link>[aria-label="right"]'))
        )

        #滾動到按鈕可見位置
        driver.execute_script("arguments[0].scrollIntoView(true);", next_button)

        #使用 JavaScript 點擊按鈕，繞過遮擋問題
        driver.execute_script("arguments[0].click();", next_button)
        time.sleep(3)
        current_page += 1

    driver.quit()

    #寫入CSV
    with open('spot_id.csv', 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['city', 'spot_id', 'title'])
        writer.writerows(spot_data)

    print("抓取完成，已寫入 spot_id.csv")

if __name__ == "__main__":
    scrape_spot_ids()


第 1 頁，找到 10 個景點項目
抓取: 臺北故宮博物院, taipei/taipei-palace-museum-80595
抓取: 台北101大樓, taipei/taipei-101-10758289
抓取: 野柳地質公園, new-taipei-city/yehliu-geopark-88560
抓取: 日月潭, nantou/sun-moon-lake-81233
抓取: 臺北市立動物園, taipei/taipei-zoo-80594
抓取: 士林夜市, taipei/shilin-market-10524212
抓取: 打狗英國領事館, kaohsiung/former-british-consulate-residence-at-dagou-88570
抓取: 貓空, taipei/maokong-85951
抓取: 貓空纜車, taipei/maokong-gondola-10572530
抓取: 台北101觀景台, taipei/taipei-101-observatory-23865324
第 2 頁，找到 10 個景點項目
抓取: 九份老街, new-taipei-city/jiufen-old-street-96936
抓取: 阿里山國家風景區, chiayi-county/alishan-national-scenic-area-81253
抓取: 彩虹眷村, taichung/rainbow-village-10523103
抓取: 台北逛夜市, taipei/taipei-night-market-tour-148805072
抓取: 國立臺灣博物館, taipei/taiwan-museum-10520764
抓取: 國立中正紀念堂, taipei/chiang-kai-shek-memorial-hall-80598
抓取: 自由廣場, taipei/liberty-square-10522872
抓取: 桃園 · 五月天跨年演唱會2024-2025《回到那一天》25週年巡迴演唱會新年特別版, taoyuan/taoyuanmayday-concert-tour-149542165
抓取: 清境農場, nantou/qingjing-farm-86072
抓取: 士林官邸正館, taipei/chiang-kai-shek-sh

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import csv
import re
import time

#初始化WebDriver
def init_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--disable-gpu')
    return webdriver.Chrome(options=options)

#第二階段：抓取評論與評分
def scrape_comments():
    with open('spot_id.csv', 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        spots = list(reader)

    driver = init_driver()
    all_comments = []

    for spot in spots:
        city = spot['city']
        spot_id = spot['spot_id']
        title = spot['title']
        url = f"https://tw.trip.com/travel-guide/attraction/{city}/{spot_id}"
        print(f"正在抓取景點：{title}, URL: {url}")
        driver.get(url)

        #確認景點名稱是否正確
        try:
            h1_title = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'h1.basicName'))
            ).text
            if h1_title != title:
                print(f"名稱不符，跳過：{h1_title}")
                continue
        except Exception:
            print(f"無法找到標題，跳過：{title}")
            continue

        #滾動頁面並確保評論區加載
        print("滾動頁面以加載評論區...")
        for _ in range(10):
            driver.execute_script("window.scrollBy(0, 500);")
            time.sleep(2)

        try:
            sort_button = driver.find_element(By.CSS_SELECTOR, 'div.switch-item[data-index="1"]')
            sort_button.click()

            # 等待評論區完全更新
            WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div.switch-item[data-index="0"]'))
            )
            time.sleep(5)  # 確保評論內容更新完成
            print("成功切換評論排序為最新")
        except Exception as e:
            print(f"無法切換評論排序: {e}")

        #抓取評論內容
        comments_count = 0
        while comments_count < 100:  #這裡可以設定要抓的評論數量
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            reviews = soup.select('div.review-item')

            if not reviews:
                print(f"無評論可抓取：{title}")
                break

            for review in reviews:
                try:
                    rating = review.select_one('div.ovh > div.score-box > span.review_score').text
                    comment_content = review.select_one('div > a[alt]').get('alt', '')
                    date_time = review.select_one('div.ovh > span.create-time').text.replace('撰寫日期：', '').strip()

                    #格式化日期
                    date_time = re.sub(r'(\d+) 年 (\d+) 月 (\d+) 日', r'\1-\2-\3', date_time)

                    all_comments.append([city, spot_id, title, rating, comment_content, date_time])
                    comments_count += 1

                    if comments_count >= 100:
                        break
                except Exception as e:
                    print(f"評論處理錯誤：{e}")

            #切換到下一頁
            try:
            #找到所有下一頁按鈕
                next_buttons = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'button.btn-next'))
                )

            #確保存在至少兩個按鈕
                if len(next_buttons) >= 2:
                    next_button = next_buttons[1]  # 選擇第二個按鈕
        
                    # 滾動到按鈕可見位置
                    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_button)
                    time.sleep(1)  # 確保滾動完成

                    # 點擊按鈕並等待加載
                    next_button.click()
                    time.sleep(3)
                    print("成功切換到下一頁")
                else:
                    print("未找到足夠的下一頁按鈕")
            except Exception as e:
                print(f"無法切換評論頁面：{e}")

    driver.quit()

    #寫入CSV
    with open('comments.csv', 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['city', 'spot_id', 'title', 'rating', 'comment_content', 'date_time'])
        writer.writerows(all_comments)
    print("評論抓取完成，已存入 comments.csv")

if __name__ == "__main__":
    scrape_comments()
