# 爬蟲

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime

In [None]:

def fetch_tixcraft_data():
    # 設置 ChromeOptions
    options = Options()
    # options.add_argument('--headless')
    # options.add_argument('--no-sandbox')
    # options.add_argument('--disable-dev-shm-usage')

    # 初始化 WebDriver
    driver = webdriver.Chrome(options=options)
    try:
        # 打開目標網頁
        driver.get('https://tixcraft.com/activity')

        # 等待頁面加載
        time.sleep(5)

        # 點擊成列表式節目資訊
        small = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, '#display-type > li:nth-child(2)'))
        )
        small.click()

        # 等待頁面更新
        time.sleep(2)

        # 查找所有鏈接
        link_elements = driver.find_elements(By.CSS_SELECTOR, ".btn.btn-outline-primary.text-bold.m-0")

        # 提取所有鏈接的 href 屬性
        links = [link.get_attribute("href") for link in link_elements]
    finally:
        # 關閉瀏覽器
        driver.quit()

    # 將鏈接轉為 DataFrame
    links_df = pd.DataFrame(links, columns=['PageURL'])

    # 設置請求標頭
    header = {'User-Agent': 'William-requests/2.31.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*',
              'Connection': 'keep-alive'}
    url = 'https://tixcraft.com/activity'

    # 發送 GET 請求
    r = requests.get(url, headers=header)
    soup = BeautifulSoup(r.text, 'html.parser')

    # 獲取音樂、演唱會資訊
    tixcraft_info = soup.find_all('div', {'class': 'col-lg-8 col-md-7 col-sm-8 col-xs-5 col-12 mb-2 mb-sm-0'})
    # 獲取圖片資訊
    images = soup.find_all('img', {'class': 'img-fluid'})

    result = []
    image_urls = []

# 定義英文轉中文
    weekday_mapping = {
        'Mon': '一',
        'Tue': '二',
        'Wed': '三',
        'Thu': '四',
        'Fri': '五',
        'Sat': '六',
        'Sun': '日'
    }


    # 瀏覽全部的活動資訊
    for info in tixcraft_info:
        # 時間資訊
        date_div = info.find('div', {'class': 'text-small date'})
        if date_div:
            date = date_div.text.strip()
            try:
                if ' ~ ' in date:
                    start_date_str, end_date_str = date.split(' ~ ')
                    start_date = datetime.strptime(start_date_str.strip(), '%Y/%m/%d (%a.)')
                    end_date = datetime.strptime(end_date_str.strip(), '%Y/%m/%d (%a.)')
                    # # 轉換開始日期的星期幾為中文
                    start_weekday_english = start_date.strftime('%a')
                    # start_weekday_chinese = weekday_mapping.get(start_weekday_english, start_weekday_english)
                    # # 轉換結束日期的星期幾為中文
                    end_weekday_english = end_date.strftime('%a')
                    # end_weekday_chinese = weekday_mapping.get(end_weekday_english, end_weekday_english)
                    # 格式化日期
                    formatted_date = (
                        f"{start_date.strftime('%Y/%m/%d')} "
                        f"({start_weekday_english}) ~ "
                        f"{end_date.strftime('%Y/%m/%d')} "
                        f"({end_weekday_english})"
                    )

                else:
                    original_date_format = datetime.strptime(date, '%Y/%m/%d (%a.)')
                    # 將英文星期幾轉換為中文
                    weekday_english = original_date_format.strftime('%a')
                    # weekday_chinese = weekday_mapping.get(weekday_english, weekday_english)
                    formatted_date = original_date_format.strftime(f'%Y/%m/%d ({weekday_english})')
            except ValueError:
                formatted_date = "日期格式錯誤"
        else:
            formatted_date = "無日期信息"
        # 活動名稱
        titles_div = info.find('div', {'class': 'text-bold pt-1 pb-1'})
        titles = titles_div.text.strip() if titles_div else "無活動名稱"

        # 活動地點
        location_div = info.find('div', {'class': 'text-small text-med-light'})
        location = location_div.text.strip() if location_div else "請參考活動網頁"
        result.append((titles, formatted_date, location))

    # 獲取圖片網址
    for img in images:
        img_url = img.get('src')
        if img_url:
            image_urls.append(img_url)

    # 用成 DataFrame 格式
    activity_df = pd.DataFrame(result, columns=['EventName', 'EventTime', 'Venue'])
    # 插入「圖片網址」欄位
    image_urls = image_urls[:len(activity_df)]
    activity_df['ImageURL'] = image_urls
    # 將activity_df, links_df結合成一個df
    df = pd.concat([activity_df, links_df], axis=1)

    return df

tix_df = fetch_tixcraft_data()

# 整理dataframe（移除重複、篩掉不符合的活動）

In [None]:
# Remove duplicate（爬出來的df不知為何都重複兩次）
new_tix_df = tix_df.drop_duplicates(subset=['EventName'])

# 找出不符合的活動篩掉
index_to_drop1 = new_tix_df[new_tix_df['EventName'].str.contains('蛋黃酥')].index
index_to_drop2 = new_tix_df[new_tix_df['EventName'].str.contains('專區')].index
index_to_drop3 = new_tix_df[new_tix_df['EventName'].str.contains('VIP Upgrade')].index
index_to_drop4 = new_tix_df[new_tix_df['EventName'].str.contains('FUJI ROCK')].index


# Combine indices
indices_to_drop = index_to_drop1.union(index_to_drop2).union(index_to_drop3).union(index_to_drop4)
print(indices_to_drop)

# Drop rows with these indices
new_tix_df = new_tix_df.drop(indices_to_drop)


# 匯出csv檔案，可在local檢查用
new_tix_df.to_csv('tixcraft.csv',index=True)



# 用google api找出地址並存進df

In [None]:
# 打api找地址
import requests

google_map_api = ""

# 展演空間的「地址」
def formatted__address(show_loc):
    # 「尋找地點」的url
    find_place_url = "https://maps.googleapis.com/maps/api/place/findplacefromtext/json"
    params = {"key": google_map_api,
              "input": show_loc,
              "inputtype": "textquery",
              "language": "zh-TW",
              "fields": "formatted_address,geometry"
              }
    # 發送請求並獲取回應
    response = requests.get(find_place_url, params=params)
    data = response.json()
    if data['candidates']:
        address = data['candidates'][0]['formatted_address']
        return address
    else:
        return None



In [None]:
venues = new_tix_df.Venue
address = []
for venue in venues:
    address.append(formatted__address(venue))
    # print(formatted__address(venue))

new_tix_df['Address'] = address

# 匯出csv檔案，可在local檢查用
new_tix_df.to_csv('tixcraft.csv',index=True)

# 新增StartTime,EndTime

In [None]:
event_time = new_tix_df.EventTime
# 2024/09/07 (Sat.) ~ 2024/09/08 (Sun.)
start_time = []
end_time = []

for time in event_time:
    if "~" in time:
        start_time.append(time.split()[0].replace("/","-"))
        end_time.append(time.split()[3].replace("/","-"))
    else:
        formatted = time.split()[0].replace("/","-")
        start_time.append(formatted)
        end_time.append(formatted)

# print(start_time)
# print(end_time)

new_tix_df['StartTime'] = start_time
new_tix_df['EndTime'] = end_time

# 匯出csv檔案，可在local檢查用
new_tix_df.to_csv('tixcraft.csv',index=True)

# 連線DB並存入資料

In [None]:
import psycopg2
import pandas as pd
from sqlalchemy import create_engine

# 此py檔案是本地做資料庫連線、建table、存爬蟲資料進table用的
# 僅供本地用，放上來給大家參考，主程式並沒有用到


# Postgres DB 連線
# 依照render上的參數輸入
conn = psycopg2.connect(
    dbname='',
    user='',
    password='',
    host='dpg--a.singapore-postgres.render.com',
    port='5432'
)

cursor = conn.cursor()

# Create table (if it doesn't exist)
create_table_query = '''
CREATE TABLE IF NOT EXISTS tb_tixcraft (
    id SERIAL PRIMARY KEY,
    EventName VARCHAR(255) NOT NULL,
    EventTime VARCHAR(255),
    Venue VARCHAR(255),
    Address VARCHAR(255),
    ImageURL VARCHAR(1000),
    PageURL VARCHAR(1000),
    StartTime TIMESTAMP,
    EndTime TIMESTAMP
);
'''
cursor.execute(create_table_query)
conn.commit()

# 利用SQLAlchemy將爬蟲資料放入Postgres DB (因Postgres非內建資料庫，需要SQLAlchemy去方便管理及簡化code)

# PostgreSQL connection details
DATABASE_TYPE = 'postgresql'
DBAPI = 'psycopg2'
ENDPOINT = 'dpg--a.singapore-postgres.render.com'  # Replace with your endpoint
USER = ''  # Replace with your username
PASSWORD = ''  # Replace with your password
PORT = 5432  # Default PostgreSQL port
DATABASE = ''  # Replace with your database name

# Create SQLAlchemy engine
engine = create_engine(f'{DATABASE_TYPE}+{DBAPI}://{USER}:{PASSWORD}@{ENDPOINT}:{PORT}/{DATABASE}')

# Insert the data into the PostgreSQL table
# 此假設df已經儲存爬蟲爬下來並且用pandas整理過的dataframe
new_tix_df.to_sql('tb_tixcraft', engine, if_exists='replace', index=False)


print("Data inserted successfully.")

conn.close()