# 爬Indievox售票網頁

In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By


def scrape_indievox_events():
    # Initialize WebDriver with options
    option = webdriver.ChromeOptions()
    option.add_experimental_option('excludeSwitches', ['enable-automation'])
    driver = webdriver.Chrome(options=option)
    driver.maximize_window()

    # First URL
    url_table = 'https://www.indievox.com/activity/list?type=table&startDate=2024%2F06%2F13&endDate='
    driver.get(url_table)
    driver.implicitly_wait(5)

    # Scroll down the page to load all events
    for i in range(5):
        driver.find_element(By.CSS_SELECTOR, 'html').send_keys(Keys.END)
        time.sleep(3)

    # Find all event elements
    event_elements = driver.find_elements(By.CSS_SELECTOR, '#activityListTab tr')

    # Extract event information
    event_data = []
    for event in event_elements:
        details = event.find_elements(By.TAG_NAME, 'td')
        if len(details) >= 3:
            date = details[0].text
            name = details[1].text
            location = details[2].text
            event_data.append((name, date, location))
    print(event_data)

    # Find all event links
    links = driver.find_elements(By.CLASS_NAME, "fcLightBlue")
    link_urls = [link.get_attribute("href") for link in links]
    print(link_urls)

    # Close the driver
    driver.close()

    # Combine event information with links
    for i in range(len(event_data)):
        event_data[i] = (*event_data[i], link_urls[i])

    # Re-initialize WebDriver
    driver = webdriver.Chrome(options=option)

    # Second URL
    url_card = 'https://www.indievox.com/activity/list?type=card&startDate=2024%2F06%2F14&endDate='
    driver.get(url_card)
    driver.implicitly_wait(5)

    # Scroll down the page to load all images
    for i in range(40):
        driver.find_element(By.TAG_NAME, 'html').send_keys(Keys.END)
        time.sleep(2)

    # Find all event images
    imgs = driver.find_elements(By.CSS_SELECTOR, ".wrap img")
    img_urls = [img.get_attribute("src") for img in imgs]
    print(img_urls)

    # Close the driver
    driver.close()

    # Combine event information with image URLs
    for i in range(len(event_data)):
        event_data[i] = (*event_data[i], img_urls[i])

    # Create DataFrame
    df = pd.DataFrame(event_data, columns=['EventName', 'EventTime', 'Venue', 'PageURL', 'ImageURL'])

    return df

In [None]:
df = scrape_indievox_events()

# 利用google map api 找到展演空間地址

In [None]:
import requests

google_map_api = ""

# 展演空間的「地址」
def formatted__address(show_loc):
    # 「尋找地點」的url
    find_place_url = "https://maps.googleapis.com/maps/api/place/findplacefromtext/json"
    params = {"key": google_map_api,
              "input": show_loc,
              "inputtype": "textquery",
              "language": "zh-TW",
              "fields": "formatted_address,geometry"
              }
    # 發送請求並獲取回應
    response = requests.get(find_place_url, params=params)
    data = response.json()
    if data['candidates']:
        address = data['candidates'][0]['formatted_address']
        return address
    else:
        return None



# 將地址欄位存進dataframe

In [None]:
venues = df.Venue
address = []
for venue in venues:
    address.append(formatted__address(venue))
    print(formatted__address(venue))

df['Address'] = address

# 匯出csv檔案，可在local檢查用
df.to_csv('indievox.csv',index=True)

# 利用EventTime整理成StartTime, EndTime並存進dataframe

In [None]:
event_time = df.EventTime
# 2024/09/07 (Sat.) ~ 2024/09/08 (Sun.)
start_time = []
end_time = []

for time in event_time:
    if "~" in time:
        start_time.append(time.split()[0].replace("/","-"))
        end_time.append(time.split()[3].replace("/","-"))
    else:
        formatted = time.split()[0].replace("/","-")
        start_time.append(formatted)
        end_time.append(formatted)


df['StartTime'] = start_time
df['EndTime'] = end_time

# 匯出csv檔案，可在local檢查用
df.to_csv('indievox.csv',index=True)

# 連線DB並將整理過的爬蟲資料存進資料庫

In [None]:
import psycopg2
import pandas as pd
from sqlalchemy import create_engine

# 此py檔案是本地做資料庫連線、建table、存爬蟲資料進table用的
# 僅供本地用，放上來給大家參考，主程式並沒有用到


# Postgres DB 連線
# 依照render上的參數輸入
conn = psycopg2.connect(
    dbname='',
    user='',
    password='',
    host='dpg--a.singapore-postgres.render.com',
    port='5432'
)

cursor = conn.cursor()

# Create table (if it doesn't exist)
create_table_query = '''
CREATE TABLE IF NOT EXISTS tb_indievox (
    id SERIAL PRIMARY KEY,
    EventName VARCHAR(255) NOT NULL,
    EventTime VARCHAR(255),
    Venue VARCHAR(255),
    Address VARCHAR(255),
    ImageURL VARCHAR(1000),
    PageURL VARCHAR(1000),
    StartTime TIMESTAMP,
    EndTime TIMESTAMP
);
'''
cursor.execute(create_table_query)
conn.commit()

# 利用SQLAlchemy將爬蟲資料放入Postgres DB (因Postgres非內建資料庫，需要SQLAlchemy去方便管理及簡化code)

# PostgreSQL connection details
DATABASE_TYPE = 'postgresql'
DBAPI = 'psycopg2'
ENDPOINT = 'dpg--a.singapore-postgres.render.com'  # Replace with your endpoint
USER = ''  # Replace with your username
PASSWORD = ''  # Replace with your password
PORT = 5432  # Default PostgreSQL port
DATABASE = ''  # Replace with your database name

# Create SQLAlchemy engine
engine = create_engine(f'{DATABASE_TYPE}+{DBAPI}://{USER}:{PASSWORD}@{ENDPOINT}:{PORT}/{DATABASE}')

# Insert the data into the PostgreSQL table
# 此假設df已經儲存爬蟲爬下來並且用pandas整理過的dataframe
df.to_sql('tb_indievox', engine, if_exists='replace', index=False)


print("Data inserted successfully.")

conn.close()