# DPL001-01 台股股票代碼資訊收集  
* 包含上市櫃股票、ＥＴＦ

In [None]:
import os
import sys
from pathlib import Path
from datetime import date
from datetime import datetime
import requests
import pandas as pd
import duckdb

In [None]:
from finlab import data
import finlab

In [None]:
# 引用自建公用模組
sys.path.insert(0, str(Path.cwd().parent))
from proj_util_pkg.settings import ProjEnvSettings

from proj_util_pkg.finlab_api import finlab_manager as flm

## 公用參數設定

In [None]:
# finlab api 服務初始化
finlab = flm.FinlabManager()
data.force_cloud_download = False

In [None]:
# 欄數全展開
pd.set_option("display.max_columns", None)

In [None]:
# 新增偽裝成chrome瀏覽器的標頭
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

## 外部資料讀取  
### 證交所ETF資料讀取  
- 來源1: 證交所 open api  
  [https://openapi.twse.com.tw/v1/opendata/t187ap47_L](https://openapi.twse.com.tw/v1/opendata/t187ap47_L)
- 來源2: 政府資料開放平台 
  [基金基本資料彙總表](https://data.gov.tw/dataset/157399). 
- 來源3: 鉅亨網
  [https://www.cnyes.com/etf/](https://www.cnyes.com/twstock/etf/detail/discount-premium)

In [None]:
# # 取得ETF資料 (來源: 證交所 open api)
# tw_etf_info = requests.get(
#     "https://openapi.twse.com.tw/v1/opendata/t187ap47_L", 
#     verify=False,
#     headers=headers
# ).json()
# print(tw_etf_info)

In [None]:
# # json 轉成 dataframe
# tw_etf_info_df = pd.DataFrame(tw_etf_info)
# print(tw_etf_info_df.shape)
# # tw_etf_info_df.columns

# tw_etf_info_df = tw_etf_info_df[["基金代號", "基金名稱", "英文名稱", "標的指數|追蹤指數名稱", "基金類型"]]
# tw_etf_info_df["category"] = "ETF"
# tw_etf_info_df.columns = ["stock_id", "stock_name", "stock_en_name", "etf_index_name", "etf_type", "industry"]
# tw_etf_info_df

In [None]:
# 取得ETF資料 (來源: 政府資料開放平台)
etf_info_url = "https://mopsfin.twse.com.tw/opendata/t187ap47_L.csv"
tw_etf_info_df = pd.read_csv(etf_info_url, dtype=str)

print(tw_etf_info_df.shape)
# tw_etf_info_df.columns

tw_etf_info_df = tw_etf_info_df[["基金代號", "基金簡稱", "基金英文名稱", "標的指數/追蹤指數名稱", "基金類型"]]
tw_etf_info_df["category"] = "ETF"
tw_etf_info_df.columns = ["stock_id", "stock_name", "stock_en_name", "etf_index_name", "etf_type", "industry"]
tw_etf_info_df

In [None]:
# 取得ETF資料 (來源: 鉅亨網)
# 該網站為 Next.js 動態渲染，需用 Selenium 載入頁面後再以 BeautifulSoup 解析
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time

etf_info_url = "https://www.cnyes.com/twstock/etf/detail/discount-premium"

# 設定 headless Chrome
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

driver = webdriver.Chrome(options=chrome_options)

try:
    driver.get(etf_info_url)

    # 等待 #react-table 元素出現
    WebDriverWait(driver, 15).until(
        EC.presence_of_element_located((By.ID, "react-table"))
    )
    time.sleep(3)

    # 滾動頁面到底部以觸發所有資料載入
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1.5)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # 取得渲染後的 HTML，交給 BeautifulSoup 解析
    soup = BeautifulSoup(driver.page_source, "html.parser")
    react_table = soup.find("table", id="react-table")

    if react_table is None:
        raise ValueError("找不到 #react-table 元素")

    # 表頭欄位（該頁面表頭不在 table 內，手動定義對應欄位）
    columns = [
        "代碼", "名稱", "淨值", "淨值漲跌%",
        "市價", "市價漲跌%", "折溢價", "折溢價%",
        "最新股價", "成交金額(百萬)", "受益人數", "當日成交量", "自選"
    ]

    # 解析每一列資料
    rows_data = []
    for tr in react_table.find("tbody").find_all("tr", role="row"):
        cells = tr.find_all("td", role="cell")
        row_data = [cell.get_text(strip=True) for cell in cells]
        if row_data:
            rows_data.append(row_data)

    # 轉成 DataFrame 並移除最後的 checkbox 欄位
    tw_etf_cnyes_df = pd.DataFrame(rows_data, columns=columns[:len(rows_data[0])])
    tw_etf_cnyes_df = tw_etf_cnyes_df.drop(columns=["自選"], errors="ignore")

    print(f"共取得 {len(tw_etf_cnyes_df)} 筆 ETF 資料")
    tw_etf_cnyes_df.head(10)

finally:
    driver.quit()

In [None]:
tw_etf_cnyes_df[tw_etf_cnyes_df["代碼"] == "00733"]

In [None]:
# 從鉅亨網資料取出代碼與名稱，補充至 ETF 清單
tw_etf_cnyes_simple = tw_etf_cnyes_df[["代碼", "名稱"]].copy()
tw_etf_cnyes_simple.columns = ["stock_id", "stock_name"]
tw_etf_cnyes_simple["industry"] = "ETF"

# 合併兩個 ETF 資料來源，依 stock_id 去除重複（優先保留政府開放平台資料）
tw_etf_info_df = pd.concat([tw_etf_info_df, tw_etf_cnyes_simple], ignore_index=True)
tw_etf_info_df = tw_etf_info_df.drop_duplicates(subset="stock_id", keep="first")
tw_etf_info_df.fillna("", inplace=True)
print(f"合併後 ETF 資料共 {len(tw_etf_info_df)} 筆")
tw_etf_info_df

In [None]:
list(tw_etf_info_df.columns)

In [None]:
tw_etf_info_df[tw_etf_info_df["stock_id"] == "00733"]

### 讀取台股企業資訊

In [None]:
tw_stock_info = data.get('company_basic_info', save_to_storage=True)

print(tw_stock_info.shape)
# tw_stock_info.head(3)

In [None]:
tw_stock_info.columns

In [None]:
tw_stock_info = tw_stock_info[["stock_id", "公司簡稱", "英文簡稱", "產業類別"]]
tw_stock_info.columns = ["stock_id", "stock_name", "stock_en_name", "industry"]
tw_stock_info

In [None]:
tw_stock_info.columns

In [None]:
tw_etf_info_df.columns

In [None]:
tw_stock_info[tw_stock_info["stock_id"] == "009811"]

## 合併股票和 ETF 資料

In [None]:
conn = duckdb.connect(':memory:')

# 直接使用 DuckDB 的 SQL 功能進行合併
combined_df = conn.execute("""
    SELECT 
        stock_id, 
        stock_name, 
        stock_en_name, 
        industry,
        '' as etf_index_name,
        '' as etf_type
    FROM tw_stock_info
    UNION ALL
    SELECT 
        stock_id, 
        stock_name, 
        stock_en_name, 
        industry,
        etf_index_name,
        etf_type
    FROM tw_etf_info_df
""").fetchdf()

print(f"合併完成！總共 {len(combined_df)} 筆資料")
print(f"股票資料: {len(combined_df[combined_df['etf_index_name'] == ''])} 筆")
print(f"ETF資料: {len(combined_df[combined_df['etf_index_name'] != ''])} 筆")
combined_df.head()

## 資料留存ＤＢ

In [None]:
# 設定資料庫路徑
TWSTOCK_DATA_ROOT = os.environ.get("hist_data_path")
twstock_db_path = f"{TWSTOCK_DATA_ROOT}/twstock.duckdb"

In [None]:
# 連線資料庫
conn_duckdb = duckdb.connect(twstock_db_path)

In [None]:
table_name = "tw_stock_list"

In [None]:
# 由於股票清單是完整資料，先刪除現有表格再重新建立
try:
    conn_duckdb.execute(f"DROP TABLE IF EXISTS {table_name}")
    print(f"已刪除現有表格: {table_name}")
except Exception as e:
    print(f"刪除表格時發生錯誤: {e}")

# 將合併後的資料存入資料庫
combined_df.to_sql(table_name, conn_duckdb, if_exists="replace", index=False)
print(f"已將 {len(combined_df)} 筆資料存入資料庫表格: {table_name}")

In [None]:
# 查詢表中資料來驗證
result_df = conn_duckdb.execute(f"""
    SELECT 
        COUNT(*) as total_count,
        SUM(CASE WHEN etf_index_name = '' THEN 1 ELSE 0 END) as stock_count,
        SUM(CASE WHEN etf_index_name != '' THEN 1 ELSE 0 END) as etf_count
    FROM {table_name}
""").fetch_df()

print("資料庫中的資料統計:")
print(result_df)

print("\n前5筆資料:")
sample_df = conn_duckdb.execute(f"SELECT * FROM {table_name} LIMIT 5").fetch_df()
print(sample_df)

In [None]:
# 關閉資料庫連線
conn_duckdb.close()
print("資料庫連線已關閉")