In [2]:
import pandas as pd
import yfinance as yf
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm
import warnings

# 忽略版本警告
warnings.simplefilter(action='ignore', category=FutureWarning)

# ==========================================
# 設定參數：聯發科 (2454.TW)
# ==========================================
START_DATE = '2023-07-01'
END_DATE = '2023-12-31'
STOCK_ID = "2454.TW"
MODEL_NAME = "ProsusAI/finbert"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"目前使用的運算裝置: {device}")

# ==========================================
# 1. 數值資料處理 (加入除錯修正)
# ==========================================
print(f"1. 正在下載 {STOCK_ID} 股價資料...")
stock_df = yf.download(STOCK_ID, start=START_DATE, end=END_DATE)

# --- [FIX: 關鍵修正] ---
# 檢查並移除多層索引 (例如: ('Close', '2454.TW') -> 'Close')
if isinstance(stock_df.columns, pd.MultiIndex):
    stock_df.columns = stock_df.columns.droplevel(1)
# -----------------------

stock_df.reset_index(inplace=True)

# 確保 Date 格式統一 (移除時區資訊)
stock_df['Date'] = pd.to_datetime(stock_df['Date']).dt.tz_localize(None)

# 特徵工程 (MA5)
stock_df['MA5'] = stock_df['Close'].rolling(window=5).mean()

print(f"   股價資料格式確認: {stock_df.columns.tolist()}") # 確認是否已變回單層 ['Date', 'Open', ...]

# ==========================================
# 2. 文字資料情緒運算 (維持不變)
# ==========================================
print("2. 載入 FinBERT 模型與模擬聯發科新聞...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(device)

# [模擬新聞資料]
news_data = {
    'Date': pd.to_datetime([
        '2023-07-31', '2023-08-15', '2023-09-20',
        '2023-10-27', '2023-11-10', '2023-12-15'
    ]),
    'Title': [
        "MediaTek unveils new Dimensity 9300 chip with powerful AI capabilities.",
        "Smartphone demand slows down, affecting mobile chip shipments.",
        "MediaTek partners with NVIDIA for automotive cockpit solutions.",
        "Q3 earnings miss expectations due to global inventory adjustments.",
        "Analyst upgrades MediaTek rating citing 5G growth potential.",
        "New flagship SoC shows record-breaking benchmark performance."
    ]
}
news_df = pd.DataFrame(news_data)

sentiments = []
print("   開始計算情緒分數...")
for text in tqdm(news_df['Title'], desc="Sentiment Inference"):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    # FinBERT: [Positive, Negative, Neutral]
    score = probs[0][0].item() - probs[0][1].item()
    sentiments.append(score)

news_df['Sentiment_Score'] = sentiments

# ==========================================
# 3. 多模態資料融合 (Data Fusion)
# ==========================================
print("3. 執行多模態資料融合 (Data Alignment & Fusion)...")

# 修正後 stock_df 為單層欄位，現在可以正常合併了
merged_df = pd.merge(stock_df, news_df[['Date', 'Sentiment_Score']], on='Date', how='left')

# 填補無新聞日的空值 (補 0 代表中立/無影響)
merged_df['Sentiment_Score'] = merged_df['Sentiment_Score'].fillna(0)

# ==========================================
# 4. 產出檢查
# ==========================================
print("\n[驗證成功] 多模態融合資料表 (含新聞的交易日)：")
# 顯示有新聞分數(不為0)且有股價的日子，證明融合成功
print(merged_df[merged_df['Sentiment_Score'] != 0][['Date', 'Close', 'Sentiment_Score']])

print("\n資料結構 (前 5 筆)：")
print(merged_df.head())

[*********************100%***********************]  1 of 1 completed

目前使用的運算裝置: cpu
1. 正在下載 2454.TW 股價資料...
   股價資料格式確認: ['Date', 'Close', 'High', 'Low', 'Open', 'Volume', 'MA5']
2. 載入 FinBERT 模型與模擬聯發科新聞...





   開始計算情緒分數...


Sentiment Inference: 100%|██████████| 6/6 [00:02<00:00,  2.26it/s]

3. 執行多模態資料融合 (Data Alignment & Fusion)...

[驗證成功] 多模態融合資料表 (含新聞的交易日)：
          Date       Close  Sentiment_Score
20  2023-07-31  631.728333         0.737804
30  2023-08-15  620.741760        -0.961125
56  2023-09-20  694.901123         0.434905
80  2023-10-27  733.354126        -0.955735
90  2023-11-10  809.344727         0.914452
115 2023-12-15  911.886108         0.929232

資料結構 (前 5 筆)：
        Date       Close        High         Low        Open   Volume  \
0 2023-07-03  632.643860  639.052698  631.728312  633.559408  2906690   
1 2023-07-04  640.883789  642.714886  632.643855  637.221596  3753123   
2 2023-07-05  638.137146  643.630436  637.221598  643.630436  3610090   
3 2023-07-06  631.728333  639.968267  629.897236  633.559429  4200756   
4 2023-07-07  624.403870  628.066062  622.572773  628.066062  4297744   

          MA5  Sentiment_Score  
0         NaN              0.0  
1         NaN              0.0  
2         NaN              0.0  
3         NaN              0.0  
4  


