In [4]:
import pandas as pd
import yfinance as yf
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
from tqdm import tqdm  # 進度條套件

# ==========================================
# 1. 設定加速區間 (6個月) 與 模型參數
# ==========================================
# 建議選波動較大的區間，例如 2023 下半年，這樣分析比較有意義
START_DATE = '2023-07-01'
END_DATE = '2023-12-31'

# 選擇模型：若作業沒強制，建議用 FinBERT (針對金融優化且速度快於 Roberta-Large)
MODEL_NAME = "ProsusAI/finbert"
# 若堅持用講義的 Roberta，改為 "roberta-large" (但速度會慢 3 倍以上)

# ==========================================
# 2. 準備環境與模型 (GPU 加速)
# ==========================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"目前使用的運算裝置: {device}")  # 確認是否有用到 GPU

print(f"正在載入模型 {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(device)

# ==========================================
# 3. 資料獲取與篩選 (關鍵加速步驟)
# ==========================================
print(f"正在下載股價資料 ({START_DATE} ~ {END_DATE})...")
# 假設標的是台積電 (2330.TW) 或大盤 (^TWII)，請依題目修改
stock_df = yf.download("2330.TW", start=START_DATE, end=END_DATE)

# 假設你已經有一個新聞資料檔 (csv)，我們只讀取該區間
# 若你沒有檔案，這邊模擬一個簡單的載入過程
# news_df = pd.read_csv('your_news_data.csv')
# 這裡我手動過濾時間，請確保你的 csv 有 'Date' 欄位
# news_df['Date'] = pd.to_datetime(news_df['Date'])
# mask = (news_df['Date'] >= START_DATE) & (news_df['Date'] <= END_DATE)
# news_df = news_df.loc[mask].reset_index(drop=True)

# [模擬資料] 為了讓程式能跑，我先造幾筆假資料給你看
news_data = {
    'Date': pd.date_range(start=START_DATE, periods=5, freq='M'),
    'Title': [
        "TSMC revenue beats expectations due to AI demand.",
        "Market crash fears rise as inflation data disappoints.",
        "Tech sector rallies on new chip breakthrough.",
        "Global supply chain issues persist, affecting output.",
        "Quarterly earnings report shows steady growth."
    ]
}
news_df = pd.DataFrame(news_data)

print(f"篩選後資料筆數: {len(news_df)} 筆 (已大幅減少運算量)")

# ==========================================
# 4. 快速情緒計算 (Inference Loop)
# ==========================================
sentiments = []

print("開始計算情緒分數 (Inference)...")
# 使用 tqdm 顯示進度條
for text in tqdm(news_df['Title'], desc="Processing"):
    # 優化重點：max_length 設為 128 (通常標題與導言就夠了，設 512 會慢 4 倍)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)

    with torch.no_grad(): # 關閉梯度計算，節省記憶體並加速
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

    # FinBERT 輸出順序通常是 [Positive, Negative, Neutral] 或 [Pos, Neg, Neu]
    # 需依模型定義，ProsusAI/finbert 是 [positive, negative, neutral]
    # 這裡我們取 Positive - Negative 作為一個簡單的 Sentiment Score
    score = probs[0][0].item() - probs[0][1].item()
    sentiments.append(score)

news_df['Sentiment_Score'] = sentiments

# ==========================================
# 5. 輸出結果
# ==========================================
print("\n運算完成！前 5 筆結果：")
print(news_df.head())

# 接下來你可以將 news_df 與 stock_df 進行合併 (Merge) 做後續預測

目前使用的運算裝置: cpu
正在載入模型 ProsusAI/finbert...
正在下載股價資料 (2023-07-01 ~ 2023-12-31)...


  stock_df = yf.download("2330.TW", start=START_DATE, end=END_DATE)
[*********************100%***********************]  1 of 1 completed
  'Date': pd.date_range(start=START_DATE, periods=5, freq='M'),


篩選後資料筆數: 5 筆 (已大幅減少運算量)
開始計算情緒分數 (Inference)...


Processing: 100%|██████████| 5/5 [00:00<00:00,  6.53it/s]


運算完成！前 5 筆結果：
        Date                                              Title  \
0 2023-07-31  TSMC revenue beats expectations due to AI demand.   
1 2023-08-31  Market crash fears rise as inflation data disa...   
2 2023-09-30      Tech sector rallies on new chip breakthrough.   
3 2023-10-31  Global supply chain issues persist, affecting ...   
4 2023-11-30     Quarterly earnings report shows steady growth.   

   Sentiment_Score  
0         0.932132  
1        -0.844522  
2         0.368385  
3        -0.953435  
4         0.932569  



