In [41]:
!jupyter nbconvert --to script first_code.ipynb

usage: jupyter [-h] [--version] [--config-dir] [--data-dir] [--runtime-dir]
               [--paths] [--json] [--debug]
               [subcommand]

Jupyter: Interactive Computing

positional arguments:
  subcommand     the subcommand to launch

options:
  -h, --help     show this help message and exit
  --version      show the versions of core jupyter packages and exit
  --config-dir   show Jupyter config dir
  --data-dir     show Jupyter data dir
  --runtime-dir  show Jupyter runtime dir
  --paths        show all Jupyter paths. Add --json for machine-readable
                 format.
  --json         output paths as machine-readable json
  --debug        output debug information about paths

Available subcommands: kernel kernelspec migrate run troubleshoot

Jupyter command `jupyter-nbconvert` not found.


In [34]:
# ============================== #
#         匯入必要的套件         #
# ============================== #

import os
import re
import numpy as np
import pandas as pd
from datetime import datetime

import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

from tqdm import tqdm
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE

# 設定顯示選項
pd.set_option('display.max_columns', None)

print("所有套件匯入完成。")


所有套件匯入完成。


In [35]:
print("===================================")
print("        CUDA 測試程式碼開始        ")
print("===================================")
print(f"PyTorch 版本: {torch.__version__}")
print(f"CUDA 可用: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA 驅動版本: {torch.version.cuda}")
    print(f"CUDA CUDNN 版本: {torch.backends.cudnn.version()}")
    print(f"CUDA 設備數量: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"CUDA 設備 {i}: {torch.cuda.get_device_name(i)}")
    print(f"當前使用的 CUDA 設備: {torch.cuda.current_device()}")
else:
    print("CUDA 不可用。請檢查以下幾點：")
    print("1. 確保已安裝 NVIDIA GPU 驅動程序。")
    print("2. 確保已安裝與 GPU 驅動程序匹配的 CUDA Toolkit。")
    print("3. 確保已安裝支持 CUDA 的 PyTorch 版本。")
    print("4. 確保您的環境中沒有其他導致 CUDA 初始化錯誤的問題。")

print("===================================")
print("          CUDA 測試程式碼結束       ")
print("===================================")

        CUDA 測試程式碼開始        
PyTorch 版本: 2.6.0.dev20241221+cu126
CUDA 可用: True
CUDA 驅動版本: 12.6
CUDA CUDNN 版本: 90501
CUDA 設備數量: 1
CUDA 設備 0: NVIDIA GeForce RTX 3050 Laptop GPU
當前使用的 CUDA 設備: 0
          CUDA 測試程式碼結束       


In [36]:
# ============================== #
#       定義輔助函數               #
# ============================== #

def clean_text(txt: str) -> str:
    """簡單文本清理，去除多餘空白和雜訊。"""
    if not isinstance(txt, str):
        return ""
    txt = re.sub(r'\s+', ' ', txt)
    return txt.strip()

def parse_cnbc_time(t):
    """解析 CNBC 的時間格式: '7:51 PM ET Fri, 17 July 2020'"""
    if not isinstance(t, str):
        return None
    try:
        dt = datetime.strptime(t, '%I:%M %p ET %a, %d %B %Y')
        return dt.strftime('%Y-%m-%d')
    except:
        return None

def parse_guardian_time(t):
    """解析 Guardian 的時間格式: '18-Jul-20'"""
    if not isinstance(t, str):
        return None
    try:
        dt = datetime.strptime(t.strip(), '%d-%b-%y')
        return dt.strftime('%Y-%m-%d')
    except:
        return None

def parse_reuters_time(t):
    """解析 Reuters 的時間格式: 'Jul 18 2020'"""
    if not isinstance(t, str):
        return None
    try:
        dt = datetime.strptime(t.strip(), '%b %d %Y')
        return dt.strftime('%Y-%m-%d')
    except:
        return None

def load_cnbc(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path, on_bad_lines='skip', engine='python')
    print(f"CNBC 原始欄位: {df.columns.tolist()}")
    expected_cols = ['Headlines', 'Time', 'Description']
    for col in expected_cols:
        if col not in df.columns:
            print(f"警告: CNBC CSV 缺少欄位 '{col}'")
    df['headline'] = df['Headlines'].fillna("").apply(clean_text)
    df['article_content'] = df['Description'].fillna("").apply(clean_text)
    df['date'] = df['Time'].apply(parse_cnbc_time)
    df = df[['date', 'headline', 'article_content']].dropna(subset=['date', 'headline'])
    df['source'] = 'CNBC'
    return df

def load_guardian(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path, on_bad_lines='skip', engine='python')
    print(f"Guardian 原始欄位: {df.columns.tolist()}")
    expected_cols = ['Time', 'Headlines']
    for col in expected_cols:
        if col not in df.columns:
            print(f"警告: Guardian CSV 缺少欄位 '{col}'")
    df['headline'] = df['Headlines'].fillna("").apply(clean_text)
    df['article_content'] = ""  # Guardian 無 Description
    df['date'] = df['Time'].apply(parse_guardian_time)
    df = df[['date', 'headline', 'article_content']].dropna(subset=['date', 'headline'])
    df['source'] = 'Guardian'
    return df

def load_reuters(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path, on_bad_lines='skip', engine='python')
    print(f"Reuters 原始欄位: {df.columns.tolist()}")
    expected_cols = ['Headlines', 'Time', 'Description']
    for col in expected_cols:
        if col not in df.columns:
            print(f"警告: Reuters CSV 缺少欄位 '{col}'")
    df['headline'] = df['Headlines'].fillna("").apply(clean_text)
    df['article_content'] = df['Description'].fillna("").apply(clean_text)
    df['date'] = df['Time'].apply(parse_reuters_time)
    df = df[['date', 'headline', 'article_content']].dropna(subset=['date', 'headline'])
    df['source'] = 'Reuters'
    return df

def merge_news(cnbc_csv, guardian_csv, reuters_csv):
    """
    讀取並合併 CNBC、Guardian 和 Reuters 的新聞資料。
    將 'Time' 欄位轉換為 'date'，並統一格式。
    """
    # 讀取 CSV 檔案
    cnbc = pd.read_csv(cnbc_csv)
    guardian = pd.read_csv(guardian_csv)
    reuters = pd.read_csv(reuters_csv)
    
    # 轉換 'Time' 欄位為 datetime 格式，並命名為 'date'
    cnbc['date'] = pd.to_datetime(cnbc['Time']).dt.date
    guardian['date'] = pd.to_datetime(guardian['Time']).dt.date
    reuters['date'] = pd.to_datetime(reuters['Time']).dt.date
    
    # 選取必要欄位
    cnbc = cnbc[['date', 'Headlines', 'Description']]
    guardian = guardian[['date', 'Headlines']]
    reuters = reuters[['date', 'Headlines', 'Description']]
    
    # 合併資料
    df_news = pd.concat([cnbc, guardian, reuters], ignore_index=True)
    
    # 確保 'date' 欄位為 datetime 類型
    df_news['date'] = pd.to_datetime(df_news['date'])
    
    # 排序
    df_news.sort_values('date', inplace=True)
    
    # 重設索引
    df_news.reset_index(drop=True, inplace=True)
    
    return df_news


def setup_finbert_pipeline():
    """初始化 FinBERT pipeline"""
    tokenizer = BertTokenizer.from_pretrained("yiyanghkust/finbert-tone")
    model = BertForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
    device = 0 if torch.cuda.is_available() else -1
    finbert_pipeline = pipeline(
        "sentiment-analysis",
        model=model,
        tokenizer=tokenizer,
        device=device
    )
    print(f"FinBERT pipeline 使用的設備: {'GPU (0)' if device == 0 else 'CPU'}")
    return finbert_pipeline


def batch_sentiment(text_list, pipeline_fn, batch_size=16, max_length=128):
    """
    手動分批，並使用 tqdm 進度條顯示處理進度。
    pipeline_fn: transformers pipeline
    text_list: 需要做情緒分析的文本列表
    """
    results = []
    for i in tqdm(range(0, len(text_list), batch_size), desc="Sentiment Analysis Batches"):
        batch_texts = text_list[i:i+batch_size]
        try:
            batch_out = pipeline_fn(
                batch_texts,
                truncation=True,
                max_length=max_length
            )
            for out in batch_out:
                label = out['label'].lower()
                score = out['score']
                if label == 'positive':
                    results.append(+score)
                elif label == 'negative':
                    results.append(-score)
                else:
                    results.append(0.0)
        except Exception as e:
            print(f"批次 {i//batch_size + 1} 解析錯誤: {e}")
            results.extend([0.0]*len(batch_texts))
    return results

def compute_daily_sentiment(df_news: pd.DataFrame, sentiment_pipeline, batch_size=16, max_length=128) -> pd.DataFrame:
    """
    對 headline & article_content 做批次情緒分析，顯示 tqdm 進度條，並彙整成每日平均 (mean_headline_sent, mean_content_sent)。
    """
    print("開始批次情緒分析 (FinBERT) ...")

    # headline
    headlines = df_news['headline'].tolist()
    print("-> 分析 Headline")
    headline_scores = batch_sentiment(headlines, sentiment_pipeline, batch_size=batch_size, max_length=max_length)

    # content
    contents = df_news['article_content'].tolist()
    print("-> 分析 Content")
    content_scores = batch_sentiment(contents, sentiment_pipeline, batch_size=batch_size, max_length=max_length)

    df_news['headline_sent'] = headline_scores
    df_news['content_sent']  = content_scores

    print("-> 彙整當日情緒...")
    daily_sentiment = (
        df_news
        .groupby('date')
        .agg({'headline_sent':'mean','content_sent':'mean'})
        .reset_index()
        .rename(columns={
            'headline_sent': 'mean_headline_sent',
            'content_sent': 'mean_content_sent'
        })
    )
    print("情緒分析完成。若出現負值，可能代表負面新聞居多。")
    return daily_sentiment

def load_sp500(sp_csv_path: str) -> pd.DataFrame:
    df_sp = pd.read_csv(sp_csv_path, on_bad_lines='skip', engine='python')
    print(f"S&P500 原始欄位: {df_sp.columns.tolist()}")
    df_sp.rename(columns={'Date':'date','S&P500':'sp_close'}, inplace=True)
    df_sp['date'] = pd.to_datetime(df_sp['date']).dt.strftime('%Y-%m-%d')
    df_sp.sort_values('date', inplace=True)
    print(f"S&P500 行數: {len(df_sp)}")
    return df_sp

def create_labels_for_prediction(df: pd.DataFrame, mode='classification'):
    """
    mode='classification': 隔日漲(1)/跌(0)
    mode='regression': 預測隔日收盤價
    """
    df['sp_close_next'] = df['sp_close'].shift(-1)
    df = df.dropna(subset=['sp_close_next'])

    if mode == 'classification':
        df['target'] = (df['sp_close_next'] > df['sp_close']).astype(int)
    else:  # regression
        df['target'] = df['sp_close_next']
    return df

def add_technical_indicators(df: pd.DataFrame) -> pd.DataFrame:
    """
    添加技術指標到 DataFrame 中。
    包括移動平均線（MA）、相對強弱指標（RSI）、移動平均收斂發散指標（MACD）、
    Bollinger Bands 等。
    """
    # 移動平均線
    df['MA_5'] = df['sp_close'].rolling(window=5).mean()
    df['MA_10'] = df['sp_close'].rolling(window=10).mean()
    
    # 相對強弱指標 (RSI)
    delta = df['sp_close'].diff()
    gain = delta.where(delta > 0, 0).rolling(window=14).mean()
    loss = -delta.where(delta < 0, 0).rolling(window=14).mean()
    RS = gain / loss
    df['RSI'] = 100 - (100 / (1 + RS))
    
    # 移動平均收斂發散指標 (MACD)
    ema_12 = df['sp_close'].ewm(span=12, adjust=False).mean()
    ema_26 = df['sp_close'].ewm(span=26, adjust=False).mean()
    df['MACD'] = ema_12 - ema_26
    df['MACD_signal'] = df['MACD'].ewm(span=9, adjust=False).mean()
    
    # Bollinger Bands
    df['BB_upper'] = df['MA_10'] + 2 * df['sp_close'].rolling(window=10).std()
    df['BB_lower'] = df['MA_10'] - 2 * df['sp_close'].rolling(window=10).std()
    
    # 移動平均收斂發散指標差值
    df['MACD_diff'] = df['MACD'] - df['MACD_signal']
    
    # 去除因計算技術指標產生的 NaN 值
    df = df.dropna()
    
    return df

def add_sentiment_features(df: pd.DataFrame, sentiment_col='mean_headline_sent', lags=[1,2,3]) -> pd.DataFrame:
    """
    添加情緒變動值和滯後情緒特徵。
    """
    # 情緒變動值
    df['sentiment_change_1'] = df[sentiment_col].diff(1)
    df['sentiment_change_2'] = df[sentiment_col].diff(2)
    df['sentiment_change_3'] = df[sentiment_col].diff(3)
    
    # 滯後情緒特徵
    for lag in lags:
        df[f'sentiment_lag_{lag}'] = df[sentiment_col].shift(lag)
    
    # 去除因計算差分和滯後特徵產生的 NaN 值
    df = df.dropna()
    
    return df

def add_sp500_features(df: pd.DataFrame, lags=[1,2,3], rolling_window=5) -> pd.DataFrame:
    """
    添加 S&P500 的滯後漲幅和滾動平均漲幅特徵。
    同時添加滯後的 sp_close 特徵。
    """
    # 計算每日漲幅
    df.loc[:, 'sp_return'] = df['sp_close'].pct_change()
    
    # 滯後漲幅
    for lag in lags:
        df.loc[:, f'sp_return_lag_{lag}'] = df['sp_return'].shift(lag)
    
    # 滾動平均漲幅
    df.loc[:, f'sp_return_mean_{rolling_window}'] = df['sp_return'].rolling(window=rolling_window).mean()
    
    # 添加滯後的 sp_close 特徵
    for lag in lags:
        df.loc[:, f'sp_close_lag_{lag}'] = df['sp_close'].shift(lag)
    
    # 去除因計算漲幅、滯後和滾動平均產生的 NaN 值
    df = df.dropna()
    
    return df


def add_feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    """
    添加所有特徵：技術指標、情緒變動與滯後、S&P500 漲幅與滯後。
    保留 'date' 欄位。
    """
    # 添加技術指標
    df = add_technical_indicators(df)
    
    # 添加情緒變動特徵和滯後特徵
    df = add_sentiment_features(df, sentiment_col='mean_headline_sent', lags=[1,2,3])
    
    # 添加 S&P500 的漲幅特徵
    df = add_sp500_features(df, lags=[1,2,3], rolling_window=5)
    
    # 保留 'date' 欄位
    if 'date' not in df.columns:
        raise KeyError("'date' 欄位在特徵工程後遺失。")
    
    return df




In [37]:
# ============================== #
#        定義模型訓練函數         #
# ============================== #

def train_random_forest(X_train, y_train):
    """訓練 Random Forest 模型。"""
    clf = RandomForestClassifier(
        n_estimators=150,
        random_state=42,
        class_weight="balanced"
    )
    clf.fit(X_train, y_train)
    return clf

def train_xgboost(X_train, y_train):
    """訓練 XGBoost 模型，使用 GPU。"""
    clf = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        n_estimators=150,
        learning_rate=0.1,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=1,  # 根據類別不平衡調整
        tree_method='gpu_hist',  # 使用 GPU
        predictor='gpu_predictor',  # 使用 GPU 進行預測
        gpu_id=0,  # 指定 GPU ID
        random_state=42
    )
    clf.fit(X_train, y_train)
    return clf


def train_lightgbm(X_train, y_train):
    """訓練 LightGBM 模型，使用 GPU。"""
    clf = lgb.LGBMClassifier(
        boosting_type='gbdt',
        device='gpu',
        gpu_platform_id=0,
        gpu_device_id=0,
        n_estimators=150,
        learning_rate=0.1,
        max_depth=10,               # 增加最大深度
        num_leaves=31,              # 增加葉子節點數
        min_data_in_leaf=10,        # 減少每葉最小樣本數
        class_weight='balanced',
        random_state=42
    )
    clf.fit(X_train, y_train)
    return clf



def train_catboost(X_train, y_train):
    """訓練 CatBoost 模型，使用 GPU。"""
    clf = cb.CatBoostClassifier(
        iterations=150,
        learning_rate=0.1,
        depth=5,
        class_weights=[2,1],  # 根據類別不平衡調整
        task_type='GPU',       # 使用 GPU
        devices='0',           # 指定 GPU ID
        verbose=0,
        random_state=42
    )
    clf.fit(X_train, y_train)
    return clf


def train_mlp(X_train, y_train):
    """訓練 MLP 模型。"""
    clf = MLPClassifier(
        hidden_layer_sizes=(100, 50),
        activation='relu',
        solver='adam',
        max_iter=500,
        random_state=42
    )
    clf.fit(X_train, y_train)
    return clf

def get_stacking_model():
    """建立 Stacking 模型，結合 Random Forest 和 Gradient Boosting。"""
    rf = RandomForestClassifier(n_estimators=150, random_state=42, class_weight='balanced')
    xgb = train_xgboost  # 已修改的 XGBoost GPU 函數
    lgbm = train_lightgbm  # 已修改的 LightGBM GPU 函數
    catboost = train_catboost  # 已修改的 CatBoost GPU 函數

    # 建立 VotingClassifier，這裡假設您已經有預先訓練的模型
    stacking_clf = VotingClassifier(
        estimators=[
            ('rf', rf),
            ('xgb', xgb),
            ('lgbm', lgbm),
            ('catboost', catboost)
        ],
        voting='soft'  # 使用概率投票
    )
    return stacking_clf
def train_stacking(X_train, y_train):
    """訓練 Stacking 模型。"""
    stacking_clf = get_stacking_model()
    stacking_clf.fit(X_train, y_train)
    return stacking_clf


In [38]:
# ============================== #
#      定義評估與交叉驗證函數     #
# ============================== #

def evaluate_classification(y_true, y_pred, y_prob=None):
    """評估分類模型。"""
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_prob) if y_prob is not None else None
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    if auc is not None:
        print(f"AUC-ROC: {auc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))
    return acc, f1, auc

def timeseries_cv_and_train(X, y, model_fn, n_splits=5, is_lstm=False):
    """
    使用 TimeSeriesSplit 進行交叉驗證，回傳平均 Accuracy、F1 分數和 AUC-ROC。
    model_fn: 傳入可呼叫 (X_train, y_train) -> (model, predict_fn)
    is_lstm: True 表示 LSTM，需要 reshape
    """
    tscv = TimeSeriesSplit(n_splits=n_splits)
    acc_scores = []
    f1_scores = []
    auc_scores = []
    
    for fold_idx, (train_idx, val_idx) in enumerate(tscv.split(X)):
        print(f"\nFold {fold_idx+1}/{n_splits} 開始...")
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        print(f"Fold {fold_idx+1} - X_train shape: {X_train.shape}")
        print(f"Fold {fold_idx+1} - X_val shape: {X_val.shape}")
        print(f"Fold {fold_idx+1} - y_train shape: {y_train.shape}")
        print(f"Fold {fold_idx+1} - y_val shape: {y_val.shape}")
        
        model, pred_fn = model_fn(X_train, y_train)
        y_pred = pred_fn(X_val)
        
        if hasattr(model, "predict_proba"):
            y_prob = model.predict_proba(X_val)[:,1]
        else:
            y_prob = None
        
        acc, f1, auc = evaluate_classification(y_val, y_pred, y_prob)
        acc_scores.append(acc)
        f1_scores.append(f1)
        auc_scores.append(auc if auc is not None else 0)
    
    avg_acc = np.mean(acc_scores)
    avg_f1 = np.mean(f1_scores)
    avg_auc = np.mean(auc_scores)
    print(f"\n=== TimeSeriesSplit 平均 ACC: {avg_acc:.4f}, F1: {avg_f1:.4f}, AUC-ROC: {avg_auc:.4f} ===")
    return avg_acc, avg_f1, avg_auc


In [39]:
def add_sentiment_features(df: pd.DataFrame, sentiment_col='mean_headline_sent', lags=[1,2,3]) -> pd.DataFrame:
    """
    添加情緒變動值和滯後情緒特徵。
    """
    # 使用 .loc 進行安全的賦值操作
    df.loc[:, 'sentiment_change_1'] = df[sentiment_col].diff(1)
    df.loc[:, 'sentiment_change_2'] = df[sentiment_col].diff(2)
    df.loc[:, 'sentiment_change_3'] = df[sentiment_col].diff(3)
    
    # 滯後情緒特徵
    for lag in lags:
        df.loc[:, f'sentiment_lag_{lag}'] = df[sentiment_col].shift(lag)
    
    # 去除因計算差分和滯後特徵產生的 NaN 值
    df = df.dropna()
    
    return df


In [40]:
# ============================== #
#          主流程開始               #
# ============================== #


# ============================== #
#       1. 讀取並合併新聞資料       #
# ============================== #

# 使用範例路徑（請根據您的實際檔案路徑修改）
cnbc_csv     = r"C:\Users\morri\Desktop\IRTM-project\test\cnbc_headlines.csv"
guardian_csv = r"C:\Users\morri\Desktop\IRTM-project\test\guardian_headlines.csv"
reuters_csv  = r"C:\Users\morri\Desktop\IRTM-project\test\reuters_headlines.csv"

# 讀取並合併新聞資料
df_news = merge_news(cnbc_csv, guardian_csv, reuters_csv)
print(f"[News] total: {len(df_news)} rows from 3 sources.")

# 檢查合併後的資料
print("\n合併後的資料預覽：")
display(df_news.head())
print("df_news 的欄位:")
print(df_news.columns.tolist())

# ============================== #
#     2. 初始化 FinBERT pipeline     #
# ============================== #

finbert_pipe = setup_finbert_pipeline()
print("FinBERT pipeline 初始化完成。")

# ============================== #
#      3. 批次情緒分析             #
# ============================== #

# 計算每日情緒分數
daily_sentiment_df = compute_daily_sentiment(df_news, finbert_pipe, batch_size=32, max_length=128)
daily_sentiment_df.to_csv("daily_sentiment.csv", index=False)
print("[Output] daily_sentiment.csv 已輸出.")

# ============================== #
#        4. 讀取 S&P500 資料          #
# ============================== #

# 使用範例路徑（請根據您的實際檔案路徑修改）
sp500_csv    = r"C:\Users\morri\Desktop\IRTM-project\test\sp500_index.csv"

# 讀取 S&P500 資料
df_sp = load_sp500(sp500_csv)
print(f"[SP500] total: {len(df_sp)} rows.")

# ============================== #
#  5. 合併情緒分數與 S&P500 資料  #
# ============================== #

print("\n合併情緒分數與 S&P500 資料...")
df_merged = pd.merge(daily_sentiment_df, df_sp, on='date', how='inner')
df_merged.sort_values('date', inplace=True)
df_merged.dropna(inplace=True)
print(f"[Merged] total: {len(df_merged)} rows.")

# 檢查合併後的資料
print("\n合併後的資料預覽：")
display(df_merged.head())

# ============================== #
#      6. 建立標籤 (隔日漲跌)       #
# ============================== #

print("\n建立標籤 (隔日漲跌)...")
df_merged = create_labels_for_prediction(df_merged, mode='classification')
print(f"[Merged+Target] total: {len(df_merged)} rows with target.")

# 檢查建立標籤後的資料
print("\n建立標籤後的資料預覽：")
display(df_merged.head())




ParserError: Error tokenizing data. C error: Expected 3 fields in line 4, saw 4


In [33]:
# ============================== #
#       7. 添加所有特徵              #
# ============================== #

# 添加技術指標、情緒變動特徵、滯後漲幅等
df_merged = add_feature_engineering(df_merged)
print("\n添加所有特徵後的資料預覽：")
display(df_merged.head())

# ============================== #
#    8. 準備特徵與標準化           #
# ============================== #

# 定義特徵列表
features = [
    'mean_headline_sent',
    'mean_content_sent',
    'sp_close',
    'sp_close_lag_1',
    'sp_close_lag_2',
    'sp_close_lag_3',
    'MA_5',
    'MA_10',
    'RSI',
    'MACD',
    'MACD_signal',
    'MACD_diff',
    'BB_upper',
    'BB_lower',
    'sentiment_change_1',
    'sentiment_change_2',
    'sentiment_change_3',
    'sentiment_lag_1',
    'sentiment_lag_2',
    'sentiment_lag_3',
    'sp_return',
    'sp_return_lag_1',
    'sp_return_lag_2',
    'sp_return_lag_3',
    'sp_return_mean_5'
]

# 確認所有特徵都存在
missing_features = [feat for feat in features if feat not in df_merged.columns]
if missing_features:
    print(f"錯誤: 缺少特徵 {missing_features}")
else:
    print("所有特徵均存在。")

# 填補 NaN 值並確保數據類型正確
df_merged[features] = df_merged[features].astype(float).fillna(0)

# 準備特徵和標籤
X_full = df_merged[features].values
y_full = df_merged['target'].values

# 標準化特徵
scaler = StandardScaler()
X_full_scaled = scaler.fit_transform(X_full)
print("[Info] 特徵標準化完成.")
print(f"X_full_scaled shape: {X_full_scaled.shape}")

# ============================== #
#         9. 交叉驗證與訓練        #
# ============================== #

# 定義模型函數
def model_fn_rf(X, y):
    model = train_random_forest(X, y)
    return model, lambda X_val: model.predict(X_val)

def model_fn_xgb(X, y):
    model = train_xgboost(X, y)
    return model, lambda X_val: model.predict(X_val)

def model_fn_lightgbm(X, y):
    model = train_lightgbm(X, y)
    return model, lambda X_val: model.predict(X_val)

def model_fn_catboost(X, y):
    model = train_catboost(X, y)
    return model, lambda X_val: model.predict(X_val)

def model_fn_mlp(X, y):
    model = train_mlp(X, y)
    return model, lambda X_val: model.predict(X_val)

def model_fn_stacking(X, y):
    model = train_stacking(X, y)
    return model, lambda X_val: model.predict(X_val)

def model_fn_xgb_grid(X, y):
    model = train_xgboost_gridsearch(X, y)
    return model, lambda X_val: model.predict(X_val)

def model_fn_xgb_random(X, y):
    model = train_xgboost_randomsearch(X, y)
    return model, lambda X_val: model.predict(X_val)

# 定義超參數調整函數
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

def train_xgboost_gridsearch(X_train, y_train):
    """
    使用 GridSearchCV 進行超參數調整的 XGBoost 模型。
    """
    param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.7, 0.8, 1.0],
        'colsample_bytree': [0.7, 0.8, 1.0],
        'scale_pos_weight': [1, 2, 3]  # 根據類別不平衡調整
    }
    
    clf = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42
    )
    
    grid_search = GridSearchCV(
        estimator=clf,
        param_grid=param_grid,
        scoring='accuracy',
        cv=3,
        verbose=1,
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")
    
    best_model = grid_search.best_estimator_
    return best_model

def train_xgboost_randomsearch(X_train, y_train):
    """
    使用 RandomizedSearchCV 進行超參數調整的 XGBoost 模型。
    """
    param_dist = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7, 10],
        'subsample': [0.6, 0.7, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.7, 0.8, 1.0],
        'gamma': [0, 0.1, 0.2, 0.3],
        'reg_alpha': [0, 0.01, 0.1],
        'reg_lambda': [1, 1.5, 2]
    }
    
    clf = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42
    )
    
    random_search = RandomizedSearchCV(
        estimator=clf,
        param_distributions=param_dist,
        n_iter=100,
        scoring='accuracy',
        cv=3,
        verbose=1,
        random_state=42,
        n_jobs=-1
    )
    
    random_search.fit(X_train, y_train)
    print(f"Best parameters: {random_search.best_params_}")
    print(f"Best cross-validation accuracy: {random_search.best_score_:.4f}")
    
    best_model = random_search.best_estimator_
    return best_model

# 執行交叉驗證
print("\n=== TimeSeriesSplit: Random Forest ===")
avg_acc_rf, avg_f1_rf, avg_auc_rf = timeseries_cv_and_train(
    X_full_scaled, y_full, 
    model_fn=model_fn_rf, 
    n_splits=3, 
    is_lstm=False
)

print("\n=== TimeSeriesSplit: XGBoost ===")
avg_acc_xgb, avg_f1_xgb, avg_auc_xgb = timeseries_cv_and_train(
    X_full_scaled, y_full, 
    model_fn=model_fn_xgb, 
    n_splits=3, 
    is_lstm=False
)

print("\n=== TimeSeriesSplit: LightGBM ===")
avg_acc_lgbm, avg_f1_lgbm, avg_auc_lgbm = timeseries_cv_and_train(
    X_full_scaled, y_full, 
    model_fn=model_fn_lightgbm, 
    n_splits=3, 
    is_lstm=False
)

print("\n=== TimeSeriesSplit: CatBoost ===")
avg_acc_cb, avg_f1_cb, avg_auc_cb = timeseries_cv_and_train(
    X_full_scaled, y_full, 
    model_fn=model_fn_catboost, 
    n_splits=3, 
    is_lstm=False
)

print("\n=== TimeSeriesSplit: MLP ===")
avg_acc_mlp, avg_f1_mlp, avg_auc_mlp = timeseries_cv_and_train(
    X_full_scaled, y_full, 
    model_fn=model_fn_mlp, 
    n_splits=3, 
    is_lstm=False
)

print("\n=== TimeSeriesSplit: Stacking ===")
avg_acc_stack, avg_f1_stack, avg_auc_stack = timeseries_cv_and_train(
    X_full_scaled, y_full, 
    model_fn=model_fn_stacking, 
    n_splits=3, 
    is_lstm=False
)

print("\n=== TimeSeriesSplit: XGBoost GridSearch ===")
avg_acc_xgb_grid, avg_f1_xgb_grid, avg_auc_xgb_grid = timeseries_cv_and_train(
    X_full_scaled, y_full, 
    model_fn=model_fn_xgb_grid, 
    n_splits=3, 
    is_lstm=False
)

print("\n=== TimeSeriesSplit: XGBoost RandomSearch ===")
avg_acc_xgb_random, avg_f1_xgb_random, avg_auc_xgb_random = timeseries_cv_and_train(
    X_full_scaled, y_full, 
    model_fn=model_fn_xgb_random, 
    n_splits=3, 
    is_lstm=False
)

# 選擇最佳模型
model_acc = {
    "RandomForest": avg_acc_rf,
    "XGBoost": avg_acc_xgb,
    "LightGBM": avg_acc_lgbm,
    "CatBoost": avg_acc_cb,
    "MLP": avg_acc_mlp,
    "Stacking": avg_acc_stack,
    "XGBoost_GridSearch": avg_acc_xgb_grid,
    "XGBoost_RandomSearch": avg_acc_xgb_random
}

best_model_name = max(model_acc, key=model_acc.get)
best_acc = model_acc[best_model_name]

print(f"\n[Best Model] {best_model_name}, ACC={best_acc:.4f}\n")

# ============================== #
#        10. 訓練最佳模型           #
# ============================== #

# 分割訓練集和測試集的比例
split_ratio = 0.8
split_index = int(len(df_merged) * split_ratio)

# 提取訓練集和測試集
df_train = df_merged.iloc[:split_index].copy()
df_test = df_merged.iloc[split_index:].copy()

# 確保 'date' 欄位存在
if 'date' not in df_test.columns:
    raise KeyError("'date' 欄位在測試集中不存在。請檢查特徵工程和數據分割步驟。")

# 準備訓練集和測試集的特徵和標籤
X_train, X_test = X_full_scaled[:split_index], X_full_scaled[split_index:]
y_train, y_test = y_full[:split_index], y_full[split_index:]

print(f"訓練集大小: {X_train.shape[0]}")
print(f"測試集大小: {X_test.shape[0]}")

# 處理類別不平衡（過採樣）
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f"重新採樣後訓練集大小: {X_train_resampled.shape[0]}")
print(f"重新採樣後測試集大小: {X_test.shape[0]}")

# 定義並訓練最佳模型
def get_best_model(model_name):
    if model_name == "RandomForest":
        return train_random_forest(X_train_resampled, y_train_resampled)
    elif model_name == "XGBoost":
        return train_xgboost(X_train_resampled, y_train_resampled)
    elif model_name == "LightGBM":
        return train_lightgbm(X_train_resampled, y_train_resampled)
    elif model_name == "CatBoost":
        return train_catboost(X_train_resampled, y_train_resampled)
    elif model_name == "MLP":
        return train_mlp(X_train_resampled, y_train_resampled)  # 若使用 PyTorch 的 MLP，需另外處理
    elif model_name == "Stacking":
        return train_stacking(X_train_resampled, y_train_resampled)  # 使用 GPU 支持的 Stacking 模型
    elif model_name == "XGBoost_GridSearch":
        return train_xgboost_gridsearch(X_train_resampled, y_train_resampled)
    elif model_name == "XGBoost_RandomSearch":
        return train_xgboost_randomsearch(X_train_resampled, y_train_resampled)
    else:
        raise ValueError("未知的模型名稱")

# 獲取最佳模型
final_model = get_best_model(best_model_name)

# 在測試集上進行預測
if best_model_name in ["Stacking"]:
    y_pred_test = final_model.predict(X_test)
    y_prob_test = final_model.predict_proba(X_test)[:,1]
else:
    y_pred_test = final_model.predict(X_test)
    if hasattr(final_model, "predict_proba"):
        y_prob_test = final_model.predict_proba(X_test)[:,1]
    else:
        y_prob_test = None

# 添加預測結果到 df_test
df_test.loc[:, 'y_pred'] = y_pred_test


# 評估模型
def evaluate_final_model(y_true, y_pred, y_prob):
    """評估最終模型。"""
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_prob) if y_prob is not None else None
    print(f"Final Model Accuracy: {acc:.4f}")
    print(f"Final Model F1 Score: {f1:.4f}")
    if auc is not None:
        print(f"Final Model AUC-ROC: {auc:.4f}")
    print("\nFinal Model Classification Report:")
    print(classification_report(y_true, y_pred))
    return acc, f1, auc

acc_test, f1_test, auc_test = evaluate_final_model(y_test, y_pred_test, y_prob_test)


添加所有特徵後的資料預覽：


Unnamed: 0,date,mean_headline_sent,mean_content_sent,sp_close,sp_close_next,target,MA_5,MA_10,RSI,MACD,MACD_signal,BB_upper,BB_lower,MACD_diff,sentiment_change_1,sentiment_change_2,sentiment_change_3,sentiment_lag_1,sentiment_lag_2,sentiment_lag_3,sp_return,sp_return_lag_1,sp_return_lag_2,sp_return_lag_3,sp_return_mean_5,sp_close_lag_1,sp_close_lag_2,sp_close_lag_3
126,2018-06-27,-0.355807,-0.135176,2699.63,2716.31,1,2728.88,2751.021,26.277666,3.077169,13.039142,2808.053551,2693.988449,-9.961972,-0.170921,-0.126648,-0.324425,-0.184886,-0.22916,-0.031382,-0.008604,0.002205,-0.013725,0.001862,-0.004922,2723.06,2717.07,2754.88
127,2018-06-28,-0.130231,-0.10373,2716.31,2718.37,1,2722.19,2744.403,30.040733,0.809837,10.593281,2800.558253,2688.247747,-9.783444,0.225576,0.054655,0.098928,-0.355807,-0.184886,-0.22916,0.006179,-0.008604,0.002205,-0.013725,-0.002417,2699.63,2723.06,2717.07
128,2018-06-29,-0.000166,-0.090418,2718.37,2726.71,1,2714.888,2738.274,29.633186,-0.811462,8.312332,2790.573108,2685.974892,-9.123794,0.130065,0.355641,0.18472,-0.130231,-0.355807,-0.184886,0.000758,0.006179,-0.008604,0.002205,-0.002637,2716.31,2699.63,2723.06
129,2018-07-02,-0.117394,-0.172609,2726.71,2713.22,0,2716.816,2733.57,31.170946,-1.407164,6.368433,2779.796957,2687.343043,-7.775597,-0.117228,0.012837,0.238413,-0.000166,-0.130231,-0.355807,0.003068,0.000758,0.006179,-0.008604,0.000721,2718.37,2716.31,2699.63
130,2018-07-03,-0.123707,-0.146778,2713.22,2736.61,1,2714.848,2728.633,30.734087,-2.933971,4.507952,2771.509144,2685.756856,-7.441923,-0.006312,-0.123541,0.006525,-0.117394,-0.000166,-0.130231,-0.004947,0.003068,0.000758,0.006179,-0.000709,2726.71,2718.37,2716.31


所有特徵均存在。
[Info] 特徵標準化完成.
X_full_scaled shape: (517, 25)

=== TimeSeriesSplit: Random Forest ===

Fold 1/3 開始...
Fold 1 - X_train shape: (130, 25)
Fold 1 - X_val shape: (129, 25)
Fold 1 - y_train shape: (130,)
Fold 1 - y_val shape: (129,)
Accuracy: 0.4031
F1 Score: 0.1149
AUC-ROC: 0.6028

Classification Report:
              precision    recall  f1-score   support

           0       0.39      0.90      0.55        52
           1       0.50      0.06      0.11        77

    accuracy                           0.40       129
   macro avg       0.45      0.48      0.33       129
weighted avg       0.46      0.40      0.29       129


Fold 2/3 開始...
Fold 2 - X_train shape: (259, 25)
Fold 2 - X_val shape: (129, 25)
Fold 2 - y_train shape: (259,)
Fold 2 - y_val shape: (129,)
Accuracy: 0.4961
F1 Score: 0.4961
AUC-ROC: 0.4788

Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.62      0.50        52
           1       0.62      0.42   


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.



Accuracy: 0.4884
F1 Score: 0.3774
AUC-ROC: 0.5450

Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.83      0.57        52
           1       0.69      0.26      0.38        77

    accuracy                           0.49       129
   macro avg       0.56      0.54      0.47       129
weighted avg       0.58      0.49      0.45       129


Fold 2/3 開始...
Fold 2 - X_train shape: (259, 25)
Fold 2 - X_val shape: (129, 25)
Fold 2 - y_train shape: (259,)
Fold 2 - y_val shape: (129,)
Accuracy: 0.4419
F1 Score: 0.4194
AUC-ROC: 0.4488

Classification Report:
              precision    recall  f1-score   support

           0       0.38      0.60      0.46        52
           1       0.55      0.34      0.42        77

    accuracy                           0.44       129
   macro avg       0.47      0.47      0.44       129
weighted avg       0.48      0.44      0.44       129


Fold 3/3 開始...
Fold 3 - X_train shape: (388, 25)
Fold 3


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



Accuracy: 0.5349
F1 Score: 0.6341
AUC-ROC: 0.4647

Classification Report:
              precision    recall  f1-score   support

           0       0.46      0.30      0.36        57
           1       0.57      0.72      0.63        72

    accuracy                           0.53       129
   macro avg       0.51      0.51      0.50       129
weighted avg       0.52      0.53      0.51       129


=== TimeSeriesSplit 平均 ACC: 0.4884, F1: 0.4770, AUC-ROC: 0.4861 ===

=== TimeSeriesSplit: LightGBM ===

Fold 1/3 開始...
Fold 1 - X_train shape: (130, 25)
Fold 1 - X_val shape: (129, 25)
Fold 1 - y_train shape: (130,)
Fold 1 - y_val shape: (129,)
[LightGBM] [Info] Number of positive: 64, number of negative: 66
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1124
[LightGBM] [Info] Number of data points in the train set: 130, number of used features: 25
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 305



Accuracy: 0.4806
F1 Score: 0.3619
AUC-ROC: 0.5562

Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.83      0.56        52
           1       0.68      0.25      0.36        77

    accuracy                           0.48       129
   macro avg       0.55      0.54      0.46       129
weighted avg       0.58      0.48      0.44       129


Fold 2/3 開始...
Fold 2 - X_train shape: (259, 25)
Fold 2 - X_val shape: (129, 25)
Fold 2 - y_train shape: (259,)
Fold 2 - y_val shape: (129,)




Accuracy: 0.3876
F1 Score: 0.3577
AUC-ROC: 0.4101

Classification Report:
              precision    recall  f1-score   support

           0       0.34      0.54      0.41        52
           1       0.48      0.29      0.36        77

    accuracy                           0.39       129
   macro avg       0.41      0.41      0.39       129
weighted avg       0.42      0.39      0.38       129


Fold 3/3 開始...
Fold 3 - X_train shape: (388, 25)
Fold 3 - X_val shape: (129, 25)
Fold 3 - y_train shape: (388,)
Fold 3 - y_val shape: (129,)




Accuracy: 0.5194
F1 Score: 0.6220
AUC-ROC: 0.4683

Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.28      0.34        57
           1       0.55      0.71      0.62        72

    accuracy                           0.52       129
   macro avg       0.49      0.49      0.48       129
weighted avg       0.50      0.52      0.50       129


=== TimeSeriesSplit 平均 ACC: 0.4625, F1: 0.4472, AUC-ROC: 0.4782 ===

=== TimeSeriesSplit: MLP ===

Fold 1/3 開始...
Fold 1 - X_train shape: (130, 25)
Fold 1 - X_val shape: (129, 25)
Fold 1 - y_train shape: (130,)
Fold 1 - y_val shape: (129,)
Accuracy: 0.4574
F1 Score: 0.4615
AUC-ROC: 0.5320

Classification Report:
              precision    recall  f1-score   support

           0       0.38      0.56      0.45        52
           1       0.57      0.39      0.46        77

    accuracy                           0.46       129
   macro avg       0.47      0.47      0.46       129
weighted avg

ValueError: The estimator function should be a classifier.

In [None]:
# ============================== #
#        7. 視覺化結果            #
# ============================== #

def smooth_curve(y, window=5):
    """使用移動平均平滑曲線。"""
    return pd.Series(y).rolling(window=window, min_periods=1).mean().values

# 平滑處理
df_test['sp_close_smooth'] = smooth_curve(df_test['sp_close'], window=5)
df_test['mean_headline_sent_smooth'] = smooth_curve(df_test['mean_headline_sent'], window=5)
df_test['y_pred_smooth'] = smooth_curve(df_test['y_pred'], window=5)

# 設定日期為索引（如果尚未設定）
df_test['date'] = pd.to_datetime(df_test['date'])
df_test.set_index('date', inplace=True)

# 創建一個雙 y 軸的圖表
fig, ax1 = plt.subplots(figsize=(14,7))

# 繪製 S&P500 指數
ax1.plot(df_test.index, df_test['sp_close_smooth'], color='blue', label='S&P 500')
ax1.set_xlabel('Date')
ax1.set_ylabel('S&P500', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')

# 繪製情緒分析分數
ax2 = ax1.twinx()
ax2.plot(df_test.index, df_test['mean_headline_sent_smooth'], color='green', label='avg sentiment(smoothed)')
ax2.set_ylabel('sentiment', color='green')
ax2.tick_params(axis='y', labelcolor='green')

# 繪製預測結果
plt.plot(df_test.index, df_test['y_pred_smooth'], color='red', linestyle='--', label='predict (smothed)')

# 合併圖例
lines_1, labels_1 = ax1.get_legend_handles_labels()
lines_2, labels_2 = ax2.get_legend_handles_labels()
plt.legend(lines_1 + lines_2, labels_1 + labels_2, loc='upper left')

plt.title(f"Final Model ({best_model_name}) on Test Data (Smoothed)")
plt.grid(True)
plt.show()


KeyError: 'date'

In [14]:
# ============================== #
#       8. 輸出合併資料              #
# ============================== #

df_merged.to_csv("merged_data.csv", index=False)
print("merged_data.csv 已輸出。")


merged_data.csv 已輸出。
