In [None]:
# 導入必要的庫
import pandas as pd
import numpy as np
from tsfresh import extract_features
from tsfresh.feature_extraction import MinimalFCParameters
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# 設置顯示選項
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)


In [None]:
# 讀取和準備數據
def prepare_data(file_path, stock_id):
    """
    準備數據用於特徵提取
    
    參數:
    file_path (str): CSV文件路徑
    stock_id (str): 股票代碼
    
    返回:
    DataFrame: 準備好的數據框
    """
    # 讀取CSV文件
    df = pd.read_csv(file_path)
    
    # 添加股票ID
    df['Stock_ID'] = stock_id
    
    # 處理目標變量
    df['PD'] = df['DIRECTION'].shift(-1)
    df = df.drop(columns=['DIRECTION'])
    
    # 轉換日期
    df['Date'] = pd.to_datetime(df['Date'])
    
    # 清理數據
    df = df.drop(columns=['PREDICT_TARGET'], errors='ignore')
    df = df.dropna(subset=['PD'])
    
    return df

# 示例使用
file_path = "data_prep/ta_data/Consumer Discretionary/AAP/AAP_1D_ta.csv"
df = prepare_data(file_path, 'AAP')
print("數據準備完成，形狀:", df.shape)


In [None]:
# 特徵提取函數
def extract_time_series_features(df, minimal=False):
    """
    從時間序列數據中提取特徵
    
    參數:
    df (DataFrame): 輸入數據框
    minimal (bool): 是否使用最小特徵集
    
    返回:
    DataFrame: 提取的特徵
    """
    if minimal:
        # 使用最小特徵集以加快處理速度
        extracted_features = extract_features(
            df, 
            column_id='Stock_ID', 
            column_sort='Date',
            default_fc_parameters=MinimalFCParameters()
        )
    else:
        # 使用完整特徵集
        extracted_features = extract_features(
            df, 
            column_id='Stock_ID', 
            column_sort='Date'
        )
    
    return extracted_features

# 提取特徵
features = extract_time_series_features(df, minimal=True)
print("特徵提取完成，特徵數量:", features.shape[1])


In [None]:
# 特徵選擇函數
def select_features(X, y, n_estimators=100):
    """
    使用隨機森林進行特徵選擇
    
    參數:
    X (DataFrame): 特徵矩陣
    y (Series): 目標變量
    n_estimators (int): 隨機森林中的樹數量
    
    返回:
    DataFrame: 選擇後的特徵
    """
    # 初始化隨機森林
    rf = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
    
    # 訓練模型
    rf.fit(X, y)
    
    # 使用SelectFromModel選擇特徵
    selector = SelectFromModel(rf, prefit=True)
    
    # 獲取選擇的特徵
    X_reduced = selector.transform(X)
    
    # 獲取選擇的特徵名稱
    selected_features = X.columns[selector.get_support()].tolist()
    
    return X_reduced, selected_features

# 準備數據進行特徵選擇
y = df['PD']
X = features

# 執行特徵選擇
X_reduced, selected_features = select_features(X, y)
print(f"原始特徵數量: {X.shape[1]}")
print(f"選擇後特徵數量: {X_reduced.shape[1]}")
print("\n選擇的特徵:")
for i, feature in enumerate(selected_features, 1):
    print(f"{i}. {feature}")


In [None]:
# 保存處理後的數據
def save_processed_data(X_reduced, y, selected_features, output_path):
    """
    保存處理後的數據和特徵信息
    
    參數:
    X_reduced (ndarray): 選擇後的特徵矩陣
    y (Series): 目標變量
    selected_features (list): 選擇的特徵名稱
    output_path (str): 輸出文件路徑
    """
    # 創建包含特徵的DataFrame
    processed_df = pd.DataFrame(X_reduced, columns=selected_features)
    
    # 添加目標變量
    processed_df['PD'] = y
    
    # 保存到CSV
    processed_df.to_csv(output_path, index=False)
    print(f"數據已保存到: {output_path}")
    
    # 保存特徵列表
    feature_path = output_path.replace('.csv', '_features.txt')
    with open(feature_path, 'w') as f:
        for feature in selected_features:
            f.write(f"{feature}\n")
    print(f"特徵列表已保存到: {feature_path}")

# 保存處理後的數據
output_path = "ta_train/AAP_1D_ta_features_processed.csv"
save_processed_data(X_reduced, y, selected_features, output_path)


In [5]:
import pandas as pd
from tsfresh import extract_features

# 讀取CSV文件
df = pd.read_csv("data_prep/ta_data/Consumer Discretionary/AAP/AAP_1D_ta.csv")

# 假設 'Stock_ID' 是用於識別不同股票的列，你需要添加這個列
df['Stock_ID'] = 'AAP'

# 將 DIRECTION 列移動並重命名為 PD
df['PD'] = df['DIRECTION'].shift(-1)

# 去掉原 DIRECTION 列
df = df.drop(columns=['DIRECTION'])

# 將日期列轉換為日期時間索引
df['Date'] = pd.to_datetime(df['Date'])

# 去掉任何不需要的列如 PREDICT_TARGET
df = df.drop(columns=['PREDICT_TARGET'], errors='ignore')

# 刪除含有 NaN 值的行
df = df.dropna(subset=['PD'])

# 提取時間序列特徵，這次加入 'column_id' 和 'column_sort'
extracted_features = extract_features(df, column_id='Stock_ID', column_sort='Date')

# 提取目標列
y = df['PD']

# 重置索引
y = y.reset_index(drop=True)
extracted_features = extracted_features.reset_index(drop=True)

# 合併特徵和目標
features_with_target = pd.concat([extracted_features, y], axis=1)

# 保存為CSV在ta_train文件夾
features_with_target.to_csv("ta_train/AAP_1D_ta_features.csv", index=False)


Feature Extraction: 100%|██████████| 26/26 [06:03<00:00, 13.97s/it]


In [2]:
import pandas as pd
from tsfresh import extract_features
from tsfresh.feature_extraction import MinimalFCParameters

# 讀取CSV文件
df = pd.read_csv("data_prep/ta_data/Consumer Discretionary/AAP/AAP_1D_ta.csv")

# 假設 'Stock_ID' 是用於識別不同股票的列，你需要添加這個列
df['Stock_ID'] = 'AAP'

# 將 DIRECTION 列移動並重命名為 PD
df['PD'] = df['DIRECTION'].shift(-1)

# 去掉原 DIRECTION 列
df = df.drop(columns=['DIRECTION'])

# 將日期列轉換為日期時間索引
df['Date'] = pd.to_datetime(df['Date'])

# 去掉任何不需要的列如 PREDICT_TARGET
df = df.drop(columns=['PREDICT_TARGET'], errors='ignore')

# 刪除含有 NaN 值的行
df = df.dropna(subset=['PD'])

# 使用 MinimalFCParameters 提取時間序列特徵，這將生成較少的特徵
extracted_features = extract_features(df, column_id='Stock_ID', column_sort='Date', default_fc_parameters=MinimalFCParameters())

# 提取目標列
y = df['PD']

# 重置索引
y = y.reset_index(drop=True)
extracted_features = extracted_features.reset_index(drop=True)

# 合併特徵和目標
features_with_target = pd.concat([extracted_features, y], axis=1)

# 保存為CSV在ta_train文件夾
features_with_target.to_csv("ta_train/AAP_1D_ta_features_mini.csv", index=False)


Unnamed: 0_level_0,Open,High,Low,Close,Volume,MA5,MA6,MA10,MA20,BIAS5,...,low_pct_change,VolPctChg,volume_direction,TREND,VIX,S&P 500,Dow Jones Industrial Average,NASDAQ Composite,Russell 2000,PD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2002-01-18,15.70,16.00,15.57,15.77,70500.0,15.35,15.21,14.67,14.98,2.74,...,-0.64,64.34,1,-1,22.52,1127.58,9771.85,1930.34,474.37,-1.0
2002-01-22,15.77,15.77,15.23,15.57,130800.0,15.61,15.39,14.83,15.05,-0.26,...,-2.18,85.53,1,-1,23.61,1119.31,9713.80,1882.53,469.43,0.0
2002-01-23,15.60,15.72,15.57,15.63,38700.0,15.67,15.61,15.02,15.08,-0.26,...,2.23,-70.41,-1,-1,21.88,1128.18,9730.96,1922.38,477.45,-1.0
2002-01-24,15.67,15.73,15.33,15.42,125100.0,15.62,15.62,15.19,15.09,-1.28,...,-1.54,223.26,1,1,21.15,1132.15,9796.07,1942.58,479.73,1.0
2002-01-25,15.47,15.77,15.32,15.63,88800.0,15.60,15.62,15.35,15.10,0.19,...,-0.07,-29.02,-1,1,21.01,1133.28,9840.08,1937.70,479.35,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-06-04,67.00,67.00,64.51,65.21,1264900.0,66.54,67.15,68.28,71.54,-2.00,...,-2.67,-23.56,-1,-1,13.16,5291.34,38711.29,16857.05,2033.94,0.0
2024-06-05,65.75,66.18,63.81,65.27,1579600.0,67.10,66.33,67.74,71.08,-2.73,...,-1.09,24.88,1,-1,12.63,5354.03,38807.33,17187.90,2063.87,-1.0
2024-06-06,65.68,65.87,64.13,64.56,1031300.0,66.62,66.68,67.17,70.56,-3.09,...,0.50,-34.71,-1,-1,12.58,5352.96,38886.17,17173.12,2049.44,0.0
2024-06-07,63.80,64.96,63.55,64.58,1071900.0,65.41,66.28,66.69,70.01,-1.27,...,-0.90,3.94,1,-1,12.22,5346.99,38798.99,17133.13,2026.55,-1.0


In [6]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# 加載數據
data = pd.read_csv('ta_train\AAP_1D_ta_features.csv')
y = data['PD']
X = data.drop('PD', axis=1)


# 使用隨機森林作為基模型
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X, y)

# 使用 SelectFromModel 來選擇特徵
selector = SelectFromModel(rf, prefit=True)
X_reduced = selector.transform(X)

# 現在 X_reduced 包含的是被選擇的特徵
print(f"Original feature count: {X.shape[1]}, Reduced feature count: {X_reduced.shape[1]}")

KeyboardInterrupt: 