In [5]:
import pandas as pd
from tsfresh import extract_features

# 讀取CSV文件
df = pd.read_csv("data_prep/ta_data/Consumer Discretionary/AAP/AAP_1D_ta.csv")

# 假設 'Stock_ID' 是用於識別不同股票的列，你需要添加這個列
df['Stock_ID'] = 'AAP'

# 將 DIRECTION 列移動並重命名為 PD
df['PD'] = df['DIRECTION'].shift(-1)

# 去掉原 DIRECTION 列
df = df.drop(columns=['DIRECTION'])

# 將日期列轉換為日期時間索引
df['Date'] = pd.to_datetime(df['Date'])

# 去掉任何不需要的列如 PREDICT_TARGET
df = df.drop(columns=['PREDICT_TARGET'], errors='ignore')

# 刪除含有 NaN 值的行
df = df.dropna(subset=['PD'])

# 提取時間序列特徵，這次加入 'column_id' 和 'column_sort'
extracted_features = extract_features(df, column_id='Stock_ID', column_sort='Date')

# 提取目標列
y = df['PD']

# 重置索引
y = y.reset_index(drop=True)
extracted_features = extracted_features.reset_index(drop=True)

# 合併特徵和目標
features_with_target = pd.concat([extracted_features, y], axis=1)

# 保存為CSV在ta_train文件夾
features_with_target.to_csv("ta_train/AAP_1D_ta_features.csv", index=False)


Feature Extraction: 100%|██████████| 26/26 [06:03<00:00, 13.97s/it]


In [8]:
import pandas as pd
from tsfresh import extract_features
from tsfresh.feature_extraction import MinimalFCParameters

# 讀取CSV文件
df = pd.read_csv("data_prep/ta_data/Consumer Discretionary/AAP/AAP_1D_ta.csv")

# 假設 'Stock_ID' 是用於識別不同股票的列，你需要添加這個列
df['Stock_ID'] = 'AAP'

# 將 DIRECTION 列移動並重命名為 PD
df['PD'] = df['DIRECTION'].shift(-1)

# 去掉原 DIRECTION 列
df = df.drop(columns=['DIRECTION'])

# 將日期列轉換為日期時間索引
df['Date'] = pd.to_datetime(df['Date'])

# 去掉任何不需要的列如 PREDICT_TARGET
df = df.drop(columns=['PREDICT_TARGET'], errors='ignore')

# 刪除含有 NaN 值的行
df = df.dropna(subset=['PD'])

# 使用 MinimalFCParameters 提取時間序列特徵，這將生成較少的特徵
extracted_features = extract_features(df, column_id='Stock_ID', column_sort='Date', default_fc_parameters=MinimalFCParameters())

# 提取目標列
y = df['PD']

# 重置索引
y = y.reset_index(drop=True)
extracted_features = extracted_features.reset_index(drop=True)

# 合併特徵和目標
features_with_target = pd.concat([extracted_features, y], axis=1)

# 保存為CSV在ta_train文件夾
features_with_target.to_csv("ta_train/AAP_1D_ta_features_mini.csv", index=False)


Feature Extraction: 100%|██████████| 26/26 [00:04<00:00,  5.74it/s]


In [9]:
import pandas as pd

# 讀取CSV文件
df = pd.read_csv("data_prep/ta_data/Consumer Discretionary/AAP/AAP_1D_ta.csv")

# 假設 'Stock_ID' 是用於識別不同股票的列，你需要添加這個列
df['Stock_ID'] = 'AAP'

# 將 DIRECTION 列移動並重命名為 PD
df['PD'] = df['DIRECTION'].shift(-1)

# 去掉原 DIRECTION 列
df = df.drop(columns=['DIRECTION'])

# 將日期列轉換為日期時間索引
df['Date'] = pd.to_datetime(df['Date'])

# 去掉任何不需要的列如 PREDICT_TARGET
df = df.drop(columns=['PREDICT_TARGET'], errors='ignore')

# 刪除含有 NaN 值的行
df = df.dropna(subset=['PD'])

# 現在 df 已經處理好，可以直接用於模型訓練或其他分析，無需進一步的特徵提取
# 保存處理後的數據
df.to_csv("ta_train/AAP_1D_ta_cleaned.csv", index=False)


In [6]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# 加載數據
data = pd.read_csv('ta_train\AAP_1D_ta_features.csv')
y = data['PD']
X = data.drop('PD', axis=1)


# 使用隨機森林作為基模型
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X, y)

# 使用 SelectFromModel 來選擇特徵
selector = SelectFromModel(rf, prefit=True)
X_reduced = selector.transform(X)

# 現在 X_reduced 包含的是被選擇的特徵
print(f"Original feature count: {X.shape[1]}, Reduced feature count: {X_reduced.shape[1]}")

KeyboardInterrupt: 