In [41]:
import sys
sys.path.append('/home/s312657018/TBrain/code')

import pandas as pd
import glob
from itertools import combinations
from dataset import process_serial

from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

In [42]:
avgdata_paths = glob.glob('/home/s312657018/TBrain/示範程式/LSTM(比賽用)/ExampleTrainData(AVG)/AvgDATA_*.csv')
avgdata_dict = {}

for i in range(len(avgdata_paths)):
    df_num = avgdata_paths[i].split('.')[0][-2:]
    avgdata_dict[f'avgdata_{df_num}'] = pd.read_csv(avgdata_paths[i])

avgdata_dict = {key: process_serial(df, 'Serial') for key, df in avgdata_dict.items()}
data = avgdata_dict['avgdata_01']

## 使用 Lasso 回歸進行特徵選擇

In [44]:
X = data[['WindSpeed(m/s)', 'Pressure(hpa)', 'Temperature(°C)', 'Humidity(%)', 'Sunlight(Lux)', 'Station_ID']]
y = data['Power(mW)']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 使用 LassoCV 來自動選擇 alpha 值
lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X_scaled, y)

# 獲取特徵
selected_features = X.columns[lasso.coef_ != 0]
print("Selected features:", selected_features)
print("Feature coefficients:", lasso.coef_)

Selected features: Index(['WindSpeed(m/s)', 'Temperature(°C)', 'Sunlight(Lux)'], dtype='object')
Feature coefficients: [ 10.35816239   0.         -53.71293634  -0.         548.24315826
   0.        ]


##### 注意：雖然 `Huminity` 負相關性很高，代表可能是重要的特徵，但如果它與其他特徵有共線性，Lasso 可能會選擇刪除它來保持模型的簡潔性。
（例如 `Humunity_x_Sunlight`, `Huminity_x_Temperature`）

## 創建交互項

In [36]:
X_interactions = pd.DataFrame()

# 為選中的特徵創建兩兩交互項
for feature1, feature2 in combinations(selected_features, 2):
    interaction_term = X[feature1] * X[feature2]
    X_interactions[f"{feature1}_x_{feature2}"] = interaction_term

X_combined = pd.concat([X[selected_features], X_interactions], axis=1)
X_combined

Unnamed: 0,WindSpeed(m/s),Temperature(°C),Sunlight(Lux),WindSpeed(m/s)_x_Temperature(°C),WindSpeed(m/s)_x_Sunlight(Lux),Temperature(°C)_x_Sunlight(Lux)
0,1.92,18.33,8395.25,35.1936,16118.8800,153884.9325
1,0.09,18.02,7046.50,1.6218,634.1850,126977.9300
2,0.02,17.73,5919.58,0.3546,118.3916,104954.1534
3,0.02,17.90,8838.92,0.3580,176.7784,158216.6680
4,0.04,18.08,5774.67,0.7232,230.9868,104406.0336
...,...,...,...,...,...,...
5695,0.00,25.13,901.08,0.0000,0.0000,22644.1404
5696,0.00,25.10,758.08,0.0000,0.0000,19027.8080
5697,0.00,25.10,507.17,0.0000,0.0000,12729.9670
5698,0.00,25.10,370.00,0.0000,0.0000,9287.0000


## 再次使用 Lasso 選擇交互項

In [37]:
X_combined_scaled = scaler.fit_transform(X_combined)
lasso.fit(X_combined_scaled, y)

selected_combined_features = X_combined.columns[lasso.coef_ != 0]
print("Selected features and interactions:", selected_combined_features)

Selected features and interactions: Index(['WindSpeed(m/s)', 'Temperature(°C)', 'Sunlight(Lux)',
       'WindSpeed(m/s)_x_Sunlight(Lux)', 'Temperature(°C)_x_Sunlight(Lux)'],
      dtype='object')


In [38]:
df

Unnamed: 0,Temperature(°C),Sunlight(Lux),Hour,WindSpeed(m/s)_x_Sunlight(Lux),WindSpeed(m/s)_x_Hour,Temperature(°C)_x_Sunlight(Lux),Power(mW)
0,18.33,8395.25,9,16118.8800,17.28,153884.9325,17.37
1,18.02,7046.50,9,634.1850,0.81,126977.9300,12.61
2,17.73,5919.58,9,118.3916,0.18,104954.1534,8.81
3,17.90,8838.92,9,176.7784,0.18,158216.6680,17.56
4,18.08,5774.67,9,230.9868,0.36,104406.0336,7.51
...,...,...,...,...,...,...,...
5695,25.13,901.08,16,0.0000,0.00,22644.1404,0.15
5696,25.10,758.08,16,0.0000,0.00,19027.8080,0.11
5697,25.10,507.17,16,0.0000,0.00,12729.9670,0.05
5698,25.10,370.00,16,0.0000,0.00,9287.0000,0.03


> 最後選擇 `Temperature(°C)`, `Sunlight(Lux)`, `Hour`, `WindSpeed(m/s)_x_Sunlight(Lux)`, `Temperature(°C)_x_Sunlight(Lux)`, `WindSpeed(m/s)_x_Hour`

In [39]:
df = pd.concat([X_combined[selected_combined_features], data[['Month', 'Day', 'Hour', 'Minute']], y], axis=1)
df.to_csv('/home/s312657018/TBrain/data-preprocess/selected_features.csv', index=False)
print("Selected features saved to selected_features.csv")

Selected features saved to selected_features.csv


--------
## Regression Model Selection

In [8]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

import sys
sys.path.append('/home/s312657018/TBrain/code')
from dataset import process_serial
from Configs import Config

In [9]:
regressor_models = {
    'RandomForest': RandomForestRegressor(random_state=42, n_estimators=500),
    'XGBoost': XGBRegressor(random_state=42, n_estimators=500),
    'SVM': SVR(kernel='rbf', C=1.0, epsilon=0.1),
    'KNN': KNeighborsRegressor(n_neighbors=5),
    'LinearRegression': LinearRegression()
}

In [10]:
def evaluate_feature_completion(X_train, X_valid, features, target_columns, time_features):
    # 保存不同回歸模型的補全效果
    completion_results = {}

    # 逐個回歸模型測試
    for model_name, regressor in regressor_models.items():
        print(f"Testing feature completion with {model_name}...")

        # 存儲補全特徵的 DataFrame
        X_valid_pred = X_valid.copy()  
        
        # 使用 time_features 作為特徵來補全 target_columns
        train_features = X_train[time_features]
        valid_features = X_valid[time_features]
        
        errors = {}  # 存儲每個目標特徵的補全誤差
        
        # 僅補全原始目標特徵，不包括乘積特徵
        for target in original_target_columns:
            regressor.fit(train_features, X_train[target])
            X_valid_pred[target] = regressor.predict(valid_features)
            mse = mean_squared_error(X_valid[target], X_valid_pred[target])
            errors[target] = mse
            print(f"{model_name} - MSE for {target}: {mse:.4f}")

        # 計算乘積特徵的誤差（如果需要）
        # for product in product_columns:
        #     factors = product.split('_x_')
        #     X_valid_pred[product] = X_valid_pred[factors[0]] * X_valid_pred[factors[1]]
        #     mse = mean_squared_error(X_valid[product], X_valid_pred[product])
        #     errors[product] = mse
        #     print(f"{model_name} - MSE for {product}: {mse:.4f}")

        # 計算該模型的總體補全誤差
        avg_mse = np.mean(list(errors.values()))
        completion_results[model_name] = {'MSE': errors, 'Avg MSE': avg_mse}
        print(f"{model_name} - Average MSE across all features: {avg_mse:.4f}")
    
    return completion_results



In [None]:
data = pd.read_csv('/home/s312657018/TBrain/data-preprocess/AvgDATA.csv')
# X = data[['WindSpeed(m/s)', 'Pressure(hpa)', 'Temperature(°C)', 'Humidity(%)', 'Sunlight(Lux)', 'Month', 'Day', 'Hour', 'Minute', 'Station_ID']]

time_features = ['Month', 'Day', 'Hour', 'Minute', 'Station_ID']
features = ['WindSpeed(m/s)', 'Temperature(°C)', 'Sunlight(Lux)',
       'WindSpeed(m/s)_x_Sunlight(Lux)', 'Temperature(°C)_x_Sunlight(Lux)',
       'Month', 'Day', 'Hour', 'Minute', 'Station_ID']
target_columns = [col for col in features if col not in time_features]  # 需要補全的特徵

X = data[features]
y = data['Power(mW)']

original_target_columns = ['WindSpeed(m/s)', 'Temperature(°C)', 'Sunlight(Lux)', 'WindSpeed(m/s)_x_Sunlight(Lux)', 'Temperature(°C)_x_Sunlight(Lux)']

# 評估特徵補全效果
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=Config.seed)
# 對原始目標特徵進行標準化
scaler = StandardScaler()
X_train[original_target_columns] = scaler.fit_transform(X_train[original_target_columns])
X_valid[original_target_columns] = scaler.transform(X_valid[original_target_columns])

completion_results = evaluate_feature_completion(X_train, X_valid, features, target_columns, time_features)


Testing feature completion with RandomForest...
RandomForest - MSE for WindSpeed(m/s): 0.1201
RandomForest - MSE for Temperature(°C): 0.0172
RandomForest - MSE for Sunlight(Lux): 0.0886
RandomForest - MSE for WindSpeed(m/s)_x_Sunlight(Lux): 0.1237
RandomForest - MSE for Temperature(°C)_x_Sunlight(Lux): 0.0824
RandomForest - Average MSE across all features: 0.0864
Testing feature completion with XGBoost...
XGBoost - MSE for WindSpeed(m/s): 0.1402
XGBoost - MSE for Temperature(°C): 0.0201
XGBoost - MSE for Sunlight(Lux): 0.1025
XGBoost - MSE for WindSpeed(m/s)_x_Sunlight(Lux): 0.1363
XGBoost - MSE for Temperature(°C)_x_Sunlight(Lux): 0.0911
XGBoost - Average MSE across all features: 0.0980
Testing feature completion with SVM...
SVM - MSE for WindSpeed(m/s): 0.9296
SVM - MSE for Temperature(°C): 0.4663
SVM - MSE for Sunlight(Lux): 0.7126
SVM - MSE for WindSpeed(m/s)_x_Sunlight(Lux): 1.1061
SVM - MSE for Temperature(°C)_x_Sunlight(Lux): 0.7124
SVM - Average MSE across all features: 0.7854


-----
#### Grid Search

In [16]:
regressor_models = {
    'RandomForest1': RandomForestRegressor(random_state=42, n_estimators=300),
    'RandomForest2': RandomForestRegressor(random_state=42, n_estimators=100),
    'RandomForest3': RandomForestRegressor(random_state=42, n_estimators=100, max_depth=10),
    'RandomForest4': RandomForestRegressor(random_state=42, n_estimators=100, max_depth=10, min_samples_leaf=5),
}

In [17]:
completion_results = evaluate_feature_completion(X_train, X_valid, features, target_columns, time_features)


Testing feature completion with RandomForest1...
RandomForest1 - MSE for WindSpeed(m/s): 0.1096
RandomForest1 - MSE for Temperature(°C): 0.0192
RandomForest1 - MSE for Sunlight(Lux): 0.0792
RandomForest1 - MSE for WindSpeed(m/s)_x_Sunlight(Lux): 0.1332
RandomForest1 - MSE for Temperature(°C)_x_Sunlight(Lux): 0.0695
RandomForest1 - Average MSE across all features: 0.0821
Testing feature completion with RandomForest2...
RandomForest2 - MSE for WindSpeed(m/s): 0.1104
RandomForest2 - MSE for Temperature(°C): 0.0196
RandomForest2 - MSE for Sunlight(Lux): 0.0799
RandomForest2 - MSE for WindSpeed(m/s)_x_Sunlight(Lux): 0.1329
RandomForest2 - MSE for Temperature(°C)_x_Sunlight(Lux): 0.0701
RandomForest2 - Average MSE across all features: 0.0826
Testing feature completion with RandomForest3...
RandomForest3 - MSE for WindSpeed(m/s): 0.2737
RandomForest3 - MSE for Temperature(°C): 0.1767
RandomForest3 - MSE for Sunlight(Lux): 0.3615
RandomForest3 - MSE for WindSpeed(m/s)_x_Sunlight(Lux): 0.3893
R

In [19]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import numpy as np
import pandas as pd

# 1. 定义回归模型
regressor_models = {
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
}

def evaluate_feature_completion(X_train, X_valid, features, target_columns, time_features):
    # 保存不同回归模型的补全效果
    completion_results = {}

    # 逐个回归模型测试
    for model_name, regressor in regressor_models.items():
        print(f"Testing feature completion with {model_name}...")

        # 存储补全特征的 DataFrame
        X_valid_pred = X_valid.copy()
        
        # 使用 time_features 作为特征来补全 target_columns
        train_features = X_train[time_features]
        valid_features = X_valid[time_features]
        
        errors = {}  # 存储每个目标特征的补全误差
        
        # 仅补全原始目标特征，不包括乘积特征
        for target in original_target_columns:
            regressor.fit(train_features, X_train[target])
            X_valid_pred[target] = regressor.predict(valid_features)
            mse = mean_squared_error(X_valid[target], X_valid_pred[target])
            errors[target] = mse
            print(f"{model_name} - MSE for {target}: {mse:.4f}")

        # 计算乘积特征的误差（如果需要）
        for product in product_columns:
            factors = product.split('_x_')
            X_valid_pred[product] = X_valid_pred[factors[0]] * X_valid_pred[factors[1]]
            mse = mean_squared_error(X_valid[product], X_valid_pred[product])
            errors[product] = mse
            print(f"{model_name} - MSE for {product}: {mse:.4f}")

        # 计算该模型的总体补全误差
        avg_mse = np.mean(list(errors.values()))
        completion_results[model_name] = {'MSE': errors, 'Avg MSE': avg_mse}
        print(f"{model_name} - Average MSE across all features: {avg_mse:.4f}")
    
    return completion_results

# 2. 数据加载和预处理
data = pd.read_csv('/home/s312657018/TBrain/data-preprocess/AvgDATA.csv')
time_features = ['Month', 'Day', 'Hour', 'Minute', 'Station_ID']
features = ['WindSpeed(m/s)', 'Temperature(°C)', 'Sunlight(Lux)',
            'WindSpeed(m/s)_x_Sunlight(Lux)', 'Temperature(°C)_x_Sunlight(Lux)',
            'Month', 'Day', 'Hour', 'Minute', 'Station_ID']
target_columns = [col for col in features if col not in time_features]
original_target_columns = ['WindSpeed(m/s)', 'Temperature(°C)', 'Sunlight(Lux)']
product_columns = ['WindSpeed(m/s)_x_Sunlight(Lux)', 'Temperature(°C)_x_Sunlight(Lux)']

# 3. 分割数据集
X = data[features]
y = data['Power(mW)']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

# 4. 标准化
scaler = StandardScaler()
X_train[original_target_columns] = scaler.fit_transform(X_train[original_target_columns])
X_valid[original_target_columns] = scaler.transform(X_valid[original_target_columns])

# 5. 评估特征补全效果
completion_results = evaluate_feature_completion(X_train, X_valid, features, target_columns, time_features)

# 输出结果
print("Feature completion results:\n", completion_results)


Testing feature completion with RandomForest...
RandomForest - MSE for WindSpeed(m/s): 0.1004
RandomForest - MSE for Temperature(°C): 0.0206
RandomForest - MSE for Sunlight(Lux): 0.0834
RandomForest - MSE for WindSpeed(m/s)_x_Sunlight(Lux): 2109389575.6678
RandomForest - MSE for Temperature(°C)_x_Sunlight(Lux): 3464337008088.7280
RandomForest - Average MSE across all features: 693289279532.9200
Feature completion results:
 {'RandomForest': {'MSE': {'WindSpeed(m/s)': np.float64(0.10040393898269438), 'Temperature(°C)': np.float64(0.020609487053280445), 'Sunlight(Lux)': np.float64(0.08337994321345758), 'WindSpeed(m/s)_x_Sunlight(Lux)': np.float64(2109389575.6677878), 'Temperature(°C)_x_Sunlight(Lux)': np.float64(3464337008088.728)}, 'Avg MSE': np.float64(693289279532.92)}}
