# 特征选择

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LassoCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from tabulate import tabulate

# 加载数据集
data = pd.read_csv('../data/raw/cleaned_data_after_mvp.csv')
data = data.drop('Index', axis=1)
data_encoded = pd.get_dummies(data, columns=['Electrolyte', 'Current collector'])

data_encoded['target_class'] = pd.qcut(data_encoded['target'], q=10, labels=False)

X = data_encoded.drop(['target', 'target_class'], axis=1)
y = data_encoded['target']
stratify_column = data_encoded['target_class']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 拆分训练和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21, stratify=stratify_column)

## Lasso回归

In [13]:
# Lasso回归
lasso = LassoCV(cv=10, random_state=21).fit(X_train, y_train)
lasso_importance = np.abs(lasso.coef_)

## 梯度提升机特征重要性

In [14]:
gbm = GradientBoostingRegressor(n_estimators=100, random_state=21).fit(X_train, y_train)
gbm_importance = gbm.feature_importances_

## RFE递归特征消除

In [15]:
# RFE
rfe = RFECV(estimator=GradientBoostingRegressor(n_estimators=100, random_state=21), step=1, cv=10)
rfe.fit(X_train, y_train)
rfe_importance = rfe.ranking_

In [16]:
# 创建特征重要性 DataFrame
features = X.columns
results_df = pd.DataFrame({
    'Feature': features,
    'GBM Importance': gbm_importance,
    'RFE Ranking': rfe_importance,
    'Lasso Importance': lasso_importance
})

# 排序特征
results_df = results_df.sort_values(by='GBM Importance', ascending=False).reset_index(drop=True)

## 打印结果

In [17]:
# 打印结果
print("Feature Importance:")
print(tabulate(results_df, headers='keys', tablefmt='pipe', showindex=False))

# 保存为 Markdown 文件
markdown_content = "### Feature Importance\n" + tabulate(results_df, headers='keys', tablefmt='pipe', showindex=False)
with open('../reports/feature_engineering/feature_selection.md', 'w') as md_file:
    md_file.write(markdown_content)

Feature Importance:
| Feature                                |   GBM Importance |   RFE Ranking |   Lasso Importance |
|:---------------------------------------|-----------------:|--------------:|-------------------:|
| Current density                        |      0.268715    |             1 |          0.544614  |
| Specific surface area                  |      0.240809    |             1 |          0.0204549 |
| N                                      |      0.116365    |             1 |          2.35257   |
| Active mass loading                    |      0.102371    |             1 |          0         |
| Pore volume                            |      0.0818256   |             1 |          0         |
| O                                      |      0.0770964   |             1 |          0         |
| Rmic/mes                               |      0.0654372   |             1 |          0         |
| ID/IG                                  |      0.0276997   |             1 |          0 