In [1]:
from sklearn.preprocessing import StandardScaler
from function import metrics_to_dataframe, plot_actual_vs_predicted
import pandas as pd
from function import split_data
import optuna
from sklearn.model_selection import cross_val_score

# 读取数据
data = pd.read_csv("../data/dataset_reduced.csv")
X_train, X_test, y_train, y_test = split_data(data, 'Cs')

# 数据标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [2]:
# 线性回归 Linear Regression
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
y_train_pred_lr = lr.predict(X_train)
y_test_pred_lr = lr.predict(X_test)
lr_metrics = metrics_to_dataframe(y_train, y_train_pred_lr, y_test, y_test_pred_lr, 'Linear Regression')
lr_train = pd.DataFrame({'Actual': y_train, 'Predicted': y_train_pred_lr})
lr_test = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred_lr})
lr_train.to_csv('lr_train.csv', index=False)
lr_test.to_csv('lr_test.csv', index=False)
lr_metrics

Unnamed: 0,model,R2_train,MAE_train,MAPE_train,RMSE_train,R2_test,MAE_test,MAPE_test,RMSE_test
0,Linear Regression,0.421558,22.900748,34.682054,30.511129,0.547166,19.912334,37.042903,27.922848


In [3]:
# 支持向量回归Support Vector Regression
from sklearn.svm import SVR

svr_params = {
    'kernel': 'poly',
    'degree': 6,
    'gamma': 'scale',
    'coef0': 3,
    'epsilon': 10,
    'verbose': True,
    'C': 0.5
}

svr = SVR(**svr_params)
svr.fit(X_train_scaled, y_train)
y_train_pred_svr = svr.predict(X_train_scaled)
y_test_pred_svr = svr.predict(X_test_scaled)
svr_metrics = metrics_to_dataframe(y_train, y_train_pred_svr, y_test, y_test_pred_svr, 'Support Vector Regression')
svr_train = pd.DataFrame({'Actual': y_train, 'Predicted': y_train_pred_svr})
svr_test = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred_svr})
svr_train.to_csv('svr_train.csv', index=False)
svr_test.to_csv('svr_test.csv', index=False)
svr_metrics

[LibSVM]

Unnamed: 0,model,R2_train,MAE_train,MAPE_train,RMSE_train,R2_test,MAE_test,MAPE_test,RMSE_test
0,Support Vector Regression,0.852376,10.862432,14.656757,15.413714,0.868775,11.107893,17.472105,15.031371


In [4]:
# 随机森林 Random Forest
from sklearn.ensemble import RandomForestRegressor

rf_params = {
    'n_estimators': 50,
    'max_depth': 12,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': 0.5,
    'random_state': 21
}
rf = RandomForestRegressor(**rf_params)
rf.fit(X_train, y_train)
y_train_pred_rf = rf.predict(X_train)
y_test_pred_rf = rf.predict(X_test)
rf_metrics = metrics_to_dataframe(y_train, y_train_pred_rf, y_test, y_test_pred_rf, 'Random Forest')
rf_train = pd.DataFrame({'Actual': y_train, 'Predicted': y_train_pred_rf})
rf_test = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred_rf})
rf_train.to_csv('rf_train.csv', index=False)
rf_test.to_csv('rf_test.csv', index=False)
rf_metrics

Unnamed: 0,model,R2_train,MAE_train,MAPE_train,RMSE_train,R2_test,MAE_test,MAPE_test,RMSE_test
0,Random Forest,0.987623,2.984936,4.127428,4.463032,0.947415,6.913925,10.246498,9.515234


In [5]:
# XGBoost
from xgboost import XGBRegressor
xgb_params = {
    'n_estimators': 300,
    'learning_rate': 0.17,
    'max_depth': 8,
    'min_child_weight': 5,
    'subsample': 0.5,
    'gamma': 0.05,
    'reg_alpha': 0.8,
    'reg_lambda': 5,
    'colsample_bytree': 0.6,
    'colsample_bylevel': 1,
    'colsample_bynode': 1,
    'random_state': 21
}
xgb = XGBRegressor(**xgb_params)
xgb.fit(X_train, y_train)
y_train_pred_xgb = xgb.predict(X_train)
y_test_pred_xgb = xgb.predict(X_test)
xgb_metrics = metrics_to_dataframe(y_train, y_train_pred_xgb, y_test, y_test_pred_xgb, 'XGBoost')
xgb_train = pd.DataFrame({'Actual': y_train, 'Predicted': y_train_pred_xgb})
xgb_test = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred_xgb})
xgb_train.to_csv('xgb_train.csv', index=False)
xgb_test.to_csv('xgb_test.csv', index=False)
xgb_metrics

Unnamed: 0,model,R2_train,MAE_train,MAPE_train,RMSE_train,R2_test,MAE_test,MAPE_test,RMSE_test
0,XGBoost,0.996976,1.43701,1.938355,2.205923,0.982383,4.06983,6.243979,5.507518


In [6]:
# LightGBM
from lightgbm import LGBMRegressor

lgbm_params = {
    'n_estimators': 300,
    'learning_rate': 0.18,
    'max_depth': 5,
    'min_child_samples': 2,
    'colsample_bytree': 0.3,
    'num_leaves': 31,
    'reg_alpha': 0.6,
    'reg_lambda': 0,
    'verbose': -1,
    'random_state': 21
}

lgbm = LGBMRegressor(**lgbm_params)
lgbm.fit(X_train, y_train)
y_train_pred_lgbm = lgbm.predict(X_train)
y_test_pred_lgbm = lgbm.predict(X_test)
lgbm_metrics = metrics_to_dataframe(y_train, y_train_pred_lgbm, y_test, y_test_pred_lgbm, 'LightGBM')
lgbm_train = pd.DataFrame({'Actual': y_train, 'Predicted': y_train_pred_lgbm})
lgbm_test = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred_lgbm})
lgbm_train.to_csv('lgbm_train.csv', index=False)
lgbm_test.to_csv('lgbm_test.csv', index=False)
lgbm_metrics

Unnamed: 0,model,R2_train,MAE_train,MAPE_train,RMSE_train,R2_test,MAE_test,MAPE_test,RMSE_test
0,LightGBM,0.99856,1.117106,1.498665,1.52208,0.986155,3.618184,5.91515,4.882483


In [7]:
metrics = pd.concat([lr_metrics, svr_metrics, rf_metrics, xgb_metrics, lgbm_metrics])
metrics_rounded = metrics.round(3)
metrics_rounded_sorted = metrics_rounded.sort_values(by='R2_test', ascending=True)
metrics_rounded_sorted.to_csv('output/report_models.csv', index=False)
metrics_rounded_sorted

Unnamed: 0,model,R2_train,MAE_train,MAPE_train,RMSE_train,R2_test,MAE_test,MAPE_test,RMSE_test
0,Linear Regression,0.422,22.901,34.682,30.511,0.547,19.912,37.043,27.923
0,Support Vector Regression,0.852,10.862,14.657,15.414,0.869,11.108,17.472,15.031
0,Random Forest,0.988,2.985,4.127,4.463,0.947,6.914,10.246,9.515
0,XGBoost,0.997,1.437,1.938,2.206,0.982,4.07,6.244,5.508
0,LightGBM,0.999,1.117,1.499,1.522,0.986,3.618,5.915,4.882


In [13]:
data_actual = pd.read_excel('../data/Experimental actual.xlsx')
data_actual = data_actual.drop('Name', axis=1)
X_actual = data_actual.drop('Cs', axis=1)
y_actual = data_actual['Cs']
X_actual_scaled = scaler.transform(X_actual)

y_actual_pred = lgbm.predict(X_actual).round(1)
data_actual_pred = data_actual.copy()
data_actual_pred['Cs_pred'] = y_actual_pred
data_actual_pred.to_excel('output/data_actual_pred.xlsx', index=False)
data_actual_pred

Unnamed: 0,O,N,SSA,PV,RMIC,Dap,ID/IG,Anion,CD,Cs,Cs_pred
0,5.31,2.55,1318.97,0.697,50.5,2.11,1.87,0,0.1,50.2,139.7
1,5.31,2.55,1318.97,0.697,50.5,2.11,1.87,0,0.2,45.9,115.6
2,5.31,2.55,1318.97,0.697,50.5,2.11,1.87,0,0.3,43.2,113.7
3,5.31,2.55,1318.97,0.697,50.5,2.11,1.87,0,0.5,39.3,110.1
4,5.31,2.55,1318.97,0.697,50.5,2.11,1.87,0,0.8,35.3,103.2
5,5.31,2.55,1318.97,0.697,50.5,2.11,1.87,0,1.0,33.5,103.2
6,5.31,2.55,1318.97,0.697,50.5,2.11,1.87,0,2.0,27.3,94.7
7,5.31,2.55,1318.97,0.697,50.5,2.11,1.87,0,3.0,23.9,89.0
8,4.81,10.56,885.83,0.607,32.29,2.74,2.23,0,0.1,68.0,155.1
9,4.81,10.56,885.83,0.607,32.29,2.74,2.23,0,0.2,60.6,143.0
