In [None]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import mean_absolute_error,r2_score,accurancy_score
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor,GradientBoostingRegressor
from sklearn.neural_network import MLPClassifier,MLPRegressor
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
from xgboost import XGBRFClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import joblib
from tqdm import tqdm

def load_and_preprocess(file_path):
    data = pd.read_excel(file_path)
    strength_featrues = data.columns[:-2]
    regression_targets = ['Density']
    X = data[regression_targets]
    y_regression = data[strength_featrues]

    preprocessor = ColumnTransformer(
        transformers=[
            (
                'num',Pipeline(
                    [
                        ('imputer', SimpleImputer(strategy='median'))
                        ('scaler',StandardScaler())
                    ]
                )
            )
        ]
    )

    return X, y_regression, preprocessor

def evaluate_regressor(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return{
        'rmse': np.sqrt(mean_absolute_error(y_test,y_pred)),
        'r2': r2_score(y_test,y_pred)
    }
def get_regressor_config(preprocessor):
    {
        'name':'XGBoost',
        'model':Pipeline([
            ('pre', preprocessor),
            ('reg', MultiOutputRegressor(xgb.XGBRegressor()))
        ]),
        'params':{
                'reg__estimator__n_estimators': [190,200, 300],
                'reg__estimator__max_depth': [None,4, 6, 8],
                'reg__estimator__learning_rate': [0.08, 0.1,0.11],
                'reg__estimator__subsample': [0.8,1.0],
                'reg__estimator__colsample_bytree': [0.8, 1.0]
        },
                'name': 'GradientBoosting',
            'model': Pipeline([
                ('pre', preprocessor),
                ('reg', MultiOutputRegressor(GradientBoostingRegressor()))
            ]),
            'params': {
                # 'reg__estimator__n_estimators': [100,190],  # 默认值100
                # 'reg__estimator__max_depth': [3],       # 默认值3
                # 'reg__estimator__learning_rate': [0.09,0.1,0.11], # 默认值0.1
                # 'reg__estimator__subsample': [1]      # 默认值1.0
                'reg__estimator__n_estimators': [190,200,],
                'reg__estimator__max_depth': [4, 6, 8],
                'reg__estimator__learning_rate': [0.015,0.018, 0.02,0.1],
                'reg__estimator__subsample': [0.7,0.8,0.85]
                # 'reg__estimator__n_estimators': [150, 200, 250],
                # 'reg__estimator__max_depth': [4, 5, 6, 7, 8, 9],
                # 'reg__estimator__learning_rate': [0.005, 0.01, 0.05, 0.1],
                # 'reg__estimator__subsample': [0.7,  0.8,  0.9],
                # 'reg__estimator__min_samples_split': [2, 5, 10],
                # 'reg__estimator__min_samples_leaf': [1, 2, 4],
                # 'reg__estimator__max_features': ['sqrt', 'log2', None]
            }
    },
    {
            'name': 'RandomForest',
            'model': Pipeline([
                ('pre', preprocessor),
                ('reg', MultiOutputRegressor(RandomForestRegressor(n_jobs=-1)))
            ]),
            'params': {
                'reg__estimator__n_estimators': [300,410,500],
                'reg__estimator__max_depth': [None,20,30,40],
                'reg__estimator__max_features': ['sqrt', 0.6,0.8],
                'reg__estimator__min_samples_split': [3, 4, 7]
            }
    },
    {
            'name': 'SVR',
            'model': Pipeline([
                ('pre', preprocessor),
                ('reg', MultiOutputRegressor(SVR()))
            ]),
            'params': {
                'reg__estimator__C': [0.1, 1, 10],
                'reg__estimator__kernel': ['rbf', 'poly', 'sigmoid'],
                'reg__estimator__epsilon': [0.1, 0.2, 0.3],
                'reg__estimator__gamma': ['scale', 'auto']
            }
    },
    {
            'name': 'MLP',
            'model': Pipeline([
                ('pre', preprocessor),
                ('reg', MLPRegressor(early_stopping=True))
            ]),
            'params': {
                'reg__hidden_layer_sizes': [(64,32), (100,50), (128,64,32)],
                'reg__alpha': [0.0001, 0.001, 0.01],
                'reg__learning_rate_init': [0.001, 0.0005, 0.0001],
                'reg__batch_size': [32, 64]
            }
    }
    

file_path = r'./database/hist_data_with_density_noisy2.xlsx'
X, y_regression, preprocessor = load_and_preprocess(file_path)
X_train, X_test, y_reg_train, y_reg_test = train_test_split(
    X, y_regression, 
    test_size=0.2, 
    random_state=42
)
regressor_config = get_regressor_config(preprocessor)
regressor_results = {}

    
print("=== Training Regressors ===")
if __name__ == '__main__':
    for config in tqdm(regressor_config, desc="Training Regressors"):
        gs = GridSearchCV(
            estimator=config['model'],
            param_grid=config['params'],
            scoring='neg_mean_squared_error',
            cv=5,
            n_jobs=-1,  
            verbose=2
        )
        gs.fit(X_train, y_reg_train)
        joblib.dump(gs.best_estimator_, f"{config['name']}_regressor.pkl")
        test_metrics = evaluate_regressor(gs.best_estimator_, X_test, y_reg_test)
        regressor_results[config['name']] = {
            'best_params': gs.best_params_,
            'train_rmse': np.sqrt(-gs.best_score_),
            'test_rmse': test_metrics['rmse'],
            'r2_score': test_metrics['r2']
        }

    print("\n=== Final Regressor Performance ===")
    print(f"{'Model':<20} | {'Train RMSE':<10} | {'Test RMSE':<10} | {'R² Score':<10}")
    for name, res in regressor_results.items():
        print(f"{name:<20} | {res['train_rmse']:<10.4f} | {res['test_rmse']:<10.4f} | {res['r2_score']:<10.4f}")

ImportError: cannot import name 'labelEncoder' from 'sklearn.preprocessing' (D:\ProgramData\anaconda3\Lib\site-packages\sklearn\preprocessing\__init__.py)