In [None]:
import numpy as np
import pandas as pd
import optuna
import tensorflow as tf
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.svm import SVR
from xgboost import XGBRegressor as XGBR
from interpret.glassbox import ExplainableBoostingRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


X_train_raw, X_test_raw, Ytrain, Ytest = train_test_split(
    x0, y, test_size=0.2, random_state=167
)

numeric_features = X_train_raw.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train_raw.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numeric_features),

        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='__MISSING__')),
            ('onehot', OneHotEncoder(
                handle_unknown='ignore',
                sparse_output=False,
                dtype=np.float64
            ))  # 绝不标准化独热列！！
        ]), categorical_features)
    ],
    remainder='drop',
    verbose_feature_names_out=False   # 关键！关闭自动前缀
)


X_train_processed = preprocessor.fit_transform(X_train_raw)
X_test_processed = preprocessor.transform(X_test_raw)
feature_names = preprocessor.get_feature_names_out()

Xtrain = pd.DataFrame(X_train_processed, columns=feature_names, index=X_train_raw.index)
Xtest = pd.DataFrame(X_test_processed, columns=feature_names, index=X_test_raw.index)


kf = KFold(n_splits=5, shuffle=True, random_state=1)

# RF
def objective_rf(trial):
    n_estimators = trial.suggest_int('n_estimators', 1, 200)
    max_depth = trial.suggest_int('max_depth', 1, 200)
    max_features = trial.suggest_int('max_features', 1, Xtrain.shape[1])
    random_state = trial.suggest_int('random_state', 0, 200)

    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        max_features=max_features,
        random_state=random_state,
        n_jobs=-1
    )
    score = cross_val_score(model, Xtrain, Ytrain.ravel(), cv=kf).mean()
    return score

study_rf = optuna.create_study(direction="maximize")
study_rf.optimize(objective_rf, n_trials=150)
print("RF Best trial:", study_rf.best_trial.params)

# EBM
def objective_ebm(trial):
    random_state = trial.suggest_int('random_state', 1, 10)
    
    model = ExplainableBoostingRegressor(random_state=random_state)
    score = cross_val_score(model, Xtrain, Ytrain.ravel(), cv=kf).mean()
    return score

study_ebm = optuna.create_study(direction="maximize")
study_ebm.optimize(objective_ebm, n_trials=9)
print("EBM Best trial:", study_ebm.best_trial.params)

# GDBT
def objective_gdbt(trial):
    n_estimators = trial.suggest_int('n_estimators', 1, 200)
    max_depth = trial.suggest_int('max_depth', 1, 200)
    random_state = trial.suggest_int('random_state', 0, 200)

    model = GradientBoostingRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=random_state
    )
    score = cross_val_score(model, Xtrain, Ytrain.ravel(), cv=kf).mean()
    return score

study_gdbt = optuna.create_study(direction="maximize")
study_gdbt.optimize(objective_gdbt, n_trials=150)
print("GDBT Best trial:", study_gdbt.best_trial.params)

# XGB
def objective_xgb(trial):
    n_estimators = trial.suggest_int('n_estimators', 1, 200)
    max_depth = trial.suggest_int('max_depth', 1, 200)
    random_state = trial.suggest_int('random_state', 0, 200)

    model = XGBR(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=random_state
    )
    score = cross_val_score(model, Xtrain, Ytrain.ravel(), cv=kf).mean()
    return score

study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(objective_xgb, n_trials=150)
print("XGB Best trial:", study_xgb.best_trial.params)

# ANN
models_ann = []
Ytrain_actual, Ytrain_pred = [], []

for fold, (train_index, val_index) in enumerate(kf.split(Xtrain)):
    X_tr, X_va = Xtrain.iloc[train_index], Xtrain.iloc[val_index]
    y_tr, y_va = Ytrain.iloc[train_index], Ytrain.iloc[val_index]
    
    model = Sequential([
        Dense(128, activation='relu', input_shape=(Xtrain.shape[1],)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
    model.fit(X_tr, y_tr, validation_data=(X_va, y_va), epochs=550, batch_size=32, verbose=0, callbacks=[early_stopping])
    
    y_va_pred = model.predict(X_va).flatten()
    Ytrain_actual.extend(y_va.values.flatten())
    Ytrain_pred.extend(y_va_pred)
    models_ann.append(model)

Ytest_preds = [m.predict(Xtest).flatten() for m in models_ann]
Ytest_pred_ensemble = np.mean(Ytest_preds, axis=0)
print("ANN Ensemble Test R2:", r2_score(Ytest.values.flatten(), Ytest_pred_ensemble))

# SVR
def objective_svr(trial):
    kernel = trial.suggest_categorical('kernel', ["rbf", "poly", "sigmoid"])
    C = trial.suggest_float('C', 0.1, 20, log=True)
    
    n_features = Xtrain.shape[1]
    gamma_upper = max(0.01, min(1, 10 / n_features))
    gamma = trial.suggest_float('gamma', 1e-5, gamma_upper, log=True)
    
    degree = trial.suggest_int('degree', 2, 5) if kernel == 'poly' else 3
    coef0 = trial.suggest_float('coef0', 0.0, 1.0) if kernel in ['poly', 'sigmoid'] else 0.0
    
    model = SVR(kernel=kernel, C=C, gamma=gamma, degree=degree, coef0=coef0)
    score = cross_val_score(model, Xtrain, Ytrain.ravel(), cv=kf).mean()
    return score

study_svr = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=42))
study_svr.optimize(objective_svr, n_trials=150)
print("SVR Best trial:", study_svr.best_trial.params)