In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
def load_and_preprocess_data(file_path):
    """加载并预处理数据"""
    # 读取数据
    train_df = pd.read_excel(file_path, sheet_name='训练集')
    test_df = pd.read_excel(file_path, sheet_name='测试集')
    
    print(f"训练集形状: {train_df.shape}")
    print(f"测试集形状: {test_df.shape}")
    
    # 分离特征和目标变量
    # 前30列是数值特征，第31列是药物类型，第32列是目标变量
    X_train_num = train_df.iloc[:, :30]  # 前30列数值特征
    X_test_num = test_df.iloc[:, :30]
    
    drug_train = train_df.iloc[:, 30]   # 第31列药物类型
    drug_test = test_df.iloc[:, 30]
    
    y_train = train_df.iloc[:, 31]      # 第32列目标变量
    y_test = test_df.iloc[:, 31]
    
    # 对药物类型进行编码
    le = LabelEncoder()
    drug_train_encoded = le.fit_transform(drug_train)
    drug_test_encoded = le.transform(drug_test)
    
    # 将数值特征和编码后的药物特征合并
    X_train = np.column_stack([X_train_num.values, drug_train_encoded])
    X_test = np.column_stack([X_test_num.values, drug_test_encoded])
    
    feature_names = [f'feature_{i}' for i in range(30)] + ['drug_encoded']
    
    return X_train, X_test, y_train.values, y_test.values, feature_names, le.classes_

In [3]:
def squared_relative_error(y_true, y_pred):
    """计算平方和相对误差"""
    return ((y_true - y_pred) / y_true) ** 2

def evaluate_model(model, X_test, y_test, model_name):
    """评估模型性能"""
    y_pred = model.predict(X_test)
    
    # 计算各种指标
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    sre = squared_relative_error(y_test, y_pred)
    mean_sre = np.mean(sre)
    var_sre = np.var(sre)
    
    print(f"\n{model_name} 性能评估:")
    print(f"均方误差 (MSE): {mse:.4f}")
    print(f"决定系数 (R²): {r2:.4f}")
    print(f"平方和相对误差均值: {mean_sre:.6f}")
    print(f"平方和相对误差方差: {var_sre:.6f}")
    
    return y_pred, mean_sre, var_sre

In [4]:
def main():
    # 文件路径 - 请根据实际情况调整
    file_path = "D:/桌面/附件1/回归预测.xlsx"
    
    try:
        # 加载和预处理数据
        X_train, X_test, y_train, y_test, feature_names, drug_classes = load_and_preprocess_data(file_path)
        
        print(f"\n药物类型: {drug_classes}")
        print(f"训练样本数: {X_train.shape[0]}, 测试样本数: {X_test.shape[0]}")
        print(f"目标变量范围: {y_train.min():.1f} - {y_train.max():.1f}")
        
        # 模型1: 随机森林回归
        print("\n" + "="*50)
        print("训练随机森林回归模型...")
        
        rf_param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2],
            'random_state': [42]
        }
        
        rf_model = GridSearchCV(
            RandomForestRegressor(),
            rf_param_grid,
            cv=5,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=1
        )
        
        rf_model.fit(X_train, y_train)
        print(f"随机森林最佳参数: {rf_model.best_params_}")
        print(f"随机森林最佳交叉验证分数: {-rf_model.best_score_:.4f}")
        
        # 模型2: 梯度提升回归
        print("\n" + "="*50)
        print("训练梯度提升回归模型...")
        
        gb_param_grid = {
            'n_estimators': [100, 200],
            'learning_rate': [0.05, 0.1],
            'max_depth': [3, 5],
            'min_samples_split': [2, 5],
            'random_state': [42]
        }
        
        gb_model = GridSearchCV(
            GradientBoostingRegressor(),
            gb_param_grid,
            cv=5,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=1
        )
        
        gb_model.fit(X_train, y_train)
        print(f"梯度提升最佳参数: {gb_model.best_params_}")
        print(f"梯度提升最佳交叉验证分数: {-gb_model.best_score_:.4f}")
        
        # 评估模型
        print("\n" + "="*50)
        print("在测试集上评估模型性能:")
        
        # 随机森林评估
        rf_pred, rf_mean_sre, rf_var_sre = evaluate_model(rf_model.best_estimator_, X_test, y_test, "随机森林")
        
        # 梯度提升评估
        gb_pred, gb_mean_sre, gb_var_sre = evaluate_model(gb_model.best_estimator_, X_test, y_test, "梯度提升")
        
        # 选择最佳模型
        if rf_mean_sre < gb_mean_sre:
            best_model = rf_model.best_estimator_
            best_model_name = "随机森林"
            best_mean_sre = rf_mean_sre
            best_var_sre = rf_var_sre
        else:
            best_model = gb_model.best_estimator_
            best_model_name = "梯度提升"
            best_mean_sre = gb_mean_sre
            best_var_sre = gb_var_sre
        
        print("\n" + "="*50)
        print("最终结果:")
        print(f"最佳模型: {best_model_name}")
        print(f"测试集平方和相对误差均值: {best_mean_sre:.6f}")
        print(f"测试集平方和相对误差方差: {best_var_sre:.6f}")
        
        # 特征重要性分析（如果使用随机森林或梯度提升）
        if hasattr(best_model, 'feature_importances_'):
            print("\n特征重要性前10:")
            importances = best_model.feature_importances_
            indices = np.argsort(importances)[::-1]
            
            for i in range(min(10, len(importances))):
                print(f"{feature_names[indices[i]]}: {importances[indices[i]]:.4f}")
        
        return best_model, best_mean_sre, best_var_sre
        
    except Exception as e:
        print(f"错误: {e}")
        print("请检查文件路径是否正确，以及文件格式是否符合要求")
        return None, None, None

if __name__ == "__main__":
    best_model, mean_sre, var_sre = main()

训练集形状: (549, 32)
测试集形状: (136, 32)

药物类型: ['利培酮' '喹硫平' '奋乃静' '奥氮平' '氟哌啶醇' '阿立哌唑' '齐拉西酮']
训练样本数: 549, 测试样本数: 136
目标变量范围: 31.0 - 157.0

训练随机森林回归模型...
Fitting 5 folds for each of 24 candidates, totalling 120 fits
随机森林最佳参数: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 42}
随机森林最佳交叉验证分数: 341.4490

训练梯度提升回归模型...
Fitting 5 folds for each of 16 candidates, totalling 80 fits
梯度提升最佳参数: {'learning_rate': 0.05, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 42}
梯度提升最佳交叉验证分数: 348.8425

在测试集上评估模型性能:

随机森林 性能评估:
均方误差 (MSE): 275.3413
决定系数 (R²): 0.0383
平方和相对误差均值: 0.086052
平方和相对误差方差: 0.023226

梯度提升 性能评估:
均方误差 (MSE): 301.0144
决定系数 (R²): -0.0514
平方和相对误差均值: 0.091155
平方和相对误差方差: 0.025627

最终结果:
最佳模型: 随机森林
测试集平方和相对误差均值: 0.086052
测试集平方和相对误差方差: 0.023226

特征重要性前10:
feature_13: 0.0785
feature_28: 0.0642
feature_12: 0.0474
feature_18: 0.0468
feature_8: 0.0442
feature_11: 0.0442
feature_3: 0.0409
feature_20: 0.0381
feature_1: 0.038