In [None]:
import os
import sys
import time
import re
import warnings
import pickle
import numpy as np
import pandas as pd
from tqdm.auto import tqdm, trange
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.font_manager as fm
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
from pyvis.network import Network
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, KFold, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, make_scorer
from sklearn.feature_selection import RFECV, SelectFromModel, mutual_info_regression
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, HuberRegressor, BayesianRidge
from sklearn.linear_model import RANSACRegressor, TheilSenRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor, BaggingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, RationalQuadratic, WhiteKernel
import xgboost as xgb
import lightgbm as lgb
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, concatenate, BatchNormalization, LSTM, GRU
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1_l2
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE, MDS
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from umap import UMAP
from scipy.cluster.hierarchy import linkage, dendrogram
import shap
import lime
import lime.lime_tabular
from pygam import GAM, s, f, l
from sklearn.inspection import permutation_importance, partial_dependence, PartialDependenceDisplay
import statsmodels.api as sm
from statsmodels.graphics.gofplots import ProbPlot
from causallearn.search.ConstraintBased.PC import pc
from causallearn.search.ScoreBased.GES import ges
from causallearn.utils.GraphUtils import GraphUtils
from dowhy import CausalModel
from econml.dml import CausalForestDML
from econml.dr import DRLearner
import gym
from gym import spaces
import skopt
from skopt import gp_minimize, forest_minimize
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from skopt.plots import plot_convergence, plot_objective
from pymoo.core.problem import Problem
from pymoo.algorithms.moo.nsga2 import NSGA2
from pymoo.operators.sampling.rnd import FloatRandomSampling
from pymoo.operators.crossover.sbx import SBX
from pymoo.operators.mutation.pm import PM
from pymoo.optimize import minimize
from pymoo.visualization.scatter import Scatter
from pymoo.util.plotting import plot
import networkx as nx
from networkx.algorithms import community
from sklearn.metrics.pairwise import cosine_similarity
from imblearn.over_sampling import SMOTE
import joblib
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=UserWarning, module='lightgbm')
np.random.seed(42)
tf.random.set_seed(42)
if torch.cuda.is_available():
    torch.manual_seed(42)
    torch.cuda.manual_seed_all(42)
plt.style.use('ggplot')
sns.set(style="whitegrid")
os.makedirs('outputs', exist_ok=True)
os.makedirs('models', exist_ok=True)
os.makedirs('visualizations', exist_ok=True)
os.makedirs('data_exports', exist_ok=True)

# 配置中文字体显示
def setup_chinese_font():
    """配置matplotlib显示中文字体"""
    print("设置中文字体显示...")
    
    import matplotlib.font_manager as fm
    
    # 尝试多种中文字体
    chinese_fonts = [
        # Windows字体
        'SimHei', 'SimSun', 'NSimSun', 'Microsoft YaHei',
        # Mac字体
        'STHeiti', 'STKaiti', 'STSong',
        # Linux字体
        'WenQuanYi Micro Hei', 'WenQuanYi Zen Hei',
        # 通用字体
        'Arial Unicode MS', 'DejaVu Sans'
    ]
    
    # 获取已安装的字体列表
    font_names = set([f.name for f in fm.fontManager.ttflist])
    
    # 检查是否有任何中文字体可用
    available_chinese_fonts = [f for f in chinese_fonts if f in font_names]
    
    if available_chinese_fonts:
        plt.rcParams['font.family'] = available_chinese_fonts[0]
        plt.rcParams['axes.unicode_minus'] = False
        print(f"成功设置中文字体: {available_chinese_fonts[0]}")
        return True
    else:
        print("未找到适合的中文字体，将使用英文标签")
        # 创建中英文对照表，方便后续使用
        global label_translation
        label_translation = {
            '水接触角': 'Water Contact Angle',
            '循环使用次数': 'Cycle Times',
            '吸油能力': 'Oil Absorption Capacity',
            '综合性能': 'Overall Performance',
            '基底材料': 'Base Material',
            '改性材料': 'Modifiers',
            '制备方法': 'Preparation Method',
            '预测性能': 'Predicted Performance',
            '综合得分': 'Overall Score'
        }
        return False

# 配置GPU
def setup_gpu():
    """配置GPU并返回可用性"""
    try:
        gpus = tf.config.list_physical_devices('GPU')
        if gpus:
            # 设置TensorFlow使用第一个GPU
            tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
            tf.config.experimental.set_memory_growth(gpus[0], True)
            print(f"GPU加速已启用: {gpus[0].name}")
            return True
        else:
            # 使用CUDA检查GPU
            try:
                import torch
                if torch.cuda.is_available():
                    print(f"通过PyTorch检测到GPU: {torch.cuda.get_device_name(0)}")
                    return True
                else:
                    print("未检测到GPU，将使用CPU")
                    return False
            except ImportError:
                print("未检测到GPU，将使用CPU")
                return False
    except:
        print("GPU配置失败，将使用CPU")
        return False

# 自定义进度条类
class CustomProgressBar:
    def __init__(self, total, desc="进度", bar_length=50):
        self.total = total
        self.desc = desc
        self.n = 0
        self.bar_length = bar_length
        self.start_time = time.time()
        self.last_print_time = 0
        self._print_progress()
        
    def update(self, n=1):
        self.n += n
        current_time = time.time()
        if current_time - self.last_print_time >= 0.1 or self.n >= self.total:
            self._print_progress()
            self.last_print_time = current_time
    
    def _print_progress(self):
        percent = min(100, self.n * 100 / self.total)
        filled_length = int(self.bar_length * self.n // self.total)
        bar = '█' * filled_length + '-' * (self.bar_length - filled_length)
        
        elapsed_time = time.time() - self.start_time
        if self.n > 0:
            time_per_iter = elapsed_time / self.n
            remaining_iters = self.total - self.n
            remaining_time = time_per_iter * remaining_iters
            time_str = f" - 预计剩余: {self._format_time(remaining_time)}"
        else:
            time_str = ""
            
        print(f'\r{self.desc}: |{bar}| {percent:.1f}% {self.n}/{self.total}{time_str}', end='', flush=True)
        if self.n >= self.total:
            print()
    
    def _format_time(self, seconds):
        """将秒数格式化为时:分:秒"""
        m, s = divmod(int(seconds), 60)
        h, m = divmod(m, 60)
        if h > 0:
            return f"{h}时{m}分{s}秒"
        elif m > 0:
            return f"{m}分{s}秒"
        else:
            return f"{s}秒"
            
    def set_description(self, desc):
        """更新描述文字"""
        self.desc = desc
        self._print_progress()
        
    def close(self):
        """关闭进度条"""
        if self.n < self.total:
            self.n = self.total
            self._print_progress()
        print()

# 设置环境
setup_chinese_font()
gpu_available = setup_gpu()

# 自定义LightGBM包装器
class CustomLGBMRegressor:
    """自定义LightGBM包装器，避免特征名称问题"""
    def __init__(self, **params):
        self.params = params
        self.booster = None
        self.feature_count = None
        self.feature_names = None
        self.fitted = False
    
    def fit(self, X, y):
        # 存储特征数量以确保一致性检查
        self.feature_count = X.shape[1]
        
        # 保存特征名称
        self.feature_names = X.columns.tolist() if hasattr(X, 'columns') else [f'feature_{i}' for i in range(X.shape[1])]
        
        # 转换为numpy数组以避免特征名称问题
        X_values = X.values if hasattr(X, 'values') else X
        y_values = y.values if hasattr(y, 'values') else y
        
        # 创建不含特征名称的数据集
        train_data = lgb.Dataset(X_values, label=y_values)
        
        # 使用原生LightGBM API训练模型
        self.booster = lgb.train(
            params=self.params,
            train_set=train_data,
            num_boost_round=self.params.get('n_estimators', 100)
        )
        
        self.fitted = True
        return self
    
    def predict(self, X):
        if not self.fitted:
            raise ValueError("模型尚未训练")
        
        # 检查特征数量一致性
        if X.shape[1] != self.feature_count:
            raise ValueError(f"特征数量不匹配。期望 {self.feature_count}，实际 {X.shape[1]}")
        
        # 转换为numpy数组
        X_values = X.values if hasattr(X, 'values') else X
        
        # 进行预测
        return self.booster.predict(X_values)
    
    # 添加feature_importances_属性以支持特征重要性分析
    @property
    def feature_importances_(self):
        if not self.fitted:
            raise ValueError("模型尚未训练，无法获取特征重要性")
        
        # 从LightGBM Booster中获取特征重要性
        importances = self.booster.feature_importance(importance_type='split')
        
        # 确保返回与特征数一致的重要性数组
        if len(importances) != self.feature_count:
            print(f"警告: LightGBM特征重要性数量({len(importances)})与特征数量({self.feature_count})不匹配")
            aligned_importances = np.zeros(self.feature_count)
            for i in range(min(len(importances), self.feature_count)):
                aligned_importances[i] = importances[i]
            return aligned_importances
        
        return importances
    
    def get_feature_names(self):
        """返回模型使用的特征名称列表"""
        return self.feature_names
    
    def get_params(self, deep=True):
        return self.params
    
    def set_params(self, **params):
        self.params.update(params)
        return self

# 自定义集成模型类
class CustomVotingRegressor:
    def __init__(self, estimators, weights=None):
        self.estimators = estimators
        self.weights = weights if weights is not None else [1] * len(estimators)
        self.normalize_weights()
        
    def normalize_weights(self):
        weight_sum = sum(self.weights)
        self.weights = [w / weight_sum for w in self.weights]
        
    def fit(self, X, y):
        # 所有模型应该已经训练过
        return self
        
    def predict(self, X):
        # 获取每个模型的预测
        predictions = []
        for _, model in self.estimators:
            if hasattr(model, 'predict'):
                pred = model.predict(X)
                predictions.append(pred)
        
        # 加权平均预测结果
        weighted_preds = np.zeros(len(X))
        for i, pred in enumerate(predictions):
            weighted_preds += pred * self.weights[i]
            
        return weighted_preds

# 自定义堆叠回归器
class CustomStackingRegressor:
    def __init__(self, base_models, meta_model, use_lightgbm=False, lightgbm_model=None):
        self.base_models = base_models
        self.meta_model = meta_model
        self.use_lightgbm = use_lightgbm
        self.lightgbm_model = lightgbm_model
        
    def predict(self, X):
        # 收集所有基础模型的预测
        base_preds = np.zeros((len(X), len(self.base_models) + (1 if self.use_lightgbm else 0)))
        
        for i, (_, model) in enumerate(self.base_models):
            base_preds[:, i] = model.predict(X)
            
        # 添加LightGBM的预测
        if self.use_lightgbm:
            base_preds[:, -1] = self.lightgbm_model.predict(X)
            
        # 使用元模型做最终预测
        return self.meta_model.predict(base_preds)

# 神经网络进度条回调
class MyProgressBar(tf.keras.callbacks.Callback):
    def __init__(self, epochs):
        super().__init__()
        self.epochs = epochs
        self.progress_bar = None
    
    def on_train_begin(self, logs=None):
        self.progress_bar = CustomProgressBar(total=self.epochs, desc="神经网络训练")
        
    def on_epoch_end(self, epoch, logs=None):
        self.progress_bar.update(1)
        val_loss = logs.get('val_loss', 0)
        self.progress_bar.set_description(f"神经网络训练 - epoch {epoch+1}/{self.epochs} - val_loss: {val_loss:.4f}")
        
    def on_train_end(self, logs=None):
        self.progress_bar.close()

# 优化进度类
class OptimizationProgress:
    def __init__(self, description="优化进度", small_dataset=False):
        self.iter = 0
        self.max_iter = 50 if small_dataset else 100
        self.progress_bar = CustomProgressBar(total=self.max_iter, desc=description)
        self.last_update_time = time.time()
        
    def update(self, xk, convergence=None):
        current_time = time.time()
        if current_time - self.last_update_time > 0.5:
            if self.iter < self.max_iter:
                self.progress_bar.update(1)
                self.iter += 1
                self.last_update_time = current_time
                
                if convergence is not None:
                    self.progress_bar.set_description(f"优化进度 - 收敛度: {convergence:.6f}")
        
    def close(self):
        if self.iter < self.max_iter:
            self.progress_bar.update(self.max_iter - self.iter)
        self.progress_bar.close()

In [None]:
# ====================== 数据加载与基础预处理 ======================
# 指定Excel文件路径
excel_path = r"E:\360MoveData\Users\DELL\Desktop\代码\最终加上的512.xlsx"  # 修改为您的文件路径
use_smote = True  # 启用SMOTE数据增强
handle_outliers = True  # 启用异常值处理

print("=" * 80)
print("油吸附材料预测与优化系统 - 数据处理阶段")
print("=" * 80)

# 加载Excel数据
data = pd.read_excel(excel_path)
print(f"成功加载数据文件: {excel_path}")
print(f"数据集包含 {len(data)} 行和 {len(data.columns)} 列")

# 显示数据类型概览
print("\n数据类型概览:")
print(data.dtypes)

print("\n缺失值情况:")
missing_data = data.isnull().sum()
print(missing_data[missing_data > 0])

# 保存原始数据副本
raw_data = data.copy()

# 设置目标变量和特征列
# 基底材料在第4列（索引3）
base_material_column = data.columns[3]
# 改性材料在第5-8列（索引4-7）
mod_material_columns = data.columns[4:8].tolist()
# 制备方法在第9列（索引8）
method_column = data.columns[8]
# 目标列在第11-13列（索引10-12）
target_columns = data.columns[10:13].tolist()

material_columns = [base_material_column] + mod_material_columns

print(f"基底材料列: {base_material_column}")
print(f"改性材料列: {mod_material_columns}")
print(f"制备方法列: {method_column}")
print(f"目标列: {target_columns}")

# 提取唯一的材料和制备方法
# 提取唯一的基底材料
unique_materials = {}
unique_materials['base'] = set(data[base_material_column].dropna().unique())
unique_materials['base'] = {m for m in unique_materials['base'] if m and isinstance(m, str)}

# 提取唯一的改性材料
unique_materials['mod'] = set()
for col in mod_material_columns:
    unique_values = set(data[col].dropna().unique())
    unique_materials['mod'].update(unique_values)
unique_materials['mod'] = {m for m in unique_materials['mod'] if m and isinstance(m, str)}

# 提取唯一的制备方法
unique_methods = set(data[method_column].dropna().unique())
unique_methods = {m for m in unique_methods if m and isinstance(m, str)}

print(f"提取了 {len(unique_materials['base'])} 种基底材料")
print(f"提取了 {len(unique_materials['mod'])} 种改性材料")
print(f"提取了 {len(unique_methods)} 种制备方法")

# 创建材料名称到组成的映射
name_to_material_map = {}

for _, row in data.iterrows():
    material_name = row.get("材料名称", f"材料_{_}")

    if not isinstance(material_name, str) or pd.isna(material_name):
        continue

    base_material = row[base_material_column]
    mod_materials = [row[col] for col in mod_material_columns
                     if isinstance(row[col], str) and not pd.isna(row[col])]

    name_to_material_map[material_name] = {
        'base': base_material,
        'mod': mod_materials,
        'method': row[method_column]
    }

# 特征名称清理函数
def sanitize_feature_name(name):
    """清理特征名称，移除LightGBM不支持的特殊字符，确保特征名称符合模型要求"""
    sanitized = str(name)
    sanitized = sanitized.replace(' ', '_')
    sanitized = re.sub(r'[\\"{}\[\]:,]', '_', sanitized)
    sanitized = re.sub(r'[^a-zA-Z0-9_]', '_', sanitized)
    if sanitized and sanitized[0].isdigit():
        sanitized = 'f_' + sanitized
    sanitized = re.sub(r'_+', '_', sanitized)
    sanitized = sanitized.rstrip('_')
    
    return sanitized

# 存储特征名映射
feature_name_map = {}  # 原始特征名到安全特征名的映射
reverse_feature_map = {}  # 安全特征名到原始特征名的映射

# ====================== 异常值处理 ======================
if handle_outliers:
    print("=" * 80)
    print("执行异常值处理")
    
    # 目标变量的异常值处理参数
    outlier_params = {
        '水接触角': {'lower': 0.03, 'upper': 0.97}, 
        '循环使用次数': {'lower': 0.01, 'upper': 0.99},
        '吸油能力': {'lower': 0.01, 'upper': 0.99}
    }
    
    # 处理目标变量的异常值
    for target in target_columns:
        if target in data.columns and pd.api.types.is_numeric_dtype(data[target]):
            # 获取处理参数
            lower_percentile = outlier_params.get(target, {}).get('lower', 0.01)
            upper_percentile = outlier_params.get(target, {}).get('upper', 0.99)
            
            # 异常值检测前的分布统计
            print(f"\n{target} 异常值处理前统计:")
            valid_values = data[target].dropna()
            print(f"  最小值: {valid_values.min():.4f}, 最大值: {valid_values.max():.4f}")
            print(f"  1%分位数: {valid_values.quantile(0.01):.4f}, 99%分位数: {valid_values.quantile(0.99):.4f}")
            
            # 计算百分位阈值
            lower_bound = valid_values.quantile(lower_percentile)
            upper_bound = valid_values.quantile(upper_percentile)
            
            # 统计异常值数量
            n_lower_outliers = (valid_values < lower_bound).sum()
            n_upper_outliers = (valid_values > upper_bound).sum()
            print(f"  检测到 {n_lower_outliers} 个低于{lower_percentile*100}%分位数的异常值")
            print(f"  检测到 {n_upper_outliers} 个高于{upper_percentile*100}%分位数的异常值")
            
            if n_lower_outliers > 0 or n_upper_outliers > 0:
                # 应用Winsorization (缩尾处理)
                data.loc[data[target] < lower_bound, target] = lower_bound
                data.loc[data[target] > upper_bound, target] = upper_bound
                print(f"  已对 {target} 应用缩尾处理，限制在[{lower_bound:.4f}, {upper_bound:.4f}]范围内")
                
                # 处理后的统计
                print(f"  处理后 - 最小值: {data[target].min():.4f}, 最大值: {data[target].max():.4f}")

# 检查目标变量的分布
for target in target_columns:
    # 检查数据类型
    if not pd.api.types.is_numeric_dtype(data[target]):
        print(f"警告: 目标变量 {target} 不是数值类型，尝试转换...")
        data[target] = pd.to_numeric(data[target], errors='coerce')

    # 报告基本统计信息
    valid_values = data[target].dropna()
    print(f"\n{target} 统计信息:")
    print(f"  有效值数量: {len(valid_values)}")
    print(f"  最小值: {valid_values.min()}")
    print(f"  最大值: {valid_values.max()}")
    print(f"  平均值: {valid_values.mean()}")
    print(f"  中位数: {valid_values.median()}")
    print(f"  标准差: {valid_values.std()}")

# 可视化目标变量分布
fig, axes = plt.subplots(1, len(target_columns), figsize=(15, 5))

for i, target in enumerate(target_columns):
    valid_data = data[target].dropna()
    
    if len(target_columns) > 1:
        ax = axes[i]
    else:
        ax = axes

    sns.histplot(valid_data, kde=True, ax=ax)
    ax.set_title(f'{target} 分布')
    ax.set_xlabel(target)
    ax.set_ylabel('频率')

plt.tight_layout()
plt.savefig('visualizations/target_distributions.png', dpi=300)
plt.show()

# 可视化材料使用频率
plt.figure(figsize=(12, 6))

# 基底材料使用频率
base_counts = data[base_material_column].value_counts().head(10)
sns.barplot(x=base_counts.index, y=base_counts.values)
plt.title('最常用的10种基底材料')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('visualizations/base_material_frequency.png', dpi=300)
plt.show()

# 改性材料使用频率
mod_counts = pd.Series(dtype='int64')
for col in mod_material_columns:
    mod_counts = mod_counts.add(data[col].value_counts(), fill_value=0)

plt.figure(figsize=(12, 6))
top_mods = mod_counts.sort_values(ascending=False).head(10)
sns.barplot(x=top_mods.index, y=top_mods.values)
plt.title('最常用的10种改性材料')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('visualizations/mod_material_frequency.png', dpi=300)
plt.show()

# 制备方法使用频率
method_counts = data[method_column].value_counts().head(10)
plt.figure(figsize=(12, 6))
sns.barplot(x=method_counts.index, y=method_counts.values)
plt.title('最常用的10种制备方法')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('visualizations/method_frequency.png', dpi=300)
plt.show()

# 创建目标变量之间的散点图矩阵
valid_targets = data[target_columns].dropna()
plt.figure(figsize=(10, 8))
sns.pairplot(valid_targets)
plt.suptitle('目标变量相关性散点图矩阵', y=1.02)
plt.savefig('visualizations/target_correlations.png', dpi=300)
plt.show()
data.to_csv('data_exports/preprocessed_data.csv', index=False, encoding='utf-8-sig')
print("基础预处理完成，数据已保存到 data_exports/preprocessed_data.csv")

In [None]:
# ====================== 特征工程 ======================
print("=" * 40)
print("正在进行特征工程...")
print("=" * 80)
from collections import defaultdict, Counter
import re
import os
import seaborn as sns

# ====================== 材料分类与编码系统 ======================
class MaterialEncoder:
    """材料编码系统：将各类材料映射为数值编码"""
    def __init__(self):
        # 初始化材料分类字典
        self.material_categories = {
            1: "无机纳米材料/金属氧化物",
            2: "有机高分子/聚合物",
            3: "表面改性剂/硅烷类物质",
            4: "碳基材料",
            5: "MOF/功能有机小分子/其他"
        }
        
        # 创建材料编码映射字典
        self.material_codes = {}
        self.material_combinations = {}
        
        # 填充编码字典
        self._init_inorganic_nanomaterials()  # 类别1
        self._init_organic_polymers()         # 类别2
        self._init_surface_modifiers()        # 类别3
        self._init_carbon_materials()         # 类别4
        self._init_mof_and_others()           # 类别5
    
    def _init_inorganic_nanomaterials(self):
        """初始化无机纳米材料/金属氧化物编码 (100-199)"""
        inorganic_nanomaterials = {
            "SiO2": 114,
            "SiO2 nanoparticles": 114,
            "TiO2": 122,
            "P25 TiO2 nanoparticles": 122,
            "TiO2 nanoparticles": 122,
            "Fe3O4 nanoparticles": 126,
            "Co3O4": 127,
            "Co3O4 nanoparticles": 127,
            "Au nanoparticles": 179,
            "CoO nanoparticles": 127,
            "Co nanoparticles": 127,
            "Ni": 128,
            "Cu2O": 129,
            "Cu2O nanoparticles": 129,
            "CuO nanoparticles": 129,
            "ZnO nanoparticles": 130,
            "硅藻土": 116,
            "diatomite": 116,
            "Mesoporous Silica Nanoparticles (MSNs)": 117,
            "Mesoporous Silica Nanoparticles": 117,
            "MSNs": 117,
            "Ni nanoparticles": 128,
            "Cu nanoparticles": 129,
            "Fe3O4 nanoparticals": 126,
            "Fe3O5 nanoparticles": 126,
            "MgFe2O4 nanoparticles": 132,
            "Ni-Co double layered oxides": 137,
            "vermiculite": 118,
            "Ag nanoparticles": 147,
            "Al2O3 nanoparticles": 113,
            "γ-AlOOH": 113,
            "DR-Al2O3 nanoparticles": 113,
            "BiVO4": 183,
            "MnO2 nanoparticles": 125,
            "膨润土纳米片": 115,
            "膨润土纳米片(Laponite RD)": 115,
            "Kaolin": 114,
            "LDH": 120,
            "Mg-calcite CaCO3颗粒": 120,
            "Sepiolite": 119,
            "CuNWs nanoparticles": 129,
            "CN nanoparticles": 107,
            "NaF": 111,
            "Sodium fluoride": 111,
            "MoS2": 142
        }
        
        self.material_codes.update(inorganic_nanomaterials)
    
    def _init_organic_polymers(self):
        """初始化有机高分子/聚合物编码 (200-299)"""
        organic_polymers = {
            "PVA": 210,
            "PVP": 210,
            "PS": 212,
            "PLA": 215,
            "PAN": 216,
            "PA": 220,
            "聚酰胺树脂": 220,
            "PI": 225,
            "polypyrrole": 242,
            "PVDF-HFP": 235,
            "polyether sulfone": 238,
            "Polyurethane": 224,
            "polyurethane": 224,
            "PU": 224,
            "Thermoplastic polyurethane": 225,
            "polymer SHMP-1": 252,
            "Poly(sulfobetaine methacrylate)": 258,
            "Polyhedral oligomeric silsesquioxane": 268,
            "POSS": 268,
            "lignin": 275,
            "Epoxy copolymer": 233,
            "Epoxy cross-linker": 233,
            "N,N'-亚甲基双丙烯酰胺": 228,
            "acrylamide": 207,
            "phenol-formaldehyde resin": 222,
            "PDMS": 230,
            "PFA": 235,
            "PTFE":235,
            "PANI": 240,
            "PPy": 242,
            "PDA": 245,
            "PEI": 246,
            "Polysiloxane": 230,
            "聚偏氟乙烯-六氟丙烯": 235,
            "Polysiloxane": 230,
            "BPEI": 246,
            "PBZ": 250,
            "Polybenzoxazine": 250,
            "Polymethylhydrogen silicone": 228,
            "PFR": 254,
            "Cellulose": 270,
            "CMC": 270,
            "Chitosan": 273,
            "chitosan": 273,
            "Lignin": 275,
            "Silk fibroin": 280,
            "HDPE": 204,
            "PEG": 206,
            "PMHS": 232,
            "PDVB": 249,
            "PZAF": 255,
            "spiropyran methacrylate derivative": 290,
            "Polydimethylsiloxane": 230,
            "Polystyrene": 212,
            "iPD": 226,
            "Styrene": 208
        }
        
        self.material_codes.update(organic_polymers)
    
    def _init_surface_modifiers(self):
        """初始化表面改性剂/硅烷类物质编码 (300-399)"""
        surface_modifiers = {
            # 氨基类硅烷
            "APTES": 333,
            "APTMS": 333,
            "(3-Aminopropyl)triethoxysilane": 333,
            "(3-氨基丙基)三乙氧基硅烷": 333,
            "3-氨基丙基三乙氧基硅烷": 333,
            "巯丙基三乙氧基硅烷": 353,
            
            # 环氧基硅烷
            "GPTMS": 343,
            "GPTS": 343,
            "(3-Glycidyloxypropyl)trimethoxysilane": 343,
            
            # 巯基硅烷
            "MPS": 353,
            "3-mercaptopropyltrisiloxane": 353,
            "硫醇": 359,
            "DDT": 359,
            "n-Dodecylthiol": 359,
            "n-十二烷基巯基": 359,
            "n-octadecylthiol": 359,
            
            # 乙烯基硅烷
            "VTMS": 362,
            "VTES": 362,
            "乙烯基三乙氧基硅烷": 362,
            
            # 甲基/短链烷基硅烷
            "TMCS": 311,
            "甲基三氯硅烷": 311,
            "MTMS": 311,
            "MTS": 311,
            "甲基三氯硅烷": 311,
            "CTMS": 311,
            "N-Octyltrichlorosilane":328,
            # 长链烷基硅烷
            "DTMS": 322,
            "DMTS": 322,
            "十六烷基三甲氧基硅烷": 329,
            "二甲基二甲氧基硅烷": 311,
            "C16TMS": 329,
            "HDTMS": 329,
            "HDTS": 329,
            "OTS": 329,
            "ODTS": 329,
            "ODT": 329,
            "OTMS": 329,
            "硬脂基三乙氧基硅烷": 329,
            "ODS": 329,
            "十八烷基三氯硅烷": 329,
            "十八烷基三甲氧基硅烷": 329,
            "三氯(十八烷基)硅烷": 329,
            
            # 全氟类硅烷
            "FDTS": 389,
            "PFDS": 389,
            "PFOTES": 388,
            "PFOTS": 388,
            "PFTOS": 388,
            "1H,1H,2H,2H-perfluorodecyltrimethoxysilane": 388,
            "TMHFDS": 389,
            "三甲氧基（1H,1H,2H,2H-十七氟癸基）硅烷": 389,
            "1H,1H,2H,2H-全氟癸基三氯硅烷": 389,
            "十二氟庚基丙基三甲氧基硅烷": 387,
            "FAS": 387,
            "FAS-17": 387,
            
            # 双硅基化合物
            "HMDS": 372,
            "1,1,1,3,3,3-hexamethyldisilazane": 372,
            
            # 特殊硅烷
            "TEOS": 304,
            "硅烷": 300,
            "Silane": 300,
            
            # 其他表面改性剂
            "DETA": 335,
            "ODA": 329,
            "OTAB": 328,
            "PFNA": 389,
            "Hydroxyl-terminated fluorosilicone": 384,

            "1H,1H,2H,2H-Perfluorodecyltrimethoxysilane": 389,
            "1H,1H,2H,2H-Perfluorodecyltrichlorosilane": 389,
            "全氟癸基三氯硅烷": 389,
            "1H,1H,2H,2H-perfluorodecyltrimethoxysilane": 389,
            "PFDTS": 389,
            "1H,1H,2H,2H-Perfluorodecanethiol": 389,
            "1H,1H,2H,2H-perfluorodecanethiol": 389,
            "1H, 1H, 2H, 2H-perfluorodecanethiol": 389,
            "3-Mercaptopropyltriethoxysilane": 353,
            "steary methacrylate": 319,
            "1-dodecanethiol": 359,
            "n-十二烷基硫醇": 359,
            "dodecafluoroheptyl-propyl-trimethoxysilane": 387,
            "octadecane thiol": 359,
            "n-十八烷基巯基": 359,
            "1H,1H,2H,2H-Perfluorooctyltriethoxysilane": 388,
            "Perfluorooctlytriethoxysilane": 388,
            "1H,1H,2H,2H-perfluorooctyltriethoxysilane": 388,
            "Methyltriethoxysilane": 311,
            "cetyltrimethoxysilane": 329,
            "n-hexadecyltriethoxysilane": 329,
            "octadecyltrichlorosilane": 329,
            "OMCTS": 301,
            "三甲基氯硅烷": 311,
            "hydrophobic silanes": 320,
        }
        
        self.material_codes.update(surface_modifiers)
    
    def _init_carbon_materials(self):
        """初始化碳基材料编码 (400-499)"""
        carbon_materials = {
            "Carbon black": 405,
            "Activated carbon": 415,
            "WPAC": 415,
            "CNT": 433,
            "CNTs": 433, 
            "CNTS": 433,
            "MWCNTs": 438,
            "CNC": 435,
            "Graphene": 459,
            "graphene": 459,
            "N-Graphene": 464,
            "MXene": 470,
            "MXene nanosheets": 470,
            "Ti3C2Tx": 470,
            "h-BN": 445,
            "hBN": 445,
            "Graphite": 465,
            "Nanodiamond": 410,
            "NDs-fPDA": 410,
            "Carbon nanofiber": 432,
            "碳海绵": 467,
            "Carbon-based material": 470,
            "DLC": 430,
            "CMP-TST": 447,
            "HCP": 460
        }
        
        self.material_codes.update(carbon_materials)
    
    def _init_mof_and_others(self):
        """初始化MOF/功能有机小分子/其他编码 (500-699)"""
        mof_and_others = {
            # MOF材料 (500-599)
            "AlMOF": 513,
            "FeMOF": 526,
            "CoMOF": 527,
            "Co-HHTP": 527,
            "CuMOF": 529,
            "MOF-199": 529,
            "ZnMOF": 530,
            "ZrMOF": 540,
            "DyMOF": 566,
            "CeMOF": 558,
            "NH2MOF": 550,
            "MOF-5": 555,
            "MOF-74": 557,
            "MOF-808": 560,
            "UiO-66-NH2": 545,
            "UiO-66-NH-C18": 546,
            "UIO-66(F4)": 547,
            "TFA-COF": 570,
            "TFB-COF": 571,
            "Copper terephthalat": 529,
            "CoFe-PBA": 528,
            "UiO-66-MOF": 544,
            "ZIF-8": 553,
            "ZIF-67": 553,
            "HKUST-type MOFs": 529,
            "COF AG1": 580,
                        
            # 有机小分子 (600-699)
            "SA": 618,
            "Stearic acid": 618,
            "丁烯酸酯": 615,
            "Lauric acid": 612,
            "Palmitic acid": 616,
            "palmitic acid": 616,
            "Ca stearate": 630,
            "Mg stearate": 631,
            "Fe stearate": 632,
            "Ni stearate": 633,
            "Zn stearate": 634,
            "stearate": 634,
            "BPO": 624,
            "Beeswax": 645,
            "Carnauba wax": 648,
            "SPMA": 663,
            "Fluoropolymer": 665,
            "Urushiol": 650,
            "TA": 670,
            "柴油": 625,
            "OPA": 661,
            "NDM": 658,
            "HCCP": 663,
            "TMPTA": 662,
            "STA": 667,
            "Stearoyl chloride": 638,
            "Palmitoyl chloride": 636,
            "Phosphoric acid": 610,
            "1,3-Oxazolidine": 607,
            "5-Acl": 608,
            "Econea": 685,
            "MMDI": 625,
            "Thiophene": 608,
            "HDS": 662,
            "HTFO": 664,
            "Pyrrole": 667,
            "SCA": 669,
            "SEP": 671,
            "azoisobutyronitrile": 627,
            "Polyoxometalates": 690,
            "POMs": 690,
            "phenol-amine": 609,
            "BPAF": 688,
            "12-aminododecanoic acid (NH2(CH2)11COOH)": 621,
            "Cucurbit[6]uril": 606,
            "pyrrole": 667,
            "n-tetradecylphosphonic acid": 628,
            "Tetradecylamine": 622,
            "aluminum diethylhypophosphite": 635,
            "citral": 614,
            "citronellal": 614,
            "citric acid": 613,
            "carnauba wax": 648,
            "beeswax": 645,
            "生物蜡乳液": 646,
            "Candelilla wax": 647,
            "Dodecyl methacrylate": 624,
            "LMA": 624,
            "oleic acid": 628,
            "Acetone": 606,
            "Tetrahydrofuran": 607,
            "dopamine hydrochloride": 619,
            "Spiropyran methacrylate": 675,
            "phytic acid": 666,
            "1, 2, 3, 4-butanetetracarboxylic acid": 623,
            "n-hexadecylamine": 623,
            "DTBP": 620,
            "Vinyl silica aerogel particles": 639,
            "Tea polyphenols": 637,
            "Fluorinated surfactant Capstone": 651,
            "席夫碱": 617,
            "5,6-dimethylbenzimidazole": 616,
            "NH3.H2O": 605
        }
        
        self.material_codes.update(mof_and_others)
    
    def get_material_code(self, material_name):
        """获取材料的编码"""
        if material_name in self.material_codes:
            return self.material_codes[material_name]
        return None
    
    def get_material_category(self, material_name):
        """获取材料所属的分类"""
        code = self.get_material_code(material_name)
        if code is None:
            return None
        
        # 根据编码范围确定分类
        if 100 <= code < 200:
            return 1  # 无机纳米材料/金属氧化物
        elif 200 <= code < 300:
            return 2  # 有机高分子/聚合物
        elif 300 <= code < 400:
            return 3  # 表面改性剂/硅烷类物质
        elif 400 <= code < 500:
            return 4  # 碳基材料
        elif 500 <= code < 700:
            return 5  # MOF/功能有机小分子/其他
        return None
    
    def register_combination(self, materials, combination_code):
        """
        注册新的材料组合编码
        
        参数:
        materials: 材料名称列表
        combination_code: 组合编码
        """
        # 转换为frozenset作为不可变键
        materials_set = frozenset(materials)
        self.material_combinations[materials_set] = combination_code
    
    def get_combination_code(self, materials):
        """
        获取材料组合的编码
        
        参数:
        materials: 材料名称列表或集合
        
        返回:
        组合编码，如果已注册则返回注册值，否则动态计算
        """
        # 转换为frozenset进行查找
        materials_set = frozenset(materials)
        
        # 检查是否已有注册编码
        if materials_set in self.material_combinations:
            return self.material_combinations[materials_set]
        
        # 如果未注册，则基于公式计算编码
        if len(materials) < 2:
            return None
            
        # 所有材料应当属于同一类别
        categories = set(self.get_material_category(material) for material in materials)
        if len(categories) != 1 or None in categories:
            return None
            
        category = categories.pop()
        material_codes = [self.get_material_code(material) for material in materials]
        
        # 根据类别应用适当的公式
        if category == 1:  # 无机纳米材料
            # 公式: 190 + |M₁-M₂|/10
            diff = abs(material_codes[0] - material_codes[1]) / 10
            code = 190 + round(diff)
        elif category == 2:  # 有机高分子
            # 公式: 290 + |P₁-P₂|/5
            diff = abs(material_codes[0] - material_codes[1]) / 5
            code = 290 + round(diff)
        elif category == 3:  # 表面改性剂
            # 公式: 390 + |F₁-F₂|
            f1 = (material_codes[0] % 100) // 10
            f2 = (material_codes[1] % 100) // 10
            code = 390 + abs(f1 - f2)
        elif category == 4:  # 碳基材料
            # 公式: 490 + |D₁-D₂| + (D₁+D₂)/2
            d1 = (material_codes[0] % 100) // 10
            d2 = (material_codes[1] % 100) // 10
            code = 490 + abs(d1 - d2) + round((d1 + d2) / 2)
        elif category == 5:  # MOF/功能有机小分子
            # 检查两种材料是否在同一子范围
            is_mof1 = 500 <= material_codes[0] < 600
            is_mof2 = 500 <= material_codes[1] < 600
            
            if is_mof1 and is_mof2:
                # 两种都是MOF: 590 + |M₁-M₂|/5
                diff = abs((material_codes[0] - 500) - (material_codes[1] - 500)) / 5
                code = 590 + round(diff)
            elif not is_mof1 and not is_mof2:
                # 两种都是有机小分子: 690 + |O₁-O₂|/5
                diff = abs((material_codes[0] - 600) - (material_codes[1] - 600)) / 5
                code = 690 + round(diff)
            else:
                # MOF和有机小分子混合: 690 + (M + O)/10
                m_val = material_codes[0] - 500 if is_mof1 else material_codes[1] - 500
                o_val = material_codes[0] - 600 if not is_mof1 else material_codes[1] - 600
                code = 690 + round((m_val + o_val) / 10)
        else:
            return None
        self.register_combination(materials, code)
        return code

# ====================== 初始化材料组合编码 ======================
def initialize_material_combinations(encoder):
    """初始化常见材料组合编码，确保所有组合编码唯一"""

    encoder.register_combination(["Fe3O4 nanoparticles", "Ag nanoparticles"], 191)
    encoder.register_combination(["ZnO nanoparticles", "CuO nanoparticles"], 192)
    encoder.register_combination(["Fe3O4 nanoparticles", "SiO2 nanoparticles"], 193)
    encoder.register_combination(["ZnO nanoparticles", "Fe3O4 nanoparticles"], 194)
    encoder.register_combination(["Ag nanoparticles", "Ni"], 195)
    encoder.register_combination(["Sepiolite", "SiO2 nanoparticles"], 196)
    encoder.register_combination(["MnO2 nanoparticles", "Co3O4 nanoparticles"], 197)
    

    encoder.register_combination(["PDA", "PEI"], 291)
    encoder.register_combination(["PDMS", "PDA"], 292)
    encoder.register_combination(["Lignin", "PDMS"], 293)
    encoder.register_combination(["PPy", "PEG"], 294)
    encoder.register_combination(["PLA", "PDA"], 295)
    encoder.register_combination(["Chitosan", "PAN"], 296)
    encoder.register_combination(["PAN", "PDA"], 297)
    
    encoder.register_combination(["TEOS", "DTMS"], 391)
    encoder.register_combination(["TEOS", "HMDS"], 391.8)
    encoder.register_combination(["APTES", "HDTS"], 392.4)
    encoder.register_combination(["MTS", "APTES"], 393.2)
    encoder.register_combination(["DETA", "硫醇"], 394)
    encoder.register_combination(["GPTMS", "HDTMS"], 394.8)
    encoder.register_combination(["APTES", "HDTMS"], 395.6)
    encoder.register_combination(["OTAB", "OTS"], 396.4)
    encoder.register_combination(["GPTS", "APTES"], 397.2)
    encoder.register_combination(["VTES", "十六烷基三甲氧基硅烷"], 398)
    encoder.register_combination(["MTMS", "GPTMS"], 398.8)
    encoder.register_combination(["TEOS", "TMHFDS"], 399.6)
    
    encoder.register_combination(["Graphene", "Carbon black"], 492)
    encoder.register_combination(["Carbon black", "h-BN"], 494)
    encoder.register_combination(["Graphene", "CNTs"], 496)
    encoder.register_combination(["CMP-TST", "CNTS"], 498)
    encoder.register_combination(["Carbon-based material", "NDs-fPDA"], 500)
    
    encoder.register_combination(["ZnMOF", "CoMOF"], 594)
    encoder.register_combination(["HTFO", "BPO"], 691.2)
    encoder.register_combination(["Beeswax", "Carnauba wax"], 692.4)
    encoder.register_combination(["Zn stearate", "ZnMOF"], 693.6)
    encoder.register_combination(["MOF-5", "OPA"], 694.8)
    encoder.register_combination(["HCCP", "BPAF"], 696)
    encoder.register_combination(["Polyoxometalates", "MOF-808"], 697.2)
    encoder.register_combination(["palmitic acid", "Urushiol"], 698.4)
    encoder.register_combination(["1,3-Oxazolidine", "Stearoyl chloride"], 699.6)
    encoder.register_combination(["UiO-66-NH2", "ZrMOF"], 598)

# ====================== 创建编码参照表 ======================
def create_encoding_reference_table(encoder, materials_by_category, combinations_by_category):
    """创建编码参照表"""
    # 创建参照表数据
    reference_data = []
    # 添加单一材料编码
    for category_id, materials in materials_by_category.items():
        category_name = encoder.material_categories[category_id]
        for material in materials:
            code = encoder.get_material_code(material)
            reference_data.append({
                "材料名称": material,
                "分类ID": category_id,
                "分类名称": category_name,
                "编码值": code,
                "编码类型": "单一材料"
            })
    for category_id, combinations in combinations_by_category.items():
        category_name = encoder.material_categories[category_id]
        for materials, code in combinations.items():
            material_list = list(materials)
            material_name = " + ".join(material_list)
            reference_data.append({
                "材料名称": material_name,
                "分类ID": category_id,
                "分类名称": category_name,
                "编码值": code,
                "编码类型": "材料组合"
            })
    df = pd.DataFrame(reference_data)
    if not df.empty:
        df = df.sort_values(by=["分类ID", "编码值"])
    return df

#特征工程函数
def process_material_features(data, encoder, mod_material_columns):
    print("\n正在进行材料特征工程...")
    feature_data = data.copy()
    for category_id in range(1, 6):
        category_name = encoder.material_categories[category_id]
        column_name = f"类别{category_id}_{category_name}"
        feature_data[column_name] = 0
    materials_by_category = defaultdict(set)
    combinations_by_category = defaultdict(dict)

    for idx, row in data.iterrows():
        materials = []
        for col in mod_material_columns:
            if pd.notna(row[col]) and row[col]:
                materials.append(row[col])
        
        if len(materials) < 1:
            continue  
        
        materials_by_category_row = defaultdict(list)
        for material in materials:
            category = encoder.get_material_category(material)
            if category:
                materials_by_category_row[category].append(material)
                materials_by_category[category].add(material)
            else:
                print(f"警告: 材料 '{material}' 在编码系统中未找到 (行 {idx+1})")
        
        for category, category_materials in materials_by_category_row.items():
            column_name = f"类别{category}_{encoder.material_categories[category]}"
            
            if len(category_materials) == 1:
                material = category_materials[0]
                code = encoder.get_material_code(material)
                feature_data.loc[idx, column_name] = code
            else:
                category_materials_set = frozenset(category_materials)
                
                if len(category_materials) == 2:
                    code = encoder.get_combination_code(category_materials)
                    feature_data.loc[idx, column_name] = code
                    combinations_by_category[category][category_materials_set] = code
                else:
                    first_two = category_materials[:2]
                    code = encoder.get_combination_code(first_two)
                    feature_data.loc[idx, column_name] = code
                    combinations_by_category[category][frozenset(first_two)] = code
                    print(f"注意: 行 {idx+1} 中类别 {category} 有超过2种材料，仅使用前两种计算组合编码。")
    reference_table = create_encoding_reference_table(encoder, materials_by_category, combinations_by_category)
    
    print(f"特征工程完成。已处理 {len(data)} 行数据。")
    print(f"已识别 {sum(len(materials) for materials in materials_by_category.values())} 种单一材料")
    print(f"已创建 {sum(len(combos) for combos in combinations_by_category.values())} 种材料组合编码")
    return feature_data, reference_table
# ====================== 制备方法和基底材料编码 ======================
def encode_method_and_base(data, method_column, base_material_column):
    """为制备方法和基底材料创建系统化编码"""
    print("\n正在对制备方法和基底材料进行系统化编码...")
    feature_data = data.copy()
    
    # 基底材料编码映射 - 材质类型码(1-9) × 10 + 材料特性值(1-9)
    base_mapping = {
        "Polyurethane sponge": 11,
        "Polystyrene": 21,
        "Polystyrene sponge": 22,
        "Polystyrene": 23,
        "Cellulose sponge": 31, 
        "Ethyl Cellulose": 32,
        "Plant Fiber Sponge": 33,
        "kapok fibre sponge": 34,
        "Polyacrylonitrile sponge": 41,
        "PAN sponge": 42,
        "PVA sponge": 51,
        "Polyamide sponge": 61,
        "Melamine sponge": 71,
        "Melamine-formaldehyde sponge": 72,
        "PDMS sponge": 81,
        "silicone sponge": 82,
        "TEOS": 83,
        "nickel sponge": 91,
        "CuNWs nanoparticles": 92,
        "EcoFlex": 93,
        "carrageenan sponge": 94,
        "CS sponge": 95,
        "PET sponge": 96,
        # 添加到 base_mapping 字典中
        "PLA sponge": 25,
        "Chitosan sponge": 95,
        "CS sponge": 95,
        "Luffa sponge": 35,
        "Polyethylene sponge": 24,
        "nylon fibrous sponge": 62,
        "rock wool": 37,
        "polyester fabric sponge": 27,
        "碳海绵": 98,  # 特殊材料类
        "TODB": 97
    }
    
    # 制备方法编码映射 - 方法类型码(1-9) × 10 + 工艺变体码(0-9)
    method_mapping = {
        "浸渍法": 10,
        "PDA浸渍法": 11,
        "BPO浸渍法": 12,
        "APTES浸渍法": 13,
        "PFR浸渍法": 14,
        "丙烯酸树脂浸渍法": 15,
        "硅橡胶粘合剂浸渍法": 16,
        "PVDF浸渍法": 17,
        "OTS浸渍法": 18,
        "环氧树脂浸渍法": 19,
        "HDTMS浸渍法": 21,
        "MTS浸渍法": 22,
        "PDMS浸渍法": 23,
        "气相沉积法": 43,
        "硅橡胶浸渍法": 16,
        "原位生长": 32,
        "天然乳胶浸渍法": 24,
        "Silk ﬁbroin浸渍法": 25,
        "PVA浸渍法": 26,
        "PDA浸渍法_x000D_": 27,
        "PDMS浸渍法_x000D_": 28,
        "酚醛树脂浸渍法": 29,
        "模板法": 30,
        "原位聚合": 31,
        "溶剂热还原法": 40,
        "静电纺丝法": 41,
        "发泡法": 42,
        "涂覆法": 50
    }
    
    feature_data['制备方法_编码'] = data[method_column].map(method_mapping)
    feature_data['基底材料_编码'] = data[base_material_column].map(base_mapping)
    
    # 检查是否有未被编码的材料或方法（添加行号信息）
    for idx, row in data.iterrows():
        if pd.notna(row[method_column]) and row[method_column] not in method_mapping:
            print(f"警告: 制备方法 '{row[method_column]}' 在编码系统中未找到 (行 {idx+1})")
        if pd.notna(row[base_material_column]) and row[base_material_column] not in base_mapping:
            print(f"警告: 基底材料 '{row[base_material_column]}' 在编码系统中未找到 (行 {idx+1})")
        
    # 处理缺失值
    feature_data['制备方法_编码'].fillna(0, inplace=True)
    feature_data['基底材料_编码'].fillna(0, inplace=True)
    
    # 创建方法和基底材料编码参照表
    method_df = pd.DataFrame([
        {'制备方法': method, '编码值': code, '方法类型': f"类型{code//10}", '工艺变体': code%10} 
        for method, code in method_mapping.items()
    ])
    
    base_df = pd.DataFrame([
        {'基底材料': base, '编码值': code, '材质类型': f"类型{code//10}", '特性值': code%10} 
        for base, code in base_mapping.items()
    ])
    
    print(f"完成制备方法编码，共 {len(method_mapping)} 种方法")
    print(f"完成基底材料编码，共 {len(base_mapping)} 种材料")
    
    return feature_data, method_df, base_df
# ====================== 执行特征工程 ======================
def run_feature_engineering(data, base_material_column, mod_material_columns, method_column, target_columns, year_column=None):
    """执行完整的特征工程流程
    
    参数:
    data: 原始数据
    base_material_column: 基底材料列名
    mod_material_columns: 改性材料列名列表
    method_column: 制备方法列名
    target_columns: 目标变量列名列表
    year_column: 年份列名 (可选)
    """
    output_dir = "data_exports"
    os.makedirs(output_dir, exist_ok=True)
    encoder = MaterialEncoder()
    initialize_material_combinations(encoder)
    feature_data, material_reference = process_material_features(data, encoder, mod_material_columns)
    feature_data, method_reference, base_reference = encode_method_and_base(
        feature_data, method_column, base_material_column)
    
    # 如果提供了年份列，则将其添加到特征数据中
    if year_column and year_column in data.columns:
        feature_data[year_column] = data[year_column]
    
    # 1. 保存带有材料名称的特征数据（用于查看）
    feature_data_with_names = feature_data.copy()
    feature_data_with_names.to_excel(f"{output_dir}/material_features_with_names.xlsx", index=False)
    
    # 2. 保存纯数值编码的特征数据（用于模型训练）
    target_data = data[target_columns].copy()
    feature_cols = [col for col in feature_data.columns 
                   if col.startswith('类别') or col == '制备方法_编码' or col == '基底材料_编码']
    
    # 如果年份列存在，也将其包含在特征数据中
    if year_column and year_column in feature_data.columns:
        feature_cols.append(year_column)
    
    feature_data_numeric = pd.concat([feature_data[feature_cols], target_data], axis=1)
    print("\n特征数据列顺序:")
    for i, col in enumerate(feature_data_numeric.columns):
        print(f"{i+1}. {col}")
    feature_data_numeric.to_excel(f"{output_dir}/material_features_numeric.xlsx", index=False)
    feature_data_numeric.to_csv(f"{output_dir}/material_features_numeric.csv", index=False)
    
    # 3. 保存编码参照表
    material_reference.to_excel(f"{output_dir}/material_encoding_reference.xlsx", index=False)
    method_reference.to_excel(f"{output_dir}/method_encoding_reference.xlsx", index=False)
    base_reference.to_excel(f"{output_dir}/base_material_encoding_reference.xlsx", index=False)
    
    # 4. 保存完整的编码表，包含所有单一材料的编码
    all_materials = []
    # 添加所有单一材料
    for category_id in range(1, 6):
        category_name = encoder.material_categories[category_id]
        for material, code in encoder.material_codes.items():
            if encoder.get_material_category(material) == category_id:
                all_materials.append({
                    "材料名称": material,
                    "分类ID": category_id,
                    "分类名称": category_name,
                    "编码值": code,
                    "编码类型": "单一材料"
                })
    
    # 添加所有组合材料（从material_reference中获取）
    combo_rows = material_reference[material_reference['编码类型'] == '材料组合']
    if not combo_rows.empty:
        all_materials.extend(combo_rows.to_dict('records'))
    
    # 创建完整的编码表
    all_materials_df = pd.DataFrame(all_materials)
    if not all_materials_df.empty:
        all_materials_df = all_materials_df.sort_values(by=["分类ID", "编码值"])
        all_materials_df.to_excel(f"{output_dir}/complete_material_encoding_table.xlsx", index=False)
    
    print(f"\n特征工程完成！")
    print(f"带有材料名称的特征数据已保存至: {output_dir}/material_features_with_names.xlsx")
    print(f"用于模型训练的特征数据已保存至: {output_dir}/material_features_numeric.xlsx")
    print(f"编码参照表已保存至: {output_dir}/material_encoding_reference.xlsx")
    print(f"完整的材料编码表已保存至: {output_dir}/complete_material_encoding_table.xlsx")
    
    return feature_data, feature_data_numeric, material_reference, method_reference, base_reference

excel_path = r"E:\360MoveData\Users\DELL\Desktop\代码\data_exports\preprocessed_data.csv"  # 修改为您的文件路径

# 加载Excel数据
data = pd.read_csv(excel_path)
print(f"成功加载数据文件: {excel_path}")
print(f"数据集包含 {len(data)} 行和 {len(data.columns)} 列")
# 设置列信息
base_material_column = data.columns[3]
mod_material_columns = data.columns[4:8].tolist()
method_column = data.columns[8]
target_columns = data.columns[10:13].tolist()  # 目标变量列（水接触角、循环使用次数、吸油能力）
year_column = data.columns[13]  # 第14列是年份列

feature_data, feature_data_numeric, material_ref, method_ref, base_ref = run_feature_engineering(
    data, base_material_column, mod_material_columns, method_column, target_columns, year_column)

In [None]:

import seaborn as sns
import pickle
import warnings
import os
import time
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import tensorflow as tf

# 忽略特定警告
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# 文件名净化函数
def sanitize_filename(filename):
    """将文件名中的无效字符替换为下划线"""
    invalid_chars = ['/', '\\', ':', '*', '?', '"', '<', '>', '|']
    for char in invalid_chars:
        filename = filename.replace(char, '_')
    return filename

# 定义目标变量特定的容忍度值
target_tolerance = {
    '水接触角': 0.2,  # 保持当前容忍度
    '循环使用次数': 0.3,  # 提高循环使用次数的容忍度
    '吸油能力': 0.3   # 提高吸油能力的容忍度
}

# 添加误差容忍的评估函数
def tolerance_r2_score(y_true, y_pred, tolerance=0.15, target=None):
    """
    计算容忍度R²评分，允许一定误差范围内的预测被视为准确
    
    参数:
    y_true: 真实值
    y_pred: 预测值
    tolerance: 容忍度，表示为真实值的百分比
    target: 目标变量名称，用于选择特定的容忍度
    
    返回:
    修正后的R²分数
    """
    # 确保输入数据是numpy数组并且形状正确
    y_true = np.asarray(y_true).ravel()
    y_pred = np.asarray(y_pred).ravel()
    
    # 如果提供了目标变量名，则使用目标特定的容忍度
    if target and target in target_tolerance:
        tolerance = target_tolerance[target]
    
    # 计算容忍范围
    tolerance_values = tolerance * np.abs(y_true)
    # 计算残差
    residuals = np.abs(y_true - y_pred)
    
    # 调整残差，使误差在容忍范围内的视为0
    adjusted_residuals = np.maximum(0, residuals - tolerance_values)
    
    # 计算修正后的总平方和
    y_true_mean = np.mean(y_true)
    tss = np.sum((y_true - y_true_mean) ** 2)
    
    # 计算修正后的残差平方和
    rss = np.sum(adjusted_residuals ** 2)
    
    # 计算修正后的R²
    if tss == 0:
        return 0  # 防止除以0
    
    tolerance_r2 = 1 - (rss / tss)
    return tolerance_r2

def prediction_within_tolerance(y_true, y_pred, tolerance=0.15, target=None):
    """
    计算预测值在目标值±容忍范围内的比例
    """
    # 确保输入为numpy数组
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    
    # 如果提供了目标变量名，则使用目标特定的容忍度
    if target and target in target_tolerance:
        tolerance = target_tolerance[target]
    
    # 计算容忍范围
    tolerance_values = tolerance * np.abs(y_true)
    
    # 检查预测是否在容忍范围内
    within_tolerance = np.abs(y_true - y_pred) <= tolerance_values
    
    # 计算在容忍范围内的预测比例
    return np.mean(within_tolerance)

def make_tolerance_scorer(target):
    def scorer_function(estimator, X, y):
        y_pred = estimator.predict(X)
        return tolerance_r2_score(y, y_pred, target=target)
    
    # 设置函数名称
    scorer_function.__name__ = f'tolerance_scorer_{target}'
    return scorer_function

# 神经网络进度条回调
class MyProgressBar(tf.keras.callbacks.Callback):
    def __init__(self, epochs):
        super(MyProgressBar, self).__init__()
        self.epochs = epochs
        self.progress_bar = None
    
    def on_train_begin(self, logs=None):
        print("开始神经网络训练...")
        
    def on_epoch_end(self, epoch, logs=None):
        val_loss = logs.get('val_loss', 0)
        if epoch % 10 == 0 or epoch == self.epochs - 1:
            print(f"epoch {epoch+1}/{self.epochs} - val_loss: {val_loss:.4f}")
        
    def on_train_end(self, logs=None):
        print("神经网络训练完成")

# LightGBM自定义封装器
class CustomLGBMRegressor:
    def __init__(self, **params):
        self.params = params
        self.model = None       
    def fit(self, X, y):
        import lightgbm as lgb
        feature_names = [f'f{i}' for i in range(X.shape[1])]
        X_values = X.values if hasattr(X, 'values') else X
        train_data = lgb.Dataset(X_values, label=y, feature_name=feature_names)
        self.model = lgb.train(self.params, train_data, num_boost_round=self.params.get('n_estimators', 100))
        return self
    def predict(self, X):
        if self.model is None:
            raise ValueError("Model not trained. Call fit first.")
            
        # 保持一致性，也将预测数据转换为numpy数组
        X_values = X.values if hasattr(X, 'values') else X
        return self.model.predict(X_values)

# 自定义投票回归器
class CustomVotingRegressor:
    def __init__(self, estimators, weights=None):
        self.estimators = estimators
        self.weights = weights if weights is not None else [1] * len(estimators)
        
    def fit(self, X, y):
        return self
    def predict(self, X):
        predictions = []
        for name, model in self.estimators:
            if hasattr(model, 'predict'):
                pred = model.predict(X)
                predictions.append(pred)
            else:
                if isinstance(model, dict) and 'model' in model:
                    if model.get('needs_scaling', False) and 'scaler' in model:
                        X_scaled = model['scaler'].transform(X)
                        pred = model['model'].predict(X_scaled)
                    else:
                        pred = model['model'].predict(X)
                    predictions.append(pred)
        weighted_preds = np.average(predictions, axis=0, weights=self.weights)
        return weighted_preds

# 绘制学习曲线来检测过拟合
def plot_learning_curves(model, X_train, y_train, X_test, y_test, model_name, target_name, cv=5):
    """
    绘制学习曲线，用于检测过拟合
    
    参数:
    model: 训练模型
    X_train, y_train: 训练数据
    X_test, y_test: 测试数据
    model_name: 模型名称
    target_name: 目标变量名称
    cv: 交叉验证折数
    """
    # 这里我们只对支持partial_fit或warm_start的模型进行学习曲线绘制
    if not (hasattr(model, 'partial_fit') or (hasattr(model, 'warm_start') and model.warm_start)):
        print(f"{model_name}模型不支持增量学习，无法绘制学习曲线")
        return
    
    # 为了简化，我们仅对部分数据进行评估
    train_sizes = np.linspace(0.1, 1.0, 10)
    train_scores = []
    test_scores = []
    
    for size in train_sizes:
        n_samples = int(len(X_train) * size)
        if n_samples < 5:  # 确保至少有5个样本
            continue
            
        X_subset = X_train.iloc[:n_samples] if hasattr(X_train, 'iloc') else X_train[:n_samples]
        y_subset = y_train.iloc[:n_samples] if hasattr(y_train, 'iloc') else y_train[:n_samples]
        
        # 重新训练模型
        model.fit(X_subset, y_subset)
        
        # 计算训练集和测试集上的分数
        train_score = r2_score(y_subset, model.predict(X_subset))
        test_score = r2_score(y_test, model.predict(X_test))
        
        train_scores.append(train_score)
        test_scores.append(test_score)
    
    # 绘制学习曲线
    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes * len(X_train), train_scores, 'o-', color='r', label='训练集 R²')
    plt.plot(train_sizes * len(X_train), test_scores, 'o-', color='g', label='测试集 R²')
    plt.title(f'{target_name} - {model_name} 学习曲线')
    plt.xlabel('训练样本数')
    plt.ylabel('R² 分数')
    plt.legend(loc='best')
    plt.grid(True)
    plt.show()
    
    # 检测过拟合
    if len(train_scores) > 0 and len(test_scores) > 0:
        train_final = train_scores[-1]
        test_final = test_scores[-1]
        gap = train_final - test_final
        
        print(f"最终训练集 R²: {train_final:.4f}")
        print(f"最终测试集 R²: {test_final:.4f}")
        print(f"训练集与测试集 R² 差距: {gap:.4f}")
        
        if gap > 0.2:
            print("警告: 模型可能存在过拟合 (训练集和测试集的R²差距大于0.2)")
        elif gap < 0:
            print("警告: 模型可能存在欠拟合 (测试集R²高于训练集)")
        else:
            print("模型拟合良好 (训练集和测试集的R²差距小于0.2)")

# 评估模型函数
def evaluate_model(model, X_train, y_train, X_test, y_test, target, model_name):
    """
    评估模型在训练集和测试集上的性能，包括标准R²和容忍度R²
    """
    # 获取目标特定的容忍度
    current_tolerance = target_tolerance.get(target, 0.15)
    
    # 在训练集上评估
    y_train_pred = model.predict(X_train)
    if len(y_train_pred.shape) > 1 and y_train_pred.shape[1] == 1:
        y_train_pred = y_train_pred.flatten()
    
    train_r2 = r2_score(y_train, y_train_pred)
    train_tol_r2 = tolerance_r2_score(y_train, y_train_pred, tolerance=current_tolerance, target=target)
    train_within_tol = prediction_within_tolerance(y_train, y_train_pred, tolerance=current_tolerance, target=target)
    
    # 在测试集上评估
    y_test_pred = model.predict(X_test)
    if len(y_test_pred.shape) > 1 and y_test_pred.shape[1] == 1:
        y_test_pred = y_test_pred.flatten()
        
    test_r2 = r2_score(y_test, y_test_pred)
    test_tol_r2 = tolerance_r2_score(y_test, y_test_pred, tolerance=current_tolerance, target=target)
    test_within_tol = prediction_within_tolerance(y_test, y_test_pred, tolerance=current_tolerance, target=target)
    
    # 打印评估结果
    print(f"\n{model_name} 在 {target} 上的评估结果:")
    print(f"训练集: R²={train_r2:.4f}, 容忍度R²={train_tol_r2:.4f}, 在容忍范围内比例={train_within_tol:.2%}")
    print(f"测试集: R²={test_r2:.4f}, 容忍度R²={test_tol_r2:.4f}, 在容忍范围内比例={test_within_tol:.2%}")
    
    # 检测过拟合
    r2_gap = train_r2 - test_r2
    if r2_gap > 0.2:
        print(f"警告: 模型可能存在过拟合 (R²差距: {r2_gap:.4f})")
    elif r2_gap < -0.1:
        print(f"警告: 模型可能存在欠拟合 (R²差距: {r2_gap:.4f})")
    else:
        print(f"模型拟合良好 (R²差距: {r2_gap:.4f})")
    
    # 绘制预测值与实际值的对比散点图
    plt.figure(figsize=(12, 5))
    
    # 训练集散点图
    plt.subplot(1, 2, 1)
    plt.scatter(y_train, y_train_pred, alpha=0.5)
    plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--')
    plt.xlabel('实际值')
    plt.ylabel('预测值')
    plt.title(f'训练集: R²={train_r2:.4f}')
    
    # 测试集散点图
    plt.subplot(1, 2, 2)
    plt.scatter(y_test, y_test_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.xlabel('实际值')
    plt.ylabel('预测值')
    plt.title(f'测试集: R²={test_r2:.4f}')
    
    plt.tight_layout()
    plt.show()
    
    return {
        'train_r2': train_r2,
        'train_tol_r2': train_tol_r2,
        'train_within_tol': train_within_tol,
        'test_r2': test_r2,
        'test_tol_r2': test_tol_r2,
        'test_within_tol': test_within_tol
    }
# 加载特征数据
feature_data_path = "data_exports/material_features_numeric.xlsx"

# 加载特征数据
data = pd.read_excel(feature_data_path)
print(f"成功加载特征数据: {feature_data_path}")
print(f"数据集包含 {len(data)} 行和 {len(data.columns)} 列")
    
# 数值预处理
# 1. 第一列减100再归一化
data.iloc[:, 0] = data.iloc[:, 0] - 100
data.iloc[:, 0] = 20*(data.iloc[:, 0] - data.iloc[:, 0].min()) / (data.iloc[:, 0].max() - data.iloc[:, 0].min())

# 2. 第二列减200再归一化
data.iloc[:, 1] = data.iloc[:, 1] - 200
data.iloc[:, 1] = 20*(data.iloc[:, 1] - data.iloc[:, 1].min()) / (data.iloc[:, 1].max() - data.iloc[:, 1].min())

# 3. 第三列减300再归一化
data.iloc[:, 2] = data.iloc[:, 2] - 300
data.iloc[:, 2] = 20*(data.iloc[:, 2] - data.iloc[:, 2].min()) / (data.iloc[:, 2].max() - data.iloc[:, 2].min())

# 4. 第四列减400再归一化
data.iloc[:, 3] = data.iloc[:, 3] - 400
data.iloc[:, 3] = 20*(data.iloc[:, 3] - data.iloc[:, 3].min()) / (data.iloc[:, 3].max() - data.iloc[:, 3].min())

# 5. 第五列减500除以二再归一化
data.iloc[:, 4] = (data.iloc[:, 4] - 500) / 2
data.iloc[:, 4] = 20*(data.iloc[:, 4] - data.iloc[:, 4].min()) / (data.iloc[:, 4].max() - data.iloc[:, 4].min())

# 识别特征列和目标列
category_columns = [col for col in data.columns if col.startswith('类别')]
method_column = '制备方法_编码'
base_material_column = '基底材料_编码'
target_columns = data.columns[-3:].tolist()
feature_columns = category_columns + [method_column, base_material_column]
data_original = data.copy()

# ---------------------- 为不同模型类型创建预处理数据 ----------------------

# 1. 线性模型数据 - 将材料类别编码除以1000以减小与0的差距
data_linear = data.copy()
for col in category_columns:
    data_linear[col] = data_linear[col] / 100.0
print("\n线性模型数据预处理 - 材料类别列除以1000")
print(f"处理前范围: {data[category_columns].max().max():.1f}")
print(f"处理后范围: {data_linear[category_columns].max().max():.4f}")

# 2. 树模型(XGBoost/LightGBM)数据 - 将0替换为NaN
data_tree = data.copy()
for col in category_columns:
    data_tree[col] = data_tree[col].replace(0, np.nan)
# 计算缺失值百分比
nan_percent = data_tree[category_columns].isna().mean().mean() * 100
print(f"\n树模型数据预处理 - 将0替换为NaN")
print(f"缺失值百分比: {nan_percent:.1f}%")

# 3. 神经网络数据 - 添加存在标志
data_nn = data.copy()
# 为每个类别特征创建存在标志
for col in category_columns:
    # 创建存在标志列（1表示存在，0表示不存在）
    flag_col = f"{col}_存在标志"
    data_nn[flag_col] = (data_nn[col] > 0).astype(int)

# 神经网络特征列（新增了存在标志）
nn_feature_columns = feature_columns + [f"{col}_存在标志" for col in category_columns]

print(f"\n神经网络数据预处理 - 添加材料存在标志")
print(f"原始特征数: {len(feature_columns)}")
print(f"增加存在标志后特征数: {len(nn_feature_columns)}")

# 打印每种数据处理的效果
print("\n数据预处理示例（第一行）:")
print("原始数据:")
print(data[category_columns].iloc[0])
print("\n线性模型数据:")
print(data_linear[category_columns].iloc[0])
print("\n树模型数据:")
print(data_tree[category_columns].iloc[0])
print("\n神经网络数据（包含存在标志）:")
print(data_nn[[col for col in data_nn.columns if col.startswith('类别')]].iloc[0])

# 为模型类型定义模型组
linear_models = ["LinearRegression", "Ridge", "Lasso", "ElasticNet", "HuberRegressor"]
tree_models_nan_support = ["XGBoost", "LightGBM", "HistGradientBoosting"]  # 支持NaN的树模型
tree_models_no_nan = ["RandomForest", "GradientBoosting"]  # 不支持NaN的树模型
advanced_models = ["SVR", "GaussianProcess"]  # 高级模型
ensemble_models = ["VotingEnsemble", "Stacking", "Bagging"]  # 集成模型
nn_models = ["DeepNN"]  # 神经网络模型

from sklearn.model_selection import train_test_split, KFold

# 数据分割函数
def create_bins(y, n_bins=5):
    """将连续变量分箱，用于分层抽样"""
    try:
        bins = pd.qcut(y, q=n_bins, duplicates='drop')
        return bins
    except:
        try:
            bins = pd.cut(y, bins=n_bins)
            return bins
        except:
            print(f"为 {len(y)} 个样本创建分层变量失败，使用随机分层")
            return pd.Series(np.random.randint(0, min(5, len(y)//2+1), size=len(y)))

# 创建数据分割
X_train = {}
X_test = {}
X_train_linear = {}
X_test_linear = {}
X_train_tree = {}
X_test_tree = {}
X_train_nn = {}
X_test_nn = {}
# 添加填充了NaN的数据版本，用于不支持NaN的树模型
X_train_tree_filled = {}
X_test_tree_filled = {}
y_train = {}
y_test = {}

# 设置年份分割阈值
year_split_threshold = 2024
year_column = '年份'  # 确保这里使用正确的年份列名

# 检查年份列是否存在
if year_column not in data.columns:
    print(f"警告: 找不到年份列 '{year_column}'。如果年份列有其他名称，请相应调整代码。")
    print("将退回到随机分割数据。")
    use_year_split = False
else:
    use_year_split = True
    print(f"使用年份分割: {year_split_threshold}年之前为训练集，{year_split_threshold}年及以后为测试集")

# 仍设置交叉验证，以备不使用年份分割的情况
cv_folds = min(5, int(len(data) * 0.1))
print(f"交叉验证设置: {cv_folds} 折")
kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)

# 为每个目标变量分割数据
for target in target_columns:
    print(f"\n为目标 {target} 准备训练数据...")
    # 准备特征和目标数据
    valid_mask = data[target].notna()
    
    # 原始数据
    X_orig = data_original.loc[valid_mask, feature_columns]
    y_orig = data_original.loc[valid_mask, target]
    
    # 线性模型数据
    X_lin = data_linear.loc[valid_mask, feature_columns]
    
    # 树模型数据（带NaN）
    X_tree_data = data_tree.loc[valid_mask, feature_columns]
    
    # 为不支持NaN的树模型创建填充了NaN的数据版本
    X_tree_filled_data = X_tree_data.copy()
    # 使用0填充NaN值
    for col in X_tree_filled_data.columns:
        X_tree_filled_data[col] = X_tree_filled_data[col].fillna(0)
    
    # 神经网络数据
    X_nn_data = data_nn.loc[valid_mask, nn_feature_columns]
    
    # 基于年份或随机分割数据
    if use_year_split:
        # 获取年份数据
        years = data_original.loc[valid_mask, year_column]
        
        # 创建训练集和测试集掩码
        train_mask = years < year_split_threshold
        test_mask = ~train_mask
        
        # 检查训练集和测试集大小
        if sum(train_mask) == 0:
            print(f"警告: 没有{year_split_threshold}年之前的数据。使用随机分割。")
            # 退回到随机分割
            strat_var = create_bins(y_orig, n_bins=min(3, len(y_orig)//5))
            X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(
                X_orig, y_orig, test_size=0.2, random_state=42, stratify=strat_var
            )
        elif sum(test_mask) == 0:
            print(f"警告: 没有{year_split_threshold}年及之后的数据。使用随机分割。")
            # 退回到随机分割
            strat_var = create_bins(y_orig, n_bins=min(3, len(y_orig)//5))
            X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(
                X_orig, y_orig, test_size=0.2, random_state=42, stratify=strat_var
            )
        else:
            # 使用年份分割数据
            X_train_orig = X_orig[train_mask]
            X_test_orig = X_orig[test_mask]
            y_train_orig = y_orig[train_mask]
            y_test_orig = y_orig[test_mask]
            
            print(f"使用年份分割: 训练集({sum(train_mask)}样本, <{year_split_threshold}年), 测试集({sum(test_mask)}样本, ≥{year_split_threshold}年)")
    else:
        # 使用随机分割
        strat_var = create_bins(y_orig, n_bins=min(3, len(y_orig)//5))
        X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(
            X_orig, y_orig, test_size=0.2, random_state=42, stratify=strat_var
        )
        print("使用随机分层抽样分割数据")
    
    # 使用相同索引分割预处理后的数据
    if use_year_split and sum(train_mask) > 0 and sum(test_mask) > 0:
        # 直接使用掩码分割其他数据集
        X_train_lin = X_lin[train_mask]
        X_test_lin = X_lin[test_mask]
        
        X_train_tree_data = X_tree_data[train_mask]
        X_test_tree_data = X_tree_data[test_mask]
        
        X_train_tree_filled_data = X_tree_filled_data[train_mask]
        X_test_tree_filled_data = X_tree_filled_data[test_mask]
        
        X_train_nn_data = X_nn_data[train_mask]
        X_test_nn_data = X_nn_data[test_mask]
    else:
        # 使用索引分割其他数据集
        X_train_lin = X_lin.loc[X_train_orig.index]
        X_test_lin = X_lin.loc[X_test_orig.index]
        
        X_train_tree_data = X_tree_data.loc[X_train_orig.index]
        X_test_tree_data = X_tree_data.loc[X_test_orig.index]
        
        X_train_tree_filled_data = X_tree_filled_data.loc[X_train_orig.index]
        X_test_tree_filled_data = X_tree_filled_data.loc[X_test_orig.index]
        
        X_train_nn_data = X_nn_data.loc[X_train_orig.index]
        X_test_nn_data = X_nn_data.loc[X_test_orig.index]
    
    # 保存分割的数据
    X_train[target] = X_train_orig
    X_test[target] = X_test_orig
    X_train_linear[target] = X_train_lin
    X_test_linear[target] = X_test_lin
    X_train_tree[target] = X_train_tree_data
    X_test_tree[target] = X_test_tree_data
    X_train_tree_filled[target] = X_train_tree_filled_data
    X_test_tree_filled[target] = X_test_tree_filled_data
    X_train_nn[target] = X_train_nn_data
    X_test_nn[target] = X_test_nn_data
    y_train[target] = y_train_orig
    y_test[target] = y_test_orig
    
    print(f"训练集: {len(y_train[target])} 样本, 测试集: {len(y_test[target])} 样本")
    print(f"训练集目标均值: {y_train[target].mean():.4f}, 测试集目标均值: {y_test[target].mean():.4f}")

# 预处理器存储
imputers = {}
scalers = {}

# 存储训练的模型
models = {}
for target in target_columns:
    models[target] = {}

In [None]:
# ====================== 保存预处理数据 ======================
print("=" * 50)
print("保存预处理数据")
print("=" * 50)

import os
import pickle

# 创建数据导出文件夹
export_folder = 'data_exports'
os.makedirs(export_folder, exist_ok=True)
print(f"创建/检查数据导出文件夹: {export_folder}")

# 保存数据分割结果
print("\n保存数据分割结果...")

# 1. 保存原始训练和测试数据
with open(os.path.join(export_folder, 'X_train.pkl'), 'wb') as f:
    pickle.dump(X_train, f)
print("✓ 已保存 X_train.pkl")

with open(os.path.join(export_folder, 'X_test.pkl'), 'wb') as f:
    pickle.dump(X_test, f)
print("✓ 已保存 X_test.pkl")

# 2. 保存线性模型数据
with open(os.path.join(export_folder, 'X_train_linear.pkl'), 'wb') as f:
    pickle.dump(X_train_linear, f)
print("✓ 已保存 X_train_linear.pkl")

with open(os.path.join(export_folder, 'X_test_linear.pkl'), 'wb') as f:
    pickle.dump(X_test_linear, f)
print("✓ 已保存 X_test_linear.pkl")

# 3. 保存树模型数据（支持NaN）
with open(os.path.join(export_folder, 'X_train_tree.pkl'), 'wb') as f:
    pickle.dump(X_train_tree, f)
print("✓ 已保存 X_train_tree.pkl")

with open(os.path.join(export_folder, 'X_test_tree.pkl'), 'wb') as f:
    pickle.dump(X_test_tree, f)
print("✓ 已保存 X_test_tree.pkl")

# 4. 保存树模型数据（填充NaN，用于RandomForest等）
with open(os.path.join(export_folder, 'X_train_tree_filled.pkl'), 'wb') as f:
    pickle.dump(X_train_tree_filled, f)
print("✓ 已保存 X_train_tree_filled.pkl")

with open(os.path.join(export_folder, 'X_test_tree_filled.pkl'), 'wb') as f:
    pickle.dump(X_test_tree_filled, f)
print("✓ 已保存 X_test_tree_filled.pkl")

# 5. 保存神经网络数据
with open(os.path.join(export_folder, 'X_train_nn.pkl'), 'wb') as f:
    pickle.dump(X_train_nn, f)
print("✓ 已保存 X_train_nn.pkl")

with open(os.path.join(export_folder, 'X_test_nn.pkl'), 'wb') as f:
    pickle.dump(X_test_nn, f)
print("✓ 已保存 X_test_nn.pkl")

# 6. 保存目标变量
with open(os.path.join(export_folder, 'y_train.pkl'), 'wb') as f:
    pickle.dump(y_train, f)
print("✓ 已保存 y_train.pkl")

with open(os.path.join(export_folder, 'y_test.pkl'), 'wb') as f:
    pickle.dump(y_test, f)
print("✓ 已保存 y_test.pkl")

# 7. 保存其他有用的数据
# 保存目标列名
with open(os.path.join(export_folder, 'target_columns.pkl'), 'wb') as f:
    pickle.dump(target_columns, f)
print("✓ 已保存 target_columns.pkl")

# 保存特征列名
with open(os.path.join(export_folder, 'feature_columns.pkl'), 'wb') as f:
    pickle.dump(feature_columns, f)
print("✓ 已保存 feature_columns.pkl")

# 保存神经网络特征列名
with open(os.path.join(export_folder, 'nn_feature_columns.pkl'), 'wb') as f:
    pickle.dump(nn_feature_columns, f)
print("✓ 已保存 nn_feature_columns.pkl")

# 保存容忍度设置
with open(os.path.join(export_folder, 'target_tolerance.pkl'), 'wb') as f:
    pickle.dump(target_tolerance, f)
print("✓ 已保存 target_tolerance.pkl")

# 保存原始数据（可选，如果需要的话）
with open(os.path.join(export_folder, 'data_original.pkl'), 'wb') as f:
    pickle.dump(data_original, f)
print("✓ 已保存 data_original.pkl")

# 保存预处理后的数据版本
preprocessing_data = {
    'data_linear': data_linear,
    'data_tree': data_tree,
    'data_nn': data_nn
}
with open(os.path.join(export_folder, 'preprocessing_data.pkl'), 'wb') as f:
    pickle.dump(preprocessing_data, f)
print("✓ 已保存 preprocessing_data.pkl")

# 显示保存的数据统计信息
print(f"\n数据保存完成! 保存位置: {export_folder}/")
print("\n数据统计信息:")
print(f"目标变量数量: {len(target_columns)}")
print(f"目标变量: {target_columns}")

for target in target_columns:
    print(f"\n{target}:")
    print(f"  训练集样本数: {len(y_train[target])}")
    print(f"  测试集样本数: {len(y_test[target])}")
    print(f"  特征数量 (原始): {len(X_train[target].columns)}")
    print(f"  特征数量 (线性): {len(X_train_linear[target].columns)}")
    print(f"  特征数量 (树模型): {len(X_train_tree[target].columns)}")
    print(f"  特征数量 (神经网络): {len(X_train_nn[target].columns)}")
    print(f"  训练集目标均值: {y_train[target].mean():.4f}")
    print(f"  测试集目标均值: {y_test[target].mean():.4f}")

# 验证保存的文件
print(f"\n保存的文件列表:")
saved_files = os.listdir(export_folder)
for file in sorted(saved_files):
    file_path = os.path.join(export_folder, file)
    file_size = os.path.getsize(file_path) / 1024  # KB
    print(f"  {file} ({file_size:.1f} KB)")

print(f"\n总共保存了 {len(saved_files)} 个文件")



In [None]:
# 水接触角XGBoost贝叶斯超参数优化
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer
import numpy as np
import pandas as pd
import pickle
import os
import matplotlib.pyplot as plt
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from skopt.plots import plot_convergence
import warnings
warnings.filterwarnings('ignore')

# 创建保存数据的文件夹
save_folder = '模型可视化数据'
if not os.path.exists(save_folder):
    os.makedirs(save_folder)
    print(f"已创建文件夹：{save_folder}")

# 选择目标变量
target = '水接触角'
print(f"训练 {target} 的XGBoost模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.03)

# 选择适当的数据集
X_train_model = X_train_tree[target]
X_test_model = X_test_tree[target]

# 从您的代码中复制的完整函数定义
def tolerance_r2_score(y_true, y_pred, tolerance=0.15, target=None):
    """
    计算容忍度R²评分，允许一定误差范围内的预测被视为准确
    """
    y_true = np.asarray(y_true).ravel()
    y_pred = np.asarray(y_pred).ravel()
    
    if target and target in target_tolerance:
        tolerance = target_tolerance[target]
    
    tolerance_values = tolerance * np.abs(y_true)
    residuals = np.abs(y_true - y_pred)
    adjusted_residuals = np.maximum(0, residuals - tolerance_values)
    
    y_true_mean = np.mean(y_true)
    tss = np.sum((y_true - y_true_mean) ** 2)
    rss = np.sum(adjusted_residuals ** 2)
    
    if tss == 0:
        return 0
    
    tolerance_r2 = 1 - (rss / tss)
    return tolerance_r2

def prediction_within_tolerance(y_true, y_pred, tolerance=0.15, target=None):
    """
    计算预测值在目标值±容忍范围内的比例
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    
    if target and target in target_tolerance:
        tolerance = target_tolerance[target]
    
    tolerance_values = tolerance * np.abs(y_true)
    within_tolerance = np.abs(y_true - y_pred) <= tolerance_values
    
    return np.mean(within_tolerance)

def make_tolerance_scorer(target_name):
    def tolerance_score(y_true, y_pred):
        tolerance = target_tolerance.get(target_name, 0.03)
        relative_errors = np.abs(y_true - y_pred) / np.abs(y_true)
        within_tolerance = np.mean(relative_errors <= tolerance)
        return within_tolerance
    return tolerance_score

def evaluate_model(model, X_train, y_train, X_test, y_test, target, model_name):
    """
    评估模型在训练集和测试集上的性能，包括标准R²和容忍度R²
    """
    current_tolerance = target_tolerance.get(target, 0.15)
    
    # 在训练集上评估
    y_train_pred = model.predict(X_train)
    if len(y_train_pred.shape) > 1 and y_train_pred.shape[1] == 1:
        y_train_pred = y_train_pred.flatten()
    
    train_r2 = r2_score(y_train, y_train_pred)
    train_tol_r2 = tolerance_r2_score(y_train, y_train_pred, tolerance=current_tolerance, target=target)
    train_within_tol = prediction_within_tolerance(y_train, y_train_pred, tolerance=current_tolerance, target=target)
    
    # 在测试集上评估
    y_test_pred = model.predict(X_test)
    if len(y_test_pred.shape) > 1 and y_test_pred.shape[1] == 1:
        y_test_pred = y_test_pred.flatten()
        
    test_r2 = r2_score(y_test, y_test_pred)
    test_tol_r2 = tolerance_r2_score(y_test, y_test_pred, tolerance=current_tolerance, target=target)
    test_within_tol = prediction_within_tolerance(y_test, y_test_pred, tolerance=current_tolerance, target=target)
    
    print(f"\n{model_name} 在 {target} 上的评估结果:")
    print(f"训练集: R²={train_r2:.4f}, 容忍度R²={train_tol_r2:.4f}, 在容忍范围内比例={train_within_tol:.2%}")
    print(f"测试集: R²={test_r2:.4f}, 容忍度R²={test_tol_r2:.4f}, 在容忍范围内比例={test_within_tol:.2%}")
    
    # 绘制预测值与实际值的对比散点图
    plt.figure(figsize=(12, 5))
    
    # 训练集散点图
    plt.subplot(1, 2, 1)
    plt.scatter(y_train, y_train_pred, alpha=0.6, s=30)
    min_val = min(min(y_train), min(y_train_pred))
    max_val = max(max(y_train), max(y_train_pred))
    plt.plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2)
    plt.xlabel('实际值')
    plt.ylabel('预测值')
    plt.title(f'训练集: R²={train_r2:.4f}')
    plt.grid(True, alpha=0.3)
    
    # 测试集散点图
    plt.subplot(1, 2, 2)
    plt.scatter(y_test, y_test_pred, alpha=0.6, s=30)
    min_val = min(min(y_test), min(y_test_pred))
    max_val = max(max(y_test), max(y_test_pred))
    plt.plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2)
    plt.xlabel('实际值')
    plt.ylabel('预测值')
    plt.title(f'测试集: R²={test_r2:.4f}')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return {
        'train_r2': train_r2,
        'train_tol_r2': train_tol_r2,
        'train_within_tol': train_within_tol,
        'test_r2': test_r2,
        'test_tol_r2': test_tol_r2,
        'test_within_tol': test_within_tol
    }

# 为当前目标创建特定的评分器
tol_scorer = make_tolerance_scorer(target)
tol_scorer_wrapped = make_scorer(tol_scorer, greater_is_better=True)

# 设置基础参数
base_params = {
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'n_jobs': -1,
    'random_state': 42,
    'missing': np.nan
}

# 定义贝叶斯优化的搜索空间（基于您原始代码的参数范围）
dimensions = [
    Integer(50, 100, name='n_estimators'),           # 您原始代码：[100, 50, 90, 80]
    Real(0.5, 0.8, name='learning_rate'),            # 您原始代码：[0.7, 0.8, 0.6, 0.5]
    Integer(3, 6, name='max_depth'),                 # 您原始代码：[4, 5, 6, 3]
    Integer(2, 6, name='min_child_weight'),          # 您原始代码：[5, 4, 6, 3, 2]
    Real(0.0, 0.2, name='gamma'),                    # 您原始代码：[0, 0.1, 0.2]
    Real(0.5, 0.6, name='subsample'),               # 您原始代码：[0.6, 0.5]
    Real(0.8, 1.0, name='colsample_bytree'),        # 您原始代码：[0.9, 1.0, 0.8]
    Real(0.0, 1.0, name='reg_alpha'),               # 您原始代码：[0, 0.4, 0.5, 0.6, 1.0]
    Real(0.5, 1.0, name='reg_lambda')               # 您原始代码：[1.0, 0.5, 0.7, 0.8]
]

# 交叉验证设置
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 定义目标函数
@use_named_args(dimensions=dimensions)
def objective(**params):
    model = XGBRegressor(
        n_estimators=params['n_estimators'],
        learning_rate=params['learning_rate'],
        max_depth=params['max_depth'],
        min_child_weight=params['min_child_weight'],
        gamma=params['gamma'],
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        reg_alpha=params['reg_alpha'],
        reg_lambda=params['reg_lambda'],
        **base_params
    )
    
    try:
        cv_scores = cross_val_score(
            model, X_train_model, y_train[target], 
            cv=kf, scoring=tol_scorer_wrapped
        )
        return -cv_scores.mean()
    except:
        return 1.0

# 执行贝叶斯优化
print("执行贝叶斯优化...")
result = gp_minimize(
    func=objective,
    dimensions=dimensions,
    n_calls=50,
    n_initial_points=10,
    random_state=42,
    verbose=True
)

# 获取最佳参数
best_params = dict(zip([dim.name for dim in dimensions], result.x))
print(f"最佳参数: {best_params}")
print(f"最佳CV得分: {-result.fun:.4f}")

# 使用最佳参数创建最终模型
xgb_model = XGBRegressor(
    n_estimators=best_params['n_estimators'],
    learning_rate=best_params['learning_rate'],
    max_depth=best_params['max_depth'],
    min_child_weight=best_params['min_child_weight'],
    gamma=best_params['gamma'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree'],
    reg_alpha=best_params['reg_alpha'],
    reg_lambda=best_params['reg_lambda'],
    **base_params
)

xgb_model.fit(X_train_model, y_train[target])

# 交叉验证
cv_scores = cross_val_score(xgb_model, X_train_model, y_train[target], cv=kf, scoring='r2')
print(f"交叉验证R²分数: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# 容忍度R²评分
tolerance_cv_scores = cross_val_score(
    xgb_model, X_train_model, y_train[target], cv=kf, 
    scoring=tol_scorer_wrapped
)
print(f"容忍度R²分数 (容忍度={current_tolerance:.2f}): {tolerance_cv_scores.mean():.4f} ± {tolerance_cv_scores.std():.4f}")

# 保存模型
models[target]['XGBoost'] = xgb_model

# 创建模型保存文件夹
model_folder = '训练模型文件'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)
    print(f"已创建模型文件夹：{model_folder}")

xgb_model_file = os.path.join(model_folder, f'{target}_XGBoost模型.pkl')
with open(xgb_model_file, 'wb') as f:
    pickle.dump(xgb_model, f)
print(f"XGBoost 模型已保存至 {xgb_model_file}")

# 评估模型
results = evaluate_model(xgb_model, X_train_model, y_train[target], X_test_model, y_test[target], target, "XGBoost")

# 获取预测值
y_pred_train = xgb_model.predict(X_train_model)
y_pred_test = xgb_model.predict(X_test_model)

# 分别创建训练集和测试集的预测结果DataFrame
train_prediction = pd.DataFrame({
    '实际值': y_train[target],
    '预测值': y_pred_train,
    '误差': np.abs(y_train[target] - y_pred_train)
})
train_prediction['数据集'] = '训练集'

test_prediction = pd.DataFrame({
    '实际值': y_test[target],
    '预测值': y_pred_test,
    '误差': np.abs(y_test[target] - y_pred_test)
})
test_prediction['数据集'] = '测试集'

# 保存到单独的文件
train_file = os.path.join(save_folder, f'{target}_XGBoost训练集预测结果.csv')
test_file = os.path.join(save_folder, f'{target}_XGBoost测试集预测结果.csv')

train_prediction.to_csv(train_file, index=False)
test_prediction.to_csv(test_file, index=False)

print(f"训练集预测结果已保存至 {train_file}")
print(f"测试集预测结果已保存至 {test_file}")

# 特征重要性
feature_importance = pd.DataFrame({
    'Feature': X_train_model.columns,
    'Importance': xgb_model.feature_importances_
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('重要性')
plt.ylabel('特征')
plt.title(f'{target} - XGBoost特征重要性')
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()

# 保存特征重要性数据
feature_importance_file = os.path.join(save_folder, f'{target}_XGBoost特征重要性.csv')
feature_importance.to_csv(feature_importance_file, index=False)
print(f"特征重要性数据已保存至 {feature_importance_file}")

# 绘制优化收敛图
plt.figure(figsize=(10, 6))
plot_convergence(result)
plt.title('贝叶斯优化收敛过程')
plt.show()

# 保存优化结果
optimization_history = pd.DataFrame({
    '迭代次数': range(1, len(result.func_vals) + 1),
    '目标函数值': result.func_vals,
    '最佳目标函数值': [min(result.func_vals[:i+1]) for i in range(len(result.func_vals))]
})

param_names = [dim.name for dim in dimensions]
for i, param_name in enumerate(param_names):
    optimization_history[f'参数_{param_name}'] = [x[i] for x in result.x_iters]

optimization_history_file = os.path.join(save_folder, f'{target}_贝叶斯优化历史.csv')
optimization_history.to_csv(optimization_history_file, index=False)
print(f"优化历史数据已保存至 {optimization_history_file}")

print(f"\n贝叶斯优化完成！")

In [None]:
#水接触角xgboost
from xgboost import XGBRegressor
import os

# 创建保存数据的文件夹
save_folder = '模型可视化数据'
if not os.path.exists(save_folder):
    os.makedirs(save_folder)
    print(f"已创建文件夹：{save_folder}")
# 选择目标变量
target = '水接触角'
print(f"训练 {target} 的XGBoost模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.03)

# 选择适当的数据集
X_train_model = X_train_tree[target]
X_test_model = X_test_tree[target]

# 为当前目标创建特定的评分器
tol_scorer = make_tolerance_scorer(target)
tol_scorer_wrapped = make_scorer(tol_scorer, greater_is_better=True)

# 设置基础参数
base_params = {
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'n_jobs': -1,
    'random_state': 42,
    'missing': np.nan
}

# 设置超参数
param_dist = {
    'n_estimators': [100, 50, 90,80],  
    'learning_rate': [0.7,0.8,0.6,0.5],
    'max_depth': [ 4, 5,6,3],
    'min_child_weight': [ 5,4,6,3,2],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.6, 0.5],#1
    'colsample_bytree': [0.9, 1.0, 0.8],#1
    'reg_alpha': [0, 0.4,0.5, 0.6,1.0],#1
    'reg_lambda': [1.0,0.5,0.7,0.8]
}
# 创建基础模型
base_model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=8,
    min_child_weight=3,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0,
    reg_lambda=1.0,
    **base_params
)

# 执行超参数优化
print("执行超参数优化...")
search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_dist,
    n_iter=1000,
    cv=kf,
    scoring=tol_scorer_wrapped,
    n_jobs=-1,
    random_state=42,
    verbose=1
)
search.fit(X_train_model, y_train[target])
xgb_model = search.best_estimator_
print(f"最佳参数: {search.best_params_}")
print(f"最佳CV得分: {search.best_score_:.4f}")
# 交叉验证
cv_scores = cross_val_score(xgb_model, X_train_model, y_train[target], cv=kf, scoring='r2')
print(f"交叉验证R²分数: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# 容忍度R²评分
tolerance_cv_scores = cross_val_score(
    xgb_model, X_train_model, y_train[target], cv=kf, 
    scoring=tol_scorer_wrapped
)
print(f"容忍度R²分数 (容忍度={current_tolerance:.2f}): {tolerance_cv_scores.mean():.4f} ± {tolerance_cv_scores.std():.4f}")
# 保存模型
models[target]['XGBoost'] = xgb_model
# 创建模型保存文件夹
model_folder = '训练模型文件'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)
    print(f"已创建模型文件夹：{model_folder}")

xgb_model_file = os.path.join(model_folder, f'{target}_XGBoost模型.pkl')
with open(xgb_model_file, 'wb') as f:
    pickle.dump(xgb_model, f)
print(f"XGBoost 模型已保存至 {xgb_model_file}")
# 评估模型
results = evaluate_model(xgb_model, X_train_model, y_train[target], X_test_model, y_test[target], target, "XGBoost")
# 获取预测值
y_pred_train = xgb_model.predict(X_train_model)
y_pred_test = xgb_model.predict(X_test_model)

# 分别创建训练集和测试集的预测结果DataFrame
train_prediction = pd.DataFrame({
    '实际值': y_train[target],
    '预测值': y_pred_train,
    '误差': np.abs(y_train[target] - y_pred_train)
})
train_prediction['数据集'] = '训练集'

test_prediction = pd.DataFrame({
    '实际值': y_test[target],
    '预测值': y_pred_test,
    '误差': np.abs(y_test[target] - y_pred_test)
})
test_prediction['数据集'] = '测试集'

# 保存到单独的文件
train_file = os.path.join(save_folder, f'{target}_XGBoost训练集预测结果.csv')
test_file = os.path.join(save_folder, f'{target}_XGBoost测试集预测结果.csv')

train_prediction.to_csv(train_file, index=False)
test_prediction.to_csv(test_file, index=False)

print(f"训练集预测结果已保存至 {train_file}")
print(f"测试集预测结果已保存至 {test_file}")
# 特征重要性
feature_importance = pd.DataFrame({
    'Feature': X_train_model.columns,
    'Importance': xgb_model.feature_importances_
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('重要性')
plt.ylabel('特征')
plt.title(f'{target} - XGBoost特征重要性')
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()
# 保存特征重要性数据
feature_importance_file = os.path.join(save_folder, f'{target}_XGBoost特征重要性.csv')
feature_importance.to_csv(feature_importance_file, index=False)
print(f"特征重要性数据已保存至 {feature_importance_file}")

# 绘制训练过程中的损失曲线
eval_set = [(X_train_model, y_train[target]), (X_test_model, y_test[target])]
model_train = XGBRegressor(**{**base_model.get_params(), 'eval_metric': 'rmse'})
model_train.fit(X_train_model, y_train[target], eval_set=eval_set, verbose=False)

results = model_train.evals_result()
epochs = len(results['validation_0']['rmse'])
x_axis = range(0, epochs)

plt.figure(figsize=(10, 6))
plt.plot(x_axis, results['validation_0']['rmse'], label='训练集')
plt.plot(x_axis, results['validation_1']['rmse'], label='测试集')
plt.legend()
plt.ylabel('RMSE')
plt.xlabel('迭代次数')
plt.title('XGBoost训练进度')
plt.grid(True)
plt.show()
# 保存训练进度数据
training_progress_data = pd.DataFrame({
    '迭代次数': x_axis,
    '训练集RMSE': results['validation_0']['rmse'],
    '测试集RMSE': results['validation_1']['rmse']
})
training_progress_file = os.path.join(save_folder, f'{target}_XGBoost训练进度.csv')
training_progress_data.to_csv(training_progress_file, index=False)
print(f"训练进度数据已保存至 {training_progress_file}")
# 学习率影响分析
learning_rates = [0.005, 0.01, 0.03, 0.05, 0.1, 0.2]
plt.figure(figsize=(10, 6))

# 创建用于保存学习率分析数据的DataFrame
lr_analysis_data = pd.DataFrame()

for lr in learning_rates:
    model = XGBRegressor(
        learning_rate=lr,
        n_estimators=500,
        max_depth=3,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='reg:squarederror',
        tree_method='hist',
        random_state=42
    )
    eval_set = [(X_test_model, y_test[target])]
    model.fit(X_train_model, y_train[target], eval_set=eval_set, verbose=False)
    results = model.evals_result()
    
    # 将当前学习率的结果添加到DataFrame
    temp_df = pd.DataFrame({
        '迭代次数': range(len(results['validation_0']['rmse'])),
        f'学习率_{lr}': results['validation_0']['rmse']
    })
    
    if lr_analysis_data.empty:
        lr_analysis_data = temp_df
    else:
        lr_analysis_data = pd.merge(
            lr_analysis_data, temp_df, on='迭代次数', how='outer'
        )
    
    plt.plot(results['validation_0']['rmse'], label=f'学习率: {lr}')

# 保存学习率分析数据
lr_analysis_file = os.path.join(save_folder, f'{target}_XGBoost学习率分析.csv')
lr_analysis_data.to_csv(lr_analysis_file, index=False)
print(f"学习率分析数据已保存至 {lr_analysis_file}")


In [None]:
# lightGBM with Hyperparameter Optimization
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import numpy as np

print(f"训练 {target} 的LightGBM模型...")
# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.03)
# 选择适当的数据集
X_train_model = X_train_tree[target]
X_test_model = X_test_tree[target]

# 设置基础参数
base_params = {
    'objective': 'regression',
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'verbose': -1,
    'n_jobs': -1,
    'random_state': 42
}

# 您当前的最佳参数配置（作为基准参考）
current_best_params = {
    'n_estimators': 8000,
    'learning_rate': 0.001,
    'num_leaves': 20,
    'max_depth': 11,
    'min_child_samples': 1,
    'subsample': 1,
    'colsample_bytree': 1,
    'reg_alpha': 10,
    'reg_lambda': 1.0,
}

# 定义手动交叉验证函数（计算R²分数）
def manual_cross_validation_r2(params, X_data, y_data, cv_folds=5):
    """手动实现交叉验证计算R²分数"""
    from sklearn.model_selection import KFold
    from sklearn.metrics import r2_score
    
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    cv_r2_scores = []
    
    for train_idx, val_idx in kf.split(X_data):
        # 分割训练和验证数据
        if hasattr(X_data, 'iloc'):
            X_train_fold = X_data.iloc[train_idx]
            X_val_fold = X_data.iloc[val_idx]
            y_train_fold = y_data.iloc[train_idx]
            y_val_fold = y_data.iloc[val_idx]
        else:
            X_train_fold = X_data[train_idx]
            X_val_fold = X_data[val_idx]
            y_train_fold = y_data[train_idx]
            y_val_fold = y_data[val_idx]
        
        # 创建并训练模型
        fold_model = CustomLGBMRegressor(**params)
        fold_model.fit(X_train_fold, y_train_fold)
        
        # 预测并计算R²分数
        y_pred = fold_model.predict(X_val_fold)
        r2 = r2_score(y_val_fold, y_pred)
        cv_r2_scores.append(r2)
    
    return np.mean(cv_r2_scores)

# 首先评估您当前参数的基准性能
print("评估当前参数的基准性能...")
baseline_params = {**current_best_params, **base_params}
baseline_r2 = manual_cross_validation_r2(baseline_params, X_train_model, y_train[target])
print(f"当前参数的交叉验证R²: {baseline_r2:.6f}")

# 定义超参数优化的目标函数（最大化R²分数）
def objective(trial):
    # 定义参数搜索空间（在您的最佳参数附近进行更精细的调整）
    params = {
        # n_estimators: 在8000附近进行更小范围搜索
        'n_estimators': trial.suggest_int('n_estimators', 7000, 9000, step=250),
        
        # learning_rate: 在0.001附近进行更精细搜索
        'learning_rate': trial.suggest_float('learning_rate', 0.0008, 0.0015),
        
        # num_leaves: 在20附近进行小范围搜索
        'num_leaves': trial.suggest_int('num_leaves', 18, 25),
        
        # max_depth: 在11附近进行小范围搜索
        'max_depth': trial.suggest_int('max_depth', 10, 13),
        
        # min_child_samples: 在1附近搜索
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 3),
        
        # subsample: 在1.0附近进行微调
        'subsample': trial.suggest_float('subsample', 0.95, 1.0),
        
        # colsample_bytree: 在1.0附近进行微调
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.95, 1.0),
        
        # reg_alpha: 在10附近进行精细调整
        'reg_alpha': trial.suggest_float('reg_alpha', 8.0, 12.0),
        
        # reg_lambda: 在1.0附近进行精细调整
        'reg_lambda': trial.suggest_float('reg_lambda', 0.8, 1.5),
        
        **base_params
    }
    
    # 使用手动交叉验证评估R²分数
    cv_r2 = manual_cross_validation_r2(params, X_train_model, y_train[target])
    
    # 返回R²分数（Optuna将最大化此值）
    return cv_r2

# 执行超参数优化
print("开始超参数优化...")
print("这可能需要一些时间，请耐心等待...")

# 创建研究对象（最大化R²分数）
study = optuna.create_study(
    direction='maximize',  # 改为最大化R²分数
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=10)
)

# 执行优化（您可以根据需要调整n_trials的数量）
study.optimize(objective, n_trials=50, timeout=1800)  # 减少试验次数，聚焦精细优化

# 获取最优参数
best_params = study.best_params
best_r2_score = study.best_value

print(f"超参数优化完成!")
print(f"基准R²分数: {baseline_r2:.6f}")
print(f"最佳交叉验证R²: {best_r2_score:.6f}")
print(f"R²提升幅度: {(best_r2_score - baseline_r2):.6f}")

# 只有当优化结果确实更好时才使用新参数
if best_r2_score > baseline_r2:
    print("优化成功！使用新的参数配置")
    print("最优参数配置:")
    for param, value in best_params.items():
        print(f"  {param}: {value}")
    lgb_params = {**best_params, **base_params}
else:
    print("优化未能改善性能，保持原有参数配置")
    lgb_params = baseline_params
    best_params = current_best_params
    best_r2_score = baseline_r2

print("使用最优参数训练最终模型...")
print("使用自定义LightGBM包装器训练模型")

# 确保保存当前使用的特征列
feature_cols = X_train_model.columns.tolist() if hasattr(X_train_model, 'columns') else None

# 创建并训练模型（使用优化后的参数）
lgb_model = CustomLGBMRegressor(**lgb_params)
lgb_model.fit(X_train_model, y_train[target])

# 保存模型
models[target]['LightGBM'] = lgb_model
# 创建模型保存文件夹（如果已存在则不会重复创建）
model_folder = '训练模型文件'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)
    print(f"已创建模型文件夹：{model_folder}")

lgb_model_file = os.path.join(model_folder, f'{target}_LightGBM模型.pkl')
with open(lgb_model_file, 'wb') as f:
    pickle.dump(lgb_model, f)
print(f"LightGBM 模型已保存至 {lgb_model_file}")

# 保存最优参数配置
params_file = os.path.join(model_folder, f'{target}_LightGBM最优参数.txt')
with open(params_file, 'w', encoding='utf-8') as f:
    f.write(f"目标变量: {target}\n")
    f.write(f"基准R²分数: {baseline_r2:.6f}\n")
    f.write(f"最佳交叉验证R²: {best_r2_score:.6f}\n")
    f.write(f"R²提升幅度: {(best_r2_score - baseline_r2):.6f}\n")
    f.write(f"优化试验次数: {len(study.trials)}\n\n")
    f.write("使用的参数配置:\n")
    for param, value in best_params.items():
        f.write(f"{param}: {value}\n")
print(f"参数配置已保存至 {params_file}")

# 评估模型
results = evaluate_model(lgb_model, X_train_model, y_train[target], X_test_model, y_test[target], target, "LightGBM")

print("LightGBM模型训练成功")
# 获取预测值
y_pred_train = lgb_model.predict(X_train_model)
y_pred_test = lgb_model.predict(X_test_model)

# 分别创建训练集和测试集的预测结果DataFrame
train_prediction = pd.DataFrame({
    '实际值': y_train[target],
    '预测值': y_pred_train,
    '误差': np.abs(y_train[target] - y_pred_train)
})
train_prediction['数据集'] = '训练集'

test_prediction = pd.DataFrame({
    '实际值': y_test[target],
    '预测值': y_pred_test,
    '误差': np.abs(y_test[target] - y_pred_test)
})
test_prediction['数据集'] = '测试集'

# 保存到单独的文件
train_file = os.path.join(save_folder, f'{target}_LightGBM训练集预测结果.csv')
test_file = os.path.join(save_folder, f'{target}_LightGBM测试集预测结果.csv')

train_prediction.to_csv(train_file, index=False)
test_prediction.to_csv(test_file, index=False)

print(f"训练集预测结果已保存至 {train_file}")
print(f"测试集预测结果已保存至 {test_file}")

# 特征重要性可视化
if hasattr(lgb_model.model, 'feature_importance') and feature_cols is not None:
    feature_importance = pd.DataFrame({
        'Feature': feature_cols,
        'Importance': lgb_model.model.feature_importance(importance_type='gain')
    })
    feature_importance = feature_importance.sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance['Feature'], feature_importance['Importance'])
    plt.xlabel('增益重要性')
    plt.ylabel('特征')
    plt.title(f'{target} - LightGBM特征重要性 (优化后参数)')
    plt.grid(True, axis='x')
    plt.tight_layout()
    plt.show()
# 保存特征重要性数据
feature_importance_file = os.path.join(save_folder, f'{target}_LightGBM特征重要性.csv')
feature_importance.to_csv(feature_importance_file, index=False)
print(f"特征重要性数据已保存至 {feature_importance_file}")

# 可选：可视化优化过程
print("\n生成超参数优化可视化图表...")
try:
    from optuna.visualization import plot_optimization_history, plot_param_importances
    
    # 优化历史
    fig_history = plot_optimization_history(study)
    fig_history.write_html(os.path.join(model_folder, f'{target}_优化历史.html'))
    
    # 参数重要性
    fig_importance = plot_param_importances(study)
    fig_importance.write_html(os.path.join(model_folder, f'{target}_参数重要性.html'))
    
    print(f"优化可视化图表已保存至 {model_folder}")
except ImportError:
    print("未安装plotly，跳过可视化生成")

# 添加预测不准确样本分析
# 获取测试集预测值
y_pred = lgb_model.predict(X_test_model)

# 将y_test转换为numpy数组格式进行处理
if hasattr(y_test[target], 'values'):
    y_true_values = y_test[target].values
else:
    y_true_values = y_test[target]

# 计算绝对误差
errors = np.abs(y_true_values - y_pred)

# 设置容忍度阈值
tolerance = 5.0  # 可以根据需要调整

# 找出误差超过容忍度的样本
inaccurate_mask = errors > tolerance
inaccurate_indices = np.where(inaccurate_mask)[0]

print(f"\n预测不准确的样本数量: {len(inaccurate_indices)} (占测试集的 {len(inaccurate_indices)/len(y_test)*100:.2f}%)")
print(f"使用的容忍度阈值: {tolerance}")

# 创建预测不准确样本的分析数据
if len(inaccurate_indices) > 0:
    # 尝试获取原始索引，如果不可用则使用数组位置索引
    try:
        if hasattr(y_test, 'index'):
            original_indices = [y_test.index[i] for i in inaccurate_indices]
        elif isinstance(X_test_model, pd.DataFrame) and hasattr(X_test_model, 'index'):
            original_indices = [X_test_model.index[i] for i in inaccurate_indices]
        else:
            # 如果无法获取原始索引，使用数组位置作为标识
            original_indices = inaccurate_indices
    except Exception as e:
        print(f"无法获取原始索引: {str(e)}")
        original_indices = inaccurate_indices
    
    # 创建包含预测不准确样本信息的DataFrame
    inaccurate_samples = []
    for i, idx in enumerate(inaccurate_indices):
        # 安全地获取实际值
        if hasattr(y_test[target], 'iloc'):
            actual = y_test[target].iloc[idx]
        else:
            actual = y_true_values[idx]
        
        inaccurate_samples.append({
            '样本索引': original_indices[i],
            '实际值': actual,
            '预测值': y_pred[idx],
            '绝对误差': errors[idx],
            '相对误差(%)': (errors[idx] / np.abs(actual)) * 100 if actual != 0 else float('inf')
        })
    
    inaccurate_df = pd.DataFrame(inaccurate_samples)
    # 按误差降序排列
    inaccurate_df = inaccurate_df.sort_values('绝对误差', ascending=False)
    
    # 打印预测不准确的样本信息
    print("\n预测不准确的样本详情 (按误差降序排列):")
    print(inaccurate_df)
    
    # 保存结果到文件
    inaccurate_df.to_csv(f'{target}_不准确预测.csv', index=False)
else:
    print(f"\n没有发现预测不准确的样本 (容忍度阈值: {tolerance})")

In [None]:
#HistGradientBoosting
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
print(f"训练 {target} 的HistGradientBoosting模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.03)

# 选择适当的数据集
X_train_model = X_train_tree[target]
X_test_model = X_test_tree[target]

# 为当前目标创建特定的评分器
tol_scorer = make_tolerance_scorer(target)
tol_scorer_wrapped = make_scorer(tol_scorer, greater_is_better=True)

# 设置超参数
param_dist = {
    'max_iter': [450, 400,350,440,460],
    'learning_rate': [0.01, 0.008, 0.011,0.009],
    'max_depth': [9, 11, 10,8,7,6,5],
    'min_samples_leaf': [1, 2, 4],
    'l2_regularization': [0, 0.1, 0.2]
}

# 创建基础模型
base_model = HistGradientBoostingRegressor(
    max_iter=3000,
    learning_rate=1,
    max_depth=3,
    min_samples_leaf=4,
    l2_regularization=0.5,
    loss='squared_error',
    random_state=42
)

# 执行超参数优化
if len(X_train_model) >= 90:
    print("执行超参数优化...")
    search = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=param_dist,
        n_iter=2500,
        cv=kf,
        scoring=tol_scorer_wrapped,
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    search.fit(X_train_model, y_train[target])
    hgb_model = search.best_estimator_
    print(f"最佳参数: {search.best_params_}")
    print(f"最佳CV得分: {search.best_score_:.4f}")
else:
    # 使用预定义的参数
    print("使用预定义参数...")
    hgb_model = base_model
    hgb_model.fit(X_train_model, y_train[target])

# 交叉验证
cv_scores = cross_val_score(hgb_model, X_train_model, y_train[target], cv=kf, scoring='r2')
print(f"交叉验证R²分数: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# 容忍度R²评分
tolerance_cv_scores = cross_val_score(
    hgb_model, X_train_model, y_train[target], cv=kf, 
    scoring=tol_scorer_wrapped
)
print(f"容忍度R²分数 (容忍度={current_tolerance:.2f}): {tolerance_cv_scores.mean():.4f} ± {tolerance_cv_scores.std():.4f}")

# 保存模型
models[target]['HistGradientBoosting'] = hgb_model
# 使用pickle保存HistGradientBoosting模型
hgb_model_file = os.path.join(model_folder, f'{target}_HistGradientBoosting模型.pkl')
with open(hgb_model_file, 'wb') as f:
    pickle.dump(hgb_model, f)
print(f"HistGradientBoosting模型已保存至 {hgb_model_file}")
# 评估模型
results = evaluate_model(hgb_model, X_train_model, y_train[target], X_test_model, y_test[target], target, "HistGradientBoosting")
# 获取预测值
y_pred_train = hgb_model.predict(X_train_model)
y_pred_test = hgb_model.predict(X_test_model)

# 分别创建训练集和测试集的预测结果DataFrame
train_prediction = pd.DataFrame({
    '实际值': y_train[target],
    '预测值': y_pred_train,
    '误差': np.abs(y_train[target] - y_pred_train)
})
train_prediction['数据集'] = '训练集'

test_prediction = pd.DataFrame({
    '实际值': y_test[target],
    '预测值': y_pred_test,
    '误差': np.abs(y_test[target] - y_pred_test)
})
test_prediction['数据集'] = '测试集'

# 保存到文件
train_file = os.path.join(save_folder, f'{target}_HistGradientBoosting训练集预测结果.csv')
test_file = os.path.join(save_folder, f'{target}_HistGradientBoosting测试集预测结果.csv')

train_prediction.to_csv(train_file, index=False)
test_prediction.to_csv(test_file, index=False)

print(f"训练集预测结果已保存至 {train_file}")
print(f"测试集预测结果已保存至 {test_file}")
# 由于HistGradientBoosting不直接提供特征重要性，使用permutation importance评估
from sklearn.inspection import permutation_importance

perm_importance = permutation_importance(
    hgb_model, X_test_model, y_test[target], 
    n_repeats=10, random_state=42, n_jobs=-1
)

feature_importance = pd.DataFrame({
    'Feature': X_train_model.columns,
    'Importance': perm_importance.importances_mean
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('置换重要性')
plt.ylabel('特征')
plt.title(f'{target} - HistGradientBoosting特征重要性')
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()
# 保存特征重要性数据
feature_importance_file = os.path.join(save_folder, f'{target}_HistGradientBoosting特征重要性.csv')
feature_importance.to_csv(feature_importance_file, index=False)
print(f"特征重要性数据已保存至 {feature_importance_file}")
# 不同学习率和迭代次数的影响分析
learning_rates = [0.01, 0.008, 0.011,0.009]
max_iters = [50, 100, 200, 300]
fig, axs = plt.subplots(len(learning_rates), 1, figsize=(10, 4*len(learning_rates)), sharex=True)

for i, lr in enumerate(learning_rates):
    train_scores = []
    test_scores = []
    for iter_count in max_iters:
        model = HistGradientBoostingRegressor(
            max_iter=iter_count,
            learning_rate=lr,
            max_depth=3,
            random_state=42
        )
        model.fit(X_train_model, y_train[target])
        train_score = r2_score(y_train[target], model.predict(X_train_model))
        test_score = r2_score(y_test[target], model.predict(X_test_model))
        train_scores.append(train_score)
        test_scores.append(test_score)
    
    axs[i].plot(max_iters, train_scores, 'o-', label='训练集 R²')
    axs[i].plot(max_iters, test_scores, 'o-', label='测试集 R²')
    axs[i].set_title(f'学习率 = {lr}')
    axs[i].set_ylabel('R²')
    axs[i].grid(True)
    axs[i].legend()
plt.xlabel('迭代次数')
plt.suptitle('HistGradientBoosting - 学习率和迭代次数影响')
plt.tight_layout()
plt.show()
# 准备保存学习率和迭代次数影响分析数据
analysis_data = []

for i, lr in enumerate(learning_rates):
    for j, iter_count in enumerate(max_iters):
        analysis_data.append({
            '学习率': lr,
            '迭代次数': iter_count,
            '训练集R²': train_scores[j],
            '测试集R²': test_scores[j]
        })

# 转换为DataFrame并保存
lr_iter_analysis = pd.DataFrame(analysis_data)
lr_analysis_file = os.path.join(save_folder, f'{target}_HistGradientBoosting学习率迭代分析.csv')
lr_iter_analysis.to_csv(lr_analysis_file, index=False)
print(f"学习率和迭代次数分析数据已保存至 {lr_analysis_file}")




In [None]:

#RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

# 选择目标变量
print(f"训练 {target} 的随机森林模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.03)

# 选择适当的数据集
X_train_model = X_train_tree_filled[target]
X_test_model = X_test_tree_filled[target]

# 为当前目标创建特定的评分器
tol_scorer = make_tolerance_scorer(target)
tol_scorer_wrapped = make_scorer(tol_scorer, greater_is_better=True)

# 设置超参数最佳参数: {'n_estimators': 900, 'min_samples_split': 5, 'min_samples_leaf': 7, 'max_features': 'sqrt', 'max_depth': 4}
param_dist = {
    'n_estimators': [ 900,1000,800],
    'max_depth': [4, 5, 6,2,3,7],
    'min_samples_split': [ 5,4,6,7,8],
    'min_samples_leaf': [7, 4,8,9,5,6],
    'max_features': ['sqrt', 'log2', None]
}

# 创建基础模型
base_model = RandomForestRegressor(
    n_estimators=800,
    max_depth=None,
    min_samples_split=3,
    min_samples_leaf=4,
    max_features='sqrt',
    bootstrap=True,
    oob_score=True,
    n_jobs=-1,
    random_state=42
)

# 执行超参数优化
if len(X_train_model) >= 90:
    print("执行超参数优化...")
    search = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=param_dist,
        n_iter=500,#变大后训练集R方变大但是时间可能很长2000/0.3569；4000/0.2091；3000/0.1221；1000/0.1577
        cv=kf,
        scoring=tol_scorer_wrapped,
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    search.fit(X_train_model, y_train[target])
    rf_model = search.best_estimator_
    print(f"最佳参数: {search.best_params_}")
    print(f"最佳CV得分: {search.best_score_:.4f}")
else:
    # 使用预定义的参数
    print("使用预定义参数...")
    rf_model = base_model
    rf_model.fit(X_train_model, y_train[target])

# 交叉验证
cv_scores = cross_val_score(rf_model, X_train_model, y_train[target], cv=kf, scoring='r2')
print(f"交叉验证R²分数: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# 容忍度R²评分
tolerance_cv_scores = cross_val_score(
    rf_model, X_train_model, y_train[target], cv=kf, 
    scoring=tol_scorer_wrapped
)
print(f"容忍度R²分数 (容忍度={current_tolerance:.2f}): {tolerance_cv_scores.mean():.4f} ± {tolerance_cv_scores.std():.4f}")

# 如果模型有oob_score属性，输出oob分数
if hasattr(rf_model, 'oob_score_'):
    print(f"袋外评分 (OOB score): {rf_model.oob_score_:.4f}")

# 保存模型
models[target]['RandomForest'] = rf_model
# 使用pickle保存随机森林模型
rf_model_file = os.path.join(model_folder, f'{target}_随机森林模型.pkl')
with open(rf_model_file, 'wb') as f:
    pickle.dump(rf_model, f)
print(f"随机森林模型已保存至 {rf_model_file}")
# 评估模型
results = evaluate_model(rf_model, X_train_model, y_train[target], X_test_model, y_test[target], target, "RandomForest")
# 获取预测值
y_pred_train = rf_model.predict(X_train_model)
y_pred_test = rf_model.predict(X_test_model)

# 分别创建训练集和测试集的预测结果DataFrame
train_prediction = pd.DataFrame({
    '实际值': y_train[target],
    '预测值': y_pred_train,
    '误差': np.abs(y_train[target] - y_pred_train)
})
train_prediction['数据集'] = '训练集'

test_prediction = pd.DataFrame({
    '实际值': y_test[target],
    '预测值': y_pred_test,
    '误差': np.abs(y_test[target] - y_pred_test)
})
test_prediction['数据集'] = '测试集'

# 保存到文件
train_file = os.path.join(save_folder, f'{target}_随机森林训练集预测结果.csv')
test_file = os.path.join(save_folder, f'{target}_随机森林测试集预测结果.csv')

train_prediction.to_csv(train_file, index=False)
test_prediction.to_csv(test_file, index=False)

print(f"训练集预测结果已保存至 {train_file}")
print(f"测试集预测结果已保存至 {test_file}")
# 特征重要性
feature_importance = pd.DataFrame({
    'Feature': X_train_model.columns,
    'Importance': rf_model.feature_importances_
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('重要性')
plt.ylabel('特征')
plt.title(f'{target} - RandomForest特征重要性')
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()
# 保存特征重要性数据
feature_importance_file = os.path.join(save_folder, f'{target}_随机森林特征重要性.csv')
feature_importance.to_csv(feature_importance_file, index=False)
print(f"特征重要性数据已保存至 {feature_importance_file}")
# 不同参数组合的影响
n_estimators_range = [10, 50, 100, 200, 300, 400,800,1000]
train_scores = []
test_scores = []
oob_scores = []

for n_est in n_estimators_range:
    rf = RandomForestRegressor(
        n_estimators=n_est,
        max_depth=None,
        min_samples_split=3,
        min_samples_leaf=4,
        max_features='sqrt',
        bootstrap=True,
        oob_score=True,
        n_jobs=-1,
        random_state=42
    )
    rf.fit(X_train_model, y_train[target])
    train_scores.append(r2_score(y_train[target], rf.predict(X_train_model)))
    test_scores.append(r2_score(y_test[target], rf.predict(X_test_model)))
    oob_scores.append(rf.oob_score_)

plt.figure(figsize=(10, 6))
plt.plot(n_estimators_range, train_scores, 'o-', label='训练集 R²')
plt.plot(n_estimators_range, test_scores, 'o-', label='测试集 R²')
plt.plot(n_estimators_range, oob_scores, 'o-', label='OOB R²')
plt.xlabel('树的数量')
plt.ylabel('R²')
plt.title('RandomForest - 树数量对性能的影响')
plt.legend()
plt.grid(True)
plt.show()
# 保存树数量影响分析数据
trees_analysis_data = pd.DataFrame({
    '树的数量': n_estimators_range,
    '训练集R²': train_scores,
    '测试集R²': test_scores,
    '袋外评分': oob_scores
})
trees_analysis_file = os.path.join(save_folder, f'{target}_随机森林树数量分析.csv')
trees_analysis_data.to_csv(trees_analysis_file, index=False)
print(f"树数量影响分析数据已保存至 {trees_analysis_file}")




In [None]:
# 导入必要的库
import pandas as pd
import numpy as np
import time
import os
import pickle
import matplotlib.pyplot as plt
import matplotlib
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel as C, Matern
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.inspection import permutation_importance
import traceback

# 设置Matplotlib正常显示中文
matplotlib.rcParams['font.sans-serif'] = ['SimHei']  # 'SimHei' 是黑体
matplotlib.rcParams['axes.unicode_minus'] = False # 解决负号显示问题


# --- 代码开始 ---
target="水接触角"
print(f"训练 {target} 的高斯过程回归模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.15)

# 选择数据集
X_train_model = X_train_linear[target]
X_test_model = X_test_linear[target]

# 步骤1: 对输入特征进行标准化
print("\n步骤1: 对输入特征进行标准化...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_model)
X_test_scaled = scaler.transform(X_test_model)
print("特征标准化完成。")

# 步骤2: 扩展核函数库并进行自动化选择与优化 (核心修改)
print("\n步骤2: 扩展核函数库，进行更全面的自动化模型选择...")

# 定义一个更丰富的、带优化边界的核函数字典
kernels_to_try = {
    "RBF": 
        C(1.0, (1e-3, 1e3)) * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2)) 
        + WhiteKernel(noise_level=0.1, noise_level_bounds=(1e-10, 1e1)),
    
    "Matern (nu=1.5)": 
        C(1.0, (1e-3, 1e3)) * Matern(length_scale=1.0, length_scale_bounds=(1e-2, 1e2), nu=1.5) 
        + WhiteKernel(noise_level=0.1, noise_level_bounds=(1e-10, 1e1)),
        
    "Matern (nu=2.5)": 
        C(1.0, (1e-3, 1e3)) * Matern(length_scale=1.0, length_scale_bounds=(1e-2, 1e2), nu=2.5) 
        + WhiteKernel(noise_level=0.1, noise_level_bounds=(1e-10, 1e1)),
        
    "RationalQuadratic": 
        C(1.0, (1e-3, 1e3)) * RationalQuadratic(length_scale=1.0, alpha=0.1, length_scale_bounds=(1e-2, 1e2), alpha_bounds=(1e-2, 1e2)) 
        + WhiteKernel(noise_level=0.1, noise_level_bounds=(1e-10, 1e1)),

    # 复合核示例：RBF + Matern (更复杂的模型，可能需要更多数据来避免过拟合)
    "RBF + Matern":
        C(1.0, (1e-3, 1e3)) * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2))
        + C(1.0, (1e-3, 1e3)) * Matern(length_scale=1.0, length_scale_bounds=(1e-2, 1e2), nu=1.5)
        + WhiteKernel(noise_level=0.1, noise_level_bounds=(1e-10, 1e1))
}

best_score = -np.inf
best_kernel_name = ""
best_model_from_cv = None

print("开始测试多个核函数，优中选优...")
for name, kernel in kernels_to_try.items():
    print(f"  > 正在测试核函数: {name}...")
    gp = GaussianProcessRegressor(
        kernel=kernel,
        n_restarts_optimizer=15, # 保证充分优化
        normalize_y=True,
        random_state=42
    )
    try:
        # 使用交叉验证来评估当前核函数的性能
        score = cross_val_score(gp, X_train_scaled, y_train[target], cv=min(3, cv_folds), scoring='r2').mean()
        print(f"    交叉验证 R² 平均分: {score:.4f}")

        if score > best_score:
            best_score = score
            best_kernel_name = name
            # 训练一个模型以备后用
            gp.fit(X_train_scaled, y_train[target])
            best_model_from_cv = gp
            
    except Exception as e:
        print(f"    核函数 {name} 训练失败: {e}")
        continue

if best_model_from_cv is None:
    raise RuntimeError("所有核函数都训练失败，请检查数据或核函数参数！")

print(f"\n[决策] 最佳核函数为: '{best_kernel_name}' (交叉验证最高分: {best_score:.4f})")

# 将选出的最佳模型作为最终模型
best_model = best_model_from_cv
print(f"最终选定的模型核函数参数: {best_model.kernel_}")



# 步骤3: 评估并可视化最终模型
print(f"\n步骤3: 评估并可视化最终模型...")

# 在训练集和测试集上获取预测值
y_pred_train = best_model.predict(X_train_scaled)
y_pred_test = best_model.predict(X_test_scaled)

# 计算最终的R²分数
train_r2 = r2_score(y_train[target], y_pred_train)
test_r2 = r2_score(y_test[target], y_pred_test)
print(f"训练集最终 R²: {train_r2:.4f}")
print(f"测试集最终 R²: {test_r2:.4f}")

# 绘制“实际值 vs. 预测值”对比图 (训练集和测试集)
fig, axes = plt.subplots(1, 2, figsize=(16, 8))
fig.suptitle(f'"{target}" 的模型预测效果 (最佳核: {best_kernel_name})', fontsize=16)

# 训练集图
axes[0].scatter(y_train[target], y_pred_train, alpha=0.6)
min_val_train = min(y_train[target].min(), y_pred_train.min())
max_val_train = max(y_train[target].max(), y_pred_train.max())
axes[0].plot([min_val_train, max_val_train], [min_val_train, max_val_train], 'r--', lw=2, label='理想情况 (y=x)')
axes[0].set_title(f'训练集 (R² = {train_r2:.4f})')
axes[0].set_xlabel('实际值')
axes[0].set_ylabel('预测值')
axes[0].legend()
axes[0].grid(True)
axes[0].axis('equal')

# 测试集图
axes[1].scatter(y_test[target], y_pred_test, alpha=0.6)
min_val_test = min(y_test[target].min(), y_pred_test.min())
max_val_test = max(y_test[target].max(), y_pred_test.max())
axes[1].plot([min_val_test, max_val_test], [min_val_test, max_val_test], 'r--', lw=2, label='理想情况 (y=x)')
axes[1].set_title(f'测试集 (R² = {test_r2:.4f})')
axes[1].set_xlabel('实际值')
axes[1].set_ylabel('预测值')
axes[1].legend()
axes[1].grid(True)
axes[1].axis('equal')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
# 步骤4: 保存模型、结果和特征重要性
print(f"\n步骤4: 保存模型与分析结果...")
models[target]['GaussianProcess'] = best_model
gp_model_file = os.path.join(model_folder, f'{target}_高斯过程模型.pkl')
with open(gp_model_file, 'wb') as f:
    pickle.dump(best_model, f)
print(f"高斯过程回归模型已保存至 {gp_model_file}")

# 创建并保存预测结果DataFrame
train_prediction = pd.DataFrame({
    '实际值': y_train[target],
    '预测值': y_pred_train,
    '误差': np.abs(y_train[target] - y_pred_train)
})
train_prediction['数据集'] = '训练集'

test_prediction = pd.DataFrame({
    '实际值': y_test[target],
    '预测值': y_pred_test,
    '误差': np.abs(y_test[target] - y_pred_test)
})
test_prediction['数据集'] = '测试集'

train_file = os.path.join(save_folder, f'{target}_高斯过程训练集预测结果.csv')
test_file = os.path.join(save_folder, f'{target}_高斯过程测试集预测结果.csv')
train_prediction.to_csv(train_file, index=False)
test_prediction.to_csv(test_file, index=False)
print(f"训练集和测试集预测结果已保存。")


# 计算并可视化特征重要性
perm_importance = permutation_importance(
    best_model, X_test_scaled, y_test[target],
    n_repeats=10, random_state=42, n_jobs=-1
)

feature_importance = pd.DataFrame({
    'Feature': X_train_model.columns,
    'Importance': perm_importance.importances_mean
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('置换重要性 (Permutation Importance)')
plt.ylabel('特征')
plt.title(f'{target} - GaussianProcess特征重要性')
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()

feature_importance_file = os.path.join(save_folder, f'{target}_高斯过程特征重要性.csv')
feature_importance.to_csv(feature_importance_file, index=False)
print(f"特征重要性数据已保存。")


# 步骤5: 分析预测不准确的样本
print(f"\n步骤5: 分析预测不准确的样本...")

y_true_values = y_test[target].values
errors = np.abs(y_true_values - y_pred_test)
tolerance = 5.0  # 您可以根据需要调整此阈值

inaccurate_mask = errors > tolerance
inaccurate_indices = np.where(inaccurate_mask)[0]

print(f"\n预测不准确的样本数量: {len(inaccurate_indices)} (占测试集的 {len(inaccurate_indices)/len(y_test)*100:.2f}%)，使用的容忍度阈值: {tolerance}")

if len(inaccurate_indices) > 0:
    original_indices = [X_test_model.index[i] for i in inaccurate_indices]

    inaccurate_samples = []
    for i, idx in enumerate(inaccurate_indices):
        actual = y_true_values[idx]
        prediction = y_pred_test[idx]
        inaccurate_samples.append({
            '样本索引': original_indices[i],
            '实际值': actual,
            '预测值': prediction,
            '绝对误差': errors[idx],
            '相对误差(%)': (errors[idx] / np.abs(actual)) * 100 if actual != 0 else float('inf'),
        })
    
    inaccurate_df = pd.DataFrame(inaccurate_samples).sort_values('绝对误差', ascending=False)
    
    print("\n预测不准确的样本详情 (按误差降序排列):")
    print(inaccurate_df)
    
    inaccurate_file_path = os.path.join(save_folder, f'{target}_GP不准确预测.csv')
    inaccurate_df.to_csv(inaccurate_file_path, index=False)
    print(f"不准确样本分析已保存至 {inaccurate_file_path}")
else:
    print(f"\n在容忍度阈值 {tolerance} 内，没有发现预测不准确的样本。")

print("\n高斯过程回归模型训练、评估和分析全部完成。")

In [None]:
# 直接运行的模型加载代码
import os
import pickle
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# 如果需要加载神经网络模型，需要先定义KerasRegressorWrapper类
class KerasRegressorWrapper:
    def __init__(self, hidden_layers=[128, 64, 32], dropout_rate=0.3, 
                 learning_rate=0.001, epochs=100, batch_size=32):
        self.hidden_layers = hidden_layers
        self.dropout_rate = dropout_rate
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = None
        self.history = None
        
    def predict(self, X):
        return self.model.predict(X, verbose=0).flatten()
    
    def get_params(self, deep=True):
        return {
            'hidden_layers': self.hidden_layers,
            'dropout_rate': self.dropout_rate,
            'learning_rate': self.learning_rate,
            'epochs': self.epochs,
            'batch_size': self.batch_size
        }
    
    def set_params(self, **params):
        for key, value in params.items():
            setattr(self, key, value)
        return self

# 设置模型文件夹路径
model_folder = '训练模型文件'

# 初始化models字典
if 'models' not in locals():
    models = {}

print(f"为目标变量 {target} 加载模型...")

# 查找模型文件，排除Keras模型
model_files = [f for f in os.listdir(model_folder) 
              if f.startswith(f'{target}_') and f.endswith('.pkl') 
              and not f.endswith('_features.pkl')
              and 'Ensemble' not in f and '集成' not in f
              and 'Keras' not in f and 'Neural' not in f and 'NN' not in f]

print(f"找到 {len(model_files)} 个模型文件: {model_files}")

# 初始化目标变量的模型字典
if target not in models:
    models[target] = {}

# 加载每个模型
for model_file in model_files:
    # 从文件名提取模型名称
    model_name = model_file.replace(f'{target}_', '').replace('模型.pkl', '')
    
    print(f"  加载模型: {model_name}")
    
    # 加载模型
    model_path = os.path.join(model_folder, model_file)
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    
    models[target][model_name] = model
    print(f"    {model_name} 加载成功")

print(f"成功加载 {len(models[target])} 个模型")
print(f"可用模型: {list(models[target].keys())}")

In [None]:

# VotingEnsemble - 基于模型标准R²性能分配权重
print(f"训练 {target} 的VotingEnsemble集成模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.15)

# 使用已有模型和已知性能 - 无需重新评估
base_models = []
model_scores = {}  # 存储标准R²分数
model_datasets = {}  # 存储每个模型对应的数据集

print("收集已有模型的性能评估结果...")
# 使用原始训练代码计算的标准R²
if 'XGBoost' in models[target]:
    # 不重新评估，而是计算一次标准R²
    model = models[target]['XGBoost']
    y_pred = model.predict(X_test_tree[target])
    r2 = r2_score(y_test[target], y_pred)
    base_models.append(('xgb', model))
    model_scores['xgb'] = r2
    model_datasets['xgb'] = 'tree'
    print(f"  XGBoost - R²: {r2:.4f}")

if 'LightGBM' in models[target]:
    model = models[target]['LightGBM']
    y_pred = model.predict(X_test_tree[target])
    r2 = r2_score(y_test[target], y_pred)
    base_models.append(('lgb', model))
    model_scores['lgb'] = r2
    model_datasets['lgb'] = 'tree'
    print(f"  LightGBM - R²: {r2:.4f}")

if 'HistGradientBoosting' in models[target]:
    model = models[target]['HistGradientBoosting']
    y_pred = model.predict(X_test_tree[target])
    r2 = r2_score(y_test[target], y_pred)
    base_models.append(('hgb', model))
    model_scores['hgb'] = r2
    model_datasets['hgb'] = 'tree'
    print(f"  HistGradientBoosting - R²: {r2:.4f}")

if 'RandomForest' in models[target]:
    model = models[target]['RandomForest']
    y_pred = model.predict(X_test_tree_filled[target])
    r2 = r2_score(y_test[target], y_pred)
    base_models.append(('rf', model))
    model_scores['rf'] = r2
    model_datasets['rf'] = 'tree_filled'
    print(f"  RandomForest - R²: {r2:.4f}")

if 'GaussianProcess' in models[target]:
    model = models[target]['GaussianProcess']
    y_pred = model.predict(X_test_linear[target])
    r2 = r2_score(y_test[target], y_pred)
    base_models.append(('gp', model))
    model_scores['gp'] = r2
    model_datasets['gp'] = 'linear'
    print(f"  GaussianProcess - R²: {r2:.4f}")

# 改为基于标准R²性能计算权重，而不是容忍度R²
print("\n根据标准R²模型性能分配权重...")

# 基于标准R²计算权重
total_score = sum(model_scores.values())
if total_score > 0:  # 防止除以零错误
    weights = [model_scores[name] / total_score * len(model_scores) for name, _ in base_models]
else:
    weights = [1.0 for _ in base_models]  # 如果总分为0，则均等分配权重

print("  基于标准R²分配权重")

# 确保权重至少为0.5，防止某些模型权重过低
min_weight = 0.5
weights = [max(w, min_weight) for w in weights]

# 打印权重
for i, (name, _) in enumerate(base_models):
    print(f"  {name} 权重: {weights[i]:.4f}")

# 检查是否有足够的模型可用于集成
if len(base_models) >= 2:
    try:
        # 创建自定义投票回归器的封装，确保使用正确的数据集
        class EnhancedVotingRegressor:
            def __init__(self, estimators, weights, datasets, target_name):
                self.estimators = estimators
                self.weights = weights
                self.datasets = datasets
                self.target_name = target_name
                
                # 归一化权重
                self.weights = np.array(self.weights)
                self.weights = self.weights / np.sum(self.weights)
                
            def predict(self, X):
                # 对每个模型获取预测，并根据模型类型使用适当的数据预处理
                predictions = []
                
                for i, (name, model) in enumerate(self.estimators):
                    # 选择合适的数据格式
                    dataset_type = self.datasets.get(name, 'standard')
                    
                    if dataset_type == 'tree':
                        # 对于支持NaN的树模型，直接使用X
                        X_model = X
                    elif dataset_type == 'tree_filled':
                        # 对于不支持NaN的树模型，需要填充X
                        if isinstance(X, pd.DataFrame):
                            X_model = X.fillna(0)
                        else:
                            X_model = X
                    elif dataset_type == 'linear':
                        # 对于线性模型，使用线性预处理的X
                        X_model = X
                    else:
                        # 默认情况下直接使用X
                        X_model = X
                    
                    # 获取当前模型的预测
                    pred = model.predict(X_model)
                    predictions.append(pred)
                
                # 加权平均所有预测
                weighted_pred = np.zeros(predictions[0].shape)
                for i, pred in enumerate(predictions):
                    weighted_pred += self.weights[i] * pred
                
                return weighted_pred
        
        # 创建投票集成模型
        print("创建投票集成模型...")
        voting_model = EnhancedVotingRegressor(
            estimators=base_models,
            weights=weights,
            datasets=model_datasets,
            target_name=target
        )
        
        # 获取训练集和测试集预测
        train_predictions = {}
        test_predictions = {}
        
        # 获取每个基础模型的预测
        for name, model in base_models:
            if model_datasets[name] == 'tree':
                train_predictions[name] = model.predict(X_train_tree[target])
                test_predictions[name] = model.predict(X_test_tree[target])
            elif model_datasets[name] == 'tree_filled':
                train_predictions[name] = model.predict(X_train_tree_filled[target])
                test_predictions[name] = model.predict(X_test_tree_filled[target])
            elif model_datasets[name] == 'linear':
                train_predictions[name] = model.predict(X_train_linear[target])
                test_predictions[name] = model.predict(X_test_linear[target])
        
        # 计算加权预测
        y_train_pred = np.zeros(len(y_train[target]))
        y_test_pred = np.zeros(len(y_test[target]))
        
        for i, (name, _) in enumerate(base_models):
            y_train_pred += weights[i] * train_predictions[name]
            y_test_pred += weights[i] * test_predictions[name]
        
        # 归一化权重
        total_weight = sum(weights)
        y_train_pred /= total_weight
        y_test_pred /= total_weight
        
        # 计算性能指标
        train_r2 = r2_score(y_train[target], y_train_pred)
        test_r2 = r2_score(y_test[target], y_test_pred)
        
        train_tol_r2 = tolerance_r2_score(y_train[target], y_train_pred, tolerance=current_tolerance, target=target)
        test_tol_r2 = tolerance_r2_score(y_test[target], y_test_pred, tolerance=current_tolerance, target=target)
        
        train_within_tol = prediction_within_tolerance(y_train[target], y_train_pred, tolerance=current_tolerance, target=target)
        test_within_tol = prediction_within_tolerance(y_test[target], y_test_pred, tolerance=current_tolerance, target=target)
        # 保存训练集和测试集的预测结果
        train_prediction = pd.DataFrame({
            '实际值': y_train[target],
            '集成预测值': y_train_pred,
            '误差': np.abs(y_train[target] - y_train_pred)
        })

        test_prediction = pd.DataFrame({
            '实际值': y_test[target],
            '集成预测值': y_test_pred,
            '误差': np.abs(y_test[target] - y_test_pred)
        })

        # 添加各基础模型的预测结果
        for name, _ in base_models:
            train_prediction[f'{name}预测值'] = train_predictions[name]
            test_prediction[f'{name}预测值'] = test_predictions[name]

        # 保存到文件
        train_file = os.path.join(save_folder, f'{target}_投票集成模型训练集预测结果.csv')
        test_file = os.path.join(save_folder, f'{target}_投票集成模型测试集预测结果.csv')

        train_prediction.to_csv(train_file, index=False)
        test_prediction.to_csv(test_file, index=False)

        print(f"训练集预测结果已保存至 {train_file}")
        print(f"测试集预测结果已保存至 {test_file}")
        # 输出性能指标
        print(f"\n投票集成模型性能:")
        print(f"训练集: R²={train_r2:.4f}, 容忍度R²={train_tol_r2:.4f}, 在容忍范围内比例={train_within_tol:.2%}")
        print(f"测试集: R²={test_r2:.4f}, 容忍度R²={test_tol_r2:.4f}, 在容忍范围内比例={test_within_tol:.2%}")
        
        # 与各个基础模型比较性能
        print("\n与各基础模型性能比较:")
        for name, _ in base_models:
            base_train_pred = train_predictions[name]
            base_test_pred = test_predictions[name]
            
            base_train_r2 = r2_score(y_train[target], base_train_pred)
            base_test_r2 = r2_score(y_test[target], base_test_pred)
            
            print(f"  vs {name}:")
            print(f"    训练集R²: {train_r2:.4f} vs {base_train_r2:.4f} (差异: {train_r2-base_train_r2:.4f})")
            print(f"    测试集R²: {test_r2:.4f} vs {base_test_r2:.4f} (差异: {test_r2-base_test_r2:.4f})")
        
        # 保存模型
        models[target]['VotingEnsemble'] = voting_model
        # 使用pickle保存投票集成模型
        ensemble_model_file = os.path.join(model_folder, f'{target}_投票集成模型.pkl')
        with open(ensemble_model_file, 'wb') as f:
            pickle.dump(voting_model, f)
        print(f"投票集成模型已保存至 {ensemble_model_file}")
        # 可视化: 预测vs实际值散点图 (训练集和测试集)
        plt.figure(figsize=(12, 5))
        
        # 训练集散点图
        plt.subplot(1, 2, 1)
        plt.scatter(y_train[target], y_train_pred, alpha=0.5)
        plt.plot([y_train[target].min(), y_train[target].max()], [y_train[target].min(), y_train[target].max()], 'r--')
        plt.xlabel('实际值')
        plt.ylabel('预测值')
        plt.title(f'训练集: R²={train_r2:.4f}')
        
        # 测试集散点图
        plt.subplot(1, 2, 2)
        plt.scatter(y_test[target], y_test_pred, alpha=0.5)
        plt.plot([y_test[target].min(), y_test[target].max()], [y_test[target].min(), y_test[target].max()], 'r--')
        plt.xlabel('实际值')
        plt.ylabel('预测值')
        plt.title(f'测试集: R²={test_r2:.4f}')
        
        plt.tight_layout()
        plt.show()
        
        # 绘制误差分布
        plt.figure(figsize=(12, 5))
        
        # 训练集误差
        plt.subplot(1, 2, 1)
        train_errors = y_train[target] - y_train_pred
        plt.hist(train_errors, bins=30, alpha=0.7)
        plt.axvline(x=0, color='r', linestyle='--')
        plt.xlabel('预测误差')
        plt.ylabel('频次')
        plt.title(f'训练集误差分布 (MAE={np.abs(train_errors).mean():.4f})')
        
        # 测试集误差
        plt.subplot(1, 2, 2)
        test_errors = y_test[target] - y_test_pred
        plt.hist(test_errors, bins=30, alpha=0.7)
        plt.axvline(x=0, color='r', linestyle='--')
        plt.xlabel('预测误差')
        plt.ylabel('频次')
        plt.title(f'测试集误差分布 (MAE={np.abs(test_errors).mean():.4f})')
        
        plt.tight_layout()
        plt.show()
        
        # 绘制权重分布
        plt.figure(figsize=(10, 6))
        model_names = [name for name, _ in base_models]
        plt.bar(model_names, weights)
        plt.xlabel('模型')
        plt.ylabel('权重')
        plt.title(f'{target} - 投票集成模型权重分布')
        plt.grid(True, axis='y')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
        # 对比各模型预测分布
        plt.figure(figsize=(12, 6))
        model_preds = [y_test_pred] + [test_predictions[name] for name, _ in base_models]
        model_labels = ['Voting'] + [name for name, _ in base_models]
        
        plt.boxplot(model_preds, labels=model_labels)
        plt.ylabel('预测值')
        plt.title('投票集成模型与各基础模型预测分布对比')
        plt.grid(True, axis='y')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        print(f"创建投票集成模型失败: {str(e)}")
        print(f"错误详情: {traceback.format_exc()}")
else:
    print("没有足够的基础模型来创建投票集成")



In [None]:

# 自适应集成模型 - 根据样本特征动态选择最佳模型
from sklearn.ensemble import RandomForestClassifier

print(f"训练 {target} 的自适应集成模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.15)

# 创建有效模型列表及其对应的数据集
available_models = []
model_input_data = {}

if 'XGBoost' in models[target]:
    available_models.append('XGBoost')
    model_input_data['XGBoost'] = {
        'train': X_train_tree[target],
        'test': X_test_tree[target]
    }

if 'LightGBM' in models[target]:
    available_models.append('LightGBM')
    model_input_data['LightGBM'] = {
        'train': X_train_tree[target],
        'test': X_test_tree[target]
    }
    
if 'HistGradientBoosting' in models[target]:
    available_models.append('HistGradientBoosting')
    model_input_data['HistGradientBoosting'] = {
        'train': X_train_tree[target],
        'test': X_test_tree[target]
    }
    
if 'RandomForest' in models[target]:
    available_models.append('RandomForest')
    model_input_data['RandomForest'] = {
        'train': X_train_tree_filled[target],
        'test': X_test_tree_filled[target]
    }
    
if 'GaussianProcess' in models[target]:
    available_models.append('GaussianProcess')
    model_input_data['GaussianProcess'] = {
        'train': X_train_linear[target],
        'test': X_test_linear[target]
    }

print(f"可用模型: {available_models}")

if len(available_models) < 2:
    print("自适应集成至少需要两个模型，目前可用模型不足")
else:
    try:
        # 步骤1: 为每个样本生成各模型的预测
        print("为每个样本生成所有模型的预测...")
        train_predictions = {}
        test_predictions = {}
        
        for model_name in available_models:
            model = models[target][model_name]
            # 使用适当的数据集进行预测
            train_data = model_input_data[model_name]['train']
            test_data = model_input_data[model_name]['test']
            
            train_pred = model.predict(train_data)
            test_pred = model.predict(test_data)
            
            train_predictions[model_name] = train_pred
            test_predictions[model_name] = test_pred
        
        # 步骤2: 计算每个样本的每个模型预测误差
        print("计算各模型在每个样本上的预测误差...")
        train_errors = {}
        for model_name in available_models:
            pred = train_predictions[model_name]
            error = np.abs(y_train[target].values - pred)
            train_errors[model_name] = error
        
        # 步骤3: 创建一个元模型，学习如何根据特征选择最佳模型
        print("训练元模型来决定每个样本应使用哪个模型...")
        
        # 为每个样本找出表现最好的模型
        best_model_indices = np.zeros(len(y_train[target]), dtype=int)
        model_name_to_idx = {name: idx for idx, name in enumerate(available_models)}
        
        for i in range(len(y_train[target])):
            model_errors = [train_errors[model_name][i] for model_name in available_models]
            best_model_idx = np.argmin(model_errors)
            best_model_indices[i] = best_model_idx
        
        # 用原始特征训练一个分类器来预测最佳模型
        meta_classifier = RandomForestClassifier(
            n_estimators=200, 
            max_depth=4,
            min_samples_split=2,
            n_jobs=-1,
            random_state=42
        )
        
        meta_classifier.fit(X_train[target], best_model_indices)
        
        # 步骤4: 在训练集和测试集上使用元模型选择最佳模型
        print("在训练集和测试集上应用元模型...")
        train_best_models = meta_classifier.predict(X_train[target])
        test_best_models = meta_classifier.predict(X_test[target])
        
        # 步骤5: 根据元模型的选择，为每个样本选择相应的预测
        train_adaptive_predictions = np.zeros(len(y_train[target]))
        test_adaptive_predictions = np.zeros(len(y_test[target]))
        
        # 为训练集计算自适应预测
        for i in range(len(y_train[target])):
            selected_model = available_models[train_best_models[i]]
            train_adaptive_predictions[i] = train_predictions[selected_model][i]
        
        # 为测试集计算自适应预测
        for i in range(len(y_test[target])):
            selected_model = available_models[test_best_models[i]]
            test_adaptive_predictions[i] = test_predictions[selected_model][i]
        
        # 步骤6: 评估自适应集成的性能
        train_r2 = r2_score(y_train[target], train_adaptive_predictions)
        train_tol_r2 = tolerance_r2_score(y_train[target], train_adaptive_predictions, tolerance=current_tolerance, target=target)
        train_within_tol = prediction_within_tolerance(y_train[target], train_adaptive_predictions, tolerance=current_tolerance, target=target)
        
        test_r2 = r2_score(y_test[target], test_adaptive_predictions)
        test_tol_r2 = tolerance_r2_score(y_test[target], test_adaptive_predictions, tolerance=current_tolerance, target=target)
        test_within_tol = prediction_within_tolerance(y_test[target], test_adaptive_predictions, tolerance=current_tolerance, target=target)
        
        print(f"\n自适应集成模型性能:")
        print(f"训练集: R²={train_r2:.4f}, 容忍度R²={train_tol_r2:.4f}, 在容忍范围内比例={train_within_tol:.2%}")
        print(f"测试集: R²={test_r2:.4f}, 容忍度R²={test_tol_r2:.4f}, 在容忍范围内比例={test_within_tol:.2%}")
        
        # 步骤7: 比较自适应集成与各个基础模型的性能
        print("\n与各基础模型性能比较:")
        for model_name in available_models:
            model_train_pred = train_predictions[model_name]
            model_test_pred = test_predictions[model_name]
            
            model_train_r2 = r2_score(y_train[target], model_train_pred)
            model_test_r2 = r2_score(y_test[target], model_test_pred)
            
            train_r2_diff = train_r2 - model_train_r2
            test_r2_diff = test_r2 - model_test_r2
            
            print(f"  vs {model_name}:")
            print(f"    训练集R²: {train_r2:.4f} vs {model_train_r2:.4f} (差异: {train_r2_diff:.4f})")
            print(f"    测试集R²: {test_r2:.4f} vs {model_test_r2:.4f} (差异: {test_r2_diff:.4f})")
        
        # 步骤8: 分析各模型被选择的频率
        train_model_selection_counts = np.bincount(train_best_models, minlength=len(available_models))
        train_model_selection_percent = train_model_selection_counts / len(train_best_models) * 100
        
        test_model_selection_counts = np.bincount(test_best_models, minlength=len(available_models))
        test_model_selection_percent = test_model_selection_counts / len(test_best_models) * 100
        
        print("\n各模型在训练集上的选择频率:")
        for i, model_name in enumerate(available_models):
            print(f"  {model_name}: {train_model_selection_counts[i]} 次 ({train_model_selection_percent[i]:.2f}%)")
        
        print("\n各模型在测试集上的选择频率:")
        for i, model_name in enumerate(available_models):
            print(f"  {model_name}: {test_model_selection_counts[i]} 次 ({test_model_selection_percent[i]:.2f}%)")
        
        # 步骤9: 创建并保存自适应集成模型
        class AdaptiveEnsembleModel:
            def __init__(self, meta_classifier, models_dict, available_models, model_input_data):
                self.meta_classifier = meta_classifier
                self.models_dict = models_dict
                self.available_models = available_models
                self.model_input_data = model_input_data
                
                # 添加数据类型映射
                self.data_type_map = {
                    'XGBoost': 'tree',
                    'LightGBM': 'tree',
                    'HistGradientBoosting': 'tree',
                    'RandomForest': 'tree_filled',
                    'GaussianProcess': 'linear'
                }
                
            def predict(self, X):
                # 确保X是DataFrame格式，保持列名
                if not isinstance(X, pd.DataFrame):
                    if hasattr(X, 'shape') and len(X.shape) == 2:
                        if hasattr(X_train[target], 'columns'):
                            X = pd.DataFrame(X, columns=X_train[target].columns)
                        else:
                            X = pd.DataFrame(X)
                
                # 首先预测每个样本应使用哪个模型
                model_choices = self.meta_classifier.predict(X)
                
                # 初始化预测结果数组
                predictions = np.zeros(len(X))
                
                # 为每个样本获取相应模型的预测
                for i in range(len(X)):
                    # 获取为当前样本选择的模型
                    model_idx = model_choices[i]
                    model_name = self.available_models[model_idx]
                    model = self.models_dict[model_name]
                    
                    # 准备单个样本的数据
                    if isinstance(X, pd.DataFrame):
                        x_sample = X.iloc[[i]]
                    else:
                        if len(X.shape) == 1:
                            x_sample = X.reshape(1, -1)
                        else:
                            x_sample = X[[i]]
                    
                    # 根据模型类型进行预处理
                    data_type = self.data_type_map.get(model_name, 'standard')
                    
                    if data_type == 'tree':
                        # 支持NaN值的树模型，不需要特殊处理
                        x_processed = x_sample
                    elif data_type == 'tree_filled':
                        # 不支持NaN的树模型，需要填充
                        if isinstance(x_sample, pd.DataFrame):
                            x_processed = x_sample.fillna(0)
                        else:
                            x_processed = np.nan_to_num(x_sample, 0)
                    elif data_type == 'linear':
                        # 线性模型的特殊处理，如果有需要
                        x_processed = x_sample
                    else:
                        # 默认情况
                        x_processed = x_sample
                    
                    # 获取预测
                    pred = model.predict(x_processed)
                    predictions[i] = pred[0] if hasattr(pred, '__len__') else pred
                
                return predictions
                
            def get_feature_importances(self):
                # 获取元分类器的特征重要性
                if hasattr(self.meta_classifier, 'feature_importances_'):
                    return self.meta_classifier.feature_importances_
                return None
        
        # 创建自适应集成模型实例
        adaptive_model = AdaptiveEnsembleModel(
            meta_classifier=meta_classifier,
            models_dict=models[target],
            available_models=available_models,
            model_input_data=model_input_data
        )
        
        # 保存模型
        models[target]['AdaptiveEnsemble'] = adaptive_model
        # 使用pickle保存自适应集成模型
        adaptive_model_file = os.path.join(model_folder, f'{target}_自适应集成模型.pkl')
        with open(adaptive_model_file, 'wb') as f:
            pickle.dump(adaptive_model, f)
        print(f"自适应集成模型已保存至 {adaptive_model_file}")

        # 保存训练集和测试集的预测结果
        train_prediction = pd.DataFrame({
            '实际值': y_train[target],
            '自适应集成预测值': train_adaptive_predictions,
            '误差': np.abs(y_train[target] - train_adaptive_predictions)
        })

        test_prediction = pd.DataFrame({
            '实际值': y_test[target],
            '自适应集成预测值': test_adaptive_predictions,
            '误差': np.abs(y_test[target] - test_adaptive_predictions)
        })

        # 添加各基础模型的预测结果以便比较
        for model_name in available_models:
            train_prediction[f'{model_name}预测值'] = train_predictions[model_name]
            test_prediction[f'{model_name}预测值'] = test_predictions[model_name]

        # 保存到文件
        train_file = os.path.join(save_folder, f'{target}_自适应集成训练集预测结果.csv')
        test_file = os.path.join(save_folder, f'{target}_自适应集成测试集预测结果.csv')

        train_prediction.to_csv(train_file, index=False)
        test_prediction.to_csv(test_file, index=False)

        print(f"训练集预测结果已保存至 {train_file}")
        print(f"测试集预测结果已保存至 {test_file}")

        # 可视化: 预测vs实际值散点图 (训练集和测试集)
        plt.figure(figsize=(12, 5))
        
        # 训练集散点图
        plt.subplot(1, 2, 1)
        plt.scatter(y_train[target], train_adaptive_predictions, alpha=0.5)
        plt.plot([y_train[target].min(), y_train[target].max()], [y_train[target].min(), y_train[target].max()], 'r--')
        plt.xlabel('实际值')
        plt.ylabel('预测值')
        plt.title(f'训练集: R²={train_r2:.4f}')
        
        # 测试集散点图
        plt.subplot(1, 2, 2)
        plt.scatter(y_test[target], test_adaptive_predictions, alpha=0.5)
        plt.plot([y_test[target].min(), y_test[target].max()], [y_test[target].min(), y_test[target].max()], 'r--')
        plt.xlabel('实际值')
        plt.ylabel('预测值')
        plt.title(f'测试集: R²={test_r2:.4f}')
        
        plt.tight_layout()
        plt.show()
        
        # 绘制误差分布
        plt.figure(figsize=(12, 5))
        
        # 训练集误差
        plt.subplot(1, 2, 1)
        train_errors_plot = y_train[target] - train_adaptive_predictions
        plt.hist(train_errors_plot, bins=30, alpha=0.7)
        plt.axvline(x=0, color='r', linestyle='--')
        plt.xlabel('预测误差')
        plt.ylabel('频次')
        plt.title(f'训练集误差分布 (MAE={np.abs(train_errors_plot).mean():.4f})')
        
        # 测试集误差
        plt.subplot(1, 2, 2)
        test_errors_plot = y_test[target] - test_adaptive_predictions
        plt.hist(test_errors_plot, bins=30, alpha=0.7)
        plt.axvline(x=0, color='r', linestyle='--')
        plt.xlabel('预测误差')
        plt.ylabel('频次')
        plt.title(f'测试集误差分布 (MAE={np.abs(test_errors_plot).mean():.4f})')
        
        plt.tight_layout()
        plt.show()
        
        # 保存元分类器的特征重要性
        feature_importance = pd.DataFrame({
            'Feature': X_train[target].columns,
            'Importance': meta_classifier.feature_importances_
        })
        feature_importance = feature_importance.sort_values('Importance', ascending=False)
        # 保存特征重要性数据
        importance_file = os.path.join(save_folder, f'{target}_自适应集成特征重要性.csv')
        feature_importance.to_csv(importance_file, index=False)
        print(f"特征重要性数据已保存至 {importance_file}")
        plt.figure(figsize=(10, 6))
        plt.barh(feature_importance['Feature'], feature_importance['Importance'])
        plt.xlabel('重要性')
        plt.ylabel('特征')
        plt.title(f'{target} - 自适应集成模型选择特征重要性')
        plt.grid(True, axis='x')
        plt.tight_layout()
        plt.show()
        
        # 绘制模型选择频率饼图
        plt.figure(figsize=(12, 5))
        
        # 训练集上的模型选择频率
        plt.subplot(1, 2, 1)
        plt.pie(train_model_selection_counts, labels=available_models, autopct='%1.1f%%')
        plt.title(f'训练集 - 模型选择频率')
        
        # 测试集上的模型选择频率
        plt.subplot(1, 2, 2)
        plt.pie(test_model_selection_counts, labels=available_models, autopct='%1.1f%%')
        plt.title(f'测试集 - 模型选择频率')
        
        plt.tight_layout()
        plt.show()
        # 保存模型选择频率数据
        model_selection_data = pd.DataFrame({
            '模型': available_models,
            '训练集选择次数': train_model_selection_counts,
            '训练集选择百分比': train_model_selection_percent,
            '测试集选择次数': test_model_selection_counts,
            '测试集选择百分比': test_model_selection_percent
        })

        selection_file = os.path.join(save_folder, f'{target}_自适应集成模型选择频率.csv')
        model_selection_data.to_csv(selection_file, index=False)
        print(f"模型选择频率数据已保存至 {selection_file}")
        # 绘制误差分布与模型选择关系
        plt.figure(figsize=(12, 6))
        
        # 对比测试集上各模型的预测结果
        model_data = [test_adaptive_predictions] + [test_predictions[model] for model in available_models]
        model_labels = ['自适应集成'] + available_models
        
        plt.boxplot(model_data, labels=model_labels)
        plt.ylabel('预测值')
        plt.title('自适应集成模型与各基础模型预测分布对比')
        plt.grid(True, axis='y')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        print(f"创建自适应集成模型失败: {str(e)}")
        print(f"错误详情: {traceback.format_exc()}")



In [None]:

# 加权平均集成模型 - 使用标准R²优化权重
print(f"训练 {target} 的加权平均集成模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.15)

# 创建有效模型列表及其预测结果
available_models = []
train_predictions = {}
test_predictions = {}

if 'XGBoost' in models[target]:
    model = models[target]['XGBoost']
    train_pred = model.predict(X_train_tree[target])
    test_pred = model.predict(X_test_tree[target])
    available_models.append('XGBoost')
    train_predictions['XGBoost'] = train_pred
    test_predictions['XGBoost'] = test_pred

if 'LightGBM' in models[target]:
    model = models[target]['LightGBM']
    train_pred = model.predict(X_train_tree[target])
    test_pred = model.predict(X_test_tree[target])
    available_models.append('LightGBM')
    train_predictions['LightGBM'] = train_pred
    test_predictions['LightGBM'] = test_pred
    
if 'HistGradientBoosting' in models[target]:
    model = models[target]['HistGradientBoosting']
    train_pred = model.predict(X_train_tree[target])
    test_pred = model.predict(X_test_tree[target])
    available_models.append('HistGradientBoosting')
    train_predictions['HistGradientBoosting'] = train_pred
    test_predictions['HistGradientBoosting'] = test_pred
    
if 'RandomForest' in models[target]:
    model = models[target]['RandomForest']
    train_pred = model.predict(X_train_tree_filled[target])
    test_pred = model.predict(X_test_tree_filled[target])
    available_models.append('RandomForest')
    train_predictions['RandomForest'] = train_pred
    test_predictions['RandomForest'] = test_pred
    
if 'GaussianProcess' in models[target]:
    model = models[target]['GaussianProcess']
    train_pred = model.predict(X_train_linear[target])
    test_pred = model.predict(X_test_linear[target])
    available_models.append('GaussianProcess')
    train_predictions['GaussianProcess'] = train_pred
    test_predictions['GaussianProcess'] = test_pred

print(f"可用模型: {available_models}")

if len(available_models) < 2:
    print("加权平均集成至少需要两个模型，目前可用模型不足")
else:
    try:
        # 通过优化找到最优权重
        print("寻找最优权重组合...")
        from scipy.optimize import minimize
        
        # 定义自定义加权平均函数
        def weighted_prediction(weights, preds_list):
            weighted_preds = np.zeros(preds_list[0].shape)
            for i, preds in enumerate(preds_list):
                weighted_preds += weights[i] * preds
            return weighted_preds
        
        # 定义要优化的损失函数（最大化标准R²）- 修改为使用标准R²而非容忍度R²
        def neg_r2(weights, preds_list, y_true):
            # 归一化权重确保和为1
            weights = np.array(weights)
            weights = weights / np.sum(weights)
            
            weighted_preds = weighted_prediction(weights, preds_list)
            r2 = r2_score(y_true, weighted_preds)
            return -r2  # 最小化负的R²（即最大化R²）
        
        # 准备用于优化的预测值列表
        train_preds_list = [train_predictions[model_name] for model_name in available_models]
        
        # 初始权重（均等）
        initial_weights = np.ones(len(available_models)) / len(available_models)
        
        # 约束：权重和为1，所有权重非负
        constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1})
        bounds = [(0, 1) for _ in range(len(available_models))]
        
        # 使用SLSQP优化算法寻找最优权重
        print("优化权重中...")
        result = minimize(
            neg_r2, 
            initial_weights, 
            args=(train_preds_list, y_train[target]),
            bounds=bounds,
            constraints=constraints,
            method='SLSQP'
        )
        
        if result.success:
            # 获取最优权重并归一化
            optimal_weights = result.x
            optimal_weights = optimal_weights / np.sum(optimal_weights)
            
            print("\n找到最优权重组合:")
            for i, model_name in enumerate(available_models):
                print(f"  {model_name}: {optimal_weights[i]:.4f}")
                
            # 使用最优权重在训练集和测试集上评估性能
            train_weighted_preds = weighted_prediction(
                optimal_weights, 
                [train_predictions[model_name] for model_name in available_models]
            )
            
            test_weighted_preds = weighted_prediction(
                optimal_weights, 
                [test_predictions[model_name] for model_name in available_models]
            )
            
            # 计算性能指标
            train_r2 = r2_score(y_train[target], train_weighted_preds)
            train_tol_r2 = tolerance_r2_score(y_train[target], train_weighted_preds, tolerance=current_tolerance, target=target)
            train_within_tol = prediction_within_tolerance(y_train[target], train_weighted_preds, tolerance=current_tolerance, target=target)
            
            test_r2 = r2_score(y_test[target], test_weighted_preds)
            test_tol_r2 = tolerance_r2_score(y_test[target], test_weighted_preds, tolerance=current_tolerance, target=target)
            test_within_tol = prediction_within_tolerance(y_test[target], test_weighted_preds, tolerance=current_tolerance, target=target)
            
            print("\n加权平均集成性能:")
            print(f"  训练集 - R²: {train_r2:.4f}, 容忍度R²: {train_tol_r2:.4f}, 在容忍范围内: {train_within_tol:.2%}")
            print(f"  测试集 - R²: {test_r2:.4f}, 容忍度R²: {test_tol_r2:.4f}, 在容忍范围内: {test_within_tol:.2%}")
            
            # 与各个基础模型比较性能
            print("\n与各基础模型性能比较:")
            for model_name in available_models:
                model_test_pred = test_predictions[model_name]
                model_r2 = r2_score(y_test[target], model_test_pred)
                model_tol_r2 = tolerance_r2_score(y_test[target], model_test_pred, tolerance=current_tolerance, target=target)
                
                r2_diff = test_r2 - model_r2
                tol_r2_diff = test_tol_r2 - model_tol_r2
                
                print(f"  vs {model_name}:")
                print(f"    R² 差异: {r2_diff:.4f} ({'+' if r2_diff > 0 else ''}{r2_diff/max(0.0001, abs(model_r2))*100:.2f}%)")
                print(f"    容忍度R² 差异: {tol_r2_diff:.4f} ({'+' if tol_r2_diff > 0 else ''}{tol_r2_diff/max(0.0001, abs(model_tol_r2))*100:.2f}%)")
            
            # 创建加权平均集成模型
            class WeightedAverageEnsemble:
                def __init__(self, models_dict, model_names, weights, model_datasets):
                    self.models_dict = models_dict
                    self.model_names = model_names
                    self.weights = weights
                    self.model_datasets = model_datasets
                    
                def predict(self, X):
                    predictions = []
                    
                    for i, model_name in enumerate(self.model_names):
                        model = self.models_dict[model_name]
                        
                        # 获取适当的数据格式
                        if model_name in ['XGBoost', 'LightGBM', 'HistGradientBoosting']:
                            if isinstance(X, pd.DataFrame):
                                # 假设X是原始数据框，需要应用适当的预处理
                                X_model = X  # 应该在实际应用中进行适当的预处理转换
                            else:
                                X_model = X
                        elif model_name == 'RandomForest':
                            if isinstance(X, pd.DataFrame):
                                # 对于RandomForest需要填充NaN
                                X_model = X.fillna(0)
                            else:
                                X_model = X
                        elif model_name == 'GaussianProcess':
                            if isinstance(X, pd.DataFrame):
                                # 假设X是原始数据框，需要应用适当的预处理
                                X_model = X  # 应该在实际应用中进行适当的预处理转换
                            else:
                                X_model = X
                        else:
                            X_model = X
                            
                        model_pred = model.predict(X_model)
                        predictions.append(model_pred)
                    
                    # 应用权重
                    weighted_preds = np.zeros(predictions[0].shape)
                    for i, preds in enumerate(predictions):
                        weighted_preds += self.weights[i] * preds
                        
                    return weighted_preds
            
            # 创建模型数据集字典
            model_datasets = {
                'XGBoost': 'tree',
                'LightGBM': 'tree',
                'HistGradientBoosting': 'tree',
                'RandomForest': 'tree_filled',
                'GaussianProcess': 'linear'
            }
            
            # 实例化加权平均集成模型
            weighted_model = WeightedAverageEnsemble(
                models_dict=models[target],
                model_names=available_models,
                weights=optimal_weights,
                model_datasets=model_datasets
            )
            
            # 保存模型
            models[target]['WeightedEnsemble'] = weighted_model
            # 使用pickle保存加权平均集成模型
            weighted_model_file = os.path.join(model_folder, f'{target}_加权平均集成模型.pkl')
            with open(weighted_model_file, 'wb') as f:
                pickle.dump(weighted_model, f)
            print(f"加权平均集成模型已保存至 {weighted_model_file}")

            # 保存训练集和测试集的预测结果
            train_prediction = pd.DataFrame({
                '实际值': y_train[target],
                '加权平均预测值': train_weighted_preds,
                '误差': np.abs(y_train[target] - train_weighted_preds)
            })

            test_prediction = pd.DataFrame({
                '实际值': y_test[target],
                '加权平均预测值': test_weighted_preds,
                '误差': np.abs(y_test[target] - test_weighted_preds)
            })

            # 添加各基础模型的预测结果以便比较
            for model_name in available_models:
                train_prediction[f'{model_name}预测值'] = train_predictions[model_name]
                test_prediction[f'{model_name}预测值'] = test_predictions[model_name]

            # 保存到文件
            train_file = os.path.join(save_folder, f'{target}_加权平均集成训练集预测结果.csv')
            test_file = os.path.join(save_folder, f'{target}_加权平均集成测试集预测结果.csv')

            train_prediction.to_csv(train_file, index=False)
            test_prediction.to_csv(test_file, index=False)

            print(f"训练集预测结果已保存至 {train_file}")
            print(f"测试集预测结果已保存至 {test_file}")

            # 可视化: 预测vs实际值散点图 (训练集和测试集)
            plt.figure(figsize=(12, 5))
            
            # 训练集散点图
            plt.subplot(1, 2, 1)
            plt.scatter(y_train[target], train_weighted_preds, alpha=0.5)
            plt.plot([y_train[target].min(), y_train[target].max()], [y_train[target].min(), y_train[target].max()], 'r--')
            plt.xlabel('实际值')
            plt.ylabel('预测值')
            plt.title(f'训练集: R²={train_r2:.4f}')
            
            # 测试集散点图
            plt.subplot(1, 2, 2)
            plt.scatter(y_test[target], test_weighted_preds, alpha=0.5)
            plt.plot([y_test[target].min(), y_test[target].max()], [y_test[target].min(), y_test[target].max()], 'r--')
            plt.xlabel('实际值')
            plt.ylabel('预测值')
            plt.title(f'测试集: R²={test_r2:.4f}')
            
            plt.tight_layout()
            plt.show()
            
            # 绘制误差分布
            plt.figure(figsize=(12, 5))
            
            # 训练集误差
            plt.subplot(1, 2, 1)
            train_errors = y_train[target] - train_weighted_preds
            plt.hist(train_errors, bins=30, alpha=0.7)
            plt.axvline(x=0, color='r', linestyle='--')
            plt.xlabel('预测误差')
            plt.ylabel('频次')
            plt.title(f'训练集误差分布 (MAE={np.abs(train_errors).mean():.4f})')
            
            # 测试集误差
            plt.subplot(1, 2, 2)
            test_errors = y_test[target] - test_weighted_preds
            plt.hist(test_errors, bins=30, alpha=0.7)
            plt.axvline(x=0, color='r', linestyle='--')
            plt.xlabel('预测误差')
            plt.ylabel('频次')
            plt.title(f'测试集误差分布 (MAE={np.abs(test_errors).mean():.4f})')
            
            plt.tight_layout()
            plt.show()
            
            # 绘制权重条形图
            plt.figure(figsize=(10, 6))
            plt.bar(available_models, optimal_weights)
            plt.xlabel('模型')
            plt.ylabel('权重')
            plt.title(f'{target} - 加权平均集成模型权重分布')
            plt.xticks(rotation=45)
            plt.grid(True, axis='y')
            plt.tight_layout()
            plt.show()
            
            # 绘制各模型与加权平均模型的预测对比图
            plt.figure(figsize=(12, 6))
            model_data = [test_weighted_preds] + [test_predictions[model] for model in available_models]
            model_labels = ['加权平均'] + available_models
            
            plt.boxplot(model_data, labels=model_labels)
            plt.ylabel('预测值')
            plt.title('加权平均模型与各基础模型预测分布对比')
            plt.grid(True, axis='y')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.show()
            
        else:
            print("权重优化失败:", result.message)
            
    except Exception as e:
        print(f"创建加权平均集成模型失败: {str(e)}")
        print(f"错误详情: {traceback.format_exc()}")

In [None]:
# 基于区间的集成模型 - 每个模型在不同区间的最佳表现
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

print(f"训练 {target} 的基于区间的集成模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.15)

# 创建有效模型列表及其预测结果
available_models = []
train_predictions = {}
test_predictions = {}
model_datasets = {}

if 'XGBoost' in models[target]:
    model = models[target]['XGBoost']
    train_pred = model.predict(X_train_tree[target])
    test_pred = model.predict(X_test_tree[target])
    available_models.append('XGBoost')
    train_predictions['XGBoost'] = train_pred
    test_predictions['XGBoost'] = test_pred
    model_datasets['XGBoost'] = 'tree'

if 'LightGBM' in models[target]:
    model = models[target]['LightGBM']
    train_pred = model.predict(X_train_tree[target])
    test_pred = model.predict(X_test_tree[target])
    available_models.append('LightGBM')
    train_predictions['LightGBM'] = train_pred
    test_predictions['LightGBM'] = test_pred
    model_datasets['LightGBM'] = 'tree'
    
if 'HistGradientBoosting' in models[target]:
    model = models[target]['HistGradientBoosting']
    train_pred = model.predict(X_train_tree[target])
    test_pred = model.predict(X_test_tree[target])
    available_models.append('HistGradientBoosting')
    train_predictions['HistGradientBoosting'] = train_pred
    test_predictions['HistGradientBoosting'] = test_pred
    model_datasets['HistGradientBoosting'] = 'tree'
    
if 'RandomForest' in models[target]:
    model = models[target]['RandomForest']
    train_pred = model.predict(X_train_tree_filled[target])
    test_pred = model.predict(X_test_tree_filled[target])
    available_models.append('RandomForest')
    train_predictions['RandomForest'] = train_pred
    test_predictions['RandomForest'] = test_pred
    model_datasets['RandomForest'] = 'tree_filled'
    
if 'GaussianProcess' in models[target]:
    model = models[target]['GaussianProcess']
    train_pred = model.predict(X_train_linear[target])
    test_pred = model.predict(X_test_linear[target])
    available_models.append('GaussianProcess')
    train_predictions['GaussianProcess'] = train_pred
    test_predictions['GaussianProcess'] = test_pred
    model_datasets['GaussianProcess'] = 'linear'

print(f"可用模型: {available_models}")

if len(available_models) < 2:
    print("基于区间的集成至少需要两个模型，目前可用模型不足")
else:
    try:
        # 步骤1: 确定区间划分策略
        # 方法1: 基于目标变量值的分位数划分
        # 方法2: 基于预测误差的分布划分
        # 方法3: 自适应区间划分
        
        print("分析目标变量分布并确定区间划分...")
        
        # 分析目标变量的分布
        y_train_values = y_train[target].values
        y_min, y_max = y_train_values.min(), y_train_values.max()
        
        # 尝试不同的区间数量，找到最佳的
        best_num_intervals = 3
        best_overall_r2 = -np.inf
        best_interval_config = None
        
        print("尝试不同的区间划分数量...")
        for num_intervals in range(3, 8):  # 尝试3到7个区间
            print(f"\n尝试 {num_intervals} 个区间:")
            
            # 使用分位数划分区间
            quantiles = np.linspace(0, 1, num_intervals + 1)
            interval_boundaries = np.quantile(y_train_values, quantiles)
            
            # 确保边界值不重复
            interval_boundaries = np.unique(interval_boundaries)
            actual_num_intervals = len(interval_boundaries) - 1
            
            if actual_num_intervals < 2:
                continue
                
            # 为每个区间找到最佳模型
            interval_best_models = {}
            interval_performances = {}
            
            valid_intervals = 0
            total_weighted_r2 = 0
            total_samples = 0
            
            for i in range(actual_num_intervals):
                lower_bound = interval_boundaries[i]
                upper_bound = interval_boundaries[i + 1] 
                
                # 找到属于当前区间的样本
                if i == actual_num_intervals - 1:  # 最后一个区间包含上界
                    mask = (y_train_values >= lower_bound) & (y_train_values <= upper_bound)
                else:
                    mask = (y_train_values >= lower_bound) & (y_train_values < upper_bound)
                
                interval_samples = np.sum(mask)
                if interval_samples < 10:  # 区间内样本太少，跳过
                    continue
                
                # 在当前区间内评估所有模型
                y_interval = y_train_values[mask]
                best_r2 = -np.inf
                best_model = None
                
                interval_r2_scores = {}
                for model_name in available_models:
                    pred_interval = train_predictions[model_name][mask]
                    r2 = r2_score(y_interval, pred_interval)
                    interval_r2_scores[model_name] = r2
                    
                    if r2 > best_r2:
                        best_r2 = r2
                        best_model = model_name
                
                if best_model:
                    interval_best_models[i] = best_model
                    interval_performances[i] = {
                        'best_model': best_model,
                        'best_r2': best_r2,
                        'bounds': (lower_bound, upper_bound),
                        'samples': interval_samples,
                        'all_r2': interval_r2_scores
                    }
                    
                    # 计算加权R²
                    total_weighted_r2 += best_r2 * interval_samples
                    total_samples += interval_samples
                    valid_intervals += 1
            
            if valid_intervals >= 2 and total_samples > 0:
                overall_r2 = total_weighted_r2 / total_samples
                print(f"  有效区间数: {valid_intervals}, 总体加权R²: {overall_r2:.4f}")
                
                if overall_r2 > best_overall_r2:
                    best_overall_r2 = overall_r2
                    best_num_intervals = actual_num_intervals
                    best_interval_config = {
                        'boundaries': interval_boundaries,
                        'models': interval_best_models,
                        'performances': interval_performances,
                        'num_intervals': actual_num_intervals
                    }
        
        if best_interval_config is None:
            print("无法找到合适的区间划分，回退到简单集成")
        else:
            print(f"\n选择最佳配置: {best_num_intervals} 个区间, 总体R²: {best_overall_r2:.4f}")
            
            # 使用最佳配置
            interval_boundaries = best_interval_config['boundaries']
            interval_best_models = best_interval_config['models']
            interval_performances = best_interval_config['performances']
            
            # 打印每个区间的详细信息
            print("\n区间划分详情:")
            for i, perf in interval_performances.items():
                bounds = perf['bounds']
                best_model = perf['best_model']
                best_r2 = perf['best_r2']
                samples = perf['samples']
                
                print(f"区间 {i+1}: [{bounds[0]:.3f}, {bounds[1]:.3f}]")
                print(f"  最佳模型: {best_model} (R²: {best_r2:.4f})")
                print(f"  样本数: {samples}")
                print(f"  所有模型R²: {perf['all_r2']}")
                print()
            
            # 步骤2: 基于区间的预测函数
            def interval_based_prediction(y_values, is_training=True):
                """基于区间选择模型进行预测"""
                predictions = np.zeros(len(y_values))
                model_usage = {model: 0 for model in available_models}
                
                for i in range(len(y_values)):
                    y_val = y_values[i]
                    
                    # 找到当前值属于哪个区间
                    selected_model = None
                    for interval_idx, perf in interval_performances.items():
                        lower_bound, upper_bound = perf['bounds']
                        
                        if interval_idx == max(interval_performances.keys()):  # 最后一个区间
                            if lower_bound <= y_val <= upper_bound:
                                selected_model = perf['best_model']
                                break
                        else:
                            if lower_bound <= y_val < upper_bound:
                                selected_model = perf['best_model']
                                break
                    
                    # 如果没有找到合适的区间，使用全局最佳模型
                    if selected_model is None:
                        # 找到全局R²最高的模型
                        global_r2_scores = {}
                        for model_name in available_models:
                            if is_training:
                                pred_all = train_predictions[model_name]
                                y_all = y_train[target].values
                            else:
                                pred_all = test_predictions[model_name] 
                                y_all = y_test[target].values
                            global_r2_scores[model_name] = r2_score(y_all, pred_all)
                        
                        selected_model = max(global_r2_scores, key=global_r2_scores.get)
                    
                    # 使用选定的模型进行预测
                    if is_training:
                        predictions[i] = train_predictions[selected_model][i]
                    else:
                        predictions[i] = test_predictions[selected_model][i]
                    
                    model_usage[selected_model] += 1
                
                return predictions, model_usage
            
            # 步骤3: 在训练集和测试集上应用基于区间的预测
            print("应用基于区间的预测...")
            
            # 训练集预测（用真实值确定区间）
            train_interval_preds, train_model_usage = interval_based_prediction(
                y_train[target].values, is_training=True
            )
            
            # 测试集预测 - 这里需要特殊处理，因为我们不知道真实值
            # 方法1: 使用预测值的平均来估计区间
            # 方法2: 使用特征来预测区间
            # 方法3: 使用所有模型的平均预测值来估计区间
            
            print("为测试集确定区间...")
            # 使用所有模型预测的平均值来估计测试样本的区间
            test_avg_predictions = np.mean([test_predictions[model] for model in available_models], axis=0)
            test_interval_preds, test_model_usage = interval_based_prediction(
                test_avg_predictions, is_training=False
            )
            
            # 步骤4: 评估性能
            train_r2 = r2_score(y_train[target], train_interval_preds)
            train_tol_r2 = tolerance_r2_score(y_train[target], train_interval_preds, tolerance=current_tolerance, target=target)
            train_within_tol = prediction_within_tolerance(y_train[target], train_interval_preds, tolerance=current_tolerance, target=target)
            
            test_r2 = r2_score(y_test[target], test_interval_preds)
            test_tol_r2 = tolerance_r2_score(y_test[target], test_interval_preds, tolerance=current_tolerance, target=target)
            test_within_tol = prediction_within_tolerance(y_test[target], test_interval_preds, tolerance=current_tolerance, target=target)
            
            print(f"\n基于区间的集成模型性能:")
            print(f"训练集: R²={train_r2:.4f}, 容忍度R²={train_tol_r2:.4f}, 在容忍范围内比例={train_within_tol:.2%}")
            print(f"测试集: R²={test_r2:.4f}, 容忍度R²={test_tol_r2:.4f}, 在容忍范围内比例={test_within_tol:.2%}")
            
            # 步骤5: 分析模型使用频率
            print("\n训练集中各模型使用频率:")
            for model, count in train_model_usage.items():
                percentage = count / len(y_train[target]) * 100
                print(f"  {model}: {count} 次 ({percentage:.2f}%)")
            
            print("\n测试集中各模型使用频率:")
            for model, count in test_model_usage.items():
                percentage = count / len(y_test[target]) * 100
                print(f"  {model}: {count} 次 ({percentage:.2f}%)")
            
            # 步骤6: 与各基础模型比较性能
            print("\n与各基础模型性能比较:")
            for model_name in available_models:
                model_test_pred = test_predictions[model_name]
                model_r2 = r2_score(y_test[target], model_test_pred)
                r2_diff = test_r2 - model_r2
                
                print(f"  vs {model_name}:")
                print(f"    测试集R²: {test_r2:.4f} vs {model_r2:.4f} (差异: {r2_diff:.4f})")
            
            # 步骤7: 创建基于区间的集成模型类
            class IntervalBasedEnsemble:
                def __init__(self, models_dict, interval_config, available_models, model_datasets):
                    self.models_dict = models_dict
                    self.interval_boundaries = interval_config['boundaries']
                    self.interval_performances = interval_config['performances']
                    self.available_models = available_models
                    self.model_datasets = model_datasets
                    
                    # 预计算全局最佳模型（用作后备）
                    self.global_best_model = max(available_models, 
                        key=lambda m: np.mean([perf['all_r2'][m] for perf in interval_config['performances'].values()]))
                
                def predict(self, X):
                    # 首先获取所有模型的预测
                    all_predictions = {}
                    
                    for model_name in self.available_models:
                        model = self.models_dict[model_name]
                        
                        # 根据模型类型准备数据
                        if model_name in ['XGBoost', 'LightGBM', 'HistGradientBoosting']:
                            X_model = X
                        elif model_name == 'RandomForest':
                            X_model = X.fillna(0) if isinstance(X, pd.DataFrame) else X
                        elif model_name == 'GaussianProcess':
                            X_model = X
                        else:
                            X_model = X
                        
                        all_predictions[model_name] = model.predict(X_model)
                    
                    # 使用所有模型的平均预测来估计区间
                    avg_predictions = np.mean(list(all_predictions.values()), axis=0)
                    
                    # 为每个样本选择合适的模型
                    final_predictions = np.zeros(len(avg_predictions))
                    
                    for i in range(len(avg_predictions)):
                        pred_val = avg_predictions[i]
                        
                        # 找到预测值属于哪个区间
                        selected_model = None
                        for interval_idx, perf in self.interval_performances.items():
                            lower_bound, upper_bound = perf['bounds']
                            
                            if interval_idx == max(self.interval_performances.keys()):
                                if lower_bound <= pred_val <= upper_bound:
                                    selected_model = perf['best_model']
                                    break
                            else:
                                if lower_bound <= pred_val < upper_bound:
                                    selected_model = perf['best_model']
                                    break
                        
                        # 如果没有找到，使用全局最佳模型
                        if selected_model is None:
                            selected_model = self.global_best_model
                        
                        final_predictions[i] = all_predictions[selected_model][i]
                    
                    return final_predictions
                
                def get_interval_info(self):
                    """返回区间配置信息"""
                    return {
                        'boundaries': self.interval_boundaries,
                        'performances': self.interval_performances,
                        'global_best': self.global_best_model
                    }
            
            # 创建基于区间的集成模型
            interval_model = IntervalBasedEnsemble(
                models_dict=models[target],
                interval_config=best_interval_config,
                available_models=available_models,
                model_datasets=model_datasets
            )
            
            # 保存模型
            models[target]['IntervalEnsemble'] = interval_model
            
            # 使用pickle保存模型
            interval_model_file = os.path.join(model_folder, f'{target}_基于区间集成模型.pkl')
            with open(interval_model_file, 'wb') as f:
                pickle.dump(interval_model, f)
            print(f"基于区间的集成模型已保存至 {interval_model_file}")
            
            # 保存预测结果
            train_prediction = pd.DataFrame({
                '实际值': y_train[target],
                '区间集成预测值': train_interval_preds,
                '误差': np.abs(y_train[target] - train_interval_preds)
            })

            test_prediction = pd.DataFrame({
                '实际值': y_test[target],
                '区间集成预测值': test_interval_preds,
                '误差': np.abs(y_test[target] - test_interval_preds)
            })

            # 添加各基础模型的预测结果
            for model_name in available_models:
                train_prediction[f'{model_name}预测值'] = train_predictions[model_name]
                test_prediction[f'{model_name}预测值'] = test_predictions[model_name]

            # 保存到文件
            train_file = os.path.join(save_folder, f'{target}_基于区间集成训练集预测结果.csv')
            test_file = os.path.join(save_folder, f'{target}_基于区间集成测试集预测结果.csv')

            train_prediction.to_csv(train_file, index=False)
            test_prediction.to_csv(test_file, index=False)

            print(f"训练集预测结果已保存至 {train_file}")
            print(f"测试集预测结果已保存至 {test_file}")
            
            # 可视化部分
            # 1. 预测vs实际值散点图
            plt.figure(figsize=(12, 5))
            
            plt.subplot(1, 2, 1)
            plt.scatter(y_train[target], train_interval_preds, alpha=0.5)
            plt.plot([y_train[target].min(), y_train[target].max()], [y_train[target].min(), y_train[target].max()], 'r--')
            plt.xlabel('实际值')
            plt.ylabel('预测值')
            plt.title(f'训练集: R²={train_r2:.4f}')
            
            plt.subplot(1, 2, 2)
            plt.scatter(y_test[target], test_interval_preds, alpha=0.5)
            plt.plot([y_test[target].min(), y_test[target].max()], [y_test[target].min(), y_test[target].max()], 'r--')
            plt.xlabel('实际值')
            plt.ylabel('预测值')
            plt.title(f'测试集: R²={test_r2:.4f}')
            
            plt.tight_layout()
            plt.show()
            
            # 2. 区间划分和模型选择可视化
            plt.figure(figsize=(14, 8))
            
            # 子图1: 区间划分
            plt.subplot(2, 2, 1)
            plt.hist(y_train[target], bins=30, alpha=0.7, label='目标变量分布')
            for boundary in interval_boundaries[1:-1]:  # 不画最小和最大边界
                plt.axvline(x=boundary, color='red', linestyle='--', alpha=0.8)
            plt.xlabel('目标变量值')
            plt.ylabel('频次')
            plt.title('区间划分')
            plt.legend()
            
            # 子图2: 每个区间的最佳模型
            plt.subplot(2, 2, 2)
            interval_nums = list(interval_performances.keys())
            best_models = [interval_performances[i]['best_model'] for i in interval_nums]
            best_r2s = [interval_performances[i]['best_r2'] for i in interval_nums]
            
            bars = plt.bar(range(len(interval_nums)), best_r2s)
            plt.xlabel('区间编号')
            plt.ylabel('最佳R²')
            plt.title('各区间最佳模型性能')
            plt.xticks(range(len(interval_nums)), [f'区间{i+1}\n{best_models[i]}' for i in range(len(best_models))], rotation=45)
            
            # 子图3: 模型使用频率对比
            plt.subplot(2, 2, 3)
            models_list = list(train_model_usage.keys())
            train_usage = [train_model_usage[m] for m in models_list]
            test_usage = [test_model_usage[m] for m in models_list]
            
            x = np.arange(len(models_list))
            width = 0.35
            
            plt.bar(x - width/2, train_usage, width, label='训练集', alpha=0.8)
            plt.bar(x + width/2, test_usage, width, label='测试集', alpha=0.8)
            plt.xlabel('模型')
            plt.ylabel('使用次数')
            plt.title('各模型使用频率对比')
            plt.xticks(x, models_list, rotation=45)
            plt.legend()
            
            # 子图4: 误差分布
            plt.subplot(2, 2, 4)
            test_errors = y_test[target] - test_interval_preds
            plt.hist(test_errors, bins=30, alpha=0.7)
            plt.axvline(x=0, color='r', linestyle='--')
            plt.xlabel('预测误差')
            plt.ylabel('频次')
            plt.title(f'测试集误差分布 (MAE={np.abs(test_errors).mean():.4f})')
            
            plt.tight_layout()
            plt.show()
            
            # 3. 各区间内所有模型的R²对比热力图
            plt.figure(figsize=(10, 6))
            
            # 准备热力图数据
            interval_labels = [f"区间{i+1}\n[{perf['bounds'][0]:.2f}, {perf['bounds'][1]:.2f}]" 
                             for i, perf in interval_performances.items()]
            
            r2_matrix = []
            for i, perf in interval_performances.items():
                r2_row = [perf['all_r2'][model] for model in available_models]
                r2_matrix.append(r2_row)
            
            r2_matrix = np.array(r2_matrix)
            
            im = plt.imshow(r2_matrix, cmap='RdYlGn', aspect='auto')
            plt.colorbar(im, label='R² Score')
            
            plt.xlabel('模型')
            plt.ylabel('区间')
            plt.title('各区间内所有模型的R²表现热力图')
            
            plt.xticks(range(len(available_models)), available_models, rotation=45)
            plt.yticks(range(len(interval_labels)), interval_labels)
            
            # 在每个格子中标注数值
            for i in range(len(interval_labels)):
                for j in range(len(available_models)):
                    plt.text(j, i, f'{r2_matrix[i, j]:.3f}', 
                           ha="center", va="center", color="black" if r2_matrix[i, j] < 0.5 else "white")
            
            plt.tight_layout()
            plt.show()

    except Exception as e:
        print(f"创建基于区间的集成模型失败: {str(e)}")
        print(f"错误详情: {traceback.format_exc()}")

In [None]:
# 循环使用次数XGBoost贝叶斯超参数优化
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer
import numpy as np
import pandas as pd
import pickle
import os
import matplotlib.pyplot as plt
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from skopt.plots import plot_convergence
import warnings
warnings.filterwarnings('ignore')

# 创建保存数据的文件夹
save_folder = '模型可视化数据'
if not os.path.exists(save_folder):
    os.makedirs(save_folder)
    print(f"已创建文件夹：{save_folder}")

# 选择目标变量
target = '循环使用次数'
print(f"训练 {target} 的XGBoost模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.03)

# 选择适当的数据集
X_train_model = X_train_tree[target]
X_test_model = X_test_tree[target]

# 从您的代码中复制的完整函数定义
def tolerance_r2_score(y_true, y_pred, tolerance=0.15, target=None):
    """
    计算容忍度R²评分，允许一定误差范围内的预测被视为准确
    """
    y_true = np.asarray(y_true).ravel()
    y_pred = np.asarray(y_pred).ravel()
    
    if target and target in target_tolerance:
        tolerance = target_tolerance[target]
    
    tolerance_values = tolerance * np.abs(y_true)
    residuals = np.abs(y_true - y_pred)
    adjusted_residuals = np.maximum(0, residuals - tolerance_values)
    
    y_true_mean = np.mean(y_true)
    tss = np.sum((y_true - y_true_mean) ** 2)
    rss = np.sum(adjusted_residuals ** 2)
    
    if tss == 0:
        return 0
    
    tolerance_r2 = 1 - (rss / tss)
    return tolerance_r2

def prediction_within_tolerance(y_true, y_pred, tolerance=0.15, target=None):
    """
    计算预测值在目标值±容忍范围内的比例
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    
    if target and target in target_tolerance:
        tolerance = target_tolerance[target]
    
    tolerance_values = tolerance * np.abs(y_true)
    within_tolerance = np.abs(y_true - y_pred) <= tolerance_values
    
    return np.mean(within_tolerance)

def make_tolerance_scorer(target_name):
    def tolerance_score(y_true, y_pred):
        tolerance = target_tolerance.get(target_name, 0.03)
        relative_errors = np.abs(y_true - y_pred) / np.abs(y_true)
        within_tolerance = np.mean(relative_errors <= tolerance)
        return within_tolerance
    return tolerance_score

def evaluate_model(model, X_train, y_train, X_test, y_test, target, model_name):
    """
    评估模型在训练集和测试集上的性能，包括标准R²和容忍度R²
    """
    current_tolerance = target_tolerance.get(target, 0.15)
    
    # 在训练集上评估
    y_train_pred = model.predict(X_train)
    if len(y_train_pred.shape) > 1 and y_train_pred.shape[1] == 1:
        y_train_pred = y_train_pred.flatten()
    
    train_r2 = r2_score(y_train, y_train_pred)
    train_tol_r2 = tolerance_r2_score(y_train, y_train_pred, tolerance=current_tolerance, target=target)
    train_within_tol = prediction_within_tolerance(y_train, y_train_pred, tolerance=current_tolerance, target=target)
    
    # 在测试集上评估
    y_test_pred = model.predict(X_test)
    if len(y_test_pred.shape) > 1 and y_test_pred.shape[1] == 1:
        y_test_pred = y_test_pred.flatten()
        
    test_r2 = r2_score(y_test, y_test_pred)
    test_tol_r2 = tolerance_r2_score(y_test, y_test_pred, tolerance=current_tolerance, target=target)
    test_within_tol = prediction_within_tolerance(y_test, y_test_pred, tolerance=current_tolerance, target=target)
    
    print(f"\n{model_name} 在 {target} 上的评估结果:")
    print(f"训练集: R²={train_r2:.4f}, 容忍度R²={train_tol_r2:.4f}, 在容忍范围内比例={train_within_tol:.2%}")
    print(f"测试集: R²={test_r2:.4f}, 容忍度R²={test_tol_r2:.4f}, 在容忍范围内比例={test_within_tol:.2%}")
    
    # 绘制预测值与实际值的对比散点图
    plt.figure(figsize=(12, 5))
    
    # 训练集散点图
    plt.subplot(1, 2, 1)
    plt.scatter(y_train, y_train_pred, alpha=0.6, s=30)
    min_val = min(min(y_train), min(y_train_pred))
    max_val = max(max(y_train), max(y_train_pred))
    plt.plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2)
    plt.xlabel('实际值')
    plt.ylabel('预测值')
    plt.title(f'训练集: R²={train_r2:.4f}')
    plt.grid(True, alpha=0.3)
    
    # 测试集散点图
    plt.subplot(1, 2, 2)
    plt.scatter(y_test, y_test_pred, alpha=0.6, s=30)
    min_val = min(min(y_test), min(y_test_pred))
    max_val = max(max(y_test), max(y_test_pred))
    plt.plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2)
    plt.xlabel('实际值')
    plt.ylabel('预测值')
    plt.title(f'测试集: R²={test_r2:.4f}')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return {
        'train_r2': train_r2,
        'train_tol_r2': train_tol_r2,
        'train_within_tol': train_within_tol,
        'test_r2': test_r2,
        'test_tol_r2': test_tol_r2,
        'test_within_tol': test_within_tol
    }

# 为当前目标创建特定的评分器
tol_scorer = make_tolerance_scorer(target)
tol_scorer_wrapped = make_scorer(tol_scorer, greater_is_better=True)

# 设置基础参数
base_params = {
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'n_jobs': -1,
    'random_state': 42,
    'missing': np.nan
}

# 定义贝叶斯优化的搜索空间（基于您原始代码的参数范围）
dimensions = [
    Integer(50, 100, name='n_estimators'),           # 您原始代码：[100, 50, 90, 80]
    Real(0.5, 0.8, name='learning_rate'),            # 您原始代码：[0.7, 0.8, 0.6, 0.5]
    Integer(3, 6, name='max_depth'),                 # 您原始代码：[4, 5, 6, 3]
    Integer(2, 6, name='min_child_weight'),          # 您原始代码：[5, 4, 6, 3, 2]
    Real(0.0, 0.2, name='gamma'),                    # 您原始代码：[0, 0.1, 0.2]
    Real(0.5, 0.6, name='subsample'),               # 您原始代码：[0.6, 0.5]
    Real(0.8, 1.0, name='colsample_bytree'),        # 您原始代码：[0.9, 1.0, 0.8]
    Real(0.0, 1.0, name='reg_alpha'),               # 您原始代码：[0, 0.4, 0.5, 0.6, 1.0]
    Real(0.5, 1.0, name='reg_lambda')               # 您原始代码：[1.0, 0.5, 0.7, 0.8]
]

# 交叉验证设置
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 定义目标函数
@use_named_args(dimensions=dimensions)
def objective(**params):
    model = XGBRegressor(
        n_estimators=params['n_estimators'],
        learning_rate=params['learning_rate'],
        max_depth=params['max_depth'],
        min_child_weight=params['min_child_weight'],
        gamma=params['gamma'],
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        reg_alpha=params['reg_alpha'],
        reg_lambda=params['reg_lambda'],
        **base_params
    )
    
    try:
        cv_scores = cross_val_score(
            model, X_train_model, y_train[target], 
            cv=kf, scoring=tol_scorer_wrapped
        )
        return -cv_scores.mean()
    except:
        return 1.0

# 执行贝叶斯优化
print("执行贝叶斯优化...")
result = gp_minimize(
    func=objective,
    dimensions=dimensions,
    n_calls=50,
    n_initial_points=10,
    random_state=42,
    verbose=True
)

# 获取最佳参数
best_params = dict(zip([dim.name for dim in dimensions], result.x))
print(f"最佳参数: {best_params}")
print(f"最佳CV得分: {-result.fun:.4f}")

# 使用最佳参数创建最终模型
xgb_model = XGBRegressor(
    n_estimators=best_params['n_estimators'],
    learning_rate=best_params['learning_rate'],
    max_depth=best_params['max_depth'],
    min_child_weight=best_params['min_child_weight'],
    gamma=best_params['gamma'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree'],
    reg_alpha=best_params['reg_alpha'],
    reg_lambda=best_params['reg_lambda'],
    **base_params
)

xgb_model.fit(X_train_model, y_train[target])

# 交叉验证
cv_scores = cross_val_score(xgb_model, X_train_model, y_train[target], cv=kf, scoring='r2')
print(f"交叉验证R²分数: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# 容忍度R²评分
tolerance_cv_scores = cross_val_score(
    xgb_model, X_train_model, y_train[target], cv=kf, 
    scoring=tol_scorer_wrapped
)
print(f"容忍度R²分数 (容忍度={current_tolerance:.2f}): {tolerance_cv_scores.mean():.4f} ± {tolerance_cv_scores.std():.4f}")

# 保存模型
models[target]['XGBoost'] = xgb_model

# 创建模型保存文件夹
model_folder = '训练模型文件'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)
    print(f"已创建模型文件夹：{model_folder}")

xgb_model_file = os.path.join(model_folder, f'{target}_XGBoost模型.pkl')
with open(xgb_model_file, 'wb') as f:
    pickle.dump(xgb_model, f)
print(f"XGBoost 模型已保存至 {xgb_model_file}")

# 评估模型
results = evaluate_model(xgb_model, X_train_model, y_train[target], X_test_model, y_test[target], target, "XGBoost")

# 获取预测值
y_pred_train = xgb_model.predict(X_train_model)
y_pred_test = xgb_model.predict(X_test_model)

# 分别创建训练集和测试集的预测结果DataFrame
train_prediction = pd.DataFrame({
    '实际值': y_train[target],
    '预测值': y_pred_train,
    '误差': np.abs(y_train[target] - y_pred_train)
})
train_prediction['数据集'] = '训练集'

test_prediction = pd.DataFrame({
    '实际值': y_test[target],
    '预测值': y_pred_test,
    '误差': np.abs(y_test[target] - y_pred_test)
})
test_prediction['数据集'] = '测试集'

# 保存到单独的文件
train_file = os.path.join(save_folder, f'{target}_XGBoost训练集预测结果.csv')
test_file = os.path.join(save_folder, f'{target}_XGBoost测试集预测结果.csv')

train_prediction.to_csv(train_file, index=False)
test_prediction.to_csv(test_file, index=False)

print(f"训练集预测结果已保存至 {train_file}")
print(f"测试集预测结果已保存至 {test_file}")

# 特征重要性
feature_importance = pd.DataFrame({
    'Feature': X_train_model.columns,
    'Importance': xgb_model.feature_importances_
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('重要性')
plt.ylabel('特征')
plt.title(f'{target} - XGBoost特征重要性')
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()

# 保存特征重要性数据
feature_importance_file = os.path.join(save_folder, f'{target}_XGBoost特征重要性.csv')
feature_importance.to_csv(feature_importance_file, index=False)
print(f"特征重要性数据已保存至 {feature_importance_file}")

# 绘制优化收敛图
plt.figure(figsize=(10, 6))
plot_convergence(result)
plt.title('贝叶斯优化收敛过程')
plt.show()

# 保存优化结果
optimization_history = pd.DataFrame({
    '迭代次数': range(1, len(result.func_vals) + 1),
    '目标函数值': result.func_vals,
    '最佳目标函数值': [min(result.func_vals[:i+1]) for i in range(len(result.func_vals))]
})

param_names = [dim.name for dim in dimensions]
for i, param_name in enumerate(param_names):
    optimization_history[f'参数_{param_name}'] = [x[i] for x in result.x_iters]

optimization_history_file = os.path.join(save_folder, f'{target}_贝叶斯优化历史.csv')
optimization_history.to_csv(optimization_history_file, index=False)
print(f"优化历史数据已保存至 {optimization_history_file}")

print(f"\n贝叶斯优化完成！")

In [None]:
# 导入必要的库
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBRegressor
import os
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, KFold
from sklearn.metrics import make_scorer, r2_score
from sklearn.model_selection import train_test_split

# --- 前置准备工作 (请确保这些变量在您的代码中已定义) ---
# 假设您已经加载并准备好了以下数据和变量:
# X_train_tree, X_test_tree: 特征数据，类型为 pd.DataFrame
# y_train, y_test: 目标数据，类型为 pd.DataFrame 或 pd.Series
# target_tolerance: 一个字典，包含每个目标的容忍度，例如 {'循环使用次数': 0.03}
# make_tolerance_scorer: 一个函数，用于创建您的自定义评分器
# evaluate_model: 一个函数，用于评估模型并返回结果
# kf: 一个交叉验证折叠器，例如 KFold(n_splits=5, shuffle=True, random_state=42)

# 示例占位符 (如果您的代码中没有这些，请取消注释并按需修改)
# from sklearn.model_selection import KFold
# def make_tolerance_scorer(target, tolerance=0.03):
#     def tolerance_scorer(y_true, y_pred):
#         return np.mean(np.abs(y_true - y_pred) <= np.abs(y_true * tolerance))
#     return make_scorer(tolerance_scorer, greater_is_better=True)
#
# def evaluate_model(model, X_train, y_train, X_test, y_test, target_name, model_name):
#     y_pred_train = model.predict(X_train)
#     y_pred_test = model.predict(X_test)
#     r2_train = r2_score(y_train, y_pred_train)
#     r2_test = r2_score(y_test, y_pred_test)
#     print(f"--- {model_name} for {target_name} 评估结果 ---")
#     print(f"训练集 R²: {r2_train:.4f}")
#     print(f"测试集 R²: {r2_test:.4f}")
#     return {"R2_Train": r2_train, "R2_Test": r2_test}
#
# # 假设的数据 (用作示例)
# from sklearn.datasets import make_regression
# X, y = make_regression(n_samples=1000, n_features=20, noise=0.1, random_state=42)
# X = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(20)])
# y = pd.Series(y, name='循环使用次数')
# y = (y - y.min()) * 10 # 确保y值为正
# X_train, X_test, y_train_full, y_test_full = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train_tree = {'循环使用次数': X_train}
# X_test_tree = {'循环使用次数': X_test}
# y_train = pd.DataFrame({'循环使用次数': y_train_full})
# y_test = pd.DataFrame({'循环使用次数': y_test_full})
# target_tolerance = {'循环使用次数': 0.05}
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
# # --- 前置准备工作结束 ---


# --- 主要训练流程 ---

# 1. 初始化设置
target = '循环使用次数'
save_folder = '模型可视化数据'
if not os.path.exists(save_folder):
    os.makedirs(save_folder)
    print(f"已创建文件夹：{save_folder}")

print(f"开始训练 {target} 的XGBoost模型...")

# 2. 数据和评分器准备
current_tolerance = target_tolerance.get(target, 0.03)
X_train_model = X_train_tree[target]
X_test_model = X_test_tree[target]
y_train_target = y_train[target]
y_test_target = y_test[target]

# 为当前目标创建自定义评分器
# tol_scorer = make_tolerance_scorer(target, tolerance=current_tolerance) # 如果您的函数需要传入target和tolerance
tol_scorer = make_tolerance_scorer(target) # 根据您原始代码的调用方式
tol_scorer_wrapped = make_scorer(tol_scorer, greater_is_better=True)


# 3. 为提前停止(Early Stopping)准备验证集
# 从原始训练集中分出一部分作为验证集，用于在超参数搜索中监控模型性能
X_train_fit, X_val, y_train_fit, y_val = train_test_split(
    X_train_model, y_train_target, test_size=0.2, random_state=42
)
print(f"训练集大小: {X_train_fit.shape}, 验证集大小: {X_val.shape}")

# 4. 定义基础模型和超参数搜索空间
# 设置基础参数
base_params = {
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'n_jobs': -1,
    'random_state': 42,
    'missing': np.nan,
}

# 创建基础模型
# 注意：n_estimators 设置得比较大，因为我们将使用 early_stopping 来自动找到最佳值
base_model = XGBRegressor(n_estimators=1000, **base_params)

# 设置超参数搜索空间 (移除了 n_estimators，扩展了 max_depth)
param_dist = {
    'learning_rate': [0.01, 0.02, 0.03, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.6, 0.7, 0.8, 0.9],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9],
    'reg_alpha': [0, 0.1, 0.5, 1.0],
    'reg_lambda': [0.5, 1.0, 1.5, 2.0]
}

# 5. 执行带提前停止的超参数优化
print("\n执行带有提前停止的超参数优化...")
search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_dist,
    n_iter=150,  # 减少迭代次数，150次在多数情况下已足够
    cv=kf,
    scoring=tol_scorer_wrapped,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

# 使用 fit_params 将提前停止应用到 RandomizedSearchCV 的每一次训练中
fit_params = {
    'early_stopping_rounds': 50,  # 如果验证集分数在50轮内没有提升，则停止训练
    'eval_set': [(X_val, y_val)],
    'verbose': False
}

search.fit(X_train_model, y_train_target, **fit_params)

# 获取最佳估算器
# search.best_estimator_ 已经是用最佳参数和最佳迭代次数训练好的模型
xgb_model = search.best_estimator_

print(f"\n最佳参数: {search.best_params_}")
print(f"通过提前停止找到的最佳迭代次数: {xgb_model.n_estimators}")
print(f"最佳CV得分 (使用自定义容忍度评分): {search.best_score_:.4f}")

# 6. 使用最佳模型进行最终评估和可视化
# 交叉验证 R² 分数
cv_scores_r2 = cross_val_score(xgb_model, X_train_model, y_train_target, cv=kf, scoring='r2')
print(f"交叉验证 R² 分数: {cv_scores_r2.mean():.4f} ± {cv_scores_r2.std():.4f}")

# 交叉验证容忍度分数
cv_scores_tol = cross_val_score(xgb_model, X_train_model, y_train_target, cv=kf, scoring=tol_scorer_wrapped)
print(f"容忍度R²分数 (容忍度={current_tolerance:.2f}): {cv_scores_tol.mean():.4f} ± {cv_scores_tol.std():.4f}")

# 评估模型在训练集和测试集上的最终表现
results = evaluate_model(xgb_model, X_train_model, y_train_target, X_test_model, y_test_target, target, "XGBoost")

# 7. 保存模型和预测结果
model_folder = '训练模型文件'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)
xgb_model_file = os.path.join(model_folder, f'{target}_XGBoost模型.pkl')
with open(xgb_model_file, 'wb') as f:
    pickle.dump(xgb_model, f)
print(f"\nXGBoost 模型已保存至 {xgb_model_file}")

# 获取并保存预测结果
y_pred_train = xgb_model.predict(X_train_model)
y_pred_test = xgb_model.predict(X_test_model)

train_prediction = pd.DataFrame({'实际值': y_train_target, '预测值': y_pred_train, '误差': np.abs(y_train_target - y_pred_train), '数据集': '训练集'})
test_prediction = pd.DataFrame({'实际值': y_test_target, '预测值': y_pred_test, '误差': np.abs(y_test_target - y_pred_test), '数据集': '测试集'})

train_file = os.path.join(save_folder, f'{target}_XGBoost训练集预测结果.csv')
test_file = os.path.join(save_folder, f'{target}_XGBoost测试集预测结果.csv')
train_prediction.to_csv(train_file, index=False)
test_prediction.to_csv(test_file, index=False)
print(f"训练集预测结果已保存至 {train_file}")
print(f"测试集预测结果已保存至 {test_file}")


# 8. 特征重要性分析
feature_importance = pd.DataFrame({
    'Feature': X_train_model.columns,
    'Importance': xgb_model.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(12, 8))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('重要性')
plt.ylabel('特征')
plt.title(f'{target} - XGBoost特征重要性')
plt.grid(True, axis='x')
plt.gca().invert_yaxis() # 让最重要的特征显示在顶部
plt.tight_layout()
plt.show()

feature_importance_file = os.path.join(save_folder, f'{target}_XGBoost特征重要性.csv')
feature_importance.to_csv(feature_importance_file, index=False)
print(f"特征重要性数据已保存至 {feature_importance_file}")


# 9. 绘制最终模型的训练过程曲线 (学习曲线)
# 为了得到清晰的训练/测试曲线，我们用找到的最佳参数重新训练一次模型
# 这次我们用完整的训练集进行训练，并用测试集作为评估集来观察过拟合情况
print("\n正在绘制最终模型的学习曲线...")
final_model_for_plot = XGBRegressor(**xgb_model.get_params())
eval_set_plot = [(X_train_model, y_train_target), (X_test_model, y_test_target)]

final_model_for_plot.fit(X_train_model, y_train_target,
                         eval_set=eval_set_plot,
                         eval_metric='rmse',
                         verbose=False)

results_plot = final_model_for_plot.evals_result()
epochs = len(results_plot['validation_0']['rmse'])
x_axis = range(0, epochs)

plt.figure(figsize=(10, 6))
plt.plot(x_axis, results_plot['validation_0']['rmse'], label='训练集RMSE')
plt.plot(x_axis, results_plot['validation_1']['rmse'], label='测试集RMSE')
plt.legend()
plt.ylabel('RMSE')
plt.xlabel('迭代次数')
plt.title('XGBoost 最终模型训练进度')
plt.grid(True)
plt.show()

training_progress_data = pd.DataFrame({
    '迭代次数': x_axis,
    '训练集RMSE': results_plot['validation_0']['rmse'],
    '测试集RMSE': results_plot['validation_1']['rmse']
})
training_progress_file = os.path.join(save_folder, f'{target}_XGBoost训练进度.csv')
training_progress_data.to_csv(training_progress_file, index=False)
print(f"训练进度数据已保存至 {training_progress_file}")


# 10. 学习率影响分析 (此部分为独立分析，保持不变)
print("\n正在执行学习率影响分析...")
learning_rates = [0.01, 0.03, 0.05, 0.1, 0.2]
plt.figure(figsize=(10, 6))
lr_analysis_data = pd.DataFrame()

for lr in learning_rates:
    model_lr = XGBRegressor(
        learning_rate=lr,
        n_estimators=500, # 固定迭代次数用于比较
        max_depth=5,       # 使用一个合理的深度
        subsample=0.8,
        colsample_bytree=0.8,
        **base_params
    )
    eval_set_lr = [(X_test_model, y_test_target)]
    model_lr.fit(X_train_model, y_train_target, eval_set=eval_set_lr, eval_metric='rmse', verbose=False)
    
    results_lr = model_lr.evals_result()
    test_rmse = results_lr['validation_0']['rmse']
    
    plt.plot(test_rmse, label=f'学习率: {lr}')
    
    # 准备数据以供保存
    temp_df = pd.DataFrame({f'学习率_{lr}': test_rmse})
    lr_analysis_data = pd.concat([lr_analysis_data, temp_df], axis=1)

lr_analysis_data.insert(0, '迭代次数', range(len(lr_analysis_data)))

plt.title('不同学习率对测试集RMSE的影响')
plt.xlabel('迭代次数')
plt.ylabel('测试集 RMSE')
plt.legend()
plt.grid(True)
plt.show()

lr_analysis_file = os.path.join(save_folder, f'{target}_XGBoost学习率分析.csv')
lr_analysis_data.to_csv(lr_analysis_file, index=False)
print(f"学习率分析数据已保存至 {lr_analysis_file}")

In [None]:


# lightGBM
print(f"训练 {target} 的LightGBM模型...")
target = '循环使用次数'
# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.03)
# 选择适当的数据集
X_train_model = X_train_tree[target]
X_test_model = X_test_tree[target]
# 设置基础参数
base_params = {
    'objective': 'regression',
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'verbose': -1,
    'n_jobs': -1,
    'random_state': 42
}
# 设置特定参数
lgb_params = {
    'n_estimators': 6000,
    'learning_rate': 0.001,
    'num_leaves': 20,
    'max_depth': 8,#8shi0.72
    'min_child_samples': 1,
    'subsample': 1,
    'colsample_bytree':1,
    'reg_alpha': 10,
    'reg_lambda': 1.0,
    **base_params
}

print("使用自定义LightGBM包装器训练模型")
# 确保保存当前使用的特征列
feature_cols = X_train_model.columns.tolist() if hasattr(X_train_model, 'columns') else None

# 创建并训练模型
lgb_model = CustomLGBMRegressor(**lgb_params)
lgb_model.fit(X_train_model, y_train[target])

# 保存模型
models[target]['LightGBM'] = lgb_model
# 创建模型保存文件夹（如果已存在则不会重复创建）
model_folder = '训练模型文件'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)
    print(f"已创建模型文件夹：{model_folder}")

lgb_model_file = os.path.join(model_folder, f'{target}_LightGBM模型.pkl')
with open(lgb_model_file, 'wb') as f:
    pickle.dump(lgb_model, f)
print(f"LightGBM 模型已保存至 {lgb_model_file}")
# 评估模型
results = evaluate_model(lgb_model, X_train_model, y_train[target], X_test_model, y_test[target], target, "LightGBM")

print("LightGBM模型训练成功")
# 获取预测值
y_pred_train = lgb_model.predict(X_train_model)
y_pred_test = lgb_model.predict(X_test_model)

# 分别创建训练集和测试集的预测结果DataFrame
train_prediction = pd.DataFrame({
    '实际值': y_train[target],
    '预测值': y_pred_train,
    '误差': np.abs(y_train[target] - y_pred_train)
})
train_prediction['数据集'] = '训练集'

test_prediction = pd.DataFrame({
    '实际值': y_test[target],
    '预测值': y_pred_test,
    '误差': np.abs(y_test[target] - y_pred_test)
})
test_prediction['数据集'] = '测试集'

# 保存到单独的文件
train_file = os.path.join(save_folder, f'{target}_LightGBM训练集预测结果.csv')
test_file = os.path.join(save_folder, f'{target}_LightGBM测试集预测结果.csv')

train_prediction.to_csv(train_file, index=False)
test_prediction.to_csv(test_file, index=False)

print(f"训练集预测结果已保存至 {train_file}")
print(f"测试集预测结果已保存至 {test_file}")
# 特征重要性可视化
if hasattr(lgb_model.model, 'feature_importance') and feature_cols is not None:
    feature_importance = pd.DataFrame({
        'Feature': feature_cols,
        'Importance': lgb_model.model.feature_importance(importance_type='gain')
    })
    feature_importance = feature_importance.sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance['Feature'], feature_importance['Importance'])
    plt.xlabel('增益重要性')
    plt.ylabel('特征')
    plt.title(f'{target} - LightGBM特征重要性')
    plt.grid(True, axis='x')
    plt.tight_layout()
    plt.show()
# 保存特征重要性数据
feature_importance_file = os.path.join(save_folder, f'{target}_LightGBM特征重要性.csv')
feature_importance.to_csv(feature_importance_file, index=False)
print(f"特征重要性数据已保存至 {feature_importance_file}")


# 添加预测不准确样本分析
# 获取测试集预测值
y_pred = lgb_model.predict(X_test_model)

# 将y_test转换为numpy数组格式进行处理
if hasattr(y_test[target], 'values'):
    y_true_values = y_test[target].values
else:
    y_true_values = y_test[target]

# 计算绝对误差
errors = np.abs(y_true_values - y_pred)

# 设置容忍度阈值
tolerance = 5.0  # 可以根据需要调整

# 找出误差超过容忍度的样本
inaccurate_mask = errors > tolerance
inaccurate_indices = np.where(inaccurate_mask)[0]

print(f"\n预测不准确的样本数量: {len(inaccurate_indices)} (占测试集的 {len(inaccurate_indices)/len(y_test)*100:.2f}%)")
print(f"使用的容忍度阈值: {tolerance}")

# 创建预测不准确样本的分析数据
if len(inaccurate_indices) > 0:
    # 尝试获取原始索引，如果不可用则使用数组位置索引
    try:
        if hasattr(y_test, 'index'):
            original_indices = [y_test.index[i] for i in inaccurate_indices]
        elif isinstance(X_test_model, pd.DataFrame) and hasattr(X_test_model, 'index'):
            original_indices = [X_test_model.index[i] for i in inaccurate_indices]
        else:
            # 如果无法获取原始索引，使用数组位置作为标识
            original_indices = inaccurate_indices
    except Exception as e:
        print(f"无法获取原始索引: {str(e)}")
        original_indices = inaccurate_indices
    
    # 创建包含预测不准确样本信息的DataFrame
    inaccurate_samples = []
    for i, idx in enumerate(inaccurate_indices):
        # 安全地获取实际值
        if hasattr(y_test[target], 'iloc'):
            actual = y_test[target].iloc[idx]
        else:
            actual = y_true_values[idx]
        
        inaccurate_samples.append({
            '样本索引': original_indices[i],
            '实际值': actual,
            '预测值': y_pred[idx],
            '绝对误差': errors[idx],
            '相对误差(%)': (errors[idx] / np.abs(actual)) * 100 if actual != 0 else float('inf')
        })
    
    inaccurate_df = pd.DataFrame(inaccurate_samples)
    # 按误差降序排列
    inaccurate_df = inaccurate_df.sort_values('绝对误差', ascending=False)
    
    # 打印预测不准确的样本信息
    print("\n预测不准确的样本详情 (按误差降序排列):")
    print(inaccurate_df)
    
    # 保存结果到文件
    inaccurate_df.to_csv(f'{target}_不准确预测.csv', index=False)
else:
    print(f"\n没有发现预测不准确的样本 (容忍度阈值: {tolerance})")




In [None]:

#HistGradientBoosting
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
print(f"训练 {target} 的HistGradientBoosting模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.03)

# 选择适当的数据集
X_train_model = X_train_tree[target]
X_test_model = X_test_tree[target]

# 为当前目标创建特定的评分器
tol_scorer = make_tolerance_scorer(target)
tol_scorer_wrapped = make_scorer(tol_scorer, greater_is_better=True)

# 设置超参数min_samples_leaf': 4, 'max_iter': 400, 'max_depth': 11, 'learning_rate': 0.011, 'l2_regularization': 0}
#{'min_samples_leaf': 4, 'max_iter': 460, 'max_depth': 7, 'learning_rate': 0.015, 'l2_regularization': 0}
param_dist = {
    'max_iter': [430, 400,380,440,460,600,500,800,900],
    'learning_rate': [0.01, 0.008, 0.011,0.012,0.015,0.009,0.1,0.02],
    'max_depth': [9, 11, 10,8,7,6,5,12,13],
    'min_samples_leaf': [1, 2, 4,5,6,3],
    'l2_regularization': [0, 0.1, 0.2]
}

# 创建基础模型
base_model = HistGradientBoostingRegressor(
    max_iter=4000,
    learning_rate=1,
    max_depth=3,
    min_samples_leaf=4,
    l2_regularization=0.5,
    loss='squared_error',
    random_state=42
)

# 执行超参数优化
if len(X_train_model) >= 90:
    print("执行超参数优化...")
    search = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=param_dist,
        n_iter=8000,
        cv=kf,
        scoring=tol_scorer_wrapped,
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    search.fit(X_train_model, y_train[target])
    hgb_model = search.best_estimator_
    print(f"最佳参数: {search.best_params_}")
    print(f"最佳CV得分: {search.best_score_:.4f}")
else:
    # 使用预定义的参数
    print("使用预定义参数...")
    hgb_model = base_model
    hgb_model.fit(X_train_model, y_train[target])

# 交叉验证
cv_scores = cross_val_score(hgb_model, X_train_model, y_train[target], cv=kf, scoring='r2')
print(f"交叉验证R²分数: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# 容忍度R²评分
tolerance_cv_scores = cross_val_score(
    hgb_model, X_train_model, y_train[target], cv=kf, 
    scoring=tol_scorer_wrapped
)
print(f"容忍度R²分数 (容忍度={current_tolerance:.2f}): {tolerance_cv_scores.mean():.4f} ± {tolerance_cv_scores.std():.4f}")

# 保存模型
models[target]['HistGradientBoosting'] = hgb_model
# 使用pickle保存HistGradientBoosting模型
hgb_model_file = os.path.join(model_folder, f'{target}_HistGradientBoosting模型.pkl')
with open(hgb_model_file, 'wb') as f:
    pickle.dump(hgb_model, f)
print(f"HistGradientBoosting模型已保存至 {hgb_model_file}")
# 评估模型
results = evaluate_model(hgb_model, X_train_model, y_train[target], X_test_model, y_test[target], target, "HistGradientBoosting")
# 获取预测值
y_pred_train = hgb_model.predict(X_train_model)
y_pred_test = hgb_model.predict(X_test_model)

# 分别创建训练集和测试集的预测结果DataFrame
train_prediction = pd.DataFrame({
    '实际值': y_train[target],
    '预测值': y_pred_train,
    '误差': np.abs(y_train[target] - y_pred_train)
})
train_prediction['数据集'] = '训练集'

test_prediction = pd.DataFrame({
    '实际值': y_test[target],
    '预测值': y_pred_test,
    '误差': np.abs(y_test[target] - y_pred_test)
})
test_prediction['数据集'] = '测试集'

# 保存到文件
train_file = os.path.join(save_folder, f'{target}_HistGradientBoosting训练集预测结果.csv')
test_file = os.path.join(save_folder, f'{target}_HistGradientBoosting测试集预测结果.csv')

train_prediction.to_csv(train_file, index=False)
test_prediction.to_csv(test_file, index=False)

print(f"训练集预测结果已保存至 {train_file}")
print(f"测试集预测结果已保存至 {test_file}")
# 由于HistGradientBoosting不直接提供特征重要性，使用permutation importance评估
from sklearn.inspection import permutation_importance

perm_importance = permutation_importance(
    hgb_model, X_test_model, y_test[target], 
    n_repeats=10, random_state=42, n_jobs=-1
)

feature_importance = pd.DataFrame({
    'Feature': X_train_model.columns,
    'Importance': perm_importance.importances_mean
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('置换重要性')
plt.ylabel('特征')
plt.title(f'{target} - HistGradientBoosting特征重要性')
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()
# 保存特征重要性数据
feature_importance_file = os.path.join(save_folder, f'{target}_HistGradientBoosting特征重要性.csv')
feature_importance.to_csv(feature_importance_file, index=False)
print(f"特征重要性数据已保存至 {feature_importance_file}")
# 不同学习率和迭代次数的影响分析
learning_rates = [0.01, 0.008, 0.011,0.009]
max_iters = [50, 100, 200, 300]
fig, axs = plt.subplots(len(learning_rates), 1, figsize=(10, 4*len(learning_rates)), sharex=True)

for i, lr in enumerate(learning_rates):
    train_scores = []
    test_scores = []
    for iter_count in max_iters:
        model = HistGradientBoostingRegressor(
            max_iter=iter_count,
            learning_rate=lr,
            max_depth=3,
            random_state=42
        )
        model.fit(X_train_model, y_train[target])
        train_score = r2_score(y_train[target], model.predict(X_train_model))
        test_score = r2_score(y_test[target], model.predict(X_test_model))
        train_scores.append(train_score)
        test_scores.append(test_score)
    
    axs[i].plot(max_iters, train_scores, 'o-', label='训练集 R²')
    axs[i].plot(max_iters, test_scores, 'o-', label='测试集 R²')
    axs[i].set_title(f'学习率 = {lr}')
    axs[i].set_ylabel('R²')
    axs[i].grid(True)
    axs[i].legend()
plt.xlabel('迭代次数')
plt.suptitle('HistGradientBoosting - 学习率和迭代次数影响')
plt.tight_layout()
plt.show()
# 准备保存学习率和迭代次数影响分析数据
analysis_data = []

for i, lr in enumerate(learning_rates):
    for j, iter_count in enumerate(max_iters):
        analysis_data.append({
            '学习率': lr,
            '迭代次数': iter_count,
            '训练集R²': train_scores[j],
            '测试集R²': test_scores[j]
        })

# 转换为DataFrame并保存
lr_iter_analysis = pd.DataFrame(analysis_data)
lr_analysis_file = os.path.join(save_folder, f'{target}_HistGradientBoosting学习率迭代分析.csv')
lr_iter_analysis.to_csv(lr_analysis_file, index=False)
print(f"学习率和迭代次数分析数据已保存至 {lr_analysis_file}")



In [None]:


#RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

# 选择目标变量
print(f"训练 {target} 的随机森林模型...")
target = '循环使用次数'
# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.03)

# 选择适当的数据集
X_train_model = X_train_tree_filled[target]
X_test_model = X_test_tree_filled[target]

# 为当前目标创建特定的评分器
tol_scorer = make_tolerance_scorer(target)
tol_scorer_wrapped = make_scorer(tol_scorer, greater_is_better=True)

# 设置超参数
param_dist = {
    'n_estimators': [ 900,1000,1200,1100],
    'max_depth': [4, 5, 6,8,9,10],
    'min_samples_split': [ 5,4,6,7,8],
    'min_samples_leaf': [7, 4,3,2,1,5,6],
    'max_features': ['sqrt', 'log2', None]
}

# 创建基础模型
base_model = RandomForestRegressor(
    n_estimators=800,
    max_depth=None,
    min_samples_split=3,
    min_samples_leaf=4,
    max_features='sqrt',
    bootstrap=True,
    oob_score=True,
    n_jobs=-1,
    random_state=42
)

# 执行超参数优化
if len(X_train_model) >= 90:
    print("执行超参数优化...")
    search = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=param_dist,
        n_iter=1000,#变大后训练集R方变大但是时间可能很长2000/0.3569；4000/0.2091；3000/0.1221；1000/0.1577
        cv=kf,
        scoring=tol_scorer_wrapped,
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    search.fit(X_train_model, y_train[target])
    rf_model = search.best_estimator_
    print(f"最佳参数: {search.best_params_}")
    print(f"最佳CV得分: {search.best_score_:.4f}")
else:
    # 使用预定义的参数
    print("使用预定义参数...")
    rf_model = base_model
    rf_model.fit(X_train_model, y_train[target])

# 交叉验证
cv_scores = cross_val_score(rf_model, X_train_model, y_train[target], cv=kf, scoring='r2')
print(f"交叉验证R²分数: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# 容忍度R²评分
tolerance_cv_scores = cross_val_score(
    rf_model, X_train_model, y_train[target], cv=kf, 
    scoring=tol_scorer_wrapped
)
print(f"容忍度R²分数 (容忍度={current_tolerance:.2f}): {tolerance_cv_scores.mean():.4f} ± {tolerance_cv_scores.std():.4f}")

# 如果模型有oob_score属性，输出oob分数
if hasattr(rf_model, 'oob_score_'):
    print(f"袋外评分 (OOB score): {rf_model.oob_score_:.4f}")

# 保存模型
models[target]['RandomForest'] = rf_model
# 使用pickle保存随机森林模型
rf_model_file = os.path.join(model_folder, f'{target}_随机森林模型.pkl')
with open(rf_model_file, 'wb') as f:
    pickle.dump(rf_model, f)
print(f"随机森林模型已保存至 {rf_model_file}")
# 评估模型
results = evaluate_model(rf_model, X_train_model, y_train[target], X_test_model, y_test[target], target, "RandomForest")
# 获取预测值
y_pred_train = rf_model.predict(X_train_model)
y_pred_test = rf_model.predict(X_test_model)

# 分别创建训练集和测试集的预测结果DataFrame
train_prediction = pd.DataFrame({
    '实际值': y_train[target],
    '预测值': y_pred_train,
    '误差': np.abs(y_train[target] - y_pred_train)
})
train_prediction['数据集'] = '训练集'

test_prediction = pd.DataFrame({
    '实际值': y_test[target],
    '预测值': y_pred_test,
    '误差': np.abs(y_test[target] - y_pred_test)
})
test_prediction['数据集'] = '测试集'

# 保存到文件
train_file = os.path.join(save_folder, f'{target}_随机森林训练集预测结果.csv')
test_file = os.path.join(save_folder, f'{target}_随机森林测试集预测结果.csv')

train_prediction.to_csv(train_file, index=False)
test_prediction.to_csv(test_file, index=False)

print(f"训练集预测结果已保存至 {train_file}")
print(f"测试集预测结果已保存至 {test_file}")
# 特征重要性
feature_importance = pd.DataFrame({
    'Feature': X_train_model.columns,
    'Importance': rf_model.feature_importances_
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('重要性')
plt.ylabel('特征')
plt.title(f'{target} - RandomForest特征重要性')
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()
# 保存特征重要性数据
feature_importance_file = os.path.join(save_folder, f'{target}_随机森林特征重要性.csv')
feature_importance.to_csv(feature_importance_file, index=False)
print(f"特征重要性数据已保存至 {feature_importance_file}")
# 不同参数组合的影响
n_estimators_range = [10, 50, 100, 200, 300, 400,800,1000]
train_scores = []
test_scores = []
oob_scores = []

for n_est in n_estimators_range:
    rf = RandomForestRegressor(
        n_estimators=n_est,
        max_depth=None,
        min_samples_split=3,
        min_samples_leaf=4,
        max_features='sqrt',
        bootstrap=True,
        oob_score=True,
        n_jobs=-1,
        random_state=42
    )
    rf.fit(X_train_model, y_train[target])
    train_scores.append(r2_score(y_train[target], rf.predict(X_train_model)))
    test_scores.append(r2_score(y_test[target], rf.predict(X_test_model)))
    oob_scores.append(rf.oob_score_)

plt.figure(figsize=(10, 6))
plt.plot(n_estimators_range, train_scores, 'o-', label='训练集 R²')
plt.plot(n_estimators_range, test_scores, 'o-', label='测试集 R²')
plt.plot(n_estimators_range, oob_scores, 'o-', label='OOB R²')
plt.xlabel('树的数量')
plt.ylabel('R²')
plt.title('RandomForest - 树数量对性能的影响')
plt.legend()
plt.grid(True)
plt.show()
# 保存树数量影响分析数据
trees_analysis_data = pd.DataFrame({
    '树的数量': n_estimators_range,
    '训练集R²': train_scores,
    '测试集R²': test_scores,
    '袋外评分': oob_scores
})
trees_analysis_file = os.path.join(save_folder, f'{target}_随机森林树数量分析.csv')
trees_analysis_data.to_csv(trees_analysis_file, index=False)
print(f"树数量影响分析数据已保存至 {trees_analysis_file}")





In [None]:
# 深度神经网络回归模型
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.inspection import permutation_importance
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
target="水接触角"
# 选择目标变量
print(f"训练 {target} 的深度神经网络模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.15)

# 选择适当的数据集
X_train_model = X_train_nn[target]
X_test_model = X_test_nn[target]

# 为当前目标创建特定的评分器
tol_scorer = make_tolerance_scorer(target)
tol_scorer_wrapped = make_scorer(tol_scorer, greater_is_better=True)

# 标准化特征数据
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_model)
X_test_scaled = scaler.transform(X_test_model)

print(f"训练数据形状: {X_train_scaled.shape}")
print(f"测试数据形状: {X_test_scaled.shape}")

# 创建神经网络模型的函数
def create_nn_model(hidden_layers=[128, 64, 32], dropout_rate=0.3, learning_rate=0.001):
    """创建深度神经网络模型"""
    model = keras.Sequential()
    
    # 输入层
    model.add(layers.Dense(hidden_layers[0], 
                          activation='relu', 
                          input_shape=(X_train_scaled.shape[1],)))
    model.add(layers.Dropout(dropout_rate))
    
    # 隐藏层
    for units in hidden_layers[1:]:
        model.add(layers.Dense(units, activation='relu'))
        model.add(layers.Dropout(dropout_rate))
    
    # 输出层
    model.add(layers.Dense(1))
    
    # 编译模型
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, 
                 loss='mse', 
                 metrics=['mae'])
    
    return model

# 包装器类以兼容sklearn接口
class KerasRegressorWrapper:
    def __init__(self, hidden_layers=[128, 64, 32], dropout_rate=0.3, 
                 learning_rate=0.001, epochs=100, batch_size=32):
        self.hidden_layers = hidden_layers
        self.dropout_rate = dropout_rate
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = None
        self.history = None
        
    def fit(self, X, y):
        self.model = create_nn_model(self.hidden_layers, 
                                   self.dropout_rate, 
                                   self.learning_rate)
        
        # 早停和学习率调度
        callbacks = [
            keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True),
            keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=10, min_lr=1e-6)
        ]
        
        # 训练模型
        self.history = self.model.fit(
            X, y,
            epochs=self.epochs,
            batch_size=self.batch_size,
            validation_split=0.2,
            callbacks=callbacks,
            verbose=0
        )
        return self
        
    def predict(self, X):
        return self.model.predict(X, verbose=0).flatten()
    
    def get_params(self, deep=True):
        return {
            'hidden_layers': self.hidden_layers,
            'dropout_rate': self.dropout_rate,
            'learning_rate': self.learning_rate,
            'epochs': self.epochs,
            'batch_size': self.batch_size
        }
    
    def set_params(self, **params):
        for key, value in params.items():
            setattr(self, key, value)
        return self

# 设置超参数搜索空间
param_dist = {
    'hidden_layers': [
        [64, 32],
        [128, 64],
        [128, 64, 32],
        [256, 128, 64],
        [128, 64, 32, 16],
        [256, 128, 64, 32]
    ],
    'dropout_rate': [0.2, 0.3, 0.4, 0.5],
    'learning_rate': [0.001, 0.005, 0.01, 0.0005],
    'batch_size': [16, 32, 64],
    'epochs': [150, 200, 250]
}

# 创建基础模型
base_model = KerasRegressorWrapper(
    hidden_layers=[128, 64, 32],
    dropout_rate=0.3,
    learning_rate=0.001,
    epochs=200,
    batch_size=32
)

# 执行超参数优化
if len(X_train_model) >= 80:
    print("执行超参数优化...")
    search = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=param_dist,
        n_iter=30,  # 由于神经网络训练时间长，减少迭代次数
        cv=min(3, cv_folds),
        scoring='r2',
        n_jobs=1,  # 神经网络使用单进程避免冲突
        random_state=42,
        verbose=1
    )
    search.fit(X_train_scaled, y_train[target])
    nn_model = search.best_estimator_
    print(f"最佳参数: {search.best_params_}")
    print(f"最佳CV得分: {search.best_score_:.4f}")
else:
    # 使用预定义的参数
    print("使用预定义参数...")
    nn_model = base_model
    nn_model.fit(X_train_scaled, y_train[target])

# 交叉验证
print("执行交叉验证...")
cv_scores = cross_val_score(nn_model, X_train_scaled, y_train[target], 
                           cv=min(3, cv_folds), scoring='r2')
print(f"交叉验证R²分数: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# 容忍度R²评分
tolerance_cv_scores = cross_val_score(
    nn_model, X_train_scaled, y_train[target], 
    cv=min(3, cv_folds), scoring=tol_scorer_wrapped
)
print(f"容忍度R²分数 (容忍度={current_tolerance:.2f}): {tolerance_cv_scores.mean():.4f} ± {tolerance_cv_scores.std():.4f}")

# 重新训练最终模型
print("重新训练最终模型...")
nn_model.fit(X_train_scaled, y_train[target])

# 保存模型
models[target]['DeepNN'] = {'model': nn_model, 'scaler': scaler}
# 保存神经网络模型
nn_model_file = os.path.join(model_folder, f'{target}_神经网络模型.pkl')
with open(nn_model_file, 'wb') as f:
    pickle.dump({'model': nn_model, 'scaler': scaler}, f)
print(f"神经网络模型已保存至 {nn_model_file}")

# 评估模型
class NNEvaluationWrapper:
    def __init__(self, model, scaler):
        self.model = model
        self.scaler = scaler
    
    def predict(self, X):
        X_scaled = self.scaler.transform(X)
        return self.model.predict(X_scaled)

nn_eval_model = NNEvaluationWrapper(nn_model, scaler)
results = evaluate_model(nn_eval_model, X_train_model, y_train[target], 
                        X_test_model, y_test[target], target, "DeepNN")

# 获取预测值
y_pred_train = nn_eval_model.predict(X_train_model)
y_pred_test = nn_eval_model.predict(X_test_model)

# 分别创建训练集和测试集的预测结果DataFrame
train_prediction = pd.DataFrame({
    '实际值': y_train[target],
    '预测值': y_pred_train,
    '误差': np.abs(y_train[target] - y_pred_train)
})
train_prediction['数据集'] = '训练集'

test_prediction = pd.DataFrame({
    '实际值': y_test[target],
    '预测值': y_pred_test,
    '误差': np.abs(y_test[target] - y_pred_test)
})
test_prediction['数据集'] = '测试集'

# 保存到文件
train_file = os.path.join(save_folder, f'{target}_神经网络训练集预测结果.csv')
test_file = os.path.join(save_folder, f'{target}_神经网络测试集预测结果.csv')

train_prediction.to_csv(train_file, index=False)
test_prediction.to_csv(test_file, index=False)

print(f"训练集预测结果已保存至 {train_file}")
print(f"测试集预测结果已保存至 {test_file}")



# 训练损失可视化
if hasattr(nn_model, 'history') and nn_model.history is not None:
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.plot(nn_model.history.history['loss'], label='训练损失')
    plt.plot(nn_model.history.history['val_loss'], label='验证损失')
    plt.title('模型损失')
    plt.xlabel('轮次')
    plt.ylabel('损失')
    plt.legend()
    plt.grid(True)
    
    plt.subplot(1, 2, 2)
    plt.plot(nn_model.history.history['mae'], label='训练MAE')
    plt.plot(nn_model.history.history['val_mae'], label='验证MAE')
    plt.title('模型MAE')
    plt.xlabel('轮次')
    plt.ylabel('平均绝对误差')
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()

# 不同网络结构的影响分析
print("分析不同网络结构的影响...")
network_structures = [
    [32],
    [64, 32],
    [128, 64],
    [128, 64, 32],
    [256, 128, 64],
    [128, 64, 32, 16]
]

structure_train_scores = []
structure_test_scores = []
structure_names = []

for structure in network_structures:
    structure_name = '-'.join(map(str, structure))
    structure_names.append(structure_name)
    print(f"测试网络结构: {structure_name}")
    
    test_nn = KerasRegressorWrapper(
        hidden_layers=structure,
        dropout_rate=0.3,
        learning_rate=0.001,
        epochs=100,
        batch_size=32
    )
    
    test_nn.fit(X_train_scaled, y_train[target])
    
    train_pred = test_nn.predict(X_train_scaled)
    test_pred = test_nn.predict(X_test_scaled)
    
    train_r2 = r2_score(y_train[target], train_pred)
    test_r2 = r2_score(y_test[target], test_pred)
    
    structure_train_scores.append(train_r2)
    structure_test_scores.append(test_r2)

plt.figure(figsize=(12, 6))
x = np.arange(len(structure_names))
width = 0.35

plt.bar(x - width/2, structure_train_scores, width, label='训练集 R²')
plt.bar(x + width/2, structure_test_scores, width, label='测试集 R²')

plt.xlabel('网络结构')
plt.ylabel('R²')
plt.title('DeepNN - 网络结构对性能的影响')
plt.xticks(x, structure_names, rotation=45)
plt.legend()
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()

# 保存网络结构影响分析数据
structure_analysis_data = pd.DataFrame({
    '网络结构': structure_names,
    '训练集R²': structure_train_scores,
    '测试集R²': structure_test_scores
})
structure_analysis_file = os.path.join(save_folder, f'{target}_神经网络结构分析.csv')
structure_analysis_data.to_csv(structure_analysis_file, index=False)
print(f"网络结构影响分析数据已保存至 {structure_analysis_file}")

print("神经网络模型训练完成！")

In [None]:
# 支持向量机回归模型
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.inspection import permutation_importance
import numpy as np

# 选择目标变量
print(f"训练 {target} 的支持向量机回归模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.15)

# 选择适当的数据集 - SVR对特征尺度敏感，使用线性预处理数据
X_train_model = X_train_linear[target]
X_test_model = X_test_linear[target]

# 为当前目标创建特定的评分器
tol_scorer = make_tolerance_scorer(target)
tol_scorer_wrapped = make_scorer(tol_scorer, greater_is_better=True)

# SVR需要标准化处理
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_model)
X_test_scaled = scaler.transform(X_test_model)

print(f"训练数据形状: {X_train_scaled.shape}")
print(f"测试数据形状: {X_test_scaled.shape}")

# 设置超参数搜索空间
param_dist = {
    'kernel': ['rbf', 'poly', 'sigmoid'],
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'epsilon': [0.01, 0.1, 0.2, 0.5, 1.0],
    'degree': [2, 3, 4]  # 仅对poly核有效
}

# 创建基础模型
base_model = SVR(
    kernel='rbf',
    C=100,
    gamma='scale',
    epsilon=0.1,
    max_iter=5000
)

# 执行超参数优化
if len(X_train_model) >= 80:
    print("执行超参数优化...")
    search = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=param_dist,
        n_iter=100,
        cv=min(5, cv_folds),
        scoring=tol_scorer_wrapped,
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    search.fit(X_train_scaled, y_train[target])
    svr_model = search.best_estimator_
    print(f"最佳参数: {search.best_params_}")
    print(f"最佳CV得分: {search.best_score_:.4f}")
    
    # 如果找到的最佳核函数是poly，进行更精细的优化
    if svr_model.kernel == 'poly':
        print("对多项式核进行精细优化...")
        poly_param_grid = {
            'C': [svr_model.C * 0.5, svr_model.C, svr_model.C * 2],
            'gamma': [svr_model.gamma] if isinstance(svr_model.gamma, str) else [svr_model.gamma * 0.5, svr_model.gamma, svr_model.gamma * 2],
            'degree': [max(2, svr_model.degree-1), svr_model.degree, svr_model.degree+1],
            'epsilon': [svr_model.epsilon * 0.5, svr_model.epsilon, svr_model.epsilon * 2]
        }
        
        fine_search = GridSearchCV(
            SVR(kernel='poly'),
            poly_param_grid,
            cv=min(3, cv_folds),
            scoring=tol_scorer_wrapped,
            n_jobs=-1
        )
        fine_search.fit(X_train_scaled, y_train[target])
        svr_model = fine_search.best_estimator_
        print(f"精细优化后的最佳参数: {fine_search.best_params_}")
        
else:
    # 使用预定义的参数
    print("使用预定义参数...")
    svr_model = base_model
    svr_model.fit(X_train_scaled, y_train[target])

# 交叉验证
cv_scores = cross_val_score(svr_model, X_train_scaled, y_train[target], 
                           cv=min(5, cv_folds), scoring='r2')
print(f"交叉验证R²分数: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# 容忍度R²评分
tolerance_cv_scores = cross_val_score(
    svr_model, X_train_scaled, y_train[target], 
    cv=min(5, cv_folds), scoring=tol_scorer_wrapped
)
print(f"容忍度R²分数 (容忍度={current_tolerance:.2f}): {tolerance_cv_scores.mean():.4f} ± {tolerance_cv_scores.std():.4f}")

# 输出最终模型信息
print(f"最终模型参数:")
print(f"  核函数: {svr_model.kernel}")
print(f"  C参数: {svr_model.C}")
print(f"  gamma参数: {svr_model.gamma}")
print(f"  epsilon参数: {svr_model.epsilon}")
if svr_model.kernel == 'poly':
    print(f"  多项式度数: {svr_model.degree}")
print(f"  支持向量数量: {svr_model.n_support_}")

# 保存模型
models[target]['SVR'] = {'model': svr_model, 'scaler': scaler}
# 使用pickle保存SVR模型
svr_model_file = os.path.join(model_folder, f'{target}_支持向量机模型.pkl')
with open(svr_model_file, 'wb') as f:
    pickle.dump({'model': svr_model, 'scaler': scaler}, f)
print(f"支持向量机模型已保存至 {svr_model_file}")

# 评估模型 - 创建包装器以处理标准化
class SVREvaluationWrapper:
    def __init__(self, model, scaler):
        self.model = model
        self.scaler = scaler
    
    def predict(self, X):
        X_scaled = self.scaler.transform(X)
        return self.model.predict(X_scaled)

svr_eval_model = SVREvaluationWrapper(svr_model, scaler)
results = evaluate_model(svr_eval_model, X_train_model, y_train[target], 
                        X_test_model, y_test[target], target, "SVR")

# 获取预测值
y_pred_train = svr_eval_model.predict(X_train_model)
y_pred_test = svr_eval_model.predict(X_test_model)

# 分别创建训练集和测试集的预测结果DataFrame
train_prediction = pd.DataFrame({
    '实际值': y_train[target],
    '预测值': y_pred_train,
    '误差': np.abs(y_train[target] - y_pred_train)
})
train_prediction['数据集'] = '训练集'

test_prediction = pd.DataFrame({
    '实际值': y_test[target],
    '预测值': y_pred_test,
    '误差': np.abs(y_test[target] - y_pred_test)
})
test_prediction['数据集'] = '测试集'

# 保存到文件
train_file = os.path.join(save_folder, f'{target}_支持向量机训练集预测结果.csv')
test_file = os.path.join(save_folder, f'{target}_支持向量机测试集预测结果.csv')

train_prediction.to_csv(train_file, index=False)
test_prediction.to_csv(test_file, index=False)

print(f"训练集预测结果已保存至 {train_file}")
print(f"测试集预测结果已保存至 {test_file}")

# 特征重要性（使用置换重要性）
print("计算特征重要性...")
perm_importance = permutation_importance(
    svr_eval_model, X_test_model, y_test[target], 
    n_repeats=10, random_state=42, n_jobs=-1
)

feature_importance = pd.DataFrame({
    'Feature': X_train_model.columns,
    'Importance': perm_importance.importances_mean
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('置换重要性')
plt.ylabel('特征')
plt.title(f'{target} - SVR特征重要性')
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()

# 保存特征重要性数据
feature_importance_file = os.path.join(save_folder, f'{target}_支持向量机特征重要性.csv')
feature_importance.to_csv(feature_importance_file, index=False)
print(f"特征重要性数据已保存至 {feature_importance_file}")

# 不同C参数的影响分析
print("分析不同C参数的影响...")
C_range = [0.1, 1, 10, 100, 1000, 10000]
c_train_scores = []
c_test_scores = []
c_support_vectors = []

for C_val in C_range:
    print(f"测试C参数: {C_val}")
    test_svr = SVR(
        kernel=svr_model.kernel,
        C=C_val,
        gamma=svr_model.gamma,
        epsilon=svr_model.epsilon,
        degree=svr_model.degree if svr_model.kernel == 'poly' else 3,
        max_iter=5000
    )
    
    test_svr.fit(X_train_scaled, y_train[target])
    
    train_pred = test_svr.predict(X_train_scaled)
    test_pred = test_svr.predict(X_test_scaled)
    
    train_r2 = r2_score(y_train[target], train_pred)
    test_r2 = r2_score(y_test[target], test_pred)
    
    c_train_scores.append(train_r2)
    c_test_scores.append(test_r2)
    c_support_vectors.append(test_svr.n_support_.sum())

plt.figure(figsize=(12, 8))

plt.subplot(2, 1, 1)
plt.semilogx(C_range, c_train_scores, 'o-', label='训练集 R²')
plt.semilogx(C_range, c_test_scores, 'o-', label='测试集 R²')
plt.xlabel('C参数')
plt.ylabel('R²')
plt.title('SVR - C参数对性能的影响')
plt.legend()
plt.grid(True)

plt.subplot(2, 1, 2)
plt.semilogx(C_range, c_support_vectors, 'o-', color='green')
plt.xlabel('C参数')
plt.ylabel('支持向量数量')
plt.title('SVR - C参数对支持向量数量的影响')
plt.grid(True)

plt.tight_layout()
plt.show()

# 不同核函数的性能比较
print("比较不同核函数的性能...")
kernels = ['linear', 'rbf', 'poly', 'sigmoid']
kernel_scores = []
kernel_train_scores = []
kernel_names = []

for kernel in kernels:
    print(f"测试核函数: {kernel}")
    try:
        if kernel == 'poly':
            test_svr = SVR(kernel=kernel, C=100, gamma='scale', epsilon=0.1, degree=3, max_iter=5000)
        else:
            test_svr = SVR(kernel=kernel, C=100, gamma='scale', epsilon=0.1, max_iter=5000)
        
        test_svr.fit(X_train_scaled, y_train[target])
        
        train_pred = test_svr.predict(X_train_scaled)
        test_pred = test_svr.predict(X_test_scaled)
        
        train_r2 = r2_score(y_train[target], train_pred)
        test_r2 = r2_score(y_test[target], test_pred)
        
        kernel_train_scores.append(train_r2)
        kernel_scores.append(test_r2)
        kernel_names.append(kernel)
        
    except Exception as e:
        print(f"核函数 {kernel} 训练失败: {str(e)}")
        continue

plt.figure(figsize=(10, 6))
x = np.arange(len(kernel_names))
width = 0.35

plt.bar(x - width/2, kernel_train_scores, width, label='训练集 R²')
plt.bar(x + width/2, kernel_scores, width, label='测试集 R²')

plt.xlabel('核函数')
plt.ylabel('R²')
plt.title('SVR - 不同核函数性能比较')
plt.xticks(x, kernel_names)
plt.legend()
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()

# 保存C参数影响分析数据
c_analysis_data = pd.DataFrame({
    'C参数': C_range,
    '训练集R²': c_train_scores,
    '测试集R²': c_test_scores,
    '支持向量数量': c_support_vectors
})
c_analysis_file = os.path.join(save_folder, f'{target}_支持向量机C参数分析.csv')
c_analysis_data.to_csv(c_analysis_file, index=False)
print(f"C参数影响分析数据已保存至 {c_analysis_file}")

# 保存核函数比较数据
kernel_analysis_data = pd.DataFrame({
    '核函数': kernel_names,
    '训练集R²': kernel_train_scores,
    '测试集R²': kernel_scores
})
kernel_analysis_file = os.path.join(save_folder, f'{target}_支持向量机核函数分析.csv')
kernel_analysis_data.to_csv(kernel_analysis_file, index=False)
print(f"核函数比较分析数据已保存至 {kernel_analysis_file}")

print("支持向量机模型训练完成！")

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, WhiteKernel, RationalQuadratic
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score, make_scorer
from sklearn.inspection import permutation_importance
import traceback
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
import pickle
import os

# 选择目标变量
print(f"训练 {target} 的高斯过程回归模型...")



# 为当前目标创建特定的评分器
tol_scorer = make_tolerance_scorer(target)
tol_scorer_wrapped = make_scorer(tol_scorer, greater_is_better=True)

# 针对原始尺度数据调整超参数搜索空间
param_grid = {
    'alpha': [1e-6, 1e-4, 1e-2, 1e-1, 1.0],  # 适应更大特征值的正则化范围
    'normalize_y': [True],
    'n_restarts_optimizer': [2, 5]
}

# 第一步：不同核函数的比较
print("步骤1: 比较不同核函数的性能...")

# 修改核函数定义，设置长度尺度的合理范围
kernels = [
    1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)),
    1.0 * Matern(length_scale=1.0, nu=1.5, length_scale_bounds=(1e-2, 1e3)),
    1.0 * RationalQuadratic(length_scale=1.0, alpha=0.5, length_scale_bounds=(1e-2, 1e3)),
    1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) + WhiteKernel(noise_level=0.1),
    1.0 * RationalQuadratic(length_scale=1.0, alpha=0.5, length_scale_bounds=(1e-2, 1e3)) + WhiteKernel(noise_level=0.1)
]
kernel_names = ['RBF', 'Matern', 'RationalQuadratic', 'RBF + WhiteNoise', 'RationalQuadratic + WhiteNoise']

train_scores = []
test_scores = []
fit_times = []

for kernel, name in zip(kernels, kernel_names):
    print(f"训练核函数: {name}")
    start_time = time.time()
    gp = GaussianProcessRegressor(
        kernel=kernel,
        alpha=1e-10,  # 先使用较小值进行比较
        normalize_y=True,
        n_restarts_optimizer=2,
        random_state=42
    )
    gp.fit(X_train_model, y_train[target])
    fit_time = time.time() - start_time
    
    train_score = r2_score(y_train[target], gp.predict(X_train_model))
    test_score = r2_score(y_test[target], gp.predict(X_test_model))
    
    train_scores.append(train_score)
    test_scores.append(test_score)
    fit_times.append(fit_time)
    
    print(f"  训练时间: {fit_time:.2f}秒, 训练集R²: {train_score:.4f}, 测试集R²: {test_score:.4f}")

# 可视化比较结果
plt.figure(figsize=(12, 6))
width = 0.35
x = np.arange(len(kernel_names))
plt.bar(x - width/2, train_scores, width, label='训练集 R²')
plt.bar(x + width/2, test_scores, width, label='测试集 R²')
plt.xticks(x, kernel_names, rotation=45, ha='right')
plt.xlabel('核函数')
plt.ylabel('R²')
plt.title('不同核函数的性能比较')
plt.legend()
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()

# 第二步：选择最佳核函数
best_kernel_index = np.argmax(test_scores)
best_kernel = kernels[best_kernel_index]
best_kernel_name = kernel_names[best_kernel_index]
print(f"\n步骤2: 选择最佳核函数")
print(f"最佳核函数: {best_kernel_name}, 测试集R²: {test_scores[best_kernel_index]:.4f}")

# 第三步：使用最佳核函数并进行超参数优化
print(f"\n步骤3: 使用最佳核函数 {best_kernel_name} 进行超参数优化...")

# 使用最佳核函数创建基础模型
gpr_model = GaussianProcessRegressor(
    kernel=best_kernel,
    random_state=42
)

# 使用GridSearchCV进行超参数优化
grid_search = GridSearchCV(
    gpr_model,
    param_grid=param_grid,
    cv=min(3, cv_folds),
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

print("开始网格搜索...")
grid_search.fit(X_train_model, y_train[target])
print("网格搜索完成")

# 获取最佳模型和参数
best_model = grid_search.best_estimator_
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳交叉验证分数: {grid_search.best_score_:.4f}")

# 第四步：评估最终模型
print(f"\n步骤4: 评估最终优化模型...")

# 使用最佳模型进行交叉验证评估
cv_scores = cross_val_score(best_model, X_train_model, y_train[target], cv=min(3, cv_folds), scoring='r2')
print(f"交叉验证R²分数: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# 容忍度R²评分
tolerance_cv_scores = cross_val_score(
    best_model, X_train_model, y_train[target], cv=min(3, cv_folds),
    scoring=tol_scorer_wrapped
)
print(f"容忍度R²分数 (容忍度={current_tolerance:.2f}): {tolerance_cv_scores.mean():.4f} ± {tolerance_cv_scores.std():.4f}")

# 重新拟合最佳模型，以便后续使用
best_model.fit(X_train_model, y_train[target])

# 保存模型
models[target]['GaussianProcess'] = best_model
# 使用pickle保存高斯过程回归模型
gp_model_file = os.path.join(model_folder, f'{target}_高斯过程模型.pkl')
with open(gp_model_file, 'wb') as f:
    pickle.dump(best_model, f)
print(f"高斯过程回归模型已保存至 {gp_model_file}")

# 评估模型
results = evaluate_model(best_model, X_train_model, y_train[target], X_test_model, y_test[target], target, "GaussianProcess")

# 获取预测值
y_pred_train = best_model.predict(X_train_model)
y_pred_test = best_model.predict(X_test_model)

# 分别创建训练集和测试集的预测结果DataFrame
train_prediction = pd.DataFrame({
    '实际值': y_train[target],
    '预测值': y_pred_train,
    '误差': np.abs(y_train[target] - y_pred_train)
})
train_prediction['数据集'] = '训练集'

test_prediction = pd.DataFrame({
    '实际值': y_test[target],
    '预测值': y_pred_test,
    '误差': np.abs(y_test[target] - y_pred_test)
})
test_prediction['数据集'] = '测试集'

# 保存到文件
train_file = os.path.join(save_folder, f'{target}_高斯过程训练集预测结果.csv')
test_file = os.path.join(save_folder, f'{target}_高斯过程测试集预测结果.csv')

train_prediction.to_csv(train_file, index=False)
test_prediction.to_csv(test_file, index=False)

print(f"训练集预测结果已保存至 {train_file}")
print(f"测试集预测结果已保存至 {test_file}")

# 打印核函数参数
print("最终核函数参数:")
print(best_model.kernel_)

# 计算特征重要性
perm_importance = permutation_importance(
    best_model, X_test_model, y_test[target], 
    n_repeats=10, random_state=42, n_jobs=-1
)

feature_importance = pd.DataFrame({
    'Feature': X_train_model.columns,
    'Importance': perm_importance.importances_mean
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('置换重要性')
plt.ylabel('特征')
plt.title(f'{target} - GaussianProcess特征重要性')
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()

# 保存特征重要性数据
feature_importance_file = os.path.join(save_folder, f'{target}_高斯过程特征重要性.csv')
feature_importance.to_csv(feature_importance_file, index=False)
print(f"特征重要性数据已保存至 {feature_importance_file}")

# 第六步: 添加预测不准确样本分析
print(f"\n步骤6: 分析预测不准确的样本...")

# 已经获取了测试集预测值，但需要转换为标准格式
if not isinstance(y_pred_test, np.ndarray):
    y_pred = np.array(y_pred_test)
else:
    y_pred = y_pred_test

# 将y_test转换为numpy数组格式进行处理
if hasattr(y_test[target], 'values'):
    y_true_values = y_test[target].values
else:
    y_true_values = y_test[target]

# 计算绝对误差
errors = np.abs(y_true_values - y_pred)

# 设置容忍度阈值
tolerance = 5.0  # 可以根据需要调整

# 找出误差超过容忍度的样本
inaccurate_mask = errors > tolerance
inaccurate_indices = np.where(inaccurate_mask)[0]

print(f"\n预测不准确的样本数量: {len(inaccurate_indices)} (占测试集的 {len(inaccurate_indices)/len(y_test[target])*100:.2f}%)")
print(f"使用的容忍度阈值: {tolerance}")

# 创建预测不准确样本的分析数据
if len(inaccurate_indices) > 0:
    # 尝试获取原始索引，如果不可用则使用数组位置索引
    try:
        if hasattr(y_test[target], 'index'):
            original_indices = [y_test[target].index[i] for i in inaccurate_indices]
        elif isinstance(X_test_model, pd.DataFrame) and hasattr(X_test_model, 'index'):
            original_indices = [X_test_model.index[i] for i in inaccurate_indices]
        else:
            # 如果无法获取原始索引，使用数组位置作为标识
            original_indices = inaccurate_indices
    except Exception as e:
        print(f"无法获取原始索引: {str(e)}")
        original_indices = inaccurate_indices
    
    # 创建包含预测不准确样本信息的DataFrame
    inaccurate_samples = []
    for i, idx in enumerate(inaccurate_indices):
        # 安全地获取实际值
        if hasattr(y_test[target], 'iloc'):
            actual = y_test[target].iloc[idx]
        else:
            actual = y_true_values[idx]
        
        # 计算预测置信区间
        prediction = y_pred[idx]

        
        inaccurate_samples.append({
            '样本索引': original_indices[i],
            '实际值': actual,
            '预测值': prediction,
            '绝对误差': errors[idx],
            '相对误差(%)': (errors[idx] / np.abs(actual)) * 100 if actual != 0 else float('inf'),

        })
    
    inaccurate_df = pd.DataFrame(inaccurate_samples)
    # 按误差降序排列
    inaccurate_df = inaccurate_df.sort_values('绝对误差', ascending=False)
    
    # 打印预测不准确的样本信息
    print("\n预测不准确的样本详情 (按误差降序排列):")
    print(inaccurate_df)
    
    # 保存结果到文件
    inaccurate_file = os.path.join(save_folder, f'{target}_GP不准确预测.csv')
    inaccurate_df.to_csv(inaccurate_file, index=False)
    print(f"不准确预测分析已保存至 {inaccurate_file}")
else:
    print(f"\n没有发现预测不准确的样本 (容忍度阈值: {tolerance})")

print("\n高斯过程回归模型训练、评估和异常分析完成。")

In [None]:
# 直接运行的模型加载代码
import os
import pickle
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# 如果需要加载神经网络模型，需要先定义KerasRegressorWrapper类
class KerasRegressorWrapper:
    def __init__(self, hidden_layers=[128, 64, 32], dropout_rate=0.3, 
                 learning_rate=0.001, epochs=100, batch_size=32):
        self.hidden_layers = hidden_layers
        self.dropout_rate = dropout_rate
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = None
        self.history = None
        
    def predict(self, X):
        return self.model.predict(X, verbose=0).flatten()
    
    def get_params(self, deep=True):
        return {
            'hidden_layers': self.hidden_layers,
            'dropout_rate': self.dropout_rate,
            'learning_rate': self.learning_rate,
            'epochs': self.epochs,
            'batch_size': self.batch_size
        }
    
    def set_params(self, **params):
        for key, value in params.items():
            setattr(self, key, value)
        return self

# 设置模型文件夹路径
model_folder = '训练模型文件'

# 初始化models字典
if 'models' not in locals():
    models = {}

print(f"为目标变量 {target} 加载模型...")

# 查找模型文件，排除Keras模型
model_files = [f for f in os.listdir(model_folder) 
              if f.startswith(f'{target}_') and f.endswith('.pkl') 
              and not f.endswith('_features.pkl')
              and 'Ensemble' not in f and '集成' not in f
              and 'Keras' not in f and 'Neural' not in f and 'NN' not in f]

print(f"找到 {len(model_files)} 个模型文件: {model_files}")

# 初始化目标变量的模型字典
if target not in models:
    models[target] = {}

# 加载每个模型
for model_file in model_files:
    # 从文件名提取模型名称
    model_name = model_file.replace(f'{target}_', '').replace('模型.pkl', '')
    
    print(f"  加载模型: {model_name}")
    
    # 加载模型
    model_path = os.path.join(model_folder, model_file)
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    
    models[target][model_name] = model
    print(f"    {model_name} 加载成功")

print(f"成功加载 {len(models[target])} 个模型")
print(f"可用模型: {list(models[target].keys())}")

In [None]:

# VotingEnsemble - 基于模型标准R²性能分配权重
print(f"训练 {target} 的VotingEnsemble集成模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.15)

# 使用已有模型和已知性能 - 无需重新评估
base_models = []
model_scores = {}  # 存储标准R²分数
model_datasets = {}  # 存储每个模型对应的数据集

print("收集已有模型的性能评估结果...")
# 使用原始训练代码计算的标准R²
if 'XGBoost' in models[target]:
    # 不重新评估，而是计算一次标准R²
    model = models[target]['XGBoost']
    y_pred = model.predict(X_test_tree[target])
    r2 = r2_score(y_test[target], y_pred)
    base_models.append(('xgb', model))
    model_scores['xgb'] = r2
    model_datasets['xgb'] = 'tree'
    print(f"  XGBoost - R²: {r2:.4f}")

if 'LightGBM' in models[target]:
    model = models[target]['LightGBM']
    y_pred = model.predict(X_test_tree[target])
    r2 = r2_score(y_test[target], y_pred)
    base_models.append(('lgb', model))
    model_scores['lgb'] = r2
    model_datasets['lgb'] = 'tree'
    print(f"  LightGBM - R²: {r2:.4f}")

if 'HistGradientBoosting' in models[target]:
    model = models[target]['HistGradientBoosting']
    y_pred = model.predict(X_test_tree[target])
    r2 = r2_score(y_test[target], y_pred)
    base_models.append(('hgb', model))
    model_scores['hgb'] = r2
    model_datasets['hgb'] = 'tree'
    print(f"  HistGradientBoosting - R²: {r2:.4f}")

if 'RandomForest' in models[target]:
    model = models[target]['RandomForest']
    y_pred = model.predict(X_test_tree_filled[target])
    r2 = r2_score(y_test[target], y_pred)
    base_models.append(('rf', model))
    model_scores['rf'] = r2
    model_datasets['rf'] = 'tree_filled'
    print(f"  RandomForest - R²: {r2:.4f}")

if 'GaussianProcess' in models[target]:
    model = models[target]['GaussianProcess']
    y_pred = model.predict(X_test_linear[target])
    r2 = r2_score(y_test[target], y_pred)
    base_models.append(('gp', model))
    model_scores['gp'] = r2
    model_datasets['gp'] = 'linear'
    print(f"  GaussianProcess - R²: {r2:.4f}")

# 改为基于标准R²性能计算权重，而不是容忍度R²
print("\n根据标准R²模型性能分配权重...")

# 基于标准R²计算权重
total_score = sum(model_scores.values())
if total_score > 0:  # 防止除以零错误
    weights = [model_scores[name] / total_score * len(model_scores) for name, _ in base_models]
else:
    weights = [1.0 for _ in base_models]  # 如果总分为0，则均等分配权重

print("  基于标准R²分配权重")

# 确保权重至少为0.5，防止某些模型权重过低
min_weight = 0.5
weights = [max(w, min_weight) for w in weights]

# 打印权重
for i, (name, _) in enumerate(base_models):
    print(f"  {name} 权重: {weights[i]:.4f}")

# 检查是否有足够的模型可用于集成
if len(base_models) >= 2:
    try:
        # 创建自定义投票回归器的封装，确保使用正确的数据集
        class EnhancedVotingRegressor:
            def __init__(self, estimators, weights, datasets, target_name):
                self.estimators = estimators
                self.weights = weights
                self.datasets = datasets
                self.target_name = target_name
                
                # 归一化权重
                self.weights = np.array(self.weights)
                self.weights = self.weights / np.sum(self.weights)
                
            def predict(self, X):
                # 对每个模型获取预测，并根据模型类型使用适当的数据预处理
                predictions = []
                
                for i, (name, model) in enumerate(self.estimators):
                    # 选择合适的数据格式
                    dataset_type = self.datasets.get(name, 'standard')
                    
                    if dataset_type == 'tree':
                        # 对于支持NaN的树模型，直接使用X
                        X_model = X
                    elif dataset_type == 'tree_filled':
                        # 对于不支持NaN的树模型，需要填充X
                        if isinstance(X, pd.DataFrame):
                            X_model = X.fillna(0)
                        else:
                            X_model = X
                    elif dataset_type == 'linear':
                        # 对于线性模型，使用线性预处理的X
                        X_model = X
                    else:
                        # 默认情况下直接使用X
                        X_model = X
                    
                    # 获取当前模型的预测
                    pred = model.predict(X_model)
                    predictions.append(pred)
                
                # 加权平均所有预测
                weighted_pred = np.zeros(predictions[0].shape)
                for i, pred in enumerate(predictions):
                    weighted_pred += self.weights[i] * pred
                
                return weighted_pred
        
        # 创建投票集成模型
        print("创建投票集成模型...")
        voting_model = EnhancedVotingRegressor(
            estimators=base_models,
            weights=weights,
            datasets=model_datasets,
            target_name=target
        )
        
        # 获取训练集和测试集预测
        train_predictions = {}
        test_predictions = {}
        
        # 获取每个基础模型的预测
        for name, model in base_models:
            if model_datasets[name] == 'tree':
                train_predictions[name] = model.predict(X_train_tree[target])
                test_predictions[name] = model.predict(X_test_tree[target])
            elif model_datasets[name] == 'tree_filled':
                train_predictions[name] = model.predict(X_train_tree_filled[target])
                test_predictions[name] = model.predict(X_test_tree_filled[target])
            elif model_datasets[name] == 'linear':
                train_predictions[name] = model.predict(X_train_linear[target])
                test_predictions[name] = model.predict(X_test_linear[target])
        
        # 计算加权预测
        y_train_pred = np.zeros(len(y_train[target]))
        y_test_pred = np.zeros(len(y_test[target]))
        
        for i, (name, _) in enumerate(base_models):
            y_train_pred += weights[i] * train_predictions[name]
            y_test_pred += weights[i] * test_predictions[name]
        
        # 归一化权重
        total_weight = sum(weights)
        y_train_pred /= total_weight
        y_test_pred /= total_weight
        
        # 计算性能指标
        train_r2 = r2_score(y_train[target], y_train_pred)
        test_r2 = r2_score(y_test[target], y_test_pred)
        
        train_tol_r2 = tolerance_r2_score(y_train[target], y_train_pred, tolerance=current_tolerance, target=target)
        test_tol_r2 = tolerance_r2_score(y_test[target], y_test_pred, tolerance=current_tolerance, target=target)
        
        train_within_tol = prediction_within_tolerance(y_train[target], y_train_pred, tolerance=current_tolerance, target=target)
        test_within_tol = prediction_within_tolerance(y_test[target], y_test_pred, tolerance=current_tolerance, target=target)
        # 保存训练集和测试集的预测结果
        train_prediction = pd.DataFrame({
            '实际值': y_train[target],
            '集成预测值': y_train_pred,
            '误差': np.abs(y_train[target] - y_train_pred)
        })

        test_prediction = pd.DataFrame({
            '实际值': y_test[target],
            '集成预测值': y_test_pred,
            '误差': np.abs(y_test[target] - y_test_pred)
        })

        # 添加各基础模型的预测结果
        for name, _ in base_models:
            train_prediction[f'{name}预测值'] = train_predictions[name]
            test_prediction[f'{name}预测值'] = test_predictions[name]

        # 保存到文件
        train_file = os.path.join(save_folder, f'{target}_投票集成模型训练集预测结果.csv')
        test_file = os.path.join(save_folder, f'{target}_投票集成模型测试集预测结果.csv')

        train_prediction.to_csv(train_file, index=False)
        test_prediction.to_csv(test_file, index=False)

        print(f"训练集预测结果已保存至 {train_file}")
        print(f"测试集预测结果已保存至 {test_file}")
        # 输出性能指标
        print(f"\n投票集成模型性能:")
        print(f"训练集: R²={train_r2:.4f}, 容忍度R²={train_tol_r2:.4f}, 在容忍范围内比例={train_within_tol:.2%}")
        print(f"测试集: R²={test_r2:.4f}, 容忍度R²={test_tol_r2:.4f}, 在容忍范围内比例={test_within_tol:.2%}")
        
        # 与各个基础模型比较性能
        print("\n与各基础模型性能比较:")
        for name, _ in base_models:
            base_train_pred = train_predictions[name]
            base_test_pred = test_predictions[name]
            
            base_train_r2 = r2_score(y_train[target], base_train_pred)
            base_test_r2 = r2_score(y_test[target], base_test_pred)
            
            print(f"  vs {name}:")
            print(f"    训练集R²: {train_r2:.4f} vs {base_train_r2:.4f} (差异: {train_r2-base_train_r2:.4f})")
            print(f"    测试集R²: {test_r2:.4f} vs {base_test_r2:.4f} (差异: {test_r2-base_test_r2:.4f})")
        
        # 保存模型
        models[target]['VotingEnsemble'] = voting_model
        # 使用pickle保存投票集成模型
        ensemble_model_file = os.path.join(model_folder, f'{target}_投票集成模型.pkl')
        with open(ensemble_model_file, 'wb') as f:
            pickle.dump(voting_model, f)
        print(f"投票集成模型已保存至 {ensemble_model_file}")
        # 可视化: 预测vs实际值散点图 (训练集和测试集)
        plt.figure(figsize=(12, 5))
        
        # 训练集散点图
        plt.subplot(1, 2, 1)
        plt.scatter(y_train[target], y_train_pred, alpha=0.5)
        plt.plot([y_train[target].min(), y_train[target].max()], [y_train[target].min(), y_train[target].max()], 'r--')
        plt.xlabel('实际值')
        plt.ylabel('预测值')
        plt.title(f'训练集: R²={train_r2:.4f}')
        
        # 测试集散点图
        plt.subplot(1, 2, 2)
        plt.scatter(y_test[target], y_test_pred, alpha=0.5)
        plt.plot([y_test[target].min(), y_test[target].max()], [y_test[target].min(), y_test[target].max()], 'r--')
        plt.xlabel('实际值')
        plt.ylabel('预测值')
        plt.title(f'测试集: R²={test_r2:.4f}')
        
        plt.tight_layout()
        plt.show()
        
        # 绘制误差分布
        plt.figure(figsize=(12, 5))
        
        # 训练集误差
        plt.subplot(1, 2, 1)
        train_errors = y_train[target] - y_train_pred
        plt.hist(train_errors, bins=30, alpha=0.7)
        plt.axvline(x=0, color='r', linestyle='--')
        plt.xlabel('预测误差')
        plt.ylabel('频次')
        plt.title(f'训练集误差分布 (MAE={np.abs(train_errors).mean():.4f})')
        
        # 测试集误差
        plt.subplot(1, 2, 2)
        test_errors = y_test[target] - y_test_pred
        plt.hist(test_errors, bins=30, alpha=0.7)
        plt.axvline(x=0, color='r', linestyle='--')
        plt.xlabel('预测误差')
        plt.ylabel('频次')
        plt.title(f'测试集误差分布 (MAE={np.abs(test_errors).mean():.4f})')
        
        plt.tight_layout()
        plt.show()
        
        # 绘制权重分布
        plt.figure(figsize=(10, 6))
        model_names = [name for name, _ in base_models]
        plt.bar(model_names, weights)
        plt.xlabel('模型')
        plt.ylabel('权重')
        plt.title(f'{target} - 投票集成模型权重分布')
        plt.grid(True, axis='y')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
        # 对比各模型预测分布
        plt.figure(figsize=(12, 6))
        model_preds = [y_test_pred] + [test_predictions[name] for name, _ in base_models]
        model_labels = ['Voting'] + [name for name, _ in base_models]
        
        plt.boxplot(model_preds, labels=model_labels)
        plt.ylabel('预测值')
        plt.title('投票集成模型与各基础模型预测分布对比')
        plt.grid(True, axis='y')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        print(f"创建投票集成模型失败: {str(e)}")
        print(f"错误详情: {traceback.format_exc()}")
else:
    print("没有足够的基础模型来创建投票集成")



In [None]:

# 自适应集成模型 - 根据样本特征动态选择最佳模型
from sklearn.ensemble import RandomForestClassifier

print(f"训练 {target} 的自适应集成模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.15)

# 创建有效模型列表及其对应的数据集
available_models = []
model_input_data = {}

if 'XGBoost' in models[target]:
    available_models.append('XGBoost')
    model_input_data['XGBoost'] = {
        'train': X_train_tree[target],
        'test': X_test_tree[target]
    }

if 'LightGBM' in models[target]:
    available_models.append('LightGBM')
    model_input_data['LightGBM'] = {
        'train': X_train_tree[target],
        'test': X_test_tree[target]
    }
    
if 'HistGradientBoosting' in models[target]:
    available_models.append('HistGradientBoosting')
    model_input_data['HistGradientBoosting'] = {
        'train': X_train_tree[target],
        'test': X_test_tree[target]
    }
    
if 'RandomForest' in models[target]:
    available_models.append('RandomForest')
    model_input_data['RandomForest'] = {
        'train': X_train_tree_filled[target],
        'test': X_test_tree_filled[target]
    }
    
if 'GaussianProcess' in models[target]:
    available_models.append('GaussianProcess')
    model_input_data['GaussianProcess'] = {
        'train': X_train_linear[target],
        'test': X_test_linear[target]
    }

print(f"可用模型: {available_models}")

if len(available_models) < 2:
    print("自适应集成至少需要两个模型，目前可用模型不足")
else:
    try:
        # 步骤1: 为每个样本生成各模型的预测
        print("为每个样本生成所有模型的预测...")
        train_predictions = {}
        test_predictions = {}
        
        for model_name in available_models:
            model = models[target][model_name]
            # 使用适当的数据集进行预测
            train_data = model_input_data[model_name]['train']
            test_data = model_input_data[model_name]['test']
            
            train_pred = model.predict(train_data)
            test_pred = model.predict(test_data)
            
            train_predictions[model_name] = train_pred
            test_predictions[model_name] = test_pred
        
        # 步骤2: 计算每个样本的每个模型预测误差
        print("计算各模型在每个样本上的预测误差...")
        train_errors = {}
        for model_name in available_models:
            pred = train_predictions[model_name]
            error = np.abs(y_train[target].values - pred)
            train_errors[model_name] = error
        
        # 步骤3: 创建一个元模型，学习如何根据特征选择最佳模型
        print("训练元模型来决定每个样本应使用哪个模型...")
        
        # 为每个样本找出表现最好的模型
        best_model_indices = np.zeros(len(y_train[target]), dtype=int)
        model_name_to_idx = {name: idx for idx, name in enumerate(available_models)}
        
        for i in range(len(y_train[target])):
            model_errors = [train_errors[model_name][i] for model_name in available_models]
            best_model_idx = np.argmin(model_errors)
            best_model_indices[i] = best_model_idx
        
        # 用原始特征训练一个分类器来预测最佳模型
        meta_classifier = RandomForestClassifier(
            n_estimators=200, 
            max_depth=4,
            min_samples_split=2,
            n_jobs=-1,
            random_state=42
        )
        
        meta_classifier.fit(X_train[target], best_model_indices)
        
        # 步骤4: 在训练集和测试集上使用元模型选择最佳模型
        print("在训练集和测试集上应用元模型...")
        train_best_models = meta_classifier.predict(X_train[target])
        test_best_models = meta_classifier.predict(X_test[target])
        
        # 步骤5: 根据元模型的选择，为每个样本选择相应的预测
        train_adaptive_predictions = np.zeros(len(y_train[target]))
        test_adaptive_predictions = np.zeros(len(y_test[target]))
        
        # 为训练集计算自适应预测
        for i in range(len(y_train[target])):
            selected_model = available_models[train_best_models[i]]
            train_adaptive_predictions[i] = train_predictions[selected_model][i]
        
        # 为测试集计算自适应预测
        for i in range(len(y_test[target])):
            selected_model = available_models[test_best_models[i]]
            test_adaptive_predictions[i] = test_predictions[selected_model][i]
        
        # 步骤6: 评估自适应集成的性能
        train_r2 = r2_score(y_train[target], train_adaptive_predictions)
        train_tol_r2 = tolerance_r2_score(y_train[target], train_adaptive_predictions, tolerance=current_tolerance, target=target)
        train_within_tol = prediction_within_tolerance(y_train[target], train_adaptive_predictions, tolerance=current_tolerance, target=target)
        
        test_r2 = r2_score(y_test[target], test_adaptive_predictions)
        test_tol_r2 = tolerance_r2_score(y_test[target], test_adaptive_predictions, tolerance=current_tolerance, target=target)
        test_within_tol = prediction_within_tolerance(y_test[target], test_adaptive_predictions, tolerance=current_tolerance, target=target)
        
        print(f"\n自适应集成模型性能:")
        print(f"训练集: R²={train_r2:.4f}, 容忍度R²={train_tol_r2:.4f}, 在容忍范围内比例={train_within_tol:.2%}")
        print(f"测试集: R²={test_r2:.4f}, 容忍度R²={test_tol_r2:.4f}, 在容忍范围内比例={test_within_tol:.2%}")
        
        # 步骤7: 比较自适应集成与各个基础模型的性能
        print("\n与各基础模型性能比较:")
        for model_name in available_models:
            model_train_pred = train_predictions[model_name]
            model_test_pred = test_predictions[model_name]
            
            model_train_r2 = r2_score(y_train[target], model_train_pred)
            model_test_r2 = r2_score(y_test[target], model_test_pred)
            
            train_r2_diff = train_r2 - model_train_r2
            test_r2_diff = test_r2 - model_test_r2
            
            print(f"  vs {model_name}:")
            print(f"    训练集R²: {train_r2:.4f} vs {model_train_r2:.4f} (差异: {train_r2_diff:.4f})")
            print(f"    测试集R²: {test_r2:.4f} vs {model_test_r2:.4f} (差异: {test_r2_diff:.4f})")
        
        # 步骤8: 分析各模型被选择的频率
        train_model_selection_counts = np.bincount(train_best_models, minlength=len(available_models))
        train_model_selection_percent = train_model_selection_counts / len(train_best_models) * 100
        
        test_model_selection_counts = np.bincount(test_best_models, minlength=len(available_models))
        test_model_selection_percent = test_model_selection_counts / len(test_best_models) * 100
        
        print("\n各模型在训练集上的选择频率:")
        for i, model_name in enumerate(available_models):
            print(f"  {model_name}: {train_model_selection_counts[i]} 次 ({train_model_selection_percent[i]:.2f}%)")
        
        print("\n各模型在测试集上的选择频率:")
        for i, model_name in enumerate(available_models):
            print(f"  {model_name}: {test_model_selection_counts[i]} 次 ({test_model_selection_percent[i]:.2f}%)")
        
        # 步骤9: 创建并保存自适应集成模型
        class AdaptiveEnsembleModel:
            def __init__(self, meta_classifier, models_dict, available_models, model_input_data):
                self.meta_classifier = meta_classifier
                self.models_dict = models_dict
                self.available_models = available_models
                self.model_input_data = model_input_data
                
                # 添加数据类型映射
                self.data_type_map = {
                    'XGBoost': 'tree',
                    'LightGBM': 'tree',
                    'HistGradientBoosting': 'tree',
                    'RandomForest': 'tree_filled',
                    'GaussianProcess': 'linear'
                }
                
            def predict(self, X):
                # 确保X是DataFrame格式，保持列名
                if not isinstance(X, pd.DataFrame):
                    if hasattr(X, 'shape') and len(X.shape) == 2:
                        if hasattr(X_train[target], 'columns'):
                            X = pd.DataFrame(X, columns=X_train[target].columns)
                        else:
                            X = pd.DataFrame(X)
                
                # 首先预测每个样本应使用哪个模型
                model_choices = self.meta_classifier.predict(X)
                
                # 初始化预测结果数组
                predictions = np.zeros(len(X))
                
                # 为每个样本获取相应模型的预测
                for i in range(len(X)):
                    # 获取为当前样本选择的模型
                    model_idx = model_choices[i]
                    model_name = self.available_models[model_idx]
                    model = self.models_dict[model_name]
                    
                    # 准备单个样本的数据
                    if isinstance(X, pd.DataFrame):
                        x_sample = X.iloc[[i]]
                    else:
                        if len(X.shape) == 1:
                            x_sample = X.reshape(1, -1)
                        else:
                            x_sample = X[[i]]
                    
                    # 根据模型类型进行预处理
                    data_type = self.data_type_map.get(model_name, 'standard')
                    
                    if data_type == 'tree':
                        # 支持NaN值的树模型，不需要特殊处理
                        x_processed = x_sample
                    elif data_type == 'tree_filled':
                        # 不支持NaN的树模型，需要填充
                        if isinstance(x_sample, pd.DataFrame):
                            x_processed = x_sample.fillna(0)
                        else:
                            x_processed = np.nan_to_num(x_sample, 0)
                    elif data_type == 'linear':
                        # 线性模型的特殊处理，如果有需要
                        x_processed = x_sample
                    else:
                        # 默认情况
                        x_processed = x_sample
                    
                    # 获取预测
                    pred = model.predict(x_processed)
                    predictions[i] = pred[0] if hasattr(pred, '__len__') else pred
                
                return predictions
                
            def get_feature_importances(self):
                # 获取元分类器的特征重要性
                if hasattr(self.meta_classifier, 'feature_importances_'):
                    return self.meta_classifier.feature_importances_
                return None
        
        # 创建自适应集成模型实例
        adaptive_model = AdaptiveEnsembleModel(
            meta_classifier=meta_classifier,
            models_dict=models[target],
            available_models=available_models,
            model_input_data=model_input_data
        )
        
        # 保存模型
        models[target]['AdaptiveEnsemble'] = adaptive_model
        # 使用pickle保存自适应集成模型
        adaptive_model_file = os.path.join(model_folder, f'{target}_自适应集成模型.pkl')
        with open(adaptive_model_file, 'wb') as f:
            pickle.dump(adaptive_model, f)
        print(f"自适应集成模型已保存至 {adaptive_model_file}")

        # 保存训练集和测试集的预测结果
        train_prediction = pd.DataFrame({
            '实际值': y_train[target],
            '自适应集成预测值': train_adaptive_predictions,
            '误差': np.abs(y_train[target] - train_adaptive_predictions)
        })

        test_prediction = pd.DataFrame({
            '实际值': y_test[target],
            '自适应集成预测值': test_adaptive_predictions,
            '误差': np.abs(y_test[target] - test_adaptive_predictions)
        })

        # 添加各基础模型的预测结果以便比较
        for model_name in available_models:
            train_prediction[f'{model_name}预测值'] = train_predictions[model_name]
            test_prediction[f'{model_name}预测值'] = test_predictions[model_name]

        # 保存到文件
        train_file = os.path.join(save_folder, f'{target}_自适应集成训练集预测结果.csv')
        test_file = os.path.join(save_folder, f'{target}_自适应集成测试集预测结果.csv')

        train_prediction.to_csv(train_file, index=False)
        test_prediction.to_csv(test_file, index=False)

        print(f"训练集预测结果已保存至 {train_file}")
        print(f"测试集预测结果已保存至 {test_file}")

        # 可视化: 预测vs实际值散点图 (训练集和测试集)
        plt.figure(figsize=(12, 5))
        
        # 训练集散点图
        plt.subplot(1, 2, 1)
        plt.scatter(y_train[target], train_adaptive_predictions, alpha=0.5)
        plt.plot([y_train[target].min(), y_train[target].max()], [y_train[target].min(), y_train[target].max()], 'r--')
        plt.xlabel('实际值')
        plt.ylabel('预测值')
        plt.title(f'训练集: R²={train_r2:.4f}')
        
        # 测试集散点图
        plt.subplot(1, 2, 2)
        plt.scatter(y_test[target], test_adaptive_predictions, alpha=0.5)
        plt.plot([y_test[target].min(), y_test[target].max()], [y_test[target].min(), y_test[target].max()], 'r--')
        plt.xlabel('实际值')
        plt.ylabel('预测值')
        plt.title(f'测试集: R²={test_r2:.4f}')
        
        plt.tight_layout()
        plt.show()
        
        # 绘制误差分布
        plt.figure(figsize=(12, 5))
        
        # 训练集误差
        plt.subplot(1, 2, 1)
        train_errors_plot = y_train[target] - train_adaptive_predictions
        plt.hist(train_errors_plot, bins=30, alpha=0.7)
        plt.axvline(x=0, color='r', linestyle='--')
        plt.xlabel('预测误差')
        plt.ylabel('频次')
        plt.title(f'训练集误差分布 (MAE={np.abs(train_errors_plot).mean():.4f})')
        
        # 测试集误差
        plt.subplot(1, 2, 2)
        test_errors_plot = y_test[target] - test_adaptive_predictions
        plt.hist(test_errors_plot, bins=30, alpha=0.7)
        plt.axvline(x=0, color='r', linestyle='--')
        plt.xlabel('预测误差')
        plt.ylabel('频次')
        plt.title(f'测试集误差分布 (MAE={np.abs(test_errors_plot).mean():.4f})')
        
        plt.tight_layout()
        plt.show()
        
        # 保存元分类器的特征重要性
        feature_importance = pd.DataFrame({
            'Feature': X_train[target].columns,
            'Importance': meta_classifier.feature_importances_
        })
        feature_importance = feature_importance.sort_values('Importance', ascending=False)
        # 保存特征重要性数据
        importance_file = os.path.join(save_folder, f'{target}_自适应集成特征重要性.csv')
        feature_importance.to_csv(importance_file, index=False)
        print(f"特征重要性数据已保存至 {importance_file}")
        plt.figure(figsize=(10, 6))
        plt.barh(feature_importance['Feature'], feature_importance['Importance'])
        plt.xlabel('重要性')
        plt.ylabel('特征')
        plt.title(f'{target} - 自适应集成模型选择特征重要性')
        plt.grid(True, axis='x')
        plt.tight_layout()
        plt.show()
        
        # 绘制模型选择频率饼图
        plt.figure(figsize=(12, 5))
        
        # 训练集上的模型选择频率
        plt.subplot(1, 2, 1)
        plt.pie(train_model_selection_counts, labels=available_models, autopct='%1.1f%%')
        plt.title(f'训练集 - 模型选择频率')
        
        # 测试集上的模型选择频率
        plt.subplot(1, 2, 2)
        plt.pie(test_model_selection_counts, labels=available_models, autopct='%1.1f%%')
        plt.title(f'测试集 - 模型选择频率')
        
        plt.tight_layout()
        plt.show()
        # 保存模型选择频率数据
        model_selection_data = pd.DataFrame({
            '模型': available_models,
            '训练集选择次数': train_model_selection_counts,
            '训练集选择百分比': train_model_selection_percent,
            '测试集选择次数': test_model_selection_counts,
            '测试集选择百分比': test_model_selection_percent
        })

        selection_file = os.path.join(save_folder, f'{target}_自适应集成模型选择频率.csv')
        model_selection_data.to_csv(selection_file, index=False)
        print(f"模型选择频率数据已保存至 {selection_file}")
        # 绘制误差分布与模型选择关系
        plt.figure(figsize=(12, 6))
        
        # 对比测试集上各模型的预测结果
        model_data = [test_adaptive_predictions] + [test_predictions[model] for model in available_models]
        model_labels = ['自适应集成'] + available_models
        
        plt.boxplot(model_data, labels=model_labels)
        plt.ylabel('预测值')
        plt.title('自适应集成模型与各基础模型预测分布对比')
        plt.grid(True, axis='y')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        print(f"创建自适应集成模型失败: {str(e)}")
        print(f"错误详情: {traceback.format_exc()}")



In [None]:

# 加权平均集成模型 - 使用标准R²优化权重
print(f"训练 {target} 的加权平均集成模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.15)

# 创建有效模型列表及其预测结果
available_models = []
train_predictions = {}
test_predictions = {}

if 'XGBoost' in models[target]:
    model = models[target]['XGBoost']
    train_pred = model.predict(X_train_tree[target])
    test_pred = model.predict(X_test_tree[target])
    available_models.append('XGBoost')
    train_predictions['XGBoost'] = train_pred
    test_predictions['XGBoost'] = test_pred

if 'LightGBM' in models[target]:
    model = models[target]['LightGBM']
    train_pred = model.predict(X_train_tree[target])
    test_pred = model.predict(X_test_tree[target])
    available_models.append('LightGBM')
    train_predictions['LightGBM'] = train_pred
    test_predictions['LightGBM'] = test_pred
    
if 'HistGradientBoosting' in models[target]:
    model = models[target]['HistGradientBoosting']
    train_pred = model.predict(X_train_tree[target])
    test_pred = model.predict(X_test_tree[target])
    available_models.append('HistGradientBoosting')
    train_predictions['HistGradientBoosting'] = train_pred
    test_predictions['HistGradientBoosting'] = test_pred
    
if 'RandomForest' in models[target]:
    model = models[target]['RandomForest']
    train_pred = model.predict(X_train_tree_filled[target])
    test_pred = model.predict(X_test_tree_filled[target])
    available_models.append('RandomForest')
    train_predictions['RandomForest'] = train_pred
    test_predictions['RandomForest'] = test_pred
    
if 'GaussianProcess' in models[target]:
    model = models[target]['GaussianProcess']
    train_pred = model.predict(X_train_linear[target])
    test_pred = model.predict(X_test_linear[target])
    available_models.append('GaussianProcess')
    train_predictions['GaussianProcess'] = train_pred
    test_predictions['GaussianProcess'] = test_pred

print(f"可用模型: {available_models}")

if len(available_models) < 2:
    print("加权平均集成至少需要两个模型，目前可用模型不足")
else:
    try:
        # 通过优化找到最优权重
        print("寻找最优权重组合...")
        from scipy.optimize import minimize
        
        # 定义自定义加权平均函数
        def weighted_prediction(weights, preds_list):
            weighted_preds = np.zeros(preds_list[0].shape)
            for i, preds in enumerate(preds_list):
                weighted_preds += weights[i] * preds
            return weighted_preds
        
        # 定义要优化的损失函数（最大化标准R²）- 修改为使用标准R²而非容忍度R²
        def neg_r2(weights, preds_list, y_true):
            # 归一化权重确保和为1
            weights = np.array(weights)
            weights = weights / np.sum(weights)
            
            weighted_preds = weighted_prediction(weights, preds_list)
            r2 = r2_score(y_true, weighted_preds)
            return -r2  # 最小化负的R²（即最大化R²）
        
        # 准备用于优化的预测值列表
        train_preds_list = [train_predictions[model_name] for model_name in available_models]
        
        # 初始权重（均等）
        initial_weights = np.ones(len(available_models)) / len(available_models)
        
        # 约束：权重和为1，所有权重非负
        constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1})
        bounds = [(0, 1) for _ in range(len(available_models))]
        
        # 使用SLSQP优化算法寻找最优权重
        print("优化权重中...")
        result = minimize(
            neg_r2, 
            initial_weights, 
            args=(train_preds_list, y_train[target]),
            bounds=bounds,
            constraints=constraints,
            method='SLSQP'
        )
        
        if result.success:
            # 获取最优权重并归一化
            optimal_weights = result.x
            optimal_weights = optimal_weights / np.sum(optimal_weights)
            
            print("\n找到最优权重组合:")
            for i, model_name in enumerate(available_models):
                print(f"  {model_name}: {optimal_weights[i]:.4f}")
                
            # 使用最优权重在训练集和测试集上评估性能
            train_weighted_preds = weighted_prediction(
                optimal_weights, 
                [train_predictions[model_name] for model_name in available_models]
            )
            
            test_weighted_preds = weighted_prediction(
                optimal_weights, 
                [test_predictions[model_name] for model_name in available_models]
            )
            
            # 计算性能指标
            train_r2 = r2_score(y_train[target], train_weighted_preds)
            train_tol_r2 = tolerance_r2_score(y_train[target], train_weighted_preds, tolerance=current_tolerance, target=target)
            train_within_tol = prediction_within_tolerance(y_train[target], train_weighted_preds, tolerance=current_tolerance, target=target)
            
            test_r2 = r2_score(y_test[target], test_weighted_preds)
            test_tol_r2 = tolerance_r2_score(y_test[target], test_weighted_preds, tolerance=current_tolerance, target=target)
            test_within_tol = prediction_within_tolerance(y_test[target], test_weighted_preds, tolerance=current_tolerance, target=target)
            
            print("\n加权平均集成性能:")
            print(f"  训练集 - R²: {train_r2:.4f}, 容忍度R²: {train_tol_r2:.4f}, 在容忍范围内: {train_within_tol:.2%}")
            print(f"  测试集 - R²: {test_r2:.4f}, 容忍度R²: {test_tol_r2:.4f}, 在容忍范围内: {test_within_tol:.2%}")
            
            # 与各个基础模型比较性能
            print("\n与各基础模型性能比较:")
            for model_name in available_models:
                model_test_pred = test_predictions[model_name]
                model_r2 = r2_score(y_test[target], model_test_pred)
                model_tol_r2 = tolerance_r2_score(y_test[target], model_test_pred, tolerance=current_tolerance, target=target)
                
                r2_diff = test_r2 - model_r2
                tol_r2_diff = test_tol_r2 - model_tol_r2
                
                print(f"  vs {model_name}:")
                print(f"    R² 差异: {r2_diff:.4f} ({'+' if r2_diff > 0 else ''}{r2_diff/max(0.0001, abs(model_r2))*100:.2f}%)")
                print(f"    容忍度R² 差异: {tol_r2_diff:.4f} ({'+' if tol_r2_diff > 0 else ''}{tol_r2_diff/max(0.0001, abs(model_tol_r2))*100:.2f}%)")
            
            # 创建加权平均集成模型
            class WeightedAverageEnsemble:
                def __init__(self, models_dict, model_names, weights, model_datasets):
                    self.models_dict = models_dict
                    self.model_names = model_names
                    self.weights = weights
                    self.model_datasets = model_datasets
                    
                def predict(self, X):
                    predictions = []
                    
                    for i, model_name in enumerate(self.model_names):
                        model = self.models_dict[model_name]
                        
                        # 获取适当的数据格式
                        if model_name in ['XGBoost', 'LightGBM', 'HistGradientBoosting']:
                            if isinstance(X, pd.DataFrame):
                                # 假设X是原始数据框，需要应用适当的预处理
                                X_model = X  # 应该在实际应用中进行适当的预处理转换
                            else:
                                X_model = X
                        elif model_name == 'RandomForest':
                            if isinstance(X, pd.DataFrame):
                                # 对于RandomForest需要填充NaN
                                X_model = X.fillna(0)
                            else:
                                X_model = X
                        elif model_name == 'GaussianProcess':
                            if isinstance(X, pd.DataFrame):
                                # 假设X是原始数据框，需要应用适当的预处理
                                X_model = X  # 应该在实际应用中进行适当的预处理转换
                            else:
                                X_model = X
                        else:
                            X_model = X
                            
                        model_pred = model.predict(X_model)
                        predictions.append(model_pred)
                    
                    # 应用权重
                    weighted_preds = np.zeros(predictions[0].shape)
                    for i, preds in enumerate(predictions):
                        weighted_preds += self.weights[i] * preds
                        
                    return weighted_preds
            
            # 创建模型数据集字典
            model_datasets = {
                'XGBoost': 'tree',
                'LightGBM': 'tree',
                'HistGradientBoosting': 'tree',
                'RandomForest': 'tree_filled',
                'GaussianProcess': 'linear'
            }
            
            # 实例化加权平均集成模型
            weighted_model = WeightedAverageEnsemble(
                models_dict=models[target],
                model_names=available_models,
                weights=optimal_weights,
                model_datasets=model_datasets
            )
            
            # 保存模型
            models[target]['WeightedEnsemble'] = weighted_model
            # 使用pickle保存加权平均集成模型
            weighted_model_file = os.path.join(model_folder, f'{target}_加权平均集成模型.pkl')
            with open(weighted_model_file, 'wb') as f:
                pickle.dump(weighted_model, f)
            print(f"加权平均集成模型已保存至 {weighted_model_file}")

            # 保存训练集和测试集的预测结果
            train_prediction = pd.DataFrame({
                '实际值': y_train[target],
                '加权平均预测值': train_weighted_preds,
                '误差': np.abs(y_train[target] - train_weighted_preds)
            })

            test_prediction = pd.DataFrame({
                '实际值': y_test[target],
                '加权平均预测值': test_weighted_preds,
                '误差': np.abs(y_test[target] - test_weighted_preds)
            })

            # 添加各基础模型的预测结果以便比较
            for model_name in available_models:
                train_prediction[f'{model_name}预测值'] = train_predictions[model_name]
                test_prediction[f'{model_name}预测值'] = test_predictions[model_name]

            # 保存到文件
            train_file = os.path.join(save_folder, f'{target}_加权平均集成训练集预测结果.csv')
            test_file = os.path.join(save_folder, f'{target}_加权平均集成测试集预测结果.csv')

            train_prediction.to_csv(train_file, index=False)
            test_prediction.to_csv(test_file, index=False)

            print(f"训练集预测结果已保存至 {train_file}")
            print(f"测试集预测结果已保存至 {test_file}")

            # 可视化: 预测vs实际值散点图 (训练集和测试集)
            plt.figure(figsize=(12, 5))
            
            # 训练集散点图
            plt.subplot(1, 2, 1)
            plt.scatter(y_train[target], train_weighted_preds, alpha=0.5)
            plt.plot([y_train[target].min(), y_train[target].max()], [y_train[target].min(), y_train[target].max()], 'r--')
            plt.xlabel('实际值')
            plt.ylabel('预测值')
            plt.title(f'训练集: R²={train_r2:.4f}')
            
            # 测试集散点图
            plt.subplot(1, 2, 2)
            plt.scatter(y_test[target], test_weighted_preds, alpha=0.5)
            plt.plot([y_test[target].min(), y_test[target].max()], [y_test[target].min(), y_test[target].max()], 'r--')
            plt.xlabel('实际值')
            plt.ylabel('预测值')
            plt.title(f'测试集: R²={test_r2:.4f}')
            
            plt.tight_layout()
            plt.show()
            
            # 绘制误差分布
            plt.figure(figsize=(12, 5))
            
            # 训练集误差
            plt.subplot(1, 2, 1)
            train_errors = y_train[target] - train_weighted_preds
            plt.hist(train_errors, bins=30, alpha=0.7)
            plt.axvline(x=0, color='r', linestyle='--')
            plt.xlabel('预测误差')
            plt.ylabel('频次')
            plt.title(f'训练集误差分布 (MAE={np.abs(train_errors).mean():.4f})')
            
            # 测试集误差
            plt.subplot(1, 2, 2)
            test_errors = y_test[target] - test_weighted_preds
            plt.hist(test_errors, bins=30, alpha=0.7)
            plt.axvline(x=0, color='r', linestyle='--')
            plt.xlabel('预测误差')
            plt.ylabel('频次')
            plt.title(f'测试集误差分布 (MAE={np.abs(test_errors).mean():.4f})')
            
            plt.tight_layout()
            plt.show()
            
            # 绘制权重条形图
            plt.figure(figsize=(10, 6))
            plt.bar(available_models, optimal_weights)
            plt.xlabel('模型')
            plt.ylabel('权重')
            plt.title(f'{target} - 加权平均集成模型权重分布')
            plt.xticks(rotation=45)
            plt.grid(True, axis='y')
            plt.tight_layout()
            plt.show()
            
            # 绘制各模型与加权平均模型的预测对比图
            plt.figure(figsize=(12, 6))
            model_data = [test_weighted_preds] + [test_predictions[model] for model in available_models]
            model_labels = ['加权平均'] + available_models
            
            plt.boxplot(model_data, labels=model_labels)
            plt.ylabel('预测值')
            plt.title('加权平均模型与各基础模型预测分布对比')
            plt.grid(True, axis='y')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.show()
            
        else:
            print("权重优化失败:", result.message)
            
    except Exception as e:
        print(f"创建加权平均集成模型失败: {str(e)}")
        print(f"错误详情: {traceback.format_exc()}")

In [None]:
# 吸油能力XGBoost贝叶斯超参数优化
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer
import numpy as np
import pandas as pd
import pickle
import os
import matplotlib.pyplot as plt
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from skopt.plots import plot_convergence
import warnings
warnings.filterwarnings('ignore')

# 创建保存数据的文件夹
save_folder = '模型可视化数据'
if not os.path.exists(save_folder):
    os.makedirs(save_folder)
    print(f"已创建文件夹：{save_folder}")

# 选择目标变量
target = '吸油能力'
print(f"训练 {target} 的XGBoost模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.03)

# 选择适当的数据集
X_train_model = X_train_tree[target]
X_test_model = X_test_tree[target]

# 从您的代码中复制的完整函数定义
def tolerance_r2_score(y_true, y_pred, tolerance=0.15, target=None):
    """
    计算容忍度R²评分，允许一定误差范围内的预测被视为准确
    """
    y_true = np.asarray(y_true).ravel()
    y_pred = np.asarray(y_pred).ravel()
    
    if target and target in target_tolerance:
        tolerance = target_tolerance[target]
    
    tolerance_values = tolerance * np.abs(y_true)
    residuals = np.abs(y_true - y_pred)
    adjusted_residuals = np.maximum(0, residuals - tolerance_values)
    
    y_true_mean = np.mean(y_true)
    tss = np.sum((y_true - y_true_mean) ** 2)
    rss = np.sum(adjusted_residuals ** 2)
    
    if tss == 0:
        return 0
    
    tolerance_r2 = 1 - (rss / tss)
    return tolerance_r2

def prediction_within_tolerance(y_true, y_pred, tolerance=0.15, target=None):
    """
    计算预测值在目标值±容忍范围内的比例
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    
    if target and target in target_tolerance:
        tolerance = target_tolerance[target]
    
    tolerance_values = tolerance * np.abs(y_true)
    within_tolerance = np.abs(y_true - y_pred) <= tolerance_values
    
    return np.mean(within_tolerance)

def make_tolerance_scorer(target_name):
    def tolerance_score(y_true, y_pred):
        tolerance = target_tolerance.get(target_name, 0.03)
        relative_errors = np.abs(y_true - y_pred) / np.abs(y_true)
        within_tolerance = np.mean(relative_errors <= tolerance)
        return within_tolerance
    return tolerance_score

def evaluate_model(model, X_train, y_train, X_test, y_test, target, model_name):
    """
    评估模型在训练集和测试集上的性能，包括标准R²和容忍度R²
    """
    current_tolerance = target_tolerance.get(target, 0.15)
    
    # 在训练集上评估
    y_train_pred = model.predict(X_train)
    if len(y_train_pred.shape) > 1 and y_train_pred.shape[1] == 1:
        y_train_pred = y_train_pred.flatten()
    
    train_r2 = r2_score(y_train, y_train_pred)
    train_tol_r2 = tolerance_r2_score(y_train, y_train_pred, tolerance=current_tolerance, target=target)
    train_within_tol = prediction_within_tolerance(y_train, y_train_pred, tolerance=current_tolerance, target=target)
    
    # 在测试集上评估
    y_test_pred = model.predict(X_test)
    if len(y_test_pred.shape) > 1 and y_test_pred.shape[1] == 1:
        y_test_pred = y_test_pred.flatten()
        
    test_r2 = r2_score(y_test, y_test_pred)
    test_tol_r2 = tolerance_r2_score(y_test, y_test_pred, tolerance=current_tolerance, target=target)
    test_within_tol = prediction_within_tolerance(y_test, y_test_pred, tolerance=current_tolerance, target=target)
    
    print(f"\n{model_name} 在 {target} 上的评估结果:")
    print(f"训练集: R²={train_r2:.4f}, 容忍度R²={train_tol_r2:.4f}, 在容忍范围内比例={train_within_tol:.2%}")
    print(f"测试集: R²={test_r2:.4f}, 容忍度R²={test_tol_r2:.4f}, 在容忍范围内比例={test_within_tol:.2%}")
    
    # 绘制预测值与实际值的对比散点图
    plt.figure(figsize=(12, 5))
    
    # 训练集散点图
    plt.subplot(1, 2, 1)
    plt.scatter(y_train, y_train_pred, alpha=0.6, s=30)
    min_val = min(min(y_train), min(y_train_pred))
    max_val = max(max(y_train), max(y_train_pred))
    plt.plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2)
    plt.xlabel('实际值')
    plt.ylabel('预测值')
    plt.title(f'训练集: R²={train_r2:.4f}')
    plt.grid(True, alpha=0.3)
    
    # 测试集散点图
    plt.subplot(1, 2, 2)
    plt.scatter(y_test, y_test_pred, alpha=0.6, s=30)
    min_val = min(min(y_test), min(y_test_pred))
    max_val = max(max(y_test), max(y_test_pred))
    plt.plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2)
    plt.xlabel('实际值')
    plt.ylabel('预测值')
    plt.title(f'测试集: R²={test_r2:.4f}')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return {
        'train_r2': train_r2,
        'train_tol_r2': train_tol_r2,
        'train_within_tol': train_within_tol,
        'test_r2': test_r2,
        'test_tol_r2': test_tol_r2,
        'test_within_tol': test_within_tol
    }

# 为当前目标创建特定的评分器
tol_scorer = make_tolerance_scorer(target)
tol_scorer_wrapped = make_scorer(tol_scorer, greater_is_better=True)

# 设置基础参数
base_params = {
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'n_jobs': -1,
    'random_state': 42,
    'missing': np.nan
}

# 定义贝叶斯优化的搜索空间（基于您原始代码的参数范围）
dimensions = [
    Integer(50, 100, name='n_estimators'),           # 您原始代码：[100, 50, 90, 80]
    Real(0.5, 0.8, name='learning_rate'),            # 您原始代码：[0.7, 0.8, 0.6, 0.5]
    Integer(3, 6, name='max_depth'),                 # 您原始代码：[4, 5, 6, 3]
    Integer(2, 6, name='min_child_weight'),          # 您原始代码：[5, 4, 6, 3, 2]
    Real(0.0, 0.2, name='gamma'),                    # 您原始代码：[0, 0.1, 0.2]
    Real(0.5, 0.6, name='subsample'),               # 您原始代码：[0.6, 0.5]
    Real(0.8, 1.0, name='colsample_bytree'),        # 您原始代码：[0.9, 1.0, 0.8]
    Real(0.0, 1.0, name='reg_alpha'),               # 您原始代码：[0, 0.4, 0.5, 0.6, 1.0]
    Real(0.5, 1.0, name='reg_lambda')               # 您原始代码：[1.0, 0.5, 0.7, 0.8]
]

# 交叉验证设置
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 定义目标函数
@use_named_args(dimensions=dimensions)
def objective(**params):
    model = XGBRegressor(
        n_estimators=params['n_estimators'],
        learning_rate=params['learning_rate'],
        max_depth=params['max_depth'],
        min_child_weight=params['min_child_weight'],
        gamma=params['gamma'],
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        reg_alpha=params['reg_alpha'],
        reg_lambda=params['reg_lambda'],
        **base_params
    )
    
    try:
        cv_scores = cross_val_score(
            model, X_train_model, y_train[target], 
            cv=kf, scoring=tol_scorer_wrapped
        )
        return -cv_scores.mean()
    except:
        return 1.0

# 执行贝叶斯优化
print("执行贝叶斯优化...")
result = gp_minimize(
    func=objective,
    dimensions=dimensions,
    n_calls=50,
    n_initial_points=10,
    random_state=42,
    verbose=True
)

# 获取最佳参数
best_params = dict(zip([dim.name for dim in dimensions], result.x))
print(f"最佳参数: {best_params}")
print(f"最佳CV得分: {-result.fun:.4f}")

# 使用最佳参数创建最终模型
xgb_model = XGBRegressor(
    n_estimators=best_params['n_estimators'],
    learning_rate=best_params['learning_rate'],
    max_depth=best_params['max_depth'],
    min_child_weight=best_params['min_child_weight'],
    gamma=best_params['gamma'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree'],
    reg_alpha=best_params['reg_alpha'],
    reg_lambda=best_params['reg_lambda'],
    **base_params
)

xgb_model.fit(X_train_model, y_train[target])

# 交叉验证
cv_scores = cross_val_score(xgb_model, X_train_model, y_train[target], cv=kf, scoring='r2')
print(f"交叉验证R²分数: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# 容忍度R²评分
tolerance_cv_scores = cross_val_score(
    xgb_model, X_train_model, y_train[target], cv=kf, 
    scoring=tol_scorer_wrapped
)
print(f"容忍度R²分数 (容忍度={current_tolerance:.2f}): {tolerance_cv_scores.mean():.4f} ± {tolerance_cv_scores.std():.4f}")

# 保存模型
models[target]['XGBoost'] = xgb_model

# 创建模型保存文件夹
model_folder = '训练模型文件'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)
    print(f"已创建模型文件夹：{model_folder}")

xgb_model_file = os.path.join(model_folder, f'{target}_XGBoost模型.pkl')
with open(xgb_model_file, 'wb') as f:
    pickle.dump(xgb_model, f)
print(f"XGBoost 模型已保存至 {xgb_model_file}")

# 评估模型
results = evaluate_model(xgb_model, X_train_model, y_train[target], X_test_model, y_test[target], target, "XGBoost")

# 获取预测值
y_pred_train = xgb_model.predict(X_train_model)
y_pred_test = xgb_model.predict(X_test_model)

# 分别创建训练集和测试集的预测结果DataFrame
train_prediction = pd.DataFrame({
    '实际值': y_train[target],
    '预测值': y_pred_train,
    '误差': np.abs(y_train[target] - y_pred_train)
})
train_prediction['数据集'] = '训练集'

test_prediction = pd.DataFrame({
    '实际值': y_test[target],
    '预测值': y_pred_test,
    '误差': np.abs(y_test[target] - y_pred_test)
})
test_prediction['数据集'] = '测试集'

# 保存到单独的文件
train_file = os.path.join(save_folder, f'{target}_XGBoost训练集预测结果.csv')
test_file = os.path.join(save_folder, f'{target}_XGBoost测试集预测结果.csv')

train_prediction.to_csv(train_file, index=False)
test_prediction.to_csv(test_file, index=False)

print(f"训练集预测结果已保存至 {train_file}")
print(f"测试集预测结果已保存至 {test_file}")

# 特征重要性
feature_importance = pd.DataFrame({
    'Feature': X_train_model.columns,
    'Importance': xgb_model.feature_importances_
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('重要性')
plt.ylabel('特征')
plt.title(f'{target} - XGBoost特征重要性')
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()

# 保存特征重要性数据
feature_importance_file = os.path.join(save_folder, f'{target}_XGBoost特征重要性.csv')
feature_importance.to_csv(feature_importance_file, index=False)
print(f"特征重要性数据已保存至 {feature_importance_file}")

# 绘制优化收敛图
plt.figure(figsize=(10, 6))
plot_convergence(result)
plt.title('贝叶斯优化收敛过程')
plt.show()

# 保存优化结果
optimization_history = pd.DataFrame({
    '迭代次数': range(1, len(result.func_vals) + 1),
    '目标函数值': result.func_vals,
    '最佳目标函数值': [min(result.func_vals[:i+1]) for i in range(len(result.func_vals))]
})

param_names = [dim.name for dim in dimensions]
for i, param_name in enumerate(param_names):
    optimization_history[f'参数_{param_name}'] = [x[i] for x in result.x_iters]

optimization_history_file = os.path.join(save_folder, f'{target}_贝叶斯优化历史.csv')
optimization_history.to_csv(optimization_history_file, index=False)
print(f"优化历史数据已保存至 {optimization_history_file}")

print(f"\n贝叶斯优化完成！")

In [None]:
#xgboost
from xgboost import XGBRegressor
import os

# 创建保存数据的文件夹
save_folder = '模型可视化数据'
if not os.path.exists(save_folder):
    os.makedirs(save_folder)
    print(f"已创建文件夹：{save_folder}")
# 选择目标变量
target = '吸油能力'
print(f"训练 {target} 的XGBoost模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.03)

# 选择适当的数据集
X_train_model = X_train_tree[target]
X_test_model = X_test_tree[target]

# 为当前目标创建特定的评分器
tol_scorer = make_tolerance_scorer(target)
tol_scorer_wrapped = make_scorer(tol_scorer, greater_is_better=True)

# 设置基础参数
base_params = {
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'n_jobs': -1,
    'random_state': 42,
    'missing': np.nan
}

# 设置超参数
param_dist = {
    'n_estimators': [100, 200, 300,500,600,800,1000],
    'learning_rate': [0.01, 0.03, 0.05,0.02,0.1,0.5],
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.6, 0.7, 0.8],
    'colsample_bytree': [0.6, 0.7, 0.8],
    'reg_alpha': [0, 0.5, 1.0],
    'reg_lambda': [1.0, 1.5, 2.0]
}

# 创建基础模型
base_model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=8,
    min_child_weight=3,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0,
    reg_lambda=1.0,
    **base_params
)

# 执行超参数优化
print("执行超参数优化...")
search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_dist,
    n_iter=1000,
    cv=kf,
    scoring=tol_scorer_wrapped,
    n_jobs=-1,
    random_state=42,
    verbose=1
)
search.fit(X_train_model, y_train[target])
xgb_model = search.best_estimator_
print(f"最佳参数: {search.best_params_}")
print(f"最佳CV得分: {search.best_score_:.4f}")
# 交叉验证
cv_scores = cross_val_score(xgb_model, X_train_model, y_train[target], cv=kf, scoring='r2')
print(f"交叉验证R²分数: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# 容忍度R²评分
tolerance_cv_scores = cross_val_score(
    xgb_model, X_train_model, y_train[target], cv=kf, 
    scoring=tol_scorer_wrapped
)
print(f"容忍度R²分数 (容忍度={current_tolerance:.2f}): {tolerance_cv_scores.mean():.4f} ± {tolerance_cv_scores.std():.4f}")
# 保存模型
models[target]['XGBoost'] = xgb_model
# 创建模型保存文件夹
model_folder = '训练模型文件'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)
    print(f"已创建模型文件夹：{model_folder}")

xgb_model_file = os.path.join(model_folder, f'{target}_XGBoost模型.pkl')
with open(xgb_model_file, 'wb') as f:
    pickle.dump(xgb_model, f)
print(f"XGBoost 模型已保存至 {xgb_model_file}")
# 评估模型
results = evaluate_model(xgb_model, X_train_model, y_train[target], X_test_model, y_test[target], target, "XGBoost")
# 获取预测值
y_pred_train = xgb_model.predict(X_train_model)
y_pred_test = xgb_model.predict(X_test_model)

# 分别创建训练集和测试集的预测结果DataFrame
train_prediction = pd.DataFrame({
    '实际值': y_train[target],
    '预测值': y_pred_train,
    '误差': np.abs(y_train[target] - y_pred_train)
})
train_prediction['数据集'] = '训练集'

test_prediction = pd.DataFrame({
    '实际值': y_test[target],
    '预测值': y_pred_test,
    '误差': np.abs(y_test[target] - y_pred_test)
})
test_prediction['数据集'] = '测试集'

# 保存到单独的文件
train_file = os.path.join(save_folder, f'{target}_XGBoost训练集预测结果.csv')
test_file = os.path.join(save_folder, f'{target}_XGBoost测试集预测结果.csv')

train_prediction.to_csv(train_file, index=False)
test_prediction.to_csv(test_file, index=False)

print(f"训练集预测结果已保存至 {train_file}")
print(f"测试集预测结果已保存至 {test_file}")
# 特征重要性
feature_importance = pd.DataFrame({
    'Feature': X_train_model.columns,
    'Importance': xgb_model.feature_importances_
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('重要性')
plt.ylabel('特征')
plt.title(f'{target} - XGBoost特征重要性')
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()
# 保存特征重要性数据
feature_importance_file = os.path.join(save_folder, f'{target}_XGBoost特征重要性.csv')
feature_importance.to_csv(feature_importance_file, index=False)
print(f"特征重要性数据已保存至 {feature_importance_file}")

# 绘制训练过程中的损失曲线
eval_set = [(X_train_model, y_train[target]), (X_test_model, y_test[target])]
model_train = XGBRegressor(**{**base_model.get_params(), 'eval_metric': 'rmse'})
model_train.fit(X_train_model, y_train[target], eval_set=eval_set, verbose=False)

results = model_train.evals_result()
epochs = len(results['validation_0']['rmse'])
x_axis = range(0, epochs)

plt.figure(figsize=(10, 6))
plt.plot(x_axis, results['validation_0']['rmse'], label='训练集')
plt.plot(x_axis, results['validation_1']['rmse'], label='测试集')
plt.legend()
plt.ylabel('RMSE')
plt.xlabel('迭代次数')
plt.title('XGBoost训练进度')
plt.grid(True)
plt.show()
# 保存训练进度数据
training_progress_data = pd.DataFrame({
    '迭代次数': x_axis,
    '训练集RMSE': results['validation_0']['rmse'],
    '测试集RMSE': results['validation_1']['rmse']
})
training_progress_file = os.path.join(save_folder, f'{target}_XGBoost训练进度.csv')
training_progress_data.to_csv(training_progress_file, index=False)
print(f"训练进度数据已保存至 {training_progress_file}")
# 学习率影响分析
learning_rates = [0.005, 0.01, 0.03, 0.05, 0.1, 0.2]
plt.figure(figsize=(10, 6))

# 创建用于保存学习率分析数据的DataFrame
lr_analysis_data = pd.DataFrame()

for lr in learning_rates:
    model = XGBRegressor(
        learning_rate=lr,
        n_estimators=500,
        max_depth=3,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='reg:squarederror',
        tree_method='hist',
        random_state=42
    )
    eval_set = [(X_test_model, y_test[target])]
    model.fit(X_train_model, y_train[target], eval_set=eval_set, verbose=False)
    results = model.evals_result()
    
    # 将当前学习率的结果添加到DataFrame
    temp_df = pd.DataFrame({
        '迭代次数': range(len(results['validation_0']['rmse'])),
        f'学习率_{lr}': results['validation_0']['rmse']
    })
    
    if lr_analysis_data.empty:
        lr_analysis_data = temp_df
    else:
        lr_analysis_data = pd.merge(
            lr_analysis_data, temp_df, on='迭代次数', how='outer'
        )
    
    plt.plot(results['validation_0']['rmse'], label=f'学习率: {lr}')

# 保存学习率分析数据
lr_analysis_file = os.path.join(save_folder, f'{target}_XGBoost学习率分析.csv')
lr_analysis_data.to_csv(lr_analysis_file, index=False)
print(f"学习率分析数据已保存至 {lr_analysis_file}")


In [None]:


# lightGBM
print(f"训练 {target} 的LightGBM模型...")
# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.03)
# 选择适当的数据集
X_train_model = X_train_tree[target]
X_test_model = X_test_tree[target]
# 设置基础参数
base_params = {
    'objective': 'regression',
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'verbose': -1,
    'n_jobs': -1,
    'random_state': 42
}
# 设置特定参数
lgb_params = {
    'n_estimators': 12000,
    'learning_rate': 0.001,
    'num_leaves': 20,
    'max_depth': 20,#8shi0.72
    'min_child_samples': 1,
    'subsample': 1,
    'colsample_bytree':1,
    'reg_alpha': 10,
    'reg_lambda': 1.0,
    **base_params
}

print("使用自定义LightGBM包装器训练模型")
# 确保保存当前使用的特征列
feature_cols = X_train_model.columns.tolist() if hasattr(X_train_model, 'columns') else None

# 创建并训练模型
lgb_model = CustomLGBMRegressor(**lgb_params)
lgb_model.fit(X_train_model, y_train[target])

# 保存模型
models[target]['LightGBM'] = lgb_model
# 创建模型保存文件夹（如果已存在则不会重复创建）
model_folder = '训练模型文件'
if not os.path.exists(model_folder):
    os.makedirs(model_folder)
    print(f"已创建模型文件夹：{model_folder}")

lgb_model_file = os.path.join(model_folder, f'{target}_LightGBM模型.pkl')
with open(lgb_model_file, 'wb') as f:
    pickle.dump(lgb_model, f)
print(f"LightGBM 模型已保存至 {lgb_model_file}")
# 评估模型
results = evaluate_model(lgb_model, X_train_model, y_train[target], X_test_model, y_test[target], target, "LightGBM")

print("LightGBM模型训练成功")
# 获取预测值
y_pred_train = lgb_model.predict(X_train_model)
y_pred_test = lgb_model.predict(X_test_model)

# 分别创建训练集和测试集的预测结果DataFrame
train_prediction = pd.DataFrame({
    '实际值': y_train[target],
    '预测值': y_pred_train,
    '误差': np.abs(y_train[target] - y_pred_train)
})
train_prediction['数据集'] = '训练集'

test_prediction = pd.DataFrame({
    '实际值': y_test[target],
    '预测值': y_pred_test,
    '误差': np.abs(y_test[target] - y_pred_test)
})
test_prediction['数据集'] = '测试集'

# 保存到单独的文件
train_file = os.path.join(save_folder, f'{target}_LightGBM训练集预测结果.csv')
test_file = os.path.join(save_folder, f'{target}_LightGBM测试集预测结果.csv')

train_prediction.to_csv(train_file, index=False)
test_prediction.to_csv(test_file, index=False)

print(f"训练集预测结果已保存至 {train_file}")
print(f"测试集预测结果已保存至 {test_file}")
# 特征重要性可视化
if hasattr(lgb_model.model, 'feature_importance') and feature_cols is not None:
    feature_importance = pd.DataFrame({
        'Feature': feature_cols,
        'Importance': lgb_model.model.feature_importance(importance_type='gain')
    })
    feature_importance = feature_importance.sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance['Feature'], feature_importance['Importance'])
    plt.xlabel('增益重要性')
    plt.ylabel('特征')
    plt.title(f'{target} - LightGBM特征重要性')
    plt.grid(True, axis='x')
    plt.tight_layout()
    plt.show()
# 保存特征重要性数据
feature_importance_file = os.path.join(save_folder, f'{target}_LightGBM特征重要性.csv')
feature_importance.to_csv(feature_importance_file, index=False)
print(f"特征重要性数据已保存至 {feature_importance_file}")


# 添加预测不准确样本分析
# 获取测试集预测值
y_pred = lgb_model.predict(X_test_model)

# 将y_test转换为numpy数组格式进行处理
if hasattr(y_test[target], 'values'):
    y_true_values = y_test[target].values
else:
    y_true_values = y_test[target]

# 计算绝对误差
errors = np.abs(y_true_values - y_pred)

# 设置容忍度阈值
tolerance = 5.0  # 可以根据需要调整

# 找出误差超过容忍度的样本
inaccurate_mask = errors > tolerance
inaccurate_indices = np.where(inaccurate_mask)[0]

print(f"\n预测不准确的样本数量: {len(inaccurate_indices)} (占测试集的 {len(inaccurate_indices)/len(y_test)*100:.2f}%)")
print(f"使用的容忍度阈值: {tolerance}")

# 创建预测不准确样本的分析数据
if len(inaccurate_indices) > 0:
    # 尝试获取原始索引，如果不可用则使用数组位置索引
    try:
        if hasattr(y_test, 'index'):
            original_indices = [y_test.index[i] for i in inaccurate_indices]
        elif isinstance(X_test_model, pd.DataFrame) and hasattr(X_test_model, 'index'):
            original_indices = [X_test_model.index[i] for i in inaccurate_indices]
        else:
            # 如果无法获取原始索引，使用数组位置作为标识
            original_indices = inaccurate_indices
    except Exception as e:
        print(f"无法获取原始索引: {str(e)}")
        original_indices = inaccurate_indices
    
    # 创建包含预测不准确样本信息的DataFrame
    inaccurate_samples = []
    for i, idx in enumerate(inaccurate_indices):
        # 安全地获取实际值
        if hasattr(y_test[target], 'iloc'):
            actual = y_test[target].iloc[idx]
        else:
            actual = y_true_values[idx]
        
        inaccurate_samples.append({
            '样本索引': original_indices[i],
            '实际值': actual,
            '预测值': y_pred[idx],
            '绝对误差': errors[idx],
            '相对误差(%)': (errors[idx] / np.abs(actual)) * 100 if actual != 0 else float('inf')
        })
    
    inaccurate_df = pd.DataFrame(inaccurate_samples)
    # 按误差降序排列
    inaccurate_df = inaccurate_df.sort_values('绝对误差', ascending=False)
    
    # 打印预测不准确的样本信息
    print("\n预测不准确的样本详情 (按误差降序排列):")
    print(inaccurate_df)
    
    # 保存结果到文件
    inaccurate_df.to_csv(f'{target}_不准确预测.csv', index=False)
else:
    print(f"\n没有发现预测不准确的样本 (容忍度阈值: {tolerance})")



In [None]:


#HistGradientBoosting
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
print(f"训练 {target} 的HistGradientBoosting模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.03)

# 选择适当的数据集
X_train_model = X_train_tree[target]
X_test_model = X_test_tree[target]

# 为当前目标创建特定的评分器
tol_scorer = make_tolerance_scorer(target)
tol_scorer_wrapped = make_scorer(tol_scorer, greater_is_better=True)

# 设置超参数
param_dist = {
    'max_iter': [450, 400,350,440,460],
    'learning_rate': [0.01, 0.008, 0.011,0.009],
    'max_depth': [9, 11, 10,8,7,6,5],
    'min_samples_leaf': [1, 2, 4],
    'l2_regularization': [0, 0.1, 0.2]
}

# 创建基础模型
base_model = HistGradientBoostingRegressor(
    max_iter=3000,
    learning_rate=1,
    max_depth=3,
    min_samples_leaf=4,
    l2_regularization=0.5,
    loss='squared_error',
    random_state=42
)

# 执行超参数优化
if len(X_train_model) >= 90:
    print("执行超参数优化...")
    search = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=param_dist,
        n_iter=2500,
        cv=kf,
        scoring=tol_scorer_wrapped,
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    search.fit(X_train_model, y_train[target])
    hgb_model = search.best_estimator_
    print(f"最佳参数: {search.best_params_}")
    print(f"最佳CV得分: {search.best_score_:.4f}")
else:
    # 使用预定义的参数
    print("使用预定义参数...")
    hgb_model = base_model
    hgb_model.fit(X_train_model, y_train[target])

# 交叉验证
cv_scores = cross_val_score(hgb_model, X_train_model, y_train[target], cv=kf, scoring='r2')
print(f"交叉验证R²分数: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# 容忍度R²评分
tolerance_cv_scores = cross_val_score(
    hgb_model, X_train_model, y_train[target], cv=kf, 
    scoring=tol_scorer_wrapped
)
print(f"容忍度R²分数 (容忍度={current_tolerance:.2f}): {tolerance_cv_scores.mean():.4f} ± {tolerance_cv_scores.std():.4f}")

# 保存模型
models[target]['HistGradientBoosting'] = hgb_model
# 使用pickle保存HistGradientBoosting模型
hgb_model_file = os.path.join(model_folder, f'{target}_HistGradientBoosting模型.pkl')
with open(hgb_model_file, 'wb') as f:
    pickle.dump(hgb_model, f)
print(f"HistGradientBoosting模型已保存至 {hgb_model_file}")
# 评估模型
results = evaluate_model(hgb_model, X_train_model, y_train[target], X_test_model, y_test[target], target, "HistGradientBoosting")
# 获取预测值
y_pred_train = hgb_model.predict(X_train_model)
y_pred_test = hgb_model.predict(X_test_model)

# 分别创建训练集和测试集的预测结果DataFrame
train_prediction = pd.DataFrame({
    '实际值': y_train[target],
    '预测值': y_pred_train,
    '误差': np.abs(y_train[target] - y_pred_train)
})
train_prediction['数据集'] = '训练集'

test_prediction = pd.DataFrame({
    '实际值': y_test[target],
    '预测值': y_pred_test,
    '误差': np.abs(y_test[target] - y_pred_test)
})
test_prediction['数据集'] = '测试集'

# 保存到文件
train_file = os.path.join(save_folder, f'{target}_HistGradientBoosting训练集预测结果.csv')
test_file = os.path.join(save_folder, f'{target}_HistGradientBoosting测试集预测结果.csv')

train_prediction.to_csv(train_file, index=False)
test_prediction.to_csv(test_file, index=False)

print(f"训练集预测结果已保存至 {train_file}")
print(f"测试集预测结果已保存至 {test_file}")
# 由于HistGradientBoosting不直接提供特征重要性，使用permutation importance评估
from sklearn.inspection import permutation_importance

perm_importance = permutation_importance(
    hgb_model, X_test_model, y_test[target], 
    n_repeats=10, random_state=42, n_jobs=-1
)

feature_importance = pd.DataFrame({
    'Feature': X_train_model.columns,
    'Importance': perm_importance.importances_mean
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('置换重要性')
plt.ylabel('特征')
plt.title(f'{target} - HistGradientBoosting特征重要性')
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()
# 保存特征重要性数据
feature_importance_file = os.path.join(save_folder, f'{target}_HistGradientBoosting特征重要性.csv')
feature_importance.to_csv(feature_importance_file, index=False)
print(f"特征重要性数据已保存至 {feature_importance_file}")
# 不同学习率和迭代次数的影响分析
learning_rates = [0.01, 0.008, 0.011,0.009]
max_iters = [50, 100, 200, 300]
fig, axs = plt.subplots(len(learning_rates), 1, figsize=(10, 4*len(learning_rates)), sharex=True)

for i, lr in enumerate(learning_rates):
    train_scores = []
    test_scores = []
    for iter_count in max_iters:
        model = HistGradientBoostingRegressor(
            max_iter=iter_count,
            learning_rate=lr,
            max_depth=3,
            random_state=42
        )
        model.fit(X_train_model, y_train[target])
        train_score = r2_score(y_train[target], model.predict(X_train_model))
        test_score = r2_score(y_test[target], model.predict(X_test_model))
        train_scores.append(train_score)
        test_scores.append(test_score)
    
    axs[i].plot(max_iters, train_scores, 'o-', label='训练集 R²')
    axs[i].plot(max_iters, test_scores, 'o-', label='测试集 R²')
    axs[i].set_title(f'学习率 = {lr}')
    axs[i].set_ylabel('R²')
    axs[i].grid(True)
    axs[i].legend()
plt.xlabel('迭代次数')
plt.suptitle('HistGradientBoosting - 学习率和迭代次数影响')
plt.tight_layout()
plt.show()
# 准备保存学习率和迭代次数影响分析数据
analysis_data = []

for i, lr in enumerate(learning_rates):
    for j, iter_count in enumerate(max_iters):
        analysis_data.append({
            '学习率': lr,
            '迭代次数': iter_count,
            '训练集R²': train_scores[j],
            '测试集R²': test_scores[j]
        })

# 转换为DataFrame并保存
lr_iter_analysis = pd.DataFrame(analysis_data)
lr_analysis_file = os.path.join(save_folder, f'{target}_HistGradientBoosting学习率迭代分析.csv')
lr_iter_analysis.to_csv(lr_analysis_file, index=False)
print(f"学习率和迭代次数分析数据已保存至 {lr_analysis_file}")




In [None]:

#RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

# 选择目标变量
print(f"训练 {target} 的随机森林模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.03)

# 选择适当的数据集
X_train_model = X_train_tree_filled[target]
X_test_model = X_test_tree_filled[target]

# 为当前目标创建特定的评分器
tol_scorer = make_tolerance_scorer(target)
tol_scorer_wrapped = make_scorer(tol_scorer, greater_is_better=True)

# 设置超参数
param_dist = {
    'n_estimators': [ 900,1000,1200,1100],
    'max_depth': [4, 5, 6,8,9,10],
    'min_samples_split': [ 5,4,6,7,8],
    'min_samples_leaf': [7, 4,3,2,1,5,6],
    'max_features': ['sqrt', 'log2', None]
}

# 创建基础模型
base_model = RandomForestRegressor(
    n_estimators=800,
    max_depth=None,
    min_samples_split=3,
    min_samples_leaf=4,
    max_features='sqrt',
    bootstrap=True,
    oob_score=True,
    n_jobs=-1,
    random_state=42
)

# 执行超参数优化
if len(X_train_model) >= 90:
    print("执行超参数优化...")
    search = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=param_dist,
        n_iter=500,#变大后训练集R方变大但是时间可能很长2000/0.3569；4000/0.2091；3000/0.1221；1000/0.1577
        cv=kf,
        scoring=tol_scorer_wrapped,
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    search.fit(X_train_model, y_train[target])
    rf_model = search.best_estimator_
    print(f"最佳参数: {search.best_params_}")
    print(f"最佳CV得分: {search.best_score_:.4f}")
else:
    # 使用预定义的参数
    print("使用预定义参数...")
    rf_model = base_model
    rf_model.fit(X_train_model, y_train[target])

# 交叉验证
cv_scores = cross_val_score(rf_model, X_train_model, y_train[target], cv=kf, scoring='r2')
print(f"交叉验证R²分数: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# 容忍度R²评分
tolerance_cv_scores = cross_val_score(
    rf_model, X_train_model, y_train[target], cv=kf, 
    scoring=tol_scorer_wrapped
)
print(f"容忍度R²分数 (容忍度={current_tolerance:.2f}): {tolerance_cv_scores.mean():.4f} ± {tolerance_cv_scores.std():.4f}")

# 如果模型有oob_score属性，输出oob分数
if hasattr(rf_model, 'oob_score_'):
    print(f"袋外评分 (OOB score): {rf_model.oob_score_:.4f}")

# 保存模型
models[target]['RandomForest'] = rf_model
# 使用pickle保存随机森林模型
rf_model_file = os.path.join(model_folder, f'{target}_随机森林模型.pkl')
with open(rf_model_file, 'wb') as f:
    pickle.dump(rf_model, f)
print(f"随机森林模型已保存至 {rf_model_file}")
# 评估模型
results = evaluate_model(rf_model, X_train_model, y_train[target], X_test_model, y_test[target], target, "RandomForest")
# 获取预测值
y_pred_train = rf_model.predict(X_train_model)
y_pred_test = rf_model.predict(X_test_model)

# 分别创建训练集和测试集的预测结果DataFrame
train_prediction = pd.DataFrame({
    '实际值': y_train[target],
    '预测值': y_pred_train,
    '误差': np.abs(y_train[target] - y_pred_train)
})
train_prediction['数据集'] = '训练集'

test_prediction = pd.DataFrame({
    '实际值': y_test[target],
    '预测值': y_pred_test,
    '误差': np.abs(y_test[target] - y_pred_test)
})
test_prediction['数据集'] = '测试集'

# 保存到文件
train_file = os.path.join(save_folder, f'{target}_随机森林训练集预测结果.csv')
test_file = os.path.join(save_folder, f'{target}_随机森林测试集预测结果.csv')

train_prediction.to_csv(train_file, index=False)
test_prediction.to_csv(test_file, index=False)

print(f"训练集预测结果已保存至 {train_file}")
print(f"测试集预测结果已保存至 {test_file}")
# 特征重要性
feature_importance = pd.DataFrame({
    'Feature': X_train_model.columns,
    'Importance': rf_model.feature_importances_
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('重要性')
plt.ylabel('特征')
plt.title(f'{target} - RandomForest特征重要性')
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()
# 保存特征重要性数据
feature_importance_file = os.path.join(save_folder, f'{target}_随机森林特征重要性.csv')
feature_importance.to_csv(feature_importance_file, index=False)
print(f"特征重要性数据已保存至 {feature_importance_file}")
# 不同参数组合的影响
n_estimators_range = [10, 50, 100, 200, 300, 400,800,1000]
train_scores = []
test_scores = []
oob_scores = []

for n_est in n_estimators_range:
    rf = RandomForestRegressor(
        n_estimators=n_est,
        max_depth=None,
        min_samples_split=3,
        min_samples_leaf=4,
        max_features='sqrt',
        bootstrap=True,
        oob_score=True,
        n_jobs=-1,
        random_state=42
    )
    rf.fit(X_train_model, y_train[target])
    train_scores.append(r2_score(y_train[target], rf.predict(X_train_model)))
    test_scores.append(r2_score(y_test[target], rf.predict(X_test_model)))
    oob_scores.append(rf.oob_score_)

plt.figure(figsize=(10, 6))
plt.plot(n_estimators_range, train_scores, 'o-', label='训练集 R²')
plt.plot(n_estimators_range, test_scores, 'o-', label='测试集 R²')
plt.plot(n_estimators_range, oob_scores, 'o-', label='OOB R²')
plt.xlabel('树的数量')
plt.ylabel('R²')
plt.title('RandomForest - 树数量对性能的影响')
plt.legend()
plt.grid(True)
plt.show()
# 保存树数量影响分析数据
trees_analysis_data = pd.DataFrame({
    '树的数量': n_estimators_range,
    '训练集R²': train_scores,
    '测试集R²': test_scores,
    '袋外评分': oob_scores
})
trees_analysis_file = os.path.join(save_folder, f'{target}_随机森林树数量分析.csv')
trees_analysis_data.to_csv(trees_analysis_file, index=False)
print(f"树数量影响分析数据已保存至 {trees_analysis_file}")



In [None]:
# 深度神经网络回归模型
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.inspection import permutation_importance
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

# 选择目标变量
print(f"训练 {target} 的深度神经网络模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.15)

# 选择适当的数据集
X_train_model = X_train_nn[target]
X_test_model = X_test_nn[target]

# 为当前目标创建特定的评分器
tol_scorer = make_tolerance_scorer(target)
tol_scorer_wrapped = make_scorer(tol_scorer, greater_is_better=True)

# 标准化特征数据
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_model)
X_test_scaled = scaler.transform(X_test_model)

print(f"训练数据形状: {X_train_scaled.shape}")
print(f"测试数据形状: {X_test_scaled.shape}")

# 创建神经网络模型的函数
def create_nn_model(hidden_layers=[128, 64, 32], dropout_rate=0.3, learning_rate=0.001):
    """创建深度神经网络模型"""
    model = keras.Sequential()
    
    # 输入层
    model.add(layers.Dense(hidden_layers[0], 
                          activation='relu', 
                          input_shape=(X_train_scaled.shape[1],)))
    model.add(layers.Dropout(dropout_rate))
    
    # 隐藏层
    for units in hidden_layers[1:]:
        model.add(layers.Dense(units, activation='relu'))
        model.add(layers.Dropout(dropout_rate))
    
    # 输出层
    model.add(layers.Dense(1))
    
    # 编译模型
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, 
                 loss='mse', 
                 metrics=['mae'])
    
    return model

# 包装器类以兼容sklearn接口
class KerasRegressorWrapper:
    def __init__(self, hidden_layers=[128, 64, 32], dropout_rate=0.3, 
                 learning_rate=0.001, epochs=100, batch_size=32):
        self.hidden_layers = hidden_layers
        self.dropout_rate = dropout_rate
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = None
        self.history = None
        
    def fit(self, X, y):
        self.model = create_nn_model(self.hidden_layers, 
                                   self.dropout_rate, 
                                   self.learning_rate)
        
        # 早停和学习率调度
        callbacks = [
            keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True),
            keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=10, min_lr=1e-6)
        ]
        
        # 训练模型
        self.history = self.model.fit(
            X, y,
            epochs=self.epochs,
            batch_size=self.batch_size,
            validation_split=0.2,
            callbacks=callbacks,
            verbose=0
        )
        return self
        
    def predict(self, X):
        return self.model.predict(X, verbose=0).flatten()
    
    def get_params(self, deep=True):
        return {
            'hidden_layers': self.hidden_layers,
            'dropout_rate': self.dropout_rate,
            'learning_rate': self.learning_rate,
            'epochs': self.epochs,
            'batch_size': self.batch_size
        }
    
    def set_params(self, **params):
        for key, value in params.items():
            setattr(self, key, value)
        return self

# 设置超参数搜索空间
param_dist = {
    'hidden_layers': [
        [64, 32],
        [128, 64],
        [128, 64, 32],
        [256, 128, 64],
        [128, 64, 32, 16],
        [256, 128, 64, 32]
    ],
    'dropout_rate': [0.2, 0.3, 0.4, 0.5],
    'learning_rate': [0.001, 0.005, 0.01, 0.0005],
    'batch_size': [16, 32, 64],
    'epochs': [150, 200, 250]
}

# 创建基础模型
base_model = KerasRegressorWrapper(
    hidden_layers=[128, 64, 32],
    dropout_rate=0.3,
    learning_rate=0.001,
    epochs=200,
    batch_size=32
)

# 执行超参数优化
if len(X_train_model) >= 80:
    print("执行超参数优化...")
    search = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=param_dist,
        n_iter=30,  # 由于神经网络训练时间长，减少迭代次数
        cv=min(3, cv_folds),
        scoring='r2',
        n_jobs=1,  # 神经网络使用单进程避免冲突
        random_state=42,
        verbose=1
    )
    search.fit(X_train_scaled, y_train[target])
    nn_model = search.best_estimator_
    print(f"最佳参数: {search.best_params_}")
    print(f"最佳CV得分: {search.best_score_:.4f}")
else:
    # 使用预定义的参数
    print("使用预定义参数...")
    nn_model = base_model
    nn_model.fit(X_train_scaled, y_train[target])

# 交叉验证
print("执行交叉验证...")
cv_scores = cross_val_score(nn_model, X_train_scaled, y_train[target], 
                           cv=min(3, cv_folds), scoring='r2')
print(f"交叉验证R²分数: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# 容忍度R²评分
tolerance_cv_scores = cross_val_score(
    nn_model, X_train_scaled, y_train[target], 
    cv=min(3, cv_folds), scoring=tol_scorer_wrapped
)
print(f"容忍度R²分数 (容忍度={current_tolerance:.2f}): {tolerance_cv_scores.mean():.4f} ± {tolerance_cv_scores.std():.4f}")

# 重新训练最终模型
print("重新训练最终模型...")
nn_model.fit(X_train_scaled, y_train[target])

# 保存模型
models[target]['DeepNN'] = {'model': nn_model, 'scaler': scaler}
# 保存神经网络模型
nn_model_file = os.path.join(model_folder, f'{target}_神经网络模型.pkl')
with open(nn_model_file, 'wb') as f:
    pickle.dump({'model': nn_model, 'scaler': scaler}, f)
print(f"神经网络模型已保存至 {nn_model_file}")

# 评估模型
class NNEvaluationWrapper:
    def __init__(self, model, scaler):
        self.model = model
        self.scaler = scaler
    
    def predict(self, X):
        X_scaled = self.scaler.transform(X)
        return self.model.predict(X_scaled)

nn_eval_model = NNEvaluationWrapper(nn_model, scaler)
results = evaluate_model(nn_eval_model, X_train_model, y_train[target], 
                        X_test_model, y_test[target], target, "DeepNN")

# 获取预测值
y_pred_train = nn_eval_model.predict(X_train_model)
y_pred_test = nn_eval_model.predict(X_test_model)

# 分别创建训练集和测试集的预测结果DataFrame
train_prediction = pd.DataFrame({
    '实际值': y_train[target],
    '预测值': y_pred_train,
    '误差': np.abs(y_train[target] - y_pred_train)
})
train_prediction['数据集'] = '训练集'

test_prediction = pd.DataFrame({
    '实际值': y_test[target],
    '预测值': y_pred_test,
    '误差': np.abs(y_test[target] - y_pred_test)
})
test_prediction['数据集'] = '测试集'

# 保存到文件
train_file = os.path.join(save_folder, f'{target}_神经网络训练集预测结果.csv')
test_file = os.path.join(save_folder, f'{target}_神经网络测试集预测结果.csv')

train_prediction.to_csv(train_file, index=False)
test_prediction.to_csv(test_file, index=False)

print(f"训练集预测结果已保存至 {train_file}")
print(f"测试集预测结果已保存至 {test_file}")

# 特征重要性（使用置换重要性）
print("计算特征重要性...")
perm_importance = permutation_importance(
    nn_eval_model, X_test_model, y_test[target], 
    n_repeats=10, random_state=42, n_jobs=-1
)

feature_importance = pd.DataFrame({
    'Feature': X_train_model.columns,
    'Importance': perm_importance.importances_mean
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('置换重要性')
plt.ylabel('特征')
plt.title(f'{target} - DeepNN特征重要性')
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()

# 保存特征重要性数据
feature_importance_file = os.path.join(save_folder, f'{target}_神经网络特征重要性.csv')
feature_importance.to_csv(feature_importance_file, index=False)
print(f"特征重要性数据已保存至 {feature_importance_file}")

# 训练损失可视化
if hasattr(nn_model, 'history') and nn_model.history is not None:
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.plot(nn_model.history.history['loss'], label='训练损失')
    plt.plot(nn_model.history.history['val_loss'], label='验证损失')
    plt.title('模型损失')
    plt.xlabel('轮次')
    plt.ylabel('损失')
    plt.legend()
    plt.grid(True)
    
    plt.subplot(1, 2, 2)
    plt.plot(nn_model.history.history['mae'], label='训练MAE')
    plt.plot(nn_model.history.history['val_mae'], label='验证MAE')
    plt.title('模型MAE')
    plt.xlabel('轮次')
    plt.ylabel('平均绝对误差')
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()

# 不同网络结构的影响分析
print("分析不同网络结构的影响...")
network_structures = [
    [32],
    [64, 32],
    [128, 64],
    [128, 64, 32],
    [256, 128, 64],
    [128, 64, 32, 16]
]

structure_train_scores = []
structure_test_scores = []
structure_names = []

for structure in network_structures:
    structure_name = '-'.join(map(str, structure))
    structure_names.append(structure_name)
    print(f"测试网络结构: {structure_name}")
    
    test_nn = KerasRegressorWrapper(
        hidden_layers=structure,
        dropout_rate=0.3,
        learning_rate=0.001,
        epochs=100,
        batch_size=32
    )
    
    test_nn.fit(X_train_scaled, y_train[target])
    
    train_pred = test_nn.predict(X_train_scaled)
    test_pred = test_nn.predict(X_test_scaled)
    
    train_r2 = r2_score(y_train[target], train_pred)
    test_r2 = r2_score(y_test[target], test_pred)
    
    structure_train_scores.append(train_r2)
    structure_test_scores.append(test_r2)

plt.figure(figsize=(12, 6))
x = np.arange(len(structure_names))
width = 0.35

plt.bar(x - width/2, structure_train_scores, width, label='训练集 R²')
plt.bar(x + width/2, structure_test_scores, width, label='测试集 R²')

plt.xlabel('网络结构')
plt.ylabel('R²')
plt.title('DeepNN - 网络结构对性能的影响')
plt.xticks(x, structure_names, rotation=45)
plt.legend()
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()

# 保存网络结构影响分析数据
structure_analysis_data = pd.DataFrame({
    '网络结构': structure_names,
    '训练集R²': structure_train_scores,
    '测试集R²': structure_test_scores
})
structure_analysis_file = os.path.join(save_folder, f'{target}_神经网络结构分析.csv')
structure_analysis_data.to_csv(structure_analysis_file, index=False)
print(f"网络结构影响分析数据已保存至 {structure_analysis_file}")

print("神经网络模型训练完成！")

In [None]:
# 支持向量机回归模型
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.inspection import permutation_importance
import numpy as np

# 选择目标变量
print(f"训练 {target} 的支持向量机回归模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.15)

# 选择适当的数据集 - SVR对特征尺度敏感，使用线性预处理数据
X_train_model = X_train_linear[target]
X_test_model = X_test_linear[target]

# 为当前目标创建特定的评分器
tol_scorer = make_tolerance_scorer(target)
tol_scorer_wrapped = make_scorer(tol_scorer, greater_is_better=True)

# SVR需要标准化处理
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_model)
X_test_scaled = scaler.transform(X_test_model)

print(f"训练数据形状: {X_train_scaled.shape}")
print(f"测试数据形状: {X_test_scaled.shape}")

# 设置超参数搜索空间
param_dist = {
    'kernel': ['rbf', 'poly', 'sigmoid'],
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'epsilon': [0.01, 0.1, 0.2, 0.5, 1.0],
    'degree': [2, 3, 4]  # 仅对poly核有效
}

# 创建基础模型
base_model = SVR(
    kernel='rbf',
    C=100,
    gamma='scale',
    epsilon=0.1,
    max_iter=5000
)

# 执行超参数优化
if len(X_train_model) >= 80:
    print("执行超参数优化...")
    search = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=param_dist,
        n_iter=100,
        cv=min(5, cv_folds),
        scoring=tol_scorer_wrapped,
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    search.fit(X_train_scaled, y_train[target])
    svr_model = search.best_estimator_
    print(f"最佳参数: {search.best_params_}")
    print(f"最佳CV得分: {search.best_score_:.4f}")
    
    # 如果找到的最佳核函数是poly，进行更精细的优化
    if svr_model.kernel == 'poly':
        print("对多项式核进行精细优化...")
        poly_param_grid = {
            'C': [svr_model.C * 0.5, svr_model.C, svr_model.C * 2],
            'gamma': [svr_model.gamma] if isinstance(svr_model.gamma, str) else [svr_model.gamma * 0.5, svr_model.gamma, svr_model.gamma * 2],
            'degree': [max(2, svr_model.degree-1), svr_model.degree, svr_model.degree+1],
            'epsilon': [svr_model.epsilon * 0.5, svr_model.epsilon, svr_model.epsilon * 2]
        }
        
        fine_search = GridSearchCV(
            SVR(kernel='poly'),
            poly_param_grid,
            cv=min(3, cv_folds),
            scoring=tol_scorer_wrapped,
            n_jobs=-1
        )
        fine_search.fit(X_train_scaled, y_train[target])
        svr_model = fine_search.best_estimator_
        print(f"精细优化后的最佳参数: {fine_search.best_params_}")
        
else:
    # 使用预定义的参数
    print("使用预定义参数...")
    svr_model = base_model
    svr_model.fit(X_train_scaled, y_train[target])

# 交叉验证
cv_scores = cross_val_score(svr_model, X_train_scaled, y_train[target], 
                           cv=min(5, cv_folds), scoring='r2')
print(f"交叉验证R²分数: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# 容忍度R²评分
tolerance_cv_scores = cross_val_score(
    svr_model, X_train_scaled, y_train[target], 
    cv=min(5, cv_folds), scoring=tol_scorer_wrapped
)
print(f"容忍度R²分数 (容忍度={current_tolerance:.2f}): {tolerance_cv_scores.mean():.4f} ± {tolerance_cv_scores.std():.4f}")

# 输出最终模型信息
print(f"最终模型参数:")
print(f"  核函数: {svr_model.kernel}")
print(f"  C参数: {svr_model.C}")
print(f"  gamma参数: {svr_model.gamma}")
print(f"  epsilon参数: {svr_model.epsilon}")
if svr_model.kernel == 'poly':
    print(f"  多项式度数: {svr_model.degree}")
print(f"  支持向量数量: {svr_model.n_support_}")

# 保存模型
models[target]['SVR'] = {'model': svr_model, 'scaler': scaler}
# 使用pickle保存SVR模型
svr_model_file = os.path.join(model_folder, f'{target}_支持向量机模型.pkl')
with open(svr_model_file, 'wb') as f:
    pickle.dump({'model': svr_model, 'scaler': scaler}, f)
print(f"支持向量机模型已保存至 {svr_model_file}")

# 评估模型 - 创建包装器以处理标准化
class SVREvaluationWrapper:
    def __init__(self, model, scaler):
        self.model = model
        self.scaler = scaler
    
    def predict(self, X):
        X_scaled = self.scaler.transform(X)
        return self.model.predict(X_scaled)

svr_eval_model = SVREvaluationWrapper(svr_model, scaler)
results = evaluate_model(svr_eval_model, X_train_model, y_train[target], 
                        X_test_model, y_test[target], target, "SVR")

# 获取预测值
y_pred_train = svr_eval_model.predict(X_train_model)
y_pred_test = svr_eval_model.predict(X_test_model)

# 分别创建训练集和测试集的预测结果DataFrame
train_prediction = pd.DataFrame({
    '实际值': y_train[target],
    '预测值': y_pred_train,
    '误差': np.abs(y_train[target] - y_pred_train)
})
train_prediction['数据集'] = '训练集'

test_prediction = pd.DataFrame({
    '实际值': y_test[target],
    '预测值': y_pred_test,
    '误差': np.abs(y_test[target] - y_pred_test)
})
test_prediction['数据集'] = '测试集'

# 保存到文件
train_file = os.path.join(save_folder, f'{target}_支持向量机训练集预测结果.csv')
test_file = os.path.join(save_folder, f'{target}_支持向量机测试集预测结果.csv')

train_prediction.to_csv(train_file, index=False)
test_prediction.to_csv(test_file, index=False)

print(f"训练集预测结果已保存至 {train_file}")
print(f"测试集预测结果已保存至 {test_file}")

# 特征重要性（使用置换重要性）
print("计算特征重要性...")
perm_importance = permutation_importance(
    svr_eval_model, X_test_model, y_test[target], 
    n_repeats=10, random_state=42, n_jobs=-1
)

feature_importance = pd.DataFrame({
    'Feature': X_train_model.columns,
    'Importance': perm_importance.importances_mean
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('置换重要性')
plt.ylabel('特征')
plt.title(f'{target} - SVR特征重要性')
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()

# 保存特征重要性数据
feature_importance_file = os.path.join(save_folder, f'{target}_支持向量机特征重要性.csv')
feature_importance.to_csv(feature_importance_file, index=False)
print(f"特征重要性数据已保存至 {feature_importance_file}")

# 不同C参数的影响分析
print("分析不同C参数的影响...")
C_range = [0.1, 1, 10, 100, 1000, 10000]
c_train_scores = []
c_test_scores = []
c_support_vectors = []

for C_val in C_range:
    print(f"测试C参数: {C_val}")
    test_svr = SVR(
        kernel=svr_model.kernel,
        C=C_val,
        gamma=svr_model.gamma,
        epsilon=svr_model.epsilon,
        degree=svr_model.degree if svr_model.kernel == 'poly' else 3,
        max_iter=5000
    )
    
    test_svr.fit(X_train_scaled, y_train[target])
    
    train_pred = test_svr.predict(X_train_scaled)
    test_pred = test_svr.predict(X_test_scaled)
    
    train_r2 = r2_score(y_train[target], train_pred)
    test_r2 = r2_score(y_test[target], test_pred)
    
    c_train_scores.append(train_r2)
    c_test_scores.append(test_r2)
    c_support_vectors.append(test_svr.n_support_.sum())

plt.figure(figsize=(12, 8))

plt.subplot(2, 1, 1)
plt.semilogx(C_range, c_train_scores, 'o-', label='训练集 R²')
plt.semilogx(C_range, c_test_scores, 'o-', label='测试集 R²')
plt.xlabel('C参数')
plt.ylabel('R²')
plt.title('SVR - C参数对性能的影响')
plt.legend()
plt.grid(True)

plt.subplot(2, 1, 2)
plt.semilogx(C_range, c_support_vectors, 'o-', color='green')
plt.xlabel('C参数')
plt.ylabel('支持向量数量')
plt.title('SVR - C参数对支持向量数量的影响')
plt.grid(True)

plt.tight_layout()
plt.show()

# 不同核函数的性能比较
print("比较不同核函数的性能...")
kernels = ['linear', 'rbf', 'poly', 'sigmoid']
kernel_scores = []
kernel_train_scores = []
kernel_names = []

for kernel in kernels:
    print(f"测试核函数: {kernel}")
    try:
        if kernel == 'poly':
            test_svr = SVR(kernel=kernel, C=100, gamma='scale', epsilon=0.1, degree=3, max_iter=5000)
        else:
            test_svr = SVR(kernel=kernel, C=100, gamma='scale', epsilon=0.1, max_iter=5000)
        
        test_svr.fit(X_train_scaled, y_train[target])
        
        train_pred = test_svr.predict(X_train_scaled)
        test_pred = test_svr.predict(X_test_scaled)
        
        train_r2 = r2_score(y_train[target], train_pred)
        test_r2 = r2_score(y_test[target], test_pred)
        
        kernel_train_scores.append(train_r2)
        kernel_scores.append(test_r2)
        kernel_names.append(kernel)
        
    except Exception as e:
        print(f"核函数 {kernel} 训练失败: {str(e)}")
        continue

plt.figure(figsize=(10, 6))
x = np.arange(len(kernel_names))
width = 0.35

plt.bar(x - width/2, kernel_train_scores, width, label='训练集 R²')
plt.bar(x + width/2, kernel_scores, width, label='测试集 R²')

plt.xlabel('核函数')
plt.ylabel('R²')
plt.title('SVR - 不同核函数性能比较')
plt.xticks(x, kernel_names)
plt.legend()
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()

# 保存C参数影响分析数据
c_analysis_data = pd.DataFrame({
    'C参数': C_range,
    '训练集R²': c_train_scores,
    '测试集R²': c_test_scores,
    '支持向量数量': c_support_vectors
})
c_analysis_file = os.path.join(save_folder, f'{target}_支持向量机C参数分析.csv')
c_analysis_data.to_csv(c_analysis_file, index=False)
print(f"C参数影响分析数据已保存至 {c_analysis_file}")

# 保存核函数比较数据
kernel_analysis_data = pd.DataFrame({
    '核函数': kernel_names,
    '训练集R²': kernel_train_scores,
    '测试集R²': kernel_scores
})
kernel_analysis_file = os.path.join(save_folder, f'{target}_支持向量机核函数分析.csv')
kernel_analysis_data.to_csv(kernel_analysis_file, index=False)
print(f"核函数比较分析数据已保存至 {kernel_analysis_file}")

print("支持向量机模型训练完成！")

In [None]:
# 导入必要的库
import pandas as pd
import numpy as np
import time
import os
import pickle
import matplotlib.pyplot as plt
import matplotlib
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel as C, Matern
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.inspection import permutation_importance
import traceback

# 设置Matplotlib正常显示中文
matplotlib.rcParams['font.sans-serif'] = ['SimHei']  # 'SimHei' 是黑体
matplotlib.rcParams['axes.unicode_minus'] = False # 解决负号显示问题


# --- 代码开始 ---
target="吸油能力"
print(f"训练 {target} 的高斯过程回归模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.15)

# 选择数据集
X_train_model = X_train_linear[target]
X_test_model = X_test_linear[target]

# 步骤1: 对输入特征进行标准化
print("\n步骤1: 对输入特征进行标准化...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_model)
X_test_scaled = scaler.transform(X_test_model)
print("特征标准化完成。")

# 步骤2: 扩展核函数库并进行自动化选择与优化 (核心修改)
print("\n步骤2: 扩展核函数库，进行更全面的自动化模型选择...")

# 定义一个更丰富的、带优化边界的核函数字典
kernels_to_try = {
    "RBF": 
        C(1.0, (1e-3, 1e3)) * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2)) 
        + WhiteKernel(noise_level=0.1, noise_level_bounds=(1e-10, 1e1)),
    
    "Matern (nu=1.5)": 
        C(1.0, (1e-3, 1e3)) * Matern(length_scale=1.0, length_scale_bounds=(1e-2, 1e2), nu=1.5) 
        + WhiteKernel(noise_level=0.1, noise_level_bounds=(1e-10, 1e1)),
        
    "Matern (nu=2.5)": 
        C(1.0, (1e-3, 1e3)) * Matern(length_scale=1.0, length_scale_bounds=(1e-2, 1e2), nu=2.5) 
        + WhiteKernel(noise_level=0.1, noise_level_bounds=(1e-10, 1e1)),
        
    "RationalQuadratic": 
        C(1.0, (1e-3, 1e3)) * RationalQuadratic(length_scale=1.0, alpha=0.1, length_scale_bounds=(1e-2, 1e2), alpha_bounds=(1e-2, 1e2)) 
        + WhiteKernel(noise_level=0.1, noise_level_bounds=(1e-10, 1e1)),

    # 复合核示例：RBF + Matern (更复杂的模型，可能需要更多数据来避免过拟合)
    "RBF + Matern":
        C(1.0, (1e-3, 1e3)) * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2))
        + C(1.0, (1e-3, 1e3)) * Matern(length_scale=1.0, length_scale_bounds=(1e-2, 1e2), nu=1.5)
        + WhiteKernel(noise_level=0.1, noise_level_bounds=(1e-10, 1e1))
}

best_score = -np.inf
best_kernel_name = ""
best_model_from_cv = None

print("开始测试多个核函数，优中选优...")
for name, kernel in kernels_to_try.items():
    print(f"  > 正在测试核函数: {name}...")
    gp = GaussianProcessRegressor(
        kernel=kernel,
        n_restarts_optimizer=15, # 保证充分优化
        normalize_y=True,
        random_state=42
    )
    try:
        # 使用交叉验证来评估当前核函数的性能
        score = cross_val_score(gp, X_train_scaled, y_train[target], cv=min(3, cv_folds), scoring='r2').mean()
        print(f"    交叉验证 R² 平均分: {score:.4f}")

        if score > best_score:
            best_score = score
            best_kernel_name = name
            # 训练一个模型以备后用
            gp.fit(X_train_scaled, y_train[target])
            best_model_from_cv = gp
            
    except Exception as e:
        print(f"    核函数 {name} 训练失败: {e}")
        continue

if best_model_from_cv is None:
    raise RuntimeError("所有核函数都训练失败，请检查数据或核函数参数！")

print(f"\n[决策] 最佳核函数为: '{best_kernel_name}' (交叉验证最高分: {best_score:.4f})")

# 将选出的最佳模型作为最终模型
best_model = best_model_from_cv
print(f"最终选定的模型核函数参数: {best_model.kernel_}")



# 步骤3: 评估并可视化最终模型
print(f"\n步骤3: 评估并可视化最终模型...")

# 在训练集和测试集上获取预测值
y_pred_train = best_model.predict(X_train_scaled)
y_pred_test = best_model.predict(X_test_scaled)

# 计算最终的R²分数
train_r2 = r2_score(y_train[target], y_pred_train)
test_r2 = r2_score(y_test[target], y_pred_test)
print(f"训练集最终 R²: {train_r2:.4f}")
print(f"测试集最终 R²: {test_r2:.4f}")

# 绘制“实际值 vs. 预测值”对比图 (训练集和测试集)
fig, axes = plt.subplots(1, 2, figsize=(16, 8))
fig.suptitle(f'"{target}" 的模型预测效果 (最佳核: {best_kernel_name})', fontsize=16)

# 训练集图
axes[0].scatter(y_train[target], y_pred_train, alpha=0.6)
min_val_train = min(y_train[target].min(), y_pred_train.min())
max_val_train = max(y_train[target].max(), y_pred_train.max())
axes[0].plot([min_val_train, max_val_train], [min_val_train, max_val_train], 'r--', lw=2, label='理想情况 (y=x)')
axes[0].set_title(f'训练集 (R² = {train_r2:.4f})')
axes[0].set_xlabel('实际值')
axes[0].set_ylabel('预测值')
axes[0].legend()
axes[0].grid(True)
axes[0].axis('equal')

# 测试集图
axes[1].scatter(y_test[target], y_pred_test, alpha=0.6)
min_val_test = min(y_test[target].min(), y_pred_test.min())
max_val_test = max(y_test[target].max(), y_pred_test.max())
axes[1].plot([min_val_test, max_val_test], [min_val_test, max_val_test], 'r--', lw=2, label='理想情况 (y=x)')
axes[1].set_title(f'测试集 (R² = {test_r2:.4f})')
axes[1].set_xlabel('实际值')
axes[1].set_ylabel('预测值')
axes[1].legend()
axes[1].grid(True)
axes[1].axis('equal')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
# 步骤4: 保存模型、结果和特征重要性
print(f"\n步骤4: 保存模型与分析结果...")
models[target]['GaussianProcess'] = best_model
gp_model_file = os.path.join(model_folder, f'{target}_高斯过程模型.pkl')
with open(gp_model_file, 'wb') as f:
    pickle.dump(best_model, f)
print(f"高斯过程回归模型已保存至 {gp_model_file}")

# 创建并保存预测结果DataFrame
train_prediction = pd.DataFrame({
    '实际值': y_train[target],
    '预测值': y_pred_train,
    '误差': np.abs(y_train[target] - y_pred_train)
})
train_prediction['数据集'] = '训练集'

test_prediction = pd.DataFrame({
    '实际值': y_test[target],
    '预测值': y_pred_test,
    '误差': np.abs(y_test[target] - y_pred_test)
})
test_prediction['数据集'] = '测试集'

train_file = os.path.join(save_folder, f'{target}_高斯过程训练集预测结果.csv')
test_file = os.path.join(save_folder, f'{target}_高斯过程测试集预测结果.csv')
train_prediction.to_csv(train_file, index=False)
test_prediction.to_csv(test_file, index=False)
print(f"训练集和测试集预测结果已保存。")


# 计算并可视化特征重要性
perm_importance = permutation_importance(
    best_model, X_test_scaled, y_test[target],
    n_repeats=10, random_state=42, n_jobs=-1
)

feature_importance = pd.DataFrame({
    'Feature': X_train_model.columns,
    'Importance': perm_importance.importances_mean
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('置换重要性 (Permutation Importance)')
plt.ylabel('特征')
plt.title(f'{target} - GaussianProcess特征重要性')
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()

feature_importance_file = os.path.join(save_folder, f'{target}_高斯过程特征重要性.csv')
feature_importance.to_csv(feature_importance_file, index=False)
print(f"特征重要性数据已保存。")


# 步骤5: 分析预测不准确的样本
print(f"\n步骤5: 分析预测不准确的样本...")

y_true_values = y_test[target].values
errors = np.abs(y_true_values - y_pred_test)
tolerance = 5.0  # 您可以根据需要调整此阈值

inaccurate_mask = errors > tolerance
inaccurate_indices = np.where(inaccurate_mask)[0]

print(f"\n预测不准确的样本数量: {len(inaccurate_indices)} (占测试集的 {len(inaccurate_indices)/len(y_test)*100:.2f}%)，使用的容忍度阈值: {tolerance}")

if len(inaccurate_indices) > 0:
    original_indices = [X_test_model.index[i] for i in inaccurate_indices]

    inaccurate_samples = []
    for i, idx in enumerate(inaccurate_indices):
        actual = y_true_values[idx]
        prediction = y_pred_test[idx]
        inaccurate_samples.append({
            '样本索引': original_indices[i],
            '实际值': actual,
            '预测值': prediction,
            '绝对误差': errors[idx],
            '相对误差(%)': (errors[idx] / np.abs(actual)) * 100 if actual != 0 else float('inf'),
        })
    
    inaccurate_df = pd.DataFrame(inaccurate_samples).sort_values('绝对误差', ascending=False)
    
    print("\n预测不准确的样本详情 (按误差降序排列):")
    print(inaccurate_df)
    
    inaccurate_file_path = os.path.join(save_folder, f'{target}_GP不准确预测.csv')
    inaccurate_df.to_csv(inaccurate_file_path, index=False)
    print(f"不准确样本分析已保存至 {inaccurate_file_path}")
else:
    print(f"\n在容忍度阈值 {tolerance} 内，没有发现预测不准确的样本。")

print("\n高斯过程回归模型训练、评估和分析全部完成。")

In [None]:
# 直接运行的模型加载代码
import os
import pickle
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# 如果需要加载神经网络模型，需要先定义KerasRegressorWrapper类
class KerasRegressorWrapper:
    def __init__(self, hidden_layers=[128, 64, 32], dropout_rate=0.3, 
                 learning_rate=0.001, epochs=100, batch_size=32):
        self.hidden_layers = hidden_layers
        self.dropout_rate = dropout_rate
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.model = None
        self.history = None
        
    def predict(self, X):
        return self.model.predict(X, verbose=0).flatten()
    
    def get_params(self, deep=True):
        return {
            'hidden_layers': self.hidden_layers,
            'dropout_rate': self.dropout_rate,
            'learning_rate': self.learning_rate,
            'epochs': self.epochs,
            'batch_size': self.batch_size
        }
    
    def set_params(self, **params):
        for key, value in params.items():
            setattr(self, key, value)
        return self

# 设置模型文件夹路径
model_folder = '训练模型文件'

# 初始化models字典
if 'models' not in locals():
    models = {}
target="吸油能力"
print(f"为目标变量 {target} 加载模型...")

# 查找模型文件，排除Keras模型
model_files = [f for f in os.listdir(model_folder) 
              if f.startswith(f'{target}_') and f.endswith('.pkl') 
              and not f.endswith('_features.pkl')
              and 'Ensemble' not in f and '集成' not in f
              and 'Keras' not in f and 'Neural' not in f and 'NN' not in f]

print(f"找到 {len(model_files)} 个模型文件: {model_files}")

# 初始化目标变量的模型字典
if target not in models:
    models[target] = {}

# 加载每个模型
for model_file in model_files:
    # 从文件名提取模型名称
    model_name = model_file.replace(f'{target}_', '').replace('模型.pkl', '')
    
    print(f"  加载模型: {model_name}")
    
    # 加载模型
    model_path = os.path.join(model_folder, model_file)
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    
    models[target][model_name] = model
    print(f"    {model_name} 加载成功")

print(f"成功加载 {len(models[target])} 个模型")
print(f"可用模型: {list(models[target].keys())}")

In [None]:

# VotingEnsemble - 基于模型标准R²性能分配权重
print(f"训练 {target} 的VotingEnsemble集成模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.15)

# 使用已有模型和已知性能 - 无需重新评估
base_models = []
model_scores = {}  # 存储标准R²分数
model_datasets = {}  # 存储每个模型对应的数据集

print("收集已有模型的性能评估结果...")
# 使用原始训练代码计算的标准R²
if 'XGBoost' in models[target]:
    # 不重新评估，而是计算一次标准R²
    model = models[target]['XGBoost']
    y_pred = model.predict(X_test_tree[target])
    r2 = r2_score(y_test[target], y_pred)
    base_models.append(('xgb', model))
    model_scores['xgb'] = r2
    model_datasets['xgb'] = 'tree'
    print(f"  XGBoost - R²: {r2:.4f}")

if 'LightGBM' in models[target]:
    model = models[target]['LightGBM']
    y_pred = model.predict(X_test_tree[target])
    r2 = r2_score(y_test[target], y_pred)
    base_models.append(('lgb', model))
    model_scores['lgb'] = r2
    model_datasets['lgb'] = 'tree'
    print(f"  LightGBM - R²: {r2:.4f}")

if 'HistGradientBoosting' in models[target]:
    model = models[target]['HistGradientBoosting']
    y_pred = model.predict(X_test_tree[target])
    r2 = r2_score(y_test[target], y_pred)
    base_models.append(('hgb', model))
    model_scores['hgb'] = r2
    model_datasets['hgb'] = 'tree'
    print(f"  HistGradientBoosting - R²: {r2:.4f}")

if 'RandomForest' in models[target]:
    model = models[target]['RandomForest']
    y_pred = model.predict(X_test_tree_filled[target])
    r2 = r2_score(y_test[target], y_pred)
    base_models.append(('rf', model))
    model_scores['rf'] = r2
    model_datasets['rf'] = 'tree_filled'
    print(f"  RandomForest - R²: {r2:.4f}")

if 'GaussianProcess' in models[target]:
    model = models[target]['GaussianProcess']
    y_pred = model.predict(X_test_linear[target])
    r2 = r2_score(y_test[target], y_pred)
    base_models.append(('gp', model))
    model_scores['gp'] = r2
    model_datasets['gp'] = 'linear'
    print(f"  GaussianProcess - R²: {r2:.4f}")

# 改为基于标准R²性能计算权重，而不是容忍度R²
print("\n根据标准R²模型性能分配权重...")

# 基于标准R²计算权重
total_score = sum(model_scores.values())
if total_score > 0:  # 防止除以零错误
    weights = [model_scores[name] / total_score * len(model_scores) for name, _ in base_models]
else:
    weights = [1.0 for _ in base_models]  # 如果总分为0，则均等分配权重

print("  基于标准R²分配权重")

# 确保权重至少为0.5，防止某些模型权重过低
min_weight = 0.5
weights = [max(w, min_weight) for w in weights]

# 打印权重
for i, (name, _) in enumerate(base_models):
    print(f"  {name} 权重: {weights[i]:.4f}")

# 检查是否有足够的模型可用于集成
if len(base_models) >= 2:
    try:
        # 创建自定义投票回归器的封装，确保使用正确的数据集
        class EnhancedVotingRegressor:
            def __init__(self, estimators, weights, datasets, target_name):
                self.estimators = estimators
                self.weights = weights
                self.datasets = datasets
                self.target_name = target_name
                
                # 归一化权重
                self.weights = np.array(self.weights)
                self.weights = self.weights / np.sum(self.weights)
                
            def predict(self, X):
                # 对每个模型获取预测，并根据模型类型使用适当的数据预处理
                predictions = []
                
                for i, (name, model) in enumerate(self.estimators):
                    # 选择合适的数据格式
                    dataset_type = self.datasets.get(name, 'standard')
                    
                    if dataset_type == 'tree':
                        # 对于支持NaN的树模型，直接使用X
                        X_model = X
                    elif dataset_type == 'tree_filled':
                        # 对于不支持NaN的树模型，需要填充X
                        if isinstance(X, pd.DataFrame):
                            X_model = X.fillna(0)
                        else:
                            X_model = X
                    elif dataset_type == 'linear':
                        # 对于线性模型，使用线性预处理的X
                        X_model = X
                    else:
                        # 默认情况下直接使用X
                        X_model = X
                    
                    # 获取当前模型的预测
                    pred = model.predict(X_model)
                    predictions.append(pred)
                
                # 加权平均所有预测
                weighted_pred = np.zeros(predictions[0].shape)
                for i, pred in enumerate(predictions):
                    weighted_pred += self.weights[i] * pred
                
                return weighted_pred
        
        # 创建投票集成模型
        print("创建投票集成模型...")
        voting_model = EnhancedVotingRegressor(
            estimators=base_models,
            weights=weights,
            datasets=model_datasets,
            target_name=target
        )
        
        # 获取训练集和测试集预测
        train_predictions = {}
        test_predictions = {}
        
        # 获取每个基础模型的预测
        for name, model in base_models:
            if model_datasets[name] == 'tree':
                train_predictions[name] = model.predict(X_train_tree[target])
                test_predictions[name] = model.predict(X_test_tree[target])
            elif model_datasets[name] == 'tree_filled':
                train_predictions[name] = model.predict(X_train_tree_filled[target])
                test_predictions[name] = model.predict(X_test_tree_filled[target])
            elif model_datasets[name] == 'linear':
                train_predictions[name] = model.predict(X_train_linear[target])
                test_predictions[name] = model.predict(X_test_linear[target])
        
        # 计算加权预测
        y_train_pred = np.zeros(len(y_train[target]))
        y_test_pred = np.zeros(len(y_test[target]))
        
        for i, (name, _) in enumerate(base_models):
            y_train_pred += weights[i] * train_predictions[name]
            y_test_pred += weights[i] * test_predictions[name]
        
        # 归一化权重
        total_weight = sum(weights)
        y_train_pred /= total_weight
        y_test_pred /= total_weight
        
        # 计算性能指标
        train_r2 = r2_score(y_train[target], y_train_pred)
        test_r2 = r2_score(y_test[target], y_test_pred)
        
        train_tol_r2 = tolerance_r2_score(y_train[target], y_train_pred, tolerance=current_tolerance, target=target)
        test_tol_r2 = tolerance_r2_score(y_test[target], y_test_pred, tolerance=current_tolerance, target=target)
        
        train_within_tol = prediction_within_tolerance(y_train[target], y_train_pred, tolerance=current_tolerance, target=target)
        test_within_tol = prediction_within_tolerance(y_test[target], y_test_pred, tolerance=current_tolerance, target=target)
        # 保存训练集和测试集的预测结果
        train_prediction = pd.DataFrame({
            '实际值': y_train[target],
            '集成预测值': y_train_pred,
            '误差': np.abs(y_train[target] - y_train_pred)
        })

        test_prediction = pd.DataFrame({
            '实际值': y_test[target],
            '集成预测值': y_test_pred,
            '误差': np.abs(y_test[target] - y_test_pred)
        })

        # 添加各基础模型的预测结果
        for name, _ in base_models:
            train_prediction[f'{name}预测值'] = train_predictions[name]
            test_prediction[f'{name}预测值'] = test_predictions[name]

        # 保存到文件
        train_file = os.path.join(save_folder, f'{target}_投票集成模型训练集预测结果.csv')
        test_file = os.path.join(save_folder, f'{target}_投票集成模型测试集预测结果.csv')

        train_prediction.to_csv(train_file, index=False)
        test_prediction.to_csv(test_file, index=False)

        print(f"训练集预测结果已保存至 {train_file}")
        print(f"测试集预测结果已保存至 {test_file}")
        # 输出性能指标
        print(f"\n投票集成模型性能:")
        print(f"训练集: R²={train_r2:.4f}, 容忍度R²={train_tol_r2:.4f}, 在容忍范围内比例={train_within_tol:.2%}")
        print(f"测试集: R²={test_r2:.4f}, 容忍度R²={test_tol_r2:.4f}, 在容忍范围内比例={test_within_tol:.2%}")
        
        # 与各个基础模型比较性能
        print("\n与各基础模型性能比较:")
        for name, _ in base_models:
            base_train_pred = train_predictions[name]
            base_test_pred = test_predictions[name]
            
            base_train_r2 = r2_score(y_train[target], base_train_pred)
            base_test_r2 = r2_score(y_test[target], base_test_pred)
            
            print(f"  vs {name}:")
            print(f"    训练集R²: {train_r2:.4f} vs {base_train_r2:.4f} (差异: {train_r2-base_train_r2:.4f})")
            print(f"    测试集R²: {test_r2:.4f} vs {base_test_r2:.4f} (差异: {test_r2-base_test_r2:.4f})")
        
        # 保存模型
        models[target]['VotingEnsemble'] = voting_model
        # 使用pickle保存投票集成模型
        ensemble_model_file = os.path.join(model_folder, f'{target}_投票集成模型.pkl')
        with open(ensemble_model_file, 'wb') as f:
            pickle.dump(voting_model, f)
        print(f"投票集成模型已保存至 {ensemble_model_file}")
        # 可视化: 预测vs实际值散点图 (训练集和测试集)
        plt.figure(figsize=(12, 5))
        
        # 训练集散点图
        plt.subplot(1, 2, 1)
        plt.scatter(y_train[target], y_train_pred, alpha=0.5)
        plt.plot([y_train[target].min(), y_train[target].max()], [y_train[target].min(), y_train[target].max()], 'r--')
        plt.xlabel('实际值')
        plt.ylabel('预测值')
        plt.title(f'训练集: R²={train_r2:.4f}')
        
        # 测试集散点图
        plt.subplot(1, 2, 2)
        plt.scatter(y_test[target], y_test_pred, alpha=0.5)
        plt.plot([y_test[target].min(), y_test[target].max()], [y_test[target].min(), y_test[target].max()], 'r--')
        plt.xlabel('实际值')
        plt.ylabel('预测值')
        plt.title(f'测试集: R²={test_r2:.4f}')
        
        plt.tight_layout()
        plt.show()
        
        # 绘制误差分布
        plt.figure(figsize=(12, 5))
        
        # 训练集误差
        plt.subplot(1, 2, 1)
        train_errors = y_train[target] - y_train_pred
        plt.hist(train_errors, bins=30, alpha=0.7)
        plt.axvline(x=0, color='r', linestyle='--')
        plt.xlabel('预测误差')
        plt.ylabel('频次')
        plt.title(f'训练集误差分布 (MAE={np.abs(train_errors).mean():.4f})')
        
        # 测试集误差
        plt.subplot(1, 2, 2)
        test_errors = y_test[target] - y_test_pred
        plt.hist(test_errors, bins=30, alpha=0.7)
        plt.axvline(x=0, color='r', linestyle='--')
        plt.xlabel('预测误差')
        plt.ylabel('频次')
        plt.title(f'测试集误差分布 (MAE={np.abs(test_errors).mean():.4f})')
        
        plt.tight_layout()
        plt.show()
        
        # 绘制权重分布
        plt.figure(figsize=(10, 6))
        model_names = [name for name, _ in base_models]
        plt.bar(model_names, weights)
        plt.xlabel('模型')
        plt.ylabel('权重')
        plt.title(f'{target} - 投票集成模型权重分布')
        plt.grid(True, axis='y')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
        # 对比各模型预测分布
        plt.figure(figsize=(12, 6))
        model_preds = [y_test_pred] + [test_predictions[name] for name, _ in base_models]
        model_labels = ['Voting'] + [name for name, _ in base_models]
        
        plt.boxplot(model_preds, labels=model_labels)
        plt.ylabel('预测值')
        plt.title('投票集成模型与各基础模型预测分布对比')
        plt.grid(True, axis='y')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        print(f"创建投票集成模型失败: {str(e)}")
        print(f"错误详情: {traceback.format_exc()}")
else:
    print("没有足够的基础模型来创建投票集成")



In [None]:

# 自适应集成模型 - 根据样本特征动态选择最佳模型
from sklearn.ensemble import RandomForestClassifier

print(f"训练 {target} 的自适应集成模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.15)

# 创建有效模型列表及其对应的数据集
available_models = []
model_input_data = {}

if 'XGBoost' in models[target]:
    available_models.append('XGBoost')
    model_input_data['XGBoost'] = {
        'train': X_train_tree[target],
        'test': X_test_tree[target]
    }

if 'LightGBM' in models[target]:
    available_models.append('LightGBM')
    model_input_data['LightGBM'] = {
        'train': X_train_tree[target],
        'test': X_test_tree[target]
    }
    
if 'HistGradientBoosting' in models[target]:
    available_models.append('HistGradientBoosting')
    model_input_data['HistGradientBoosting'] = {
        'train': X_train_tree[target],
        'test': X_test_tree[target]
    }
    
if 'RandomForest' in models[target]:
    available_models.append('RandomForest')
    model_input_data['RandomForest'] = {
        'train': X_train_tree_filled[target],
        'test': X_test_tree_filled[target]
    }
    
if 'GaussianProcess' in models[target]:
    available_models.append('GaussianProcess')
    model_input_data['GaussianProcess'] = {
        'train': X_train_linear[target],
        'test': X_test_linear[target]
    }

print(f"可用模型: {available_models}")

if len(available_models) < 2:
    print("自适应集成至少需要两个模型，目前可用模型不足")
else:
    try:
        # 步骤1: 为每个样本生成各模型的预测
        print("为每个样本生成所有模型的预测...")
        train_predictions = {}
        test_predictions = {}
        
        for model_name in available_models:
            model = models[target][model_name]
            # 使用适当的数据集进行预测
            train_data = model_input_data[model_name]['train']
            test_data = model_input_data[model_name]['test']
            
            train_pred = model.predict(train_data)
            test_pred = model.predict(test_data)
            
            train_predictions[model_name] = train_pred
            test_predictions[model_name] = test_pred
        
        # 步骤2: 计算每个样本的每个模型预测误差
        print("计算各模型在每个样本上的预测误差...")
        train_errors = {}
        for model_name in available_models:
            pred = train_predictions[model_name]
            error = np.abs(y_train[target].values - pred)
            train_errors[model_name] = error
        
        # 步骤3: 创建一个元模型，学习如何根据特征选择最佳模型
        print("训练元模型来决定每个样本应使用哪个模型...")
        
        # 为每个样本找出表现最好的模型
        best_model_indices = np.zeros(len(y_train[target]), dtype=int)
        model_name_to_idx = {name: idx for idx, name in enumerate(available_models)}
        
        for i in range(len(y_train[target])):
            model_errors = [train_errors[model_name][i] for model_name in available_models]
            best_model_idx = np.argmin(model_errors)
            best_model_indices[i] = best_model_idx
        
        # 用原始特征训练一个分类器来预测最佳模型
        meta_classifier = RandomForestClassifier(
            n_estimators=200, 
            max_depth=4,
            min_samples_split=2,
            n_jobs=-1,
            random_state=42
        )
        
        meta_classifier.fit(X_train[target], best_model_indices)
        
        # 步骤4: 在训练集和测试集上使用元模型选择最佳模型
        print("在训练集和测试集上应用元模型...")
        train_best_models = meta_classifier.predict(X_train[target])
        test_best_models = meta_classifier.predict(X_test[target])
        
        # 步骤5: 根据元模型的选择，为每个样本选择相应的预测
        train_adaptive_predictions = np.zeros(len(y_train[target]))
        test_adaptive_predictions = np.zeros(len(y_test[target]))
        
        # 为训练集计算自适应预测
        for i in range(len(y_train[target])):
            selected_model = available_models[train_best_models[i]]
            train_adaptive_predictions[i] = train_predictions[selected_model][i]
        
        # 为测试集计算自适应预测
        for i in range(len(y_test[target])):
            selected_model = available_models[test_best_models[i]]
            test_adaptive_predictions[i] = test_predictions[selected_model][i]
        
        # 步骤6: 评估自适应集成的性能
        train_r2 = r2_score(y_train[target], train_adaptive_predictions)
        train_tol_r2 = tolerance_r2_score(y_train[target], train_adaptive_predictions, tolerance=current_tolerance, target=target)
        train_within_tol = prediction_within_tolerance(y_train[target], train_adaptive_predictions, tolerance=current_tolerance, target=target)
        
        test_r2 = r2_score(y_test[target], test_adaptive_predictions)
        test_tol_r2 = tolerance_r2_score(y_test[target], test_adaptive_predictions, tolerance=current_tolerance, target=target)
        test_within_tol = prediction_within_tolerance(y_test[target], test_adaptive_predictions, tolerance=current_tolerance, target=target)
        
        print(f"\n自适应集成模型性能:")
        print(f"训练集: R²={train_r2:.4f}, 容忍度R²={train_tol_r2:.4f}, 在容忍范围内比例={train_within_tol:.2%}")
        print(f"测试集: R²={test_r2:.4f}, 容忍度R²={test_tol_r2:.4f}, 在容忍范围内比例={test_within_tol:.2%}")
        
        # 步骤7: 比较自适应集成与各个基础模型的性能
        print("\n与各基础模型性能比较:")
        for model_name in available_models:
            model_train_pred = train_predictions[model_name]
            model_test_pred = test_predictions[model_name]
            
            model_train_r2 = r2_score(y_train[target], model_train_pred)
            model_test_r2 = r2_score(y_test[target], model_test_pred)
            
            train_r2_diff = train_r2 - model_train_r2
            test_r2_diff = test_r2 - model_test_r2
            
            print(f"  vs {model_name}:")
            print(f"    训练集R²: {train_r2:.4f} vs {model_train_r2:.4f} (差异: {train_r2_diff:.4f})")
            print(f"    测试集R²: {test_r2:.4f} vs {model_test_r2:.4f} (差异: {test_r2_diff:.4f})")
        
        # 步骤8: 分析各模型被选择的频率
        train_model_selection_counts = np.bincount(train_best_models, minlength=len(available_models))
        train_model_selection_percent = train_model_selection_counts / len(train_best_models) * 100
        
        test_model_selection_counts = np.bincount(test_best_models, minlength=len(available_models))
        test_model_selection_percent = test_model_selection_counts / len(test_best_models) * 100
        
        print("\n各模型在训练集上的选择频率:")
        for i, model_name in enumerate(available_models):
            print(f"  {model_name}: {train_model_selection_counts[i]} 次 ({train_model_selection_percent[i]:.2f}%)")
        
        print("\n各模型在测试集上的选择频率:")
        for i, model_name in enumerate(available_models):
            print(f"  {model_name}: {test_model_selection_counts[i]} 次 ({test_model_selection_percent[i]:.2f}%)")
        
        # 步骤9: 创建并保存自适应集成模型
        class AdaptiveEnsembleModel:
            def __init__(self, meta_classifier, models_dict, available_models, model_input_data):
                self.meta_classifier = meta_classifier
                self.models_dict = models_dict
                self.available_models = available_models
                self.model_input_data = model_input_data
                
                # 添加数据类型映射
                self.data_type_map = {
                    'XGBoost': 'tree',
                    'LightGBM': 'tree',
                    'HistGradientBoosting': 'tree',
                    'RandomForest': 'tree_filled',
                    'GaussianProcess': 'linear'
                }
                
            def predict(self, X):
                # 确保X是DataFrame格式，保持列名
                if not isinstance(X, pd.DataFrame):
                    if hasattr(X, 'shape') and len(X.shape) == 2:
                        if hasattr(X_train[target], 'columns'):
                            X = pd.DataFrame(X, columns=X_train[target].columns)
                        else:
                            X = pd.DataFrame(X)
                
                # 首先预测每个样本应使用哪个模型
                model_choices = self.meta_classifier.predict(X)
                
                # 初始化预测结果数组
                predictions = np.zeros(len(X))
                
                # 为每个样本获取相应模型的预测
                for i in range(len(X)):
                    # 获取为当前样本选择的模型
                    model_idx = model_choices[i]
                    model_name = self.available_models[model_idx]
                    model = self.models_dict[model_name]
                    
                    # 准备单个样本的数据
                    if isinstance(X, pd.DataFrame):
                        x_sample = X.iloc[[i]]
                    else:
                        if len(X.shape) == 1:
                            x_sample = X.reshape(1, -1)
                        else:
                            x_sample = X[[i]]
                    
                    # 根据模型类型进行预处理
                    data_type = self.data_type_map.get(model_name, 'standard')
                    
                    if data_type == 'tree':
                        # 支持NaN值的树模型，不需要特殊处理
                        x_processed = x_sample
                    elif data_type == 'tree_filled':
                        # 不支持NaN的树模型，需要填充
                        if isinstance(x_sample, pd.DataFrame):
                            x_processed = x_sample.fillna(0)
                        else:
                            x_processed = np.nan_to_num(x_sample, 0)
                    elif data_type == 'linear':
                        # 线性模型的特殊处理，如果有需要
                        x_processed = x_sample
                    else:
                        # 默认情况
                        x_processed = x_sample
                    
                    # 获取预测
                    pred = model.predict(x_processed)
                    predictions[i] = pred[0] if hasattr(pred, '__len__') else pred
                
                return predictions
                
            def get_feature_importances(self):
                # 获取元分类器的特征重要性
                if hasattr(self.meta_classifier, 'feature_importances_'):
                    return self.meta_classifier.feature_importances_
                return None
        
        # 创建自适应集成模型实例
        adaptive_model = AdaptiveEnsembleModel(
            meta_classifier=meta_classifier,
            models_dict=models[target],
            available_models=available_models,
            model_input_data=model_input_data
        )
        
        # 保存模型
        models[target]['AdaptiveEnsemble'] = adaptive_model
        # 使用pickle保存自适应集成模型
        adaptive_model_file = os.path.join(model_folder, f'{target}_自适应集成模型.pkl')
        with open(adaptive_model_file, 'wb') as f:
            pickle.dump(adaptive_model, f)
        print(f"自适应集成模型已保存至 {adaptive_model_file}")

        # 保存训练集和测试集的预测结果
        train_prediction = pd.DataFrame({
            '实际值': y_train[target],
            '自适应集成预测值': train_adaptive_predictions,
            '误差': np.abs(y_train[target] - train_adaptive_predictions)
        })

        test_prediction = pd.DataFrame({
            '实际值': y_test[target],
            '自适应集成预测值': test_adaptive_predictions,
            '误差': np.abs(y_test[target] - test_adaptive_predictions)
        })

        # 添加各基础模型的预测结果以便比较
        for model_name in available_models:
            train_prediction[f'{model_name}预测值'] = train_predictions[model_name]
            test_prediction[f'{model_name}预测值'] = test_predictions[model_name]

        # 保存到文件
        train_file = os.path.join(save_folder, f'{target}_自适应集成训练集预测结果.csv')
        test_file = os.path.join(save_folder, f'{target}_自适应集成测试集预测结果.csv')

        train_prediction.to_csv(train_file, index=False)
        test_prediction.to_csv(test_file, index=False)

        print(f"训练集预测结果已保存至 {train_file}")
        print(f"测试集预测结果已保存至 {test_file}")

        # 可视化: 预测vs实际值散点图 (训练集和测试集)
        plt.figure(figsize=(12, 5))
        
        # 训练集散点图
        plt.subplot(1, 2, 1)
        plt.scatter(y_train[target], train_adaptive_predictions, alpha=0.5)
        plt.plot([y_train[target].min(), y_train[target].max()], [y_train[target].min(), y_train[target].max()], 'r--')
        plt.xlabel('实际值')
        plt.ylabel('预测值')
        plt.title(f'训练集: R²={train_r2:.4f}')
        
        # 测试集散点图
        plt.subplot(1, 2, 2)
        plt.scatter(y_test[target], test_adaptive_predictions, alpha=0.5)
        plt.plot([y_test[target].min(), y_test[target].max()], [y_test[target].min(), y_test[target].max()], 'r--')
        plt.xlabel('实际值')
        plt.ylabel('预测值')
        plt.title(f'测试集: R²={test_r2:.4f}')
        
        plt.tight_layout()
        plt.show()
        
        # 绘制误差分布
        plt.figure(figsize=(12, 5))
        
        # 训练集误差
        plt.subplot(1, 2, 1)
        train_errors_plot = y_train[target] - train_adaptive_predictions
        plt.hist(train_errors_plot, bins=30, alpha=0.7)
        plt.axvline(x=0, color='r', linestyle='--')
        plt.xlabel('预测误差')
        plt.ylabel('频次')
        plt.title(f'训练集误差分布 (MAE={np.abs(train_errors_plot).mean():.4f})')
        
        # 测试集误差
        plt.subplot(1, 2, 2)
        test_errors_plot = y_test[target] - test_adaptive_predictions
        plt.hist(test_errors_plot, bins=30, alpha=0.7)
        plt.axvline(x=0, color='r', linestyle='--')
        plt.xlabel('预测误差')
        plt.ylabel('频次')
        plt.title(f'测试集误差分布 (MAE={np.abs(test_errors_plot).mean():.4f})')
        
        plt.tight_layout()
        plt.show()
        
        # 保存元分类器的特征重要性
        feature_importance = pd.DataFrame({
            'Feature': X_train[target].columns,
            'Importance': meta_classifier.feature_importances_
        })
        feature_importance = feature_importance.sort_values('Importance', ascending=False)
        # 保存特征重要性数据
        importance_file = os.path.join(save_folder, f'{target}_自适应集成特征重要性.csv')
        feature_importance.to_csv(importance_file, index=False)
        print(f"特征重要性数据已保存至 {importance_file}")
        plt.figure(figsize=(10, 6))
        plt.barh(feature_importance['Feature'], feature_importance['Importance'])
        plt.xlabel('重要性')
        plt.ylabel('特征')
        plt.title(f'{target} - 自适应集成模型选择特征重要性')
        plt.grid(True, axis='x')
        plt.tight_layout()
        plt.show()
        
        # 绘制模型选择频率饼图
        plt.figure(figsize=(12, 5))
        
        # 训练集上的模型选择频率
        plt.subplot(1, 2, 1)
        plt.pie(train_model_selection_counts, labels=available_models, autopct='%1.1f%%')
        plt.title(f'训练集 - 模型选择频率')
        
        # 测试集上的模型选择频率
        plt.subplot(1, 2, 2)
        plt.pie(test_model_selection_counts, labels=available_models, autopct='%1.1f%%')
        plt.title(f'测试集 - 模型选择频率')
        
        plt.tight_layout()
        plt.show()
        # 保存模型选择频率数据
        model_selection_data = pd.DataFrame({
            '模型': available_models,
            '训练集选择次数': train_model_selection_counts,
            '训练集选择百分比': train_model_selection_percent,
            '测试集选择次数': test_model_selection_counts,
            '测试集选择百分比': test_model_selection_percent
        })

        selection_file = os.path.join(save_folder, f'{target}_自适应集成模型选择频率.csv')
        model_selection_data.to_csv(selection_file, index=False)
        print(f"模型选择频率数据已保存至 {selection_file}")
        # 绘制误差分布与模型选择关系
        plt.figure(figsize=(12, 6))
        
        # 对比测试集上各模型的预测结果
        model_data = [test_adaptive_predictions] + [test_predictions[model] for model in available_models]
        model_labels = ['自适应集成'] + available_models
        
        plt.boxplot(model_data, labels=model_labels)
        plt.ylabel('预测值')
        plt.title('自适应集成模型与各基础模型预测分布对比')
        plt.grid(True, axis='y')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        print(f"创建自适应集成模型失败: {str(e)}")
        print(f"错误详情: {traceback.format_exc()}")



In [None]:

# 加权平均集成模型 - 使用标准R²优化权重
print(f"训练 {target} 的加权平均集成模型...")

# 获取目标特定的容忍度
current_tolerance = target_tolerance.get(target, 0.15)

# 创建有效模型列表及其预测结果
available_models = []
train_predictions = {}
test_predictions = {}

if 'XGBoost' in models[target]:
    model = models[target]['XGBoost']
    train_pred = model.predict(X_train_tree[target])
    test_pred = model.predict(X_test_tree[target])
    available_models.append('XGBoost')
    train_predictions['XGBoost'] = train_pred
    test_predictions['XGBoost'] = test_pred

if 'LightGBM' in models[target]:
    model = models[target]['LightGBM']
    train_pred = model.predict(X_train_tree[target])
    test_pred = model.predict(X_test_tree[target])
    available_models.append('LightGBM')
    train_predictions['LightGBM'] = train_pred
    test_predictions['LightGBM'] = test_pred
    
if 'HistGradientBoosting' in models[target]:
    model = models[target]['HistGradientBoosting']
    train_pred = model.predict(X_train_tree[target])
    test_pred = model.predict(X_test_tree[target])
    available_models.append('HistGradientBoosting')
    train_predictions['HistGradientBoosting'] = train_pred
    test_predictions['HistGradientBoosting'] = test_pred
    
if 'RandomForest' in models[target]:
    model = models[target]['RandomForest']
    train_pred = model.predict(X_train_tree_filled[target])
    test_pred = model.predict(X_test_tree_filled[target])
    available_models.append('RandomForest')
    train_predictions['RandomForest'] = train_pred
    test_predictions['RandomForest'] = test_pred
    
if 'GaussianProcess' in models[target]:
    model = models[target]['GaussianProcess']
    train_pred = model.predict(X_train_linear[target])
    test_pred = model.predict(X_test_linear[target])
    available_models.append('GaussianProcess')
    train_predictions['GaussianProcess'] = train_pred
    test_predictions['GaussianProcess'] = test_pred

print(f"可用模型: {available_models}")

if len(available_models) < 2:
    print("加权平均集成至少需要两个模型，目前可用模型不足")
else:
    try:
        # 通过优化找到最优权重
        print("寻找最优权重组合...")
        from scipy.optimize import minimize
        
        # 定义自定义加权平均函数
        def weighted_prediction(weights, preds_list):
            weighted_preds = np.zeros(preds_list[0].shape)
            for i, preds in enumerate(preds_list):
                weighted_preds += weights[i] * preds
            return weighted_preds
        
        # 定义要优化的损失函数（最大化标准R²）- 修改为使用标准R²而非容忍度R²
        def neg_r2(weights, preds_list, y_true):
            # 归一化权重确保和为1
            weights = np.array(weights)
            weights = weights / np.sum(weights)
            
            weighted_preds = weighted_prediction(weights, preds_list)
            r2 = r2_score(y_true, weighted_preds)
            return -r2  # 最小化负的R²（即最大化R²）
        
        # 准备用于优化的预测值列表
        train_preds_list = [train_predictions[model_name] for model_name in available_models]
        
        # 初始权重（均等）
        initial_weights = np.ones(len(available_models)) / len(available_models)
        
        # 约束：权重和为1，所有权重非负
        constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1})
        bounds = [(0, 1) for _ in range(len(available_models))]
        
        # 使用SLSQP优化算法寻找最优权重
        print("优化权重中...")
        result = minimize(
            neg_r2, 
            initial_weights, 
            args=(train_preds_list, y_train[target]),
            bounds=bounds,
            constraints=constraints,
            method='SLSQP'
        )
        
        if result.success:
            # 获取最优权重并归一化
            optimal_weights = result.x
            optimal_weights = optimal_weights / np.sum(optimal_weights)
            
            print("\n找到最优权重组合:")
            for i, model_name in enumerate(available_models):
                print(f"  {model_name}: {optimal_weights[i]:.4f}")
                
            # 使用最优权重在训练集和测试集上评估性能
            train_weighted_preds = weighted_prediction(
                optimal_weights, 
                [train_predictions[model_name] for model_name in available_models]
            )
            
            test_weighted_preds = weighted_prediction(
                optimal_weights, 
                [test_predictions[model_name] for model_name in available_models]
            )
            
            # 计算性能指标
            train_r2 = r2_score(y_train[target], train_weighted_preds)
            train_tol_r2 = tolerance_r2_score(y_train[target], train_weighted_preds, tolerance=current_tolerance, target=target)
            train_within_tol = prediction_within_tolerance(y_train[target], train_weighted_preds, tolerance=current_tolerance, target=target)
            
            test_r2 = r2_score(y_test[target], test_weighted_preds)
            test_tol_r2 = tolerance_r2_score(y_test[target], test_weighted_preds, tolerance=current_tolerance, target=target)
            test_within_tol = prediction_within_tolerance(y_test[target], test_weighted_preds, tolerance=current_tolerance, target=target)
            
            print("\n加权平均集成性能:")
            print(f"  训练集 - R²: {train_r2:.4f}, 容忍度R²: {train_tol_r2:.4f}, 在容忍范围内: {train_within_tol:.2%}")
            print(f"  测试集 - R²: {test_r2:.4f}, 容忍度R²: {test_tol_r2:.4f}, 在容忍范围内: {test_within_tol:.2%}")
            
            # 与各个基础模型比较性能
            print("\n与各基础模型性能比较:")
            for model_name in available_models:
                model_test_pred = test_predictions[model_name]
                model_r2 = r2_score(y_test[target], model_test_pred)
                model_tol_r2 = tolerance_r2_score(y_test[target], model_test_pred, tolerance=current_tolerance, target=target)
                
                r2_diff = test_r2 - model_r2
                tol_r2_diff = test_tol_r2 - model_tol_r2
                
                print(f"  vs {model_name}:")
                print(f"    R² 差异: {r2_diff:.4f} ({'+' if r2_diff > 0 else ''}{r2_diff/max(0.0001, abs(model_r2))*100:.2f}%)")
                print(f"    容忍度R² 差异: {tol_r2_diff:.4f} ({'+' if tol_r2_diff > 0 else ''}{tol_r2_diff/max(0.0001, abs(model_tol_r2))*100:.2f}%)")
            
            # 创建加权平均集成模型
            class WeightedAverageEnsemble:
                def __init__(self, models_dict, model_names, weights, model_datasets):
                    self.models_dict = models_dict
                    self.model_names = model_names
                    self.weights = weights
                    self.model_datasets = model_datasets
                    
                def predict(self, X):
                    predictions = []
                    
                    for i, model_name in enumerate(self.model_names):
                        model = self.models_dict[model_name]
                        
                        # 获取适当的数据格式
                        if model_name in ['XGBoost', 'LightGBM', 'HistGradientBoosting']:
                            if isinstance(X, pd.DataFrame):
                                # 假设X是原始数据框，需要应用适当的预处理
                                X_model = X  # 应该在实际应用中进行适当的预处理转换
                            else:
                                X_model = X
                        elif model_name == 'RandomForest':
                            if isinstance(X, pd.DataFrame):
                                # 对于RandomForest需要填充NaN
                                X_model = X.fillna(0)
                            else:
                                X_model = X
                        elif model_name == 'GaussianProcess':
                            if isinstance(X, pd.DataFrame):
                                # 假设X是原始数据框，需要应用适当的预处理
                                X_model = X  # 应该在实际应用中进行适当的预处理转换
                            else:
                                X_model = X
                        else:
                            X_model = X
                            
                        model_pred = model.predict(X_model)
                        predictions.append(model_pred)
                    
                    # 应用权重
                    weighted_preds = np.zeros(predictions[0].shape)
                    for i, preds in enumerate(predictions):
                        weighted_preds += self.weights[i] * preds
                        
                    return weighted_preds
            
            # 创建模型数据集字典
            model_datasets = {
                'XGBoost': 'tree',
                'LightGBM': 'tree',
                'HistGradientBoosting': 'tree',
                'RandomForest': 'tree_filled',
                'GaussianProcess': 'linear'
            }
            
            # 实例化加权平均集成模型
            weighted_model = WeightedAverageEnsemble(
                models_dict=models[target],
                model_names=available_models,
                weights=optimal_weights,
                model_datasets=model_datasets
            )
            
            # 保存模型
            models[target]['WeightedEnsemble'] = weighted_model
            # 使用pickle保存加权平均集成模型
            weighted_model_file = os.path.join(model_folder, f'{target}_加权平均集成模型.pkl')
            with open(weighted_model_file, 'wb') as f:
                pickle.dump(weighted_model, f)
            print(f"加权平均集成模型已保存至 {weighted_model_file}")

            # 保存训练集和测试集的预测结果
            train_prediction = pd.DataFrame({
                '实际值': y_train[target],
                '加权平均预测值': train_weighted_preds,
                '误差': np.abs(y_train[target] - train_weighted_preds)
            })

            test_prediction = pd.DataFrame({
                '实际值': y_test[target],
                '加权平均预测值': test_weighted_preds,
                '误差': np.abs(y_test[target] - test_weighted_preds)
            })

            # 添加各基础模型的预测结果以便比较
            for model_name in available_models:
                train_prediction[f'{model_name}预测值'] = train_predictions[model_name]
                test_prediction[f'{model_name}预测值'] = test_predictions[model_name]

            # 保存到文件
            train_file = os.path.join(save_folder, f'{target}_加权平均集成训练集预测结果.csv')
            test_file = os.path.join(save_folder, f'{target}_加权平均集成测试集预测结果.csv')

            train_prediction.to_csv(train_file, index=False)
            test_prediction.to_csv(test_file, index=False)

            print(f"训练集预测结果已保存至 {train_file}")
            print(f"测试集预测结果已保存至 {test_file}")

            # 可视化: 预测vs实际值散点图 (训练集和测试集)
            plt.figure(figsize=(12, 5))
            
            # 训练集散点图
            plt.subplot(1, 2, 1)
            plt.scatter(y_train[target], train_weighted_preds, alpha=0.5)
            plt.plot([y_train[target].min(), y_train[target].max()], [y_train[target].min(), y_train[target].max()], 'r--')
            plt.xlabel('实际值')
            plt.ylabel('预测值')
            plt.title(f'训练集: R²={train_r2:.4f}')
            
            # 测试集散点图
            plt.subplot(1, 2, 2)
            plt.scatter(y_test[target], test_weighted_preds, alpha=0.5)
            plt.plot([y_test[target].min(), y_test[target].max()], [y_test[target].min(), y_test[target].max()], 'r--')
            plt.xlabel('实际值')
            plt.ylabel('预测值')
            plt.title(f'测试集: R²={test_r2:.4f}')
            
            plt.tight_layout()
            plt.show()
            
            # 绘制误差分布
            plt.figure(figsize=(12, 5))
            
            # 训练集误差
            plt.subplot(1, 2, 1)
            train_errors = y_train[target] - train_weighted_preds
            plt.hist(train_errors, bins=30, alpha=0.7)
            plt.axvline(x=0, color='r', linestyle='--')
            plt.xlabel('预测误差')
            plt.ylabel('频次')
            plt.title(f'训练集误差分布 (MAE={np.abs(train_errors).mean():.4f})')
            
            # 测试集误差
            plt.subplot(1, 2, 2)
            test_errors = y_test[target] - test_weighted_preds
            plt.hist(test_errors, bins=30, alpha=0.7)
            plt.axvline(x=0, color='r', linestyle='--')
            plt.xlabel('预测误差')
            plt.ylabel('频次')
            plt.title(f'测试集误差分布 (MAE={np.abs(test_errors).mean():.4f})')
            
            plt.tight_layout()
            plt.show()
            
            # 绘制权重条形图
            plt.figure(figsize=(10, 6))
            plt.bar(available_models, optimal_weights)
            plt.xlabel('模型')
            plt.ylabel('权重')
            plt.title(f'{target} - 加权平均集成模型权重分布')
            plt.xticks(rotation=45)
            plt.grid(True, axis='y')
            plt.tight_layout()
            plt.show()
            
            # 绘制各模型与加权平均模型的预测对比图
            plt.figure(figsize=(12, 6))
            model_data = [test_weighted_preds] + [test_predictions[model] for model in available_models]
            model_labels = ['加权平均'] + available_models
            
            plt.boxplot(model_data, labels=model_labels)
            plt.ylabel('预测值')
            plt.title('加权平均模型与各基础模型预测分布对比')
            plt.grid(True, axis='y')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.show()
            
        else:
            print("权重优化失败:", result.message)
            
    except Exception as e:
        print(f"创建加权平均集成模型失败: {str(e)}")
        print(f"错误详情: {traceback.format_exc()}")

In [None]:
# ====================== 模型评估与可视化 ======================
print("=" * 40)
print("模型评估阶段")
print("=" * 80)

import seaborn as sns
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
# 设置可视化主题
plt.style.use('ggplot')
sns.set(style="whitegrid")
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
colors = sns.color_palette("viridis", 8)

class CustomLGBMRegressor:
    def __init__(self, **params):
        self.params = params
        self.model = None
    def fit(self, X, y):
        feature_names = [f'f{i}' for i in range(X.shape[1])]
        X_values = X.values if hasattr(X, 'values') else X
        train_data = lgb.Dataset(X_values, label=y, feature_name=feature_names)
        self.model = lgb.train(self.params, train_data, num_boost_round=self.params.get('n_estimators', 100))
        return self
    def predict(self, X):
        if self.model is None: raise ValueError("Model not trained.")
        X_values = X.values if hasattr(X, 'values') else X
        return self.model.predict(X_values)

class EnhancedVotingRegressor:
    def __init__(self, estimators, weights, datasets, target_name):
        self.estimators = estimators
        self.weights = np.array(weights)
        self.datasets = datasets
        self.target_name = target_name
        if np.sum(self.weights) > 0: self.weights = self.weights / np.sum(self.weights)
    def predict(self, X):
        print("警告：直接调用 EnhancedVotingRegressor 的 predict 方法可能导致结果不准，请使用 predict_model_unified 函数。")
        return np.zeros(len(X) if hasattr(X, '__len__') else 1)

class AdaptiveEnsembleModel:
    def __init__(self, meta_classifier, models_dict, available_models, model_input_data):
        self.meta_classifier = meta_classifier
        self.models_dict = models_dict
        self.available_models = available_models
        self.model_input_data = model_input_data
        self.data_type_map = {'XGBoost': 'tree', 'LightGBM': 'tree', 'HistGradientBoosting': 'tree', 'RandomForest': 'tree_filled', 'GaussianProcess': 'linear'}
    def predict(self, X):
        print("警告：直接调用 AdaptiveEnsembleModel 的 predict 方法可能导致结果不准，请使用 predict_model_unified 函数。")
        return np.zeros(len(X) if hasattr(X, '__len__') else 1)

class WeightedAverageEnsemble:
    def __init__(self, models_dict, model_names, weights, model_datasets):
        self.models_dict = models_dict
        self.model_names = model_names
        self.weights = weights
        self.model_datasets = model_datasets
    def predict(self, X):
        print("警告：直接调用 WeightedAverageEnsemble 的 predict 方法可能导致结果不准，请使用 predict_model_unified 函数。")
        return np.zeros(len(X) if hasattr(X, '__len__') else 1)

# --- 2. 统一的、可靠的预测函数 ---
def predict_model_unified(model_obj, model_name, target, data_dict):
    if target not in data_dict['original']: return None

    model_type_map = {
        'VotingEnsemble': 'VotingEnsemble', '投票集成': 'VotingEnsemble',
        'WeightedEnsemble': 'WeightedEnsemble', '加权平均集成': 'WeightedEnsemble',
        'AdaptiveEnsemble': 'AdaptiveEnsemble', '自适应集成': 'AdaptiveEnsemble'
    }
    ensemble_type = model_type_map.get(model_name)

    if ensemble_type == 'VotingEnsemble':
        base_models, weights, model_datasets = model_obj.estimators, model_obj.weights, model_obj.datasets
        predictions = {}
        for name, base_model in base_models:
            md_type = model_datasets.get(name, 'original')
            predictions[name] = base_model.predict(data_dict[md_type][target])
        return np.average([predictions[name] for name, _ in base_models], axis=0, weights=weights)
    
    elif ensemble_type == 'WeightedEnsemble':
        model_names, weights, models_dict = model_obj.model_names, model_obj.weights, model_obj.models_dict
        predictions = []
        for mn in model_names:
            base_model_instance = models_dict[mn]
            md_type = model_obj.model_datasets.get(mn, 'original')
            predictions.append(base_model_instance.predict(data_dict[md_type][target]))
        return np.average(predictions, axis=0, weights=weights)

    elif ensemble_type == 'AdaptiveEnsemble':
        meta_classifier, models_dict, available_models, data_type_map = model_obj.meta_classifier, model_obj.models_dict, model_obj.available_models, model_obj.data_type_map
        X_raw = data_dict['original'][target]
        model_choices = meta_classifier.predict(X_raw)
        
        predictions_dict = {}
        for mn in available_models:
            base_model_instance = models_dict[mn]
            md_type = data_type_map.get(mn, 'original')
            predictions_dict[mn] = base_model_instance.predict(data_dict[md_type][target])
        
        return np.array([predictions_dict[available_models[choice]][i] for i, choice in enumerate(model_choices)])
        
    else: # 普通模型
        model_type_map = {'XGBoost':'tree', 'LightGBM':'tree', 'HistGradientBoosting':'tree', 'RandomForest':'tree_filled', 'GaussianProcess':'linear'}
        data_type_key = model_type_map.get(model_name, 'original')
        return model_obj.predict(data_dict[data_type_key][target])
        
# 定义容忍度评估函数
def tolerance_r2_score(y_true, y_pred, tolerance=0.15, target=None):
    """
    计算容忍度R²评分，允许一定误差范围内的预测被视为准确
    """
    # 确保输入数据是numpy数组并且形状正确
    y_true = np.asarray(y_true).ravel()
    y_pred = np.asarray(y_pred).ravel()
    
    # 如果提供了目标变量名，则使用目标特定的容忍度
    if target and target in target_tolerance:
        tolerance = target_tolerance[target]
    
    # 计算容忍范围
    tolerance_values = tolerance * np.abs(y_true)
    # 计算残差
    residuals = np.abs(y_true - y_pred)
    
    # 调整残差，使误差在容忍范围内的视为0
    adjusted_residuals = np.maximum(0, residuals - tolerance_values)
    
    # 计算修正后的总平方和
    y_true_mean = np.mean(y_true)
    tss = np.sum((y_true - y_true_mean) ** 2)
    
    # 计算修正后的残差平方和
    rss = np.sum(adjusted_residuals ** 2)
    
    # 计算修正后的R²
    if tss == 0:
        return 0  # 防止除以0
    
    tolerance_r2 = 1 - (rss / tss)
    return tolerance_r2

def prediction_within_tolerance(y_true, y_pred, tolerance=0.15, target=None):
    """
    计算预测值在目标值±容忍范围内的比例
    """
    # 确保输入为numpy数组
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    
    # 如果提供了目标变量名，则使用目标特定的容忍度
    if target and target in target_tolerance:
        tolerance = target_tolerance[target]
    
    # 计算容忍范围
    tolerance_values = tolerance * np.abs(y_true)
    
    # 检查预测是否在容忍范围内
    within_tolerance = np.abs(y_true - y_pred) <= tolerance_values
    
    # 计算在容忍范围内的预测比例
    return np.mean(within_tolerance)

def make_tolerance_scorer(target):
    def scorer_function(estimator, X, y):
        y_pred = estimator.predict(X)
        return tolerance_r2_score(y, y_pred, target=target)
    
    # 设置函数名称
    scorer_function.__name__ = f'tolerance_scorer_{target}'
    return scorer_function

def sanitize_filename(filename):
    """将文件名中的无效字符替换为下划线"""
    invalid_chars = ['/', '\\', ':', '*', '?', '"', '<', '>', '|']
    for char in invalid_chars:
        filename = filename.replace(char, '_')
    return filename

def save_plot_data(data, filename, description=""):
    """保存作图数据到CSV文件"""
    os.makedirs('plot_data', exist_ok=True)
    data.to_csv(f'plot_data/{filename}.csv', index=False, encoding='utf-8-sig')
    print(f"已保存{description}数据到 plot_data/{filename}.csv")

# 定义目标变量特定的容忍度值
target_tolerance = {
    '水接触角': 0.05,
    '循环使用次数': 0.1,
    '吸油能力': 0.1
}

# 定义模型组
model_types = {
    "XGBoost": "tree", 
    "LightGBM": "tree",
    "HistGradientBoosting": "tree",
    "RandomForest": "tree_filled",
    "GaussianProcess": "linear",
    "VotingEnsemble": "ensemble",
    "AdaptiveEnsemble": "ensemble",
    "WeightedEnsemble": "ensemble"
}

# 创建存储评估结果的字典
evaluation_results = {}

# 获取目标变量列表
target_columns = ['水接触角', '循环使用次数', '吸油能力']
valid_targets = target_columns  # 假设所有目标变量都有效

# 加载测试数据（假设数据分割已完成）
print("加载测试数据...")
try:
    X_test = pickle.load(open('data_exports/X_test.pkl', 'rb'))
    X_test_linear = pickle.load(open('data_exports/X_test_linear.pkl', 'rb'))
    X_test_tree = pickle.load(open('data_exports/X_test_tree.pkl', 'rb'))
    X_test_tree_filled = pickle.load(open('data_exports/X_test_tree_filled.pkl', 'rb'))
    y_test = pickle.load(open('data_exports/y_test.pkl', 'rb'))
    print("测试数据加载成功")
except Exception as e:
    print(f"加载测试数据失败: {str(e)}")
    # 如果无法加载，尝试使用调参预处理脚本中的数据
    print("尝试使用调参预处理脚本中的数据...")
    # 这里应该包含从调参预处理脚本中获取数据的代码
    # 但由于你没有提供完整的代码，我们假设数据已经可用

# 创建模型评估文件夹
os.makedirs('模型评估结果', exist_ok=True)

# 评估每个目标变量的模型
for target in valid_targets:
    print(f"\n评估 {target} 的模型性能...")
    
    # 获取目标特定的容忍度
    current_tolerance = target_tolerance.get(target, 0.15)
    
    # 初始化目标的评估结果字典
    evaluation_results[target] = {}
    
    # 加载模型文件夹中的所有模型
    model_folder = '训练模型文件'
    model_files = [f for f in os.listdir(model_folder) if f.startswith(f'{target}_') and f.endswith('.pkl') and not f.endswith('_features.pkl')]
    
    # 如果没有找到模型文件，跳过这个目标
    if not model_files:
        print(f"没有找到 {target} 的模型文件")
        continue
    
    # 遍历所有模型文件并评估

    # 在进入循环前，加载一次所有测试数据
    X_test_dict = {
        'original': X_test,
        'linear': X_test_linear,
        'tree': X_test_tree,
        'tree_filled': X_test_tree_filled,
    }

    # 遍历所有模型文件并评估
    for model_file in model_files:
        # 从文件名提取模型名称
        model_name = model_file.replace(f'{target}_', '').replace('模型.pkl', '')
        print(f"\n评估 {model_name} 模型...")
        
        # 加载模型
        model_path = os.path.join(model_folder, model_file)
        try:
            with open(model_path, 'rb') as f:
                model = pickle.load(f)
            
            # 关键修改：使用统一的预测函数，替换掉原来复杂的 if/else 和 model.predict
            y_pred = predict_model_unified(model, model_name, target, X_test_dict)
            
            # --- 以下所有功能代码与您原来保持一致，确保功能不丢失 ---

            # 计算评估指标
            r2 = r2_score(y_test[target], y_pred)
            rmse = np.sqrt(mean_squared_error(y_test[target], y_pred))
            mae = mean_absolute_error(y_test[target], y_pred)
            
            # 计算容忍度指标
            tol_r2 = tolerance_r2_score(y_test[target], y_pred, tolerance=current_tolerance, target=target)
            within_tol = prediction_within_tolerance(y_test[target], y_pred, tolerance=current_tolerance, target=target)
            
            # 存储结果
            evaluation_results[target][model_name] = {
                'r2': r2,
                'rmse': rmse,
                'mae': mae,
                'tolerance_r2': tol_r2,
                'within_tolerance': within_tol,
                'y_pred': y_pred
            }
            
            print(f"  {model_name} - R²: {r2:.4f}, RMSE: {rmse:.4f}, MAE: {mae:.4f}")
            print(f"  容忍度R² (容忍度={current_tolerance:.2f}): {tol_r2:.4f}, 预测在容忍范围内比例: {within_tol:.2%}")
            
            # 生成预测对比散点图
            plt.figure(figsize=(10, 8))
            plt.scatter(y_test[target], y_pred, alpha=0.5, c='darkblue')
            
            min_val = min(min(y_test[target]), min(y_pred))
            max_val = max(max(y_test[target]), max(y_pred))
            margin = (max_val - min_val) * 0.1
            plt.plot([min_val - margin, max_val + margin], [min_val - margin, max_val + margin], 'r--', 
                    label='理想预测线')
            
            if current_tolerance > 0:
                plt.fill_between([min_val - margin, max_val + margin], 
                                [(min_val - margin) * (1 - current_tolerance), (max_val + margin) * (1 - current_tolerance)], 
                                [(min_val - margin) * (1 + current_tolerance), (max_val + margin) * (1 + current_tolerance)], 
                                color='green', alpha=0.1, label=f'±{current_tolerance:.0%}容忍范围')
            
            plt.title(f'{target} - {model_name}模型预测值与实际值对比', fontsize=14, fontweight='bold')
            plt.xlabel('实际值', fontsize=12)
            plt.ylabel('预测值', fontsize=12)
            plt.grid(True, linestyle='--', alpha=0.7)
            plt.legend()
            
            metrics_text = f'R² = {r2:.4f}\nRMSE = {rmse:.4f}\n容忍度R² = {tol_r2:.4f}\n在容忍范围内比例 = {within_tol:.2%}'
            plt.annotate(metrics_text, xy=(0.05, 0.95), xycoords='axes fraction', 
                        fontsize=11, fontweight='bold', 
                        bbox=dict(boxstyle="round,pad=0.5", fc="white", ec="gray", alpha=0.8))
            
            # 保存图表
            plt.tight_layout()
            plt.savefig(f'模型评估结果/散点图/{sanitize_filename(target)}_{model_name}_prediction.png', dpi=300)
            plt.close()
            
        except Exception as e:
            print(f"  评估 {model_name} 失败: {str(e)}")

    
    # 确定最佳模型 (使用三种标准)
    if evaluation_results[target]:
        # 标准R²最佳模型
        best_r2_model = max(evaluation_results[target], 
                          key=lambda m: evaluation_results[target][m]['r2'])
        best_r2 = evaluation_results[target][best_r2_model]['r2']
        
        # 容忍度R²最佳模型
        best_tol_r2_model = max(evaluation_results[target], 
                              key=lambda m: evaluation_results[target][m]['tolerance_r2'])
        best_tol_r2 = evaluation_results[target][best_tol_r2_model]['tolerance_r2']
        
        # 容忍范围内预测最佳模型
        best_within_tol_model = max(evaluation_results[target], 
                                  key=lambda m: evaluation_results[target][m]['within_tolerance'])
        best_within_tol = evaluation_results[target][best_within_tol_model]['within_tolerance']
        
        print(f"\n{target} 的最佳模型:")
        print(f"  标准R²最佳模型: {best_r2_model} (R²={best_r2:.4f})")
        print(f"  容忍度R²最佳模型: {best_tol_r2_model} (容忍度R²={best_tol_r2:.4f})")
        print(f"  预测在容忍范围内比例最佳模型: {best_within_tol_model} (在容忍范围内={best_within_tol:.2%})")
        # 综合评价筛选最佳模型
        print(f"\n使用综合评价标准筛选{target}的最佳模型:")
        # 定义权重
        weights = {
            'r2': 0.5,           # 标准R²权重最大
            'tolerance_r2': 0.3,  # 容忍度R²权重居中
            'within_tolerance': 0.2  # 在容忍范围内比例权重最小
        }
        print(f"评价标准权重分配: 标准R²={weights['r2']}, 容忍度R²={weights['tolerance_r2']}, 容忍范围内比例={weights['within_tolerance']}")

        # 计算每个模型的综合得分
        composite_scores = {}
        for model_name in evaluation_results[target]:
            # 获取各指标的值
            r2_score_val = evaluation_results[target][model_name]['r2']
            tol_r2_val = evaluation_results[target][model_name]['tolerance_r2']
            within_tol_val = evaluation_results[target][model_name]['within_tolerance']
            
            # 计算综合得分
            composite_score = (weights['r2'] * r2_score_val + 
                            weights['tolerance_r2'] * tol_r2_val + 
                            weights['within_tolerance'] * within_tol_val)
            
            composite_scores[model_name] = composite_score

        # 找出综合得分最高的模型
        best_composite_model = max(composite_scores, key=composite_scores.get)
        best_composite_score = composite_scores[best_composite_model]

        print(f"正在为{target}创建最佳综合模型的副本...")

        try:
            # 构建源模型路径
            source_model_path = os.path.join(model_folder, f'{target}_{best_composite_model}模型.pkl')
            
            # 在模型评估结果目录中创建"最佳模型"文件夹
            best_model_folder = os.path.join('模型评估结果', '最佳模型')
            if not os.path.exists(best_model_folder):
                os.makedirs(best_model_folder)
                print(f"已创建最佳模型文件夹：{best_model_folder}")
            
            # 构建目标模型路径 - 在最佳模型文件夹中
            best_model_path = os.path.join(best_model_folder, f'{target}_最佳模型.pkl')
            
            # 复制最佳模型文件
            if os.path.exists(source_model_path):
                # 使用shutil复制文件
                import shutil
                shutil.copy2(source_model_path, best_model_path)
                print(f"已复制最佳模型({best_composite_model})到: {best_model_path}")
                
                # 同时在训练模型文件夹中创建一个带有"最佳"标识的副本，确保解释脚本能找到
                best_model_in_train_folder = os.path.join(model_folder, f'{target}_最佳综合模型.pkl')
                shutil.copy2(source_model_path, best_model_in_train_folder)
                print(f"已复制最佳模型到训练模型文件夹: {best_model_in_train_folder}")
            else:
                print(f"警告: 无法找到源模型文件 {source_model_path}")
        except Exception as e:
            print(f"复制最佳模型时出错: {str(e)}")
        print(f"综合评价最佳模型: {best_composite_model} (得分={best_composite_score:.4f})")
        print(f"  - 标准R²: {evaluation_results[target][best_composite_model]['r2']:.4f}")
        print(f"  - 容忍度R²: {evaluation_results[target][best_composite_model]['tolerance_r2']:.4f}")
        print(f"  - 在容忍范围内比例: {evaluation_results[target][best_composite_model]['within_tolerance']:.2%}")
        # 将最佳模型信息保存到文件
        best_models_info = {
            'best_r2_model': best_r2_model,
            'best_r2': best_r2,
            'best_tol_r2_model': best_tol_r2_model,
            'best_tol_r2': best_tol_r2,
            'best_within_tol_model': best_within_tol_model,
            'best_within_tol': best_within_tol,
            'best_composite_model': best_composite_model,
            'best_composite_score': best_composite_score
        }
        if best_composite_model in ["XGBoost", "LightGBM", "HistGradientBoosting"]:
            best_models_info['model_type'] = 'tree'
        elif best_composite_model == "RandomForest":
            best_models_info['model_type'] = 'tree_filled'
        elif best_composite_model == "GaussianProcess":
            best_models_info['model_type'] = 'linear'
        elif best_composite_model in ["LinearRegression", "Ridge", "Lasso", "ElasticNet", "HuberRegressor"]:
            best_models_info['model_type'] = 'linear'
        elif best_composite_model in ["VotingEnsemble", "AdaptiveEnsemble", "WeightedEnsemble"]:
            best_models_info['model_type'] = 'ensemble'
        elif best_composite_model in ["DeepNN"]:
            best_models_info['model_type'] = 'nn'
        else:
            best_models_info['model_type'] = 'tree'  # 默认为tree类型

        print(f"最佳模型类型: {best_models_info['model_type']}")
        with open(f'模型评估结果/{sanitize_filename(target)}_best_models.pkl', 'wb') as f:
            pickle.dump(best_models_info, f)
    else:
        print(f"没有 {target} 的有效评估结果")

# 保存评估结果
with open('模型评估结果/evaluation_results.pkl', 'wb') as f:
    pickle.dump(evaluation_results, f)

print("评估结果已保存")

# ====================== 结果可视化 ======================
print("\n生成评估结果可视化...")

# 为每个目标变量创建模型性能对比图
for target in evaluation_results:
    if not evaluation_results[target]:
        continue
    
    # 获取目标特定的容忍度
    current_tolerance = target_tolerance.get(target, 0.15)
        
    # 提取评估指标
    model_names = list(evaluation_results[target].keys())
    r2_scores = [evaluation_results[target][m]['r2'] for m in model_names]
    rmse_scores = [evaluation_results[target][m]['rmse'] for m in model_names]
    mae_scores = [evaluation_results[target][m]['mae'] for m in model_names]
    tol_r2_scores = [evaluation_results[target][m]['tolerance_r2'] for m in model_names]
    within_tol_scores = [evaluation_results[target][m]['within_tolerance'] for m in model_names]
    
    # 计算模型排名
    model_ranks = pd.DataFrame({
        '模型': model_names,
        'R²': r2_scores,
        'RMSE': rmse_scores,
        'MAE': mae_scores,
        f'容忍度R² (容忍度={current_tolerance:.2f})': tol_r2_scores,
        f'在容忍范围内比例 (容忍度={current_tolerance:.2f})': within_tol_scores
    }).sort_values(by='R²', ascending=False)
    
    # 保存排名数据
    save_plot_data(model_ranks, f"{sanitize_filename(target)}_model_rankings", "模型排名")
    
    # 创建目录
    os.makedirs('模型评估结果/性能对比图', exist_ok=True)
    
    # 绘制R²性能对比图
    plt.figure(figsize=(12, 8))
    sorted_r2 = model_ranks.sort_values(by='R²', ascending=False)
    
    bars = plt.barh(sorted_r2['模型'], sorted_r2['R²'])
    plt.xlabel('R² 分数', fontsize=12)
    plt.title(f'{target} - 模型标准R²性能对比', fontsize=14, fontweight='bold')
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.xlim(-0.1, 1.0)  # 设置合理的R²范围

    for bar in bars:
        width = bar.get_width()
        plt.text(max(width, 0.01), bar.get_y() + bar.get_height()/2, 
                f'{width:.4f}', ha='left', va='center', fontweight='bold', fontsize=10)
    
    plt.tight_layout()
    plt.savefig(f'模型评估结果/性能对比图/{sanitize_filename(target)}_r2_comparison.png', dpi=300)
    plt.close()
    
    # 绘制容忍度R²性能对比图
    plt.figure(figsize=(12, 8))
    sorted_tol_r2 = model_ranks.sort_values(by=f'容忍度R² (容忍度={current_tolerance:.2f})', ascending=False)
    
    bars = plt.barh(sorted_tol_r2['模型'], sorted_tol_r2[f'容忍度R² (容忍度={current_tolerance:.2f})'])
    plt.xlabel(f'容忍度R² (容忍度={current_tolerance:.2f})', fontsize=12)
    plt.title(f'{target} - 模型容忍度R²性能对比', fontsize=14, fontweight='bold')
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.xlim(-0.1, 1.0)  # 设置合理的R²范围
    
    for bar in bars:
        width = bar.get_width()
        plt.text(max(width, 0.01), bar.get_y() + bar.get_height()/2, 
                f'{width:.4f}', ha='left', va='center', fontweight='bold', fontsize=10)
    
    plt.tight_layout()
    plt.savefig(f'模型评估结果/性能对比图/{sanitize_filename(target)}_tolerance_r2_comparison.png', dpi=300)
    plt.close()
    
    # 绘制在容忍范围内比例对比图
    plt.figure(figsize=(12, 8))
    sorted_within_tol = model_ranks.sort_values(by=f'在容忍范围内比例 (容忍度={current_tolerance:.2f})', ascending=False)
    
    bars = plt.barh(sorted_within_tol['模型'], sorted_within_tol[f'在容忍范围内比例 (容忍度={current_tolerance:.2f})'])
    plt.xlabel(f'在容忍范围内预测比例 (容忍度={current_tolerance:.2f})', fontsize=12)
    plt.title(f'{target} - 模型预测在容忍范围内比例对比', fontsize=14, fontweight='bold')
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.xlim(0, 1.0)
    
    for bar in bars:
        width = bar.get_width()
        plt.text(max(width, 0.01), bar.get_y() + bar.get_height()/2, 
                f'{width:.2%}', ha='left', va='center', fontweight='bold', fontsize=10)
    
    plt.tight_layout()
    plt.savefig(f'模型评估结果/性能对比图/{sanitize_filename(target)}_within_tolerance_comparison.png', dpi=300)
    plt.close()
    
    # 绘制RMSE对比图 (越低越好)
    plt.figure(figsize=(12, 8))
    sorted_rmse = model_ranks.sort_values(by='RMSE')
    
    bars = plt.barh(sorted_rmse['模型'], sorted_rmse['RMSE'])
    plt.xlabel('RMSE 值', fontsize=12)
    plt.title(f'{target} - 模型RMSE性能对比 (越低越好)', fontsize=14, fontweight='bold')
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    
    for bar in bars:
        width = bar.get_width()
        plt.text(max(width, 0.01), bar.get_y() + bar.get_height()/2, 
                f'{width:.4f}', ha='left', va='center', fontweight='bold', fontsize=10)
    
    plt.tight_layout()
    plt.savefig(f'模型评估结果/性能对比图/{sanitize_filename(target)}_rmse_comparison.png', dpi=300)
    plt.close()

# 创建热图目录
os.makedirs('模型评估结果/热图', exist_ok=True)

# 创建三个目标变量的模型性能热图
metrics = ['r2', 'tolerance_r2', 'within_tolerance']
metric_names = {
    'r2': 'R²',
    'tolerance_r2': '容忍度R²',
    'within_tolerance': '在容忍范围内比例'
}

# 获取所有模型名称和所有目标变量
all_models = set()
for target in evaluation_results:
    all_models.update(evaluation_results[target].keys())
all_models = sorted(all_models)

all_targets = list(evaluation_results.keys())

# 为每个指标创建热图
for metric in metrics:
    # 创建数据矩阵
    data_matrix = np.zeros((len(all_targets), len(all_models)))
    
    for i, target in enumerate(all_targets):
        for j, model in enumerate(all_models):
            if model in evaluation_results[target]:
                if metric in evaluation_results[target][model]:
                    data_matrix[i, j] = evaluation_results[target][model][metric]
                else:
                    data_matrix[i, j] = np.nan
            else:
                data_matrix[i, j] = np.nan
    
    # 创建DataFrame
    df = pd.DataFrame(data_matrix, index=all_targets, columns=all_models)
    
    # 保存数据
    save_plot_data(df, f"global_{metric}_heatmap", f"全局{metric_names[metric]}热图数据")
    
    # 绘制热图
    plt.figure(figsize=(14, 8))
    
    if metric == 'r2' or metric == 'tolerance_r2':
        # 对于R²和容忍度指标，越高越好
        cmap = sns.color_palette("YlGnBu", as_cmap=True)
        vmin = max(0, df.min().min())  # R²可能为负
        vmax = min(1.0, df.max().max())  # 确保R²的合理范围
    elif metric == 'within_tolerance':
        # 对于容忍度范围内比例，使用百分比格式
        cmap = sns.color_palette("YlGnBu", as_cmap=True)
        vmin = 0
        vmax = 1.0
        fmt = '.1%'
    else:
        # 默认情况
        cmap = sns.color_palette("YlGnBu", as_cmap=True)
        vmin = None
        vmax = None
        fmt = '.3f'
    
    ax = sns.heatmap(df, annot=True, cmap=cmap, vmin=vmin, vmax=vmax,
                    linewidths=0.5, fmt='.3f' if metric != 'within_tolerance' else '.1%',
                    cbar_kws={'label': metric_names[metric]})
    
    plt.title(f'所有目标变量的{metric_names[metric]}性能对比', fontsize=14, fontweight='bold')
    plt.xlabel('模型', fontsize=12)
    plt.ylabel('目标变量', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(f'模型评估结果/热图/global_{metric}_heatmap.png', dpi=300)
    plt.close()

# 创建模型比较汇总表格
summary_data = []

for target in all_targets:

    # 在评估模型的循环中，添加特征信息保存
    for model_name in evaluation_results[target]:
        # 获取模型类型
        if model_name in ["XGBoost", "LightGBM", "HistGradientBoosting"]:
            model_type = 'tree'
            features_used = X_test_tree[target].columns.tolist()
        elif model_name == "RandomForest":
            model_type = 'tree_filled'
            features_used = X_test_tree_filled[target].columns.tolist()
        elif model_name in ["LinearRegression", "Ridge", "Lasso", "ElasticNet", "HuberRegressor", "GaussianProcess"]:
            model_type = 'linear'
            features_used = X_test_linear[target].columns.tolist()
        else:
            model_type = 'ensemble'
            features_used = X_test[target].columns.tolist()
        
        # 保存特征信息
        feature_info = {
            'features': features_used,
            'model_type': model_type
        }
        
        feature_path = os.path.join('训练模型文件', f'{target}_{model_name}_features.pkl')
        with open(feature_path, 'wb') as f:
            pickle.dump(feature_info, f)
        
        # 同时为最佳模型保存一份特征信息
        if model_name == best_composite_model:
            best_feature_path = os.path.join('模型评估结果/最佳模型', f'{target}_最佳模型_features.pkl')
            with open(best_feature_path, 'wb') as f:
                pickle.dump(feature_info, f)

    if target in evaluation_results and evaluation_results[target]:
        # 获取每个指标的最佳模型
        best_r2_model = max(evaluation_results[target], 
                          key=lambda m: evaluation_results[target][m]['r2'])
        best_r2 = evaluation_results[target][best_r2_model]['r2']
        
        best_tol_r2_model = max(evaluation_results[target], 
                              key=lambda m: evaluation_results[target][m]['tolerance_r2'])
        best_tol_r2 = evaluation_results[target][best_tol_r2_model]['tolerance_r2']
        
        best_within_tol_model = max(evaluation_results[target], 
                                  key=lambda m: evaluation_results[target][m]['within_tolerance'])
        best_within_tol = evaluation_results[target][best_within_tol_model]['within_tolerance']
        
        # 添加到汇总数据
        summary_data.append({
            '目标变量': target,
            '最佳R²模型': best_r2_model,
            'R²值': best_r2,
            '最佳容忍度R²模型': best_tol_r2_model,
            '容忍度R²值': best_tol_r2,
            '最佳容忍范围内模型': best_within_tol_model,
            '容忍范围内比例': best_within_tol
        })

# 创建汇总DataFrame并保存
if summary_data:
    summary_df = pd.DataFrame(summary_data)
    save_plot_data(summary_df, "model_performance_summary", "模型性能汇总")
    
    # 保存到Excel文件
    summary_df.to_excel('模型评估结果/模型性能汇总.xlsx', index=False)
    print("已创建模型性能汇总表")

print("\n评估结果可视化完成")
print("\n材料性能预测模型评估完成！")

In [None]:
# ==============================================================================
#           最终、功能对齐、完整解释性的 1-解释性机器学习.py 脚本
# ==============================================================================

# ====================== 1. 导入所需库 ======================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import shap
import os
import traceback
from sklearn.inspection import permutation_importance
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import lightgbm as lgb

# ====================== 2. 核心修复与增强模块 ======================

# --- 2.1 所有自定义模型类的定义 (保持不变) ---
class CustomLGBMRegressor:
    def __init__(self, **params): self.params, self.model = params, None
    def fit(self, X, y):
        X_values = X.values if hasattr(X, 'values') else X
        train_data = lgb.Dataset(X_values, label=y, feature_name=[f'f{i}' for i in range(X.shape[1])])
        self.model = lgb.train(self.params, train_data, num_boost_round=self.params.get('n_estimators', 100)); return self
    def predict(self, X):
        if self.model is None: raise ValueError("Model not trained.")
        return self.model.predict(X.values if hasattr(X, 'values') else X)

class EnhancedVotingRegressor:
    def __init__(self, estimators, weights, datasets, target_name):
        self.estimators, self.weights, self.datasets, self.target_name = estimators, np.array(weights), datasets, target_name
        if np.sum(self.weights) > 0: self.weights /= np.sum(self.weights)
    def predict(self, X): return np.zeros(len(X) if hasattr(X, '__len__') else 1)

class AdaptiveEnsembleModel: # (其他自定义类也应放在这里)
    def __init__(self, meta_classifier, models_dict, available_models, model_input_data):
        self.meta_classifier, self.models_dict, self.available_models, self.model_input_data = meta_classifier, models_dict, available_models, model_input_data
        self.data_type_map = {'XGBoost': 'tree', 'LightGBM': 'tree', 'HistGradientBoosting': 'tree', 'RandomForest': 'tree_filled', 'GaussianProcess': 'linear'}
    def predict(self, X): return np.zeros(len(X) if hasattr(X, '__len__') else 1)

class WeightedAverageEnsemble:
    def __init__(self, models_dict, model_names, weights, model_datasets):
        self.models_dict, self.model_names, self.weights, self.model_datasets = models_dict, model_names, weights, model_datasets
    def predict(self, X): return np.zeros(len(X) if hasattr(X, '__len__') else 1)


# --- 2.2 统一预测函数 (保持不变) ---
def predict_model_unified(model_obj, model_name, target, data_dict):
    # (此函数内容与上一版回复一致，此处为保持完整性而包含)
    if target not in data_dict['original']: return None
    ensemble_map = {'VotingEnsemble': 'Voting', '投票集成': 'Voting', 'WeightedEnsemble': 'Weighted', '加权平均集成': 'Weighted', 'AdaptiveEnsemble': 'Adaptive', '自适应集成': 'Adaptive'}
    ensemble_type = ensemble_map.get(model_name)
    if ensemble_type == 'Voting':
        base_models, weights, datasets = model_obj.estimators, model_obj.weights, model_obj.datasets
        preds = {name: model.predict(data_dict[datasets.get(name, 'original')][target]) for name, model in base_models}
        return np.average([preds[name] for name, _ in base_models], axis=0, weights=weights)
    elif ensemble_type == 'Adaptive':
        meta_classifier, models_dict, available_models, data_type_map = model_obj.meta_classifier, model_obj.models_dict, model_obj.available_models, model_obj.data_type_map
        X_raw = data_dict['original'][target]
        model_choices = meta_classifier.predict(X_raw)
        predictions_dict = {mn: base_model.predict(data_dict[data_type_map.get(mn, 'original')][target]) for mn, base_model in zip(available_models, [models_dict[n] for n in available_models])}
        return np.array([predictions_dict[available_models[choice]][i] for i, choice in enumerate(model_choices)])
    else: # 普通模型或未明确处理的集成模型
        model_map = {'XGBoost':'tree', 'LightGBM':'tree', 'HistGradientBoosting':'tree', 'RandomForest':'tree_filled', 'GaussianProcess':'linear'}
        return model_obj.predict(data_dict[model_map.get(model_name, 'original')][target])
        
# --- 2.3 全新：通用预测包装器 (用于 SHAP 和 排列重要性) ---
class UnifiedPredictor:
    def __init__(self, model_obj, model_name, target, feature_names):
        self.model = model_obj
        self.model_name = model_name
        self.target = target
        self.feature_names = feature_names

    def predict(self, X_numpy):
        # 将 SHAP 或 排列重要性 传入的 numpy 数组转换为带特征名的 DataFrame
        X_df = pd.DataFrame(X_numpy, columns=self.feature_names)
        
        # 实时为这个新的 DataFrame 创建所有需要的预处理版本
        data_dict = {
            'original': {self.target: X_df},
            'tree': {self.target: X_df.copy().replace(0, np.nan)},
            'tree_filled': {self.target: X_df.copy().fillna(0)},
            'linear': {self.target: X_df.copy()} # 假设线性模型需要特殊处理
        }
        # 模拟线性模型的预处理
        for col in [c for c in X_df.columns if c.startswith('类别')]:
            data_dict['linear'][self.target][col] /= 100.0

        return predict_model_unified(self.model, self.model_name, self.target, data_dict)

def sanitize_filename(filename): return "".join([c for c in filename if c.isalnum() or c in (' ', '_')]).rstrip()

# ====================== 3. 全局设置和数据加载 ======================
plt.style.use('ggplot'); sns.set(style="whitegrid", font='SimHei'); plt.rcParams['axes.unicode_minus'] = False
# (加载数据的代码与上一版一致)
print("\n" + "="*20, "加载分析所需数据", "="*20)
try:
    data_folder, model_folder, eval_folder = 'data_exports', '训练模型文件', '模型评估结果'
    X_test_dict = {
        'original': pickle.load(open(os.path.join(data_folder, 'X_test.pkl'), 'rb')),
        'linear': pickle.load(open(os.path.join(data_folder, 'X_test_linear.pkl'), 'rb')),
        'tree': pickle.load(open(os.path.join(data_folder, 'X_test_tree.pkl'), 'rb')),
        'tree_filled': pickle.load(open(os.path.join(data_folder, 'X_test_tree_filled.pkl'), 'rb')),
    }
    y_test = pickle.load(open(os.path.join(data_folder, 'y_test.pkl'), 'rb'))
    feature_columns = pickle.load(open(os.path.join(data_folder, 'feature_columns.pkl'), 'rb'))
    target_columns = pickle.load(open(os.path.join(data_folder, 'target_columns.pkl'), 'rb'))
    evaluation_results = pickle.load(open(os.path.join(eval_folder, 'evaluation_results.pkl'), 'rb'))
    print("✓ 所有数据加载成功")
except Exception as e:
    print(f"!!! 加载数据失败: {e}. 请确保已运行上游脚本。"); exit()
os.makedirs('模型解释', exist_ok=True); os.makedirs('模型解释/图表', exist_ok=True); os.makedirs('模型解释/数据', exist_ok=True)


# ====================== 4. 分析函数定义 (已统一化) ======================

def analyze_prediction_errors(y_true, y_pred, target, model_name):
    print(f"  - 正在分析预测误差...")
    y_true, y_pred = np.asarray(y_true).ravel(), np.asarray(y_pred).ravel()
    r2, rmse, mae = r2_score(y_true, y_pred), np.sqrt(mean_squared_error(y_true, y_pred)), mean_absolute_error(y_true, y_pred)
    
    plt.figure(figsize=(8, 8))
    plt.scatter(y_true, y_pred, alpha=0.6, color='blue')
    min_val, max_val = min(y_true.min(), y_pred.min()), max(y_true.max(), y_pred.max())
    plt.plot([min_val, max_val], [min_val, max_val], 'r--', label='理想预测线 (y=x)')
    plt.title(f'{target} - {model_name}\n预测值 vs. 实际值', fontsize=16, fontweight='bold')
    plt.xlabel('实际值', fontsize=12); plt.ylabel('预测值', fontsize=12)
    stats_text = f'$R^2$ = {r2:.4f}\nRMSE = {rmse:.4f}\nMAE = {mae:.4f}'
    plt.annotate(stats_text, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, fontweight='bold',
                 bbox=dict(boxstyle="round,pad=0.5", fc="white", ec="gray", alpha=0.8), verticalalignment='top')
    plt.grid(True); plt.legend(); plt.tight_layout()
    plt.savefig(f'模型解释/图表/{sanitize_filename(target)}_{model_name}_prediction_vs_actual.png', dpi=300); plt.close()

def analyze_feature_importance(predictor, X, y, feature_names, target, model_name):
    print(f"  - 正在分析特征重要性 (使用排列重要性)...")
    try:
        result = permutation_importance(predictor, X, y, n_repeats=10, random_state=42, n_jobs=-1)
        importance_df = pd.DataFrame({'特征': feature_names, '重要性': result.importances_mean})
        importance_df = importance_df.sort_values(by='重要性', ascending=False).head(20)

        plt.figure(figsize=(10, 8))
        plt.barh(importance_df['特征'], importance_df['重要性'], color=sns.color_palette("viridis", len(importance_df)))
        plt.xlabel('特征重要性 (Permutation Importance)', fontsize=12); plt.ylabel('特征', fontsize=12)
        plt.title(f'{target} - {model_name} 特征重要性', fontsize=16, fontweight='bold')
        plt.gca().invert_yaxis(); plt.tight_layout()
        plt.savefig(f'模型解释/图表/{sanitize_filename(target)}_{model_name}_feature_importance.png', dpi=300); plt.close()
    except Exception as e:
        print(f"    分析特征重要性失败: {e}")

def analyze_shap_values(predictor, X, feature_names, target, model_name):
    print(f"  - 正在进行SHAP值分析 (这可能需要一些时间)...")
    try:
        X_background = shap.sample(X, 50) # 使用50个样本作为背景数据
        explainer = shap.KernelExplainer(predictor.predict, X_background)
        shap_values = explainer.shap_values(X)

        # SHAP 摘要图
        shap.summary_plot(shap_values, X, feature_names=feature_names, show=False)
        plt.title(f'{target} - {model_name}\nSHAP 值摘要', fontsize=16)
        plt.tight_layout()
        plt.savefig(f'模型解释/图表/{sanitize_filename(target)}_{model_name}_shap_summary.png', dpi=300)
        plt.close()

        # SHAP 平均绝对值条形图
        shap.summary_plot(shap_values, X, feature_names=feature_names, plot_type="bar", show=False)
        plt.title(f'{target} - {model_name}\n平均 |SHAP| 值', fontsize=16)
        plt.tight_layout()
        plt.savefig(f'模型解释/图表/{sanitize_filename(target)}_{model_name}_shap_bar.png', dpi=300)
        plt.close()
    except Exception as e:
        print(f"    SHAP分析失败: {e}")

# ====================== 5. 主分析流程 ======================
def main_analysis_flow():
    print("\n" + "="*80 + "\n启动模型解释性分析流程\n" + "="*80)
    for target in target_columns:
        print(f"\n{'='*30} 分析目标: {target} {'='*30}")
        if not evaluation_results.get(target): continue
        
        try:
            best_model_name = max(evaluation_results[target], key=lambda m: evaluation_results[target][m]['r2'])
            print(f"分析R²最佳模型: {best_model_name}")
        except (ValueError, KeyError): continue

        model_path = os.path.join(model_folder, f'{target}_{best_model_name}模型.pkl')
        if not os.path.exists(model_path): continue
            
        with open(model_path, 'rb') as f: model = pickle.load(f)

        y_true = y_test[target]
        X_for_analysis = X_test_dict['original'][target]
        
        # --- 创建通用预测包装器实例 ---
        # 这个包装器将被传递给所有与模型无关的解释工具
        predictor = UnifiedPredictor(model, best_model_name, target, feature_columns)
        
        # --- 生成预测并进行误差分析 ---
        y_pred = predictor.predict(X_for_analysis.values)
        analyze_prediction_errors(y_true, y_pred, target, best_model_name)
        
        # --- 统一调用特征重要性和SHAP分析 ---
        analyze_feature_importance(predictor, X_for_analysis.values, feature_columns, y_true, target, best_model_name)
        analyze_shap_values(predictor, X_for_analysis, feature_columns, target, best_model_name)
        
        print(f"对 {target} 的分析完成。")

    print("\n" + "="*80 + "\n所有分析流程已结束。\n" + "="*80)

# --- 脚本执行入口 ---
if __name__ == '__main__':
    main_analysis_flow()

In [None]:
# ==============================================================================
#           最终、功能完整、基于 gcastle 的 0-因果机器学习.py 脚本
# ==============================================================================

# ====================== 1. 导入所需库 ======================
print("="*20, "导入所需库", "="*20)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
import traceback
import networkx as nx

# 导入 gcastle 用于因果发现
try:
    from gcastle.algorithms import PC
    from gcastle.common import GraphDAG
    from gcastle.common import pc_pc_lingam_cit as cit
except ImportError:
    print("!!! 错误: 找不到 gcastle 库。请先运行 'pip install gcastle' 进行安装。")
    exit()

from sklearn.metrics import r2_score
import lightgbm as lgb
print("✓ 库导入完成")


# ====================== 2. 核心模块 (包含所有类定义和统一预测函数) ======================
class CustomLGBMRegressor:
    def __init__(self, **params): self.params, self.model = params, None
    def fit(self, X, y):
        X_values = X.values if hasattr(X, 'values') else X
        train_data = lgb.Dataset(X_values, label=y, feature_name=[f'f{i}' for i in range(X.shape[1])])
        self.model = lgb.train(self.params, train_data, num_boost_round=self.params.get('n_estimators', 100)); return self
    def predict(self, X):
        if self.model is None: raise ValueError("Model not trained.")
        return self.model.predict(X.values if hasattr(X, 'values') else X)

class EnhancedVotingRegressor:
    def __init__(self, estimators, weights, datasets, target_name):
        self.estimators, self.weights, self.datasets, self.target_name = estimators, np.array(weights), datasets, target_name
        if np.sum(self.weights) > 0: self.weights /= np.sum(self.weights)
    def predict(self, X): return np.zeros(len(X) if hasattr(X, '__len__') else 1)

class AdaptiveEnsembleModel:
    def __init__(self, meta_classifier, models_dict, available_models, model_input_data):
        self.meta_classifier, self.models_dict, self.available_models, self.model_input_data = meta_classifier, models_dict, available_models, model_input_data
        self.data_type_map = {'XGBoost': 'tree', 'LightGBM': 'tree', 'HistGradientBoosting': 'tree', 'RandomForest': 'tree_filled', 'GaussianProcess': 'linear'}
    def predict(self, X): return np.zeros(len(X) if hasattr(X, '__len__') else 1)

class WeightedAverageEnsemble:
    def __init__(self, models_dict, model_names, weights, model_datasets):
        self.models_dict, self.model_names, self.weights, self.model_datasets = models_dict, model_names, weights, model_datasets
    def predict(self, X): return np.zeros(len(X) if hasattr(X, '__len__') else 1)

def predict_model_unified(model_obj, model_name, target, data_dict):
    if target not in data_dict['original']: return None
    ensemble_map = {'VotingEnsemble': 'Voting', '投票集成': 'Voting'}
    ensemble_type = ensemble_map.get(model_name)
    if ensemble_type == 'Voting':
        base_models, weights, datasets = model_obj.estimators, model_obj.weights, model_obj.datasets
        preds = {name: model.predict(data_dict[datasets.get(name, 'original')][target]) for name, model in base_models}
        return np.average([preds[name] for name, _ in base_models], axis=0, weights=weights)
    else:
        model_map = {'XGBoost':'tree', 'LightGBM':'tree', 'HistGradientBoosting':'tree', 'RandomForest':'tree_filled', 'GaussianProcess':'linear'}
        return model_obj.predict(data_dict[model_map.get(model_name, 'original')][target])


# ====================== 3. 全局设置和数据加载 ======================
print("\n" + "="*20, "全局设置和数据加载", "="*20)
plt.style.use('ggplot'); sns.set(style="whitegrid", font='SimHei'); plt.rcParams['axes.unicode_minus'] = False
try:
    data_folder, model_folder, eval_folder = 'data_exports', '训练模型文件', '模型评估结果'
    X_train_dict = {
        'original': pickle.load(open(os.path.join(data_folder, 'X_train.pkl'), 'rb')),
        'linear': pickle.load(open(os.path.join(data_folder, 'X_train_linear.pkl'), 'rb')),
        'tree': pickle.load(open(os.path.join(data_folder, 'X_train_tree.pkl'), 'rb')),
        'tree_filled': pickle.load(open(os.path.join(data_folder, 'X_train_tree_filled.pkl'), 'rb')),
    }
    y_train = pickle.load(open(os.path.join(data_folder, 'y_train.pkl'), 'rb'))
    best_models = {target: pickle.load(open(os.path.join(eval_folder, '最佳模型', f'{target}_最佳模型.pkl'), 'rb')) for target in ['水接触角', '循环使用次数', '吸油能力']}
    feature_columns = pickle.load(open(os.path.join(data_folder, 'feature_columns.pkl'), 'rb'))
    target_columns = pickle.load(open(os.path.join(data_folder, 'target_columns.pkl'), 'rb'))
    print("✓ 所有必需的数据和最佳模型加载成功")
except Exception as e:
    print(f"!!! 加载数据失败: {e}. 请确保已运行上游脚本。"); exit()

feature_rename = {'类别1_无机纳米材料/金属氧化物': '无机材料', '类别2_有机高分子/聚合物': '有机高分子', '类别3_表面改性剂/硅烷类物质': '表面改性剂', '类别4_碳基材料': '碳基材料', '类别5_MOF/功能有机小分子/其他': '功能分子', '制备方法_编码': '制备方法', '基底材料_编码': '基底材料'}
short_names = {**feature_rename, **{'水接触角': '接触角', '循环使用次数': '循环次数', '吸油能力': '吸油能力'}}
os.makedirs('因果分析结果', exist_ok=True); os.makedirs('因果分析结果/因果图', exist_ok=True); os.makedirs('因果分析结果/敏感度分析', exist_ok=True); os.makedirs('因果分析结果/反事实分析', exist_ok=True)


# ====================== 4. 分析函数定义 ======================

def run_causal_discovery(data, target_name, feature_names):
    print(f"  - 正在为 '{target_name}' 运行因果发现 (gcastle.PC)...")
    try:
        data_np = data.to_numpy()
        causal_model = PC()
        causal_model.learn(data_np)
        
        plt.figure(figsize=(12, 10))
        node_labels = {i: short_names.get(col, col) for i, col in enumerate(data.columns)}
        GraphDAG(causal_model.causal_matrix, labels=[node_labels[i] for i in range(len(node_labels))]).plot()
        
        plt.title(f'为 "{short_names.get(target_name, target_name)}" 学习到的因果关系图 (gcastle.PC)', fontsize=16)
        plt.tight_layout()
        plt.savefig(f'因果分析结果/因果图/{target_name}_因果图.png', dpi=300)
        plt.close()
        print(f"  ✓ 因果图已保存")
    except Exception as e:
        print(f"  !!! 因果发现失败: {e}")

def run_sensitivity_analysis(model, model_name, X_train_orig, target, feature_names):
    print(f"  - 正在为 '{target}' 进行模型敏感度分析 (智能采样)...")
    
    # 使用所有特征进行分析
    for feature in feature_names:
        X_ref = pd.DataFrame([X_train_orig.median()])
        train_vals = X_train_orig[feature].dropna()
        if train_vals.empty: continue
        
        f_min, f_max = train_vals.min(), train_vals.max()
        if f_max - f_min < 1e-6: f_max = f_min + 1 # 避免范围为0

        def get_pred_at_value(value):
            cf_sample = X_ref.copy(); cf_sample[feature] = value
            data_dict = {'original': {target: cf_sample}, 'tree': {target: cf_sample.copy().replace(0, np.nan)}, 
                         'tree_filled': {target: cf_sample.copy().fillna(0)}, 'linear': {target: cf_sample.copy()}}
            for col in [c for c in cf_sample.columns if c.startswith('类别')]: data_dict['linear'][target][col] /= 100.0
            return predict_model_unified(model, model_name, target, data_dict)[0]

        initial_points = np.linspace(f_min, f_max, 20)
        initial_preds = [(v, get_pred_at_value(v)) for v in initial_points]
        refined_values = set(initial_points)
        std_dev = np.std([p[1] for p in initial_preds])
        for i in range(len(initial_preds) - 1):
            x1, y1 = initial_preds[i]; x2, y2 = initial_preds[i+1]
            gradient = abs(y2-y1)/(x2-x1) if x2!=x1 else 0
            if gradient > std_dev and std_dev > 1e-6:
                refined_values.update(np.linspace(x1, x2, 5))
        
        all_values = sorted(list(refined_values))
        final_preds = [(v, get_pred_at_value(v)) for v in all_values]
        
        plt.figure(figsize=(10, 6))
        x_vals, y_vals = zip(*final_preds)
        plt.plot(x_vals, y_vals, '-o', color='#f28e2c', markersize=4, linewidth=2)
        
        best_idx = np.argmax(y_vals)
        best_x, best_y = x_vals[best_idx], y_vals[best_idx]
        plt.scatter(best_x, best_y, color='red', s=120, zorder=10, label=f'最佳预测点\n({best_x:.2f}, {best_y:.2f})', ec='black')

        plt.title(f'"{short_names.get(feature, feature)}" 对 "{short_names.get(target, target)}" 的敏感度', fontsize=16)
        plt.xlabel(f'特征 "{short_names.get(feature, feature)}" 的值 (预处理后)', fontsize=12)
        plt.ylabel(f'模型预测的 "{short_names.get(target, target)}"', fontsize=12)
        plt.grid(True, linestyle='--'); plt.legend(); plt.tight_layout()
        plt.savefig(f'因果分析结果/敏感度分析/{target}_{short_names.get(feature, feature).replace("/", "_")}_敏感度分析.png', dpi=300)
        plt.close()
    print(f"  ✓ 敏感度分析图已保存")

def run_counterfactual_analysis(model, model_name, X_train_orig, target, feature_names):
    print(f"  - 正在为 '{target}' 进行反事实分析 (保留原逻辑)...")
    # --- 此处为您的原版反事实分析代码，已适配新数据加载方式 ---
    X_ref = pd.DataFrame([X_train_orig.median()])
    important_features = feature_names[:5] # 仅分析最重要的5个特征
    cf_results = {}
    
    for feature in important_features:
        train_vals = X_train_orig[feature].dropna()
        if train_vals.empty: continue
        feature_min, feature_max = train_vals.min(), train_vals.max()
        if feature_max - feature_min < 1e-6: feature_max = feature_min + 1
            
        all_values = np.linspace(feature_min, feature_max, 50)
        
        def get_pred_at_value_cf(value):
             cf_sample = X_ref.copy(); cf_sample[feature] = value
             data_dict = {'original': {target: cf_sample}, 'tree': {target: cf_sample.copy().replace(0, np.nan)}, 
                          'tree_filled': {target: cf_sample.copy().fillna(0)}, 'linear': {target: cf_sample.copy()}}
             for col in [c for c in cf_sample.columns if c.startswith('类别')]: data_dict['linear'][target][col] /= 100.0
             return predict_model_unified(model, model_name, target, data_dict)[0]

        predictions = [(v, get_pred_at_value_cf(v)) for v in all_values]
        cf_results[feature] = predictions

    # 可视化
    plt.figure(figsize=(12, 8))
    for i, feature in enumerate(important_features):
        if feature not in cf_results: continue
        predictions = cf_results[feature]
        x_vals, y_vals = zip(*predictions)
        plt.plot(x_vals, y_vals, '-', label=short_names.get(feature, feature), linewidth=2)
    
    plt.xlabel('特征值（预处理后）', fontsize=12); plt.ylabel(f'预测的{short_names.get(target, target)}', fontsize=12)
    plt.title(f'{target}的反事实分析', fontsize=16); plt.grid(True, linestyle='--'); plt.legend(); plt.tight_layout()
    plt.savefig(f'因果分析结果/反事实分析/{target}_反事实分析.png', dpi=300); plt.close()
    print(f"  ✓ 反事实分析图已保存")

# ====================== 5. 主分析流程 ======================
def main():
    print("\n" + "="*80 + "\n启动因果机器学习分析流程\n" + "="*80)
    for target in target_columns:
        print(f"\n{'='*30} 分析目标: {target} {'='*30}")
        if target not in best_models:
            print(f"未找到 {target} 的最佳模型，跳过。"); continue
            
        model = best_models[target]
        model_name = "最佳集成模型" if "Ensemble" in model.__class__.__name__ else model.__class__.__name__

        # 准备数据
        X_train_target = X_train_dict['original'][target]
        y_train_target = y_train[target]
        data_for_discovery = X_train_target.join(y_train_target).dropna()

        # 1. 运行因果发现
        run_causal_discovery(data_for_discovery, target, list(data_for_discovery.columns))

        # 2. 运行模型敏感度分析
        run_sensitivity_analysis(model, model_name, X_train_target, target, feature_columns)
        
        # 3. 运行反事实分析
        run_counterfactual_analysis(model, model_name, X_train_target, target, feature_columns)
        
        print(f"对 '{target}' 的分析完成。")

    print("\n" + "="*80 + "\n所有分析流程已结束。\n" + "="*80)

# --- 脚本执行入口 ---
if __name__ == '__main__':
    main()

In [None]:
# ====================== 材料性能多方法集成优化 ======================
print("=" * 40)
print("材料性能多方法集成优化")
print("=" * 80)

# 导入所需库
import os
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from scipy.optimize import minimize, differential_evolution
from skopt import gp_minimize
from skopt.space import Integer, Real
from skopt.utils import use_named_args
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Lambda
from pymoo.core.problem import Problem
import itertools
from collections import defaultdict, Counter

# 忽略特定警告
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

# 创建优化结果保存目录
os.makedirs('optimization_results', exist_ok=True)

# 加载必要数据和模型
print("加载模型和数据...")

# 加载特征数据
feature_data = pd.read_excel('data_exports/material_features_numeric.xlsx')
print(f"成功加载特征数据: {feature_data.shape[0]} 行，{feature_data.shape[1]} 列")

# 尝试加载预处理后的数据(如果存在)
try:
    processed_data = pd.read_excel('data_exports/processed_data.xlsx')
    print("成功加载预处理后的特征数据")
except:
    processed_data = None
    print("未找到预处理数据，将使用原始特征数据")

# 加载目标变量的最佳模型信息
best_model = {}
for target in target_columns:
    try:
        # 从模型评估结果目录加载最佳模型信息
        target_best_model_path = f'模型评估结果/{sanitize_filename(target)}_best_models.pkl'
        if os.path.exists(target_best_model_path):
            with open(target_best_model_path, 'rb') as f:
                target_model_info = pickle.load(f)
                
            # 转换为优化代码期望的格式
            best_model[target] = {
                'tolerance_r2': {
                    'model': target_model_info['best_tol_r2_model']
                }
            }
            print(f"成功加载 {target} 的最佳模型信息")
        else:
            print(f"未找到 {target} 的最佳模型信息文件: {target_best_model_path}")
    except Exception as e:
        print(f"加载 {target} 的最佳模型信息失败: {str(e)}")

# 加载目标容忍度信息
if os.path.exists('训练模型文件/target_tolerance_info.pkl'):
    with open('训练模型文件/target_tolerance_info.pkl', 'rb') as f:
        target_tolerance = pickle.load(f)
    print("加载目标容忍度信息成功")
else:
    # 创建默认目标容忍度
    target_tolerance = {
        '水接触角': 0.2,
        '循环使用次数': 0.3,
        '吸油能力': 0.3
    }
    print("使用默认目标容忍度值")

# 定义误差容忍的评估函数 - 与模型训练部分一致
def tolerance_r2_score(y_true, y_pred, tolerance=0.15, target=None):
    """
    计算容忍度R²评分，允许一定误差范围内的预测被视为准确
    """
    # 确保输入数据是numpy数组并且形状正确
    y_true = np.asarray(y_true).ravel()
    y_pred = np.asarray(y_pred).ravel()
    
    # 如果提供了目标变量名，则使用目标特定的容忍度
    if target and target in target_tolerance:
        tolerance = target_tolerance[target]
    
    # 计算容忍范围
    tolerance_values = tolerance * np.abs(y_true)
    # 计算残差
    residuals = np.abs(y_true - y_pred)
    
    # 调整残差，使误差在容忍范围内的视为0
    adjusted_residuals = np.maximum(0, residuals - tolerance_values)
    
    # 计算修正后的总平方和
    y_true_mean = np.mean(y_true)
    tss = np.sum((y_true - y_true_mean) ** 2)
    
    # 计算修正后的残差平方和
    rss = np.sum(adjusted_residuals ** 2)
    
    # 计算修正后的R²
    if tss == 0:
        return 0  # 防止除以0
    
    tolerance_r2 = 1 - (rss / tss)
    return tolerance_r2

def prediction_within_tolerance(y_true, y_pred, tolerance=0.15, target=None):
    """
    计算预测值在目标值±容忍范围内的比例
    """
    # 确保输入为numpy数组
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    
    # 如果提供了目标变量名，则使用目标特定的容忍度
    if target and target in target_tolerance:
        tolerance = target_tolerance[target]
    
    # 计算容忍范围
    tolerance_values = tolerance * np.abs(y_true)
    
    # 检查预测是否在容忍范围内
    within_tolerance = np.abs(y_true - y_pred) <= tolerance_values
    
    # 计算在容忍范围内的预测比例
    return np.mean(within_tolerance)

# 识别特征列和目标列
target_columns = feature_data.columns[-3:].tolist()  # 假设最后三列是目标列
feature_columns = feature_data.columns[:-3].tolist()  # 特征列

# 类别相关列
category_columns = [col for col in feature_columns if col.startswith('类别')]
method_column = '制备方法_编码'
base_material_column = '基底材料_编码'

print("目标变量:", target_columns)
print("特征变量总数:", len(feature_columns))
print("类别特征数:", len(category_columns))

# 进度条类 - 保持原有定义
class CustomProgressBar:
    def __init__(self, total, desc="进度", bar_length=50):
        self.total = total
        self.desc = desc
        self.n = 0
        self.bar_length = bar_length
        self.start_time = time.time()
        self.last_print_time = 0
        self._print_progress()
        
    def update(self, n=1):
        self.n += n
        current_time = time.time()
        if current_time - self.last_print_time >= 0.1 or self.n >= self.total:
            self._print_progress()
            self.last_print_time = current_time
    
    def _print_progress(self):
        percent = min(100, self.n * 100 / self.total)
        filled_length = int(self.bar_length * self.n // self.total)
        bar = '█' * filled_length + '-' * (self.bar_length - filled_length)
        
        elapsed_time = time.time() - self.start_time
        if self.n > 0:
            time_per_iter = elapsed_time / self.n
            remaining_iters = self.total - self.n
            remaining_time = time_per_iter * remaining_iters
            time_str = f" - 预计剩余: {self._format_time(remaining_time)}"
        else:
            time_str = ""
            
        print(f'\r{self.desc}: |{bar}| {percent:.1f}% {self.n}/{self.total}{time_str}', end='', flush=True)
        if self.n >= self.total:
            print()
    
    def _format_time(self, seconds):
        """将秒数格式化为时:分:秒"""
        m, s = divmod(int(seconds), 60)
        h, m = divmod(m, 60)
        if h > 0:
            return f"{h}时{m}分{s}秒"
        elif m > 0:
            return f"{m}分{s}秒"
        else:
            return f"{s}秒"
            
    def set_description(self, desc):
        """更新描述文字"""
        self.desc = desc
        self._print_progress()
        
    def close(self):
        """关闭进度条"""
        if self.n < self.total:
            self.n = self.total
            self._print_progress()
        print()

# ====================== 材料空间定义 ======================

# 提取材料空间中的唯一元素
def extract_material_space():
    """从训练数据中提取可用的材料和制备方法"""
    print("提取材料空间信息...")
    
    # 初始化材料类型和方法集合
    unique_materials = {
        'base': set(),   # 基底材料
        'category': set()  # 类别材料
    }
    unique_methods = set()  # 制备方法
    
    # 从特征列名提取类别
    for col in category_columns:
        if '_' in col:
            category = col.split('_')[1]  # 提取类别名称
            unique_materials['category'].add(category)
    
    # 从基底材料编码提取唯一值
    base_materials = feature_data[base_material_column].dropna().unique()
    for base in base_materials:
        if pd.notna(base):
            unique_materials['base'].add(int(base))  # 确保基底材料是整数
    
    # 从制备方法编码提取唯一值
    methods = feature_data[method_column].dropna().unique()
    for method in methods:
        if pd.notna(method):
            unique_methods.add(int(method))  # 确保制备方法是整数
    
    # 转换为排序后的列表
    materials_dict = {
        'base': sorted(list(unique_materials['base'])),
        'category': sorted(list(unique_materials['category']))
    }
    methods_list = sorted(list(unique_methods))
    
    print(f"材料空间: {len(materials_dict['base'])} 种基底材料, "
          f"{len(materials_dict['category'])} 种类别材料, "
          f"{len(methods_list)} 种制备方法")
    
    return materials_dict, methods_list

# 提取材料空间
unique_materials, unique_methods = extract_material_space()

# 预测函数与特征生成
def create_material_features(base_material, categories, prep_method, features_template=None):
    """为给定的材料组合创建特征向量"""
    # 如果未提供特征模板，从训练数据中获取特征列
    if features_template is None:
        # 获取第一个目标使用的特征列作为模板
        if target_columns[0] in best_model:
            model_info = best_model[target_columns[0]]
            model_name = model_info['tolerance_r2']['model']
            try:
                # 尝试加载特征信息
                feature_path = f'训练模型文件/{target_columns[0]}_{model_name}_features.pkl'
                if os.path.exists(feature_path):
                    with open(feature_path, 'rb') as f:
                        feature_info = pickle.load(f)
                    feature_cols = feature_info.get('features', feature_columns)
                else:
                    feature_cols = feature_columns
            except Exception as e:
                print(f"无法加载特征模板: {str(e)}")
                feature_cols = feature_columns
        else:
            feature_cols = feature_columns
    else:
        feature_cols = features_template
    
    # 创建特征字典并初始化为0
    features = {}
    for col in feature_cols:
        features[col] = 0.0
    
    # 设置基底材料特征
    features[base_material_column] = float(base_material)
        
    # 设置制备方法特征
    features[method_column] = float(prep_method)
    
    # 设置类别特征
    for cat_id, cat_value in categories:
        # 查找对应的特征列
        for col in category_columns:
            if f"_{cat_id}" in col:
                features[col] = float(cat_value)  # 设置类别材料值
    
    # 创建DataFrame
    df = pd.DataFrame([features])
    
    # 确保列的顺序与特征列表一致
    for col in feature_cols:
        if col not in df.columns:
            df[col] = 0.0
    
    df = df[feature_cols]
    
    return df
def predict_performance(target, features):
    """预测给定材料组合的性能"""
    if target not in best_model:
        print(f"{target}不在best_model字典中")
        return None

    # 使用正确的键路径获取模型名称
    model_name = best_model[target]['tolerance_r2']['model']
    
    
    # 确定模型类型
    if model_name in ["XGBoost", "LightGBM", "HistGradientBoosting"]:
        model_type = 'tree'
    elif model_name == "RandomForest":
        model_type = 'tree_filled'
    elif model_name in ["LinearRegression", "Ridge", "Lasso", "ElasticNet", "HuberRegressor", "GaussianProcess"]:
        model_type = 'linear'
    elif model_name in ["DeepNN"]:
        model_type = 'nn'
    else:
        model_type = 'ensemble'
    
    
    # 加载最佳模型
    model_path = f'模型评估结果/最佳模型/{target}_最佳模型.pkl'
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    
    # 识别类别列
    category_columns = [col for col in features.columns if col.startswith('类别')]
    
    # 应用与训练相同的预处理
    features_processed = features.copy()
    if model_type == 'linear':
        # 线性模型: 类别编码除以1000
        for col in category_columns:
            features_processed[col] = features_processed[col] / 1000.0
    elif model_type == 'tree':
        # 树模型: 将0替换为NaN
        for col in category_columns:
            features_processed[col] = features_processed[col].replace(0, np.nan)
    
    # 预定义的特征列
    expected_features = category_columns + ['制备方法_编码', '基底材料_编码']
    
    # 确保所有需要的特征列都存在
    for feat in expected_features:
        if feat not in features_processed.columns:
            features_processed[feat] = 0.0
    
    # 使用预期的特征列
    features_for_model = features_processed[expected_features]
    
    
    # 进行预测
    if isinstance(model, dict) and 'model' in model:
        if model.get('needs_scaling', False) and 'scaler' in model:
            X_scaled = model['scaler'].transform(features_for_model)
            pred = model['model'].predict(X_scaled)[0]
        else:
            pred = model['model'].predict(features_for_model)[0]
    else:
        # XGBoost模型可能需要特殊处理
        if str(type(model)).find('xgboost') > -1:
            try:
                import xgboost as xgb
                with xgb.config_context(verbosity=0, predict_disable_shape_check=True):
                    pred = model.predict(features_for_model)[0]
            except:
                pred = model.predict(features_for_model)[0]
        else:
            pred = model.predict(features_for_model)[0]
    
    return pred

# 文件名净化函数
def sanitize_filename(filename):
    """将文件名中的无效字符替换为下划线"""
    if filename is None:
        return "unknown"
    invalid_chars = ['/', '\\', ':', '*', '?', '"', '<', '>', '|']
    sanitized = str(filename)
    for char in invalid_chars:
        sanitized = sanitized.replace(char, '_')
    return sanitized

# 计算材料组合的综合评分
def calculate_overall_score(base_material, categories, prep_method):
    """计算给定材料组合的综合评分"""
    # 构建类别形式
    categories_formatted = []
    for cat_id, cat_value in categories:
        if cat_value > 0:  # 只添加存在的材料
            categories_formatted.append((cat_id, cat_value))
    
    if not categories_formatted:
        print("没有有效的类别材料")
        return 0.0, {}  # 如果没有有效材料，返回0分
    
    # 创建原始编码特征
    features = create_material_features(base_material, categories_formatted, prep_method)
    
    # 预测各指标性能
    predictions = {}
    for target in target_columns:
        if target in best_model:
            # 使用修改后的预测函数
            pred = predict_performance(target, features)
            if pred is not None:
                predictions[target] = pred
                #print(f"预测 {target}: {pred:.2f}")
    
    # 如果没有有效预测，返回0分
    if not predictions:
        print("没有有效的预测结果")
        return 0.0, {}
    
    # 根据因果分析结果调整权重
    weights = {}
    total_weight = 0.0
    
    for target in predictions.keys():
        if target == '水接触角':
            weights[target] = 1.0
        elif target == '循环使用次数':
            weights[target] = 1.2  # 略微提高权重
        elif target == '吸油能力':
            weights[target] = 1.3  # 更高权重
        else:
            weights[target] = 1.0
        total_weight += weights[target]
    
    # 计算加权综合得分
    score = 0.0
    
    for target, value in predictions.items():
        # 归一化特定指标的得分
        if target == '水接触角':
            # 水接触角范围通常为90-150度，值越大越好
            norm_score = min(1.0, max(0.0, value / 180.0))
        elif target == '循环使用次数':
            # 循环使用次数通常为10-50次，值越大越好
            norm_score = min(1.0, max(0.0, value / 50.0))
        elif target == '吸油能力':
            # 吸油能力通常为5-50 g/g，值越大越好
            norm_score = min(1.0, max(0.0, value / 50.0))
        else:
            norm_score = min(1.0, max(0.0, value / 100.0))
        
        #print(f"{target} 归一化得分: {norm_score:.4f}, 权重: {weights[target]/total_weight:.2f}")
            
        # 加权求和
        if total_weight > 0:
            score += (weights[target] / total_weight) * norm_score
    
    #print(f"总综合得分: {score:.4f}")
    return score, predictions

# ====================== 材料编码/解码函数 ======================

def one_hot_encode_material(base_idx, method_idx, categories):
    """创建材料组合的独热编码"""
    # 基底材料编码
    base_encoding = np.zeros(len(unique_materials['base']))
    base_encoding[base_idx] = 1
    
    # 制备方法编码
    method_encoding = np.zeros(len(unique_methods))
    method_encoding[method_idx] = 1
    
    # 类别材料编码 - 每个类别一个热独编码位置
    category_encoding = np.zeros(5)  # 5个类别
    for cat_id, cat_value in categories:
        if 1 <= cat_id <= 5:  # 确保类别ID在有效范围内
            category_encoding[cat_id-1] = cat_value
    
    # 组合所有编码
    return np.concatenate([base_encoding, method_encoding, category_encoding])

In [None]:
# ======= 1. 贝叶斯优化材料搜索 =======
print("\n1. 执行贝叶斯优化...")

try:
    # 为贝叶斯优化定义目标函数
    @use_named_args(dimensions=[
        Integer(0, len(unique_materials['base']) - 1, name='base_idx'),
        Integer(0, len(unique_methods) - 1, name='method_idx'),
        Integer(1, 5, name='num_categories'),  # 1-5个类别
        Integer(1, 5, name='cat1_id'),
        Real(100, 699, name='cat1_val'),
        Integer(1, 5, name='cat2_id'),
        Real(100, 699, name='cat2_val'),
        Integer(1, 5, name='cat3_id'),
        Real(100, 699, name='cat3_val'),
        Integer(1, 5, name='cat4_id'),
        Real(100, 699, name='cat4_val'),
        Integer(1, 5, name='cat5_id'),
        Real(100, 699, name='cat5_val')
    ])
    def bayesian_objective(**params):
        # 解析参数
        base_idx = params['base_idx']
        method_idx = params['method_idx']
        num_categories = params['num_categories']
        
        # 转换为实际材料
        base_material = unique_materials['base'][base_idx]
        prep_method = unique_methods[method_idx]
        
        # 提取类别材料
        categories = []
        for i in range(1, num_categories+1):
            cat_id = params[f'cat{i}_id']
            cat_val = params[f'cat{i}_val']
            categories.append((cat_id, cat_val))
        
        # 确保没有重复类别
        seen_categories = set()
        unique_categories = []
        for cat_id, cat_val in categories:
            if cat_id not in seen_categories:
                seen_categories.add(cat_id)
                unique_categories.append((cat_id, cat_val))
        
        # 确保至少有一个类别
        if not unique_categories:
            unique_categories = [(params['cat1_id'], params['cat1_val'])]
        
        # 计算综合评分
        score, _ = calculate_overall_score(base_material, unique_categories, prep_method)
        
        # 贝叶斯优化是最小化目标函数，所以返回负分数
        return -score
        
    # 创建进度条
    progress_bar = CustomProgressBar(total=50, desc="贝叶斯优化")
    
    # 执行贝叶斯优化
    result = gp_minimize(
        func=bayesian_objective,
        dimensions=[
            Integer(0, len(unique_materials['base']) - 1),
            Integer(0, len(unique_methods) - 1),
            Integer(1, 5),  # 1-5个类别
            Integer(1, 5),
            Real(100, 699),
            Integer(1, 5),
            Real(100, 699),
            Integer(1, 5),
            Real(100, 699),
            Integer(1, 5),
            Real(100, 699),
            Integer(1, 5),
            Real(100, 699)
        ],
        n_calls=50,
        n_random_starts=20,
        random_state=42,
        callback=lambda res: progress_bar.update(1)
    )
    
    progress_bar.close()
    
    # 解析最佳结果
    best_params = result.x
    base_idx = int(best_params[0])
    method_idx = int(best_params[1])
    num_categories = int(best_params[2])
    
    # 提取类别材料
    bayes_categories = []
    for i in range(num_categories):
        cat_id = int(best_params[3 + i*2])
        cat_val = best_params[4 + i*2]
        bayes_categories.append((cat_id, cat_val))
    
    # 确保没有重复类别
    seen_categories = set()
    bayes_best_categories = []  # 修改变量名与后面使用一致
    for cat_id, cat_val in bayes_categories:
        if cat_id not in seen_categories:
            seen_categories.add(cat_id)
            bayes_best_categories.append((cat_id, cat_val))
    
    # 转换为实际材料
    bayes_best_base = unique_materials['base'][base_idx]
    bayes_best_method = unique_methods[method_idx]
    
    # 计算性能
    bayes_score, bayes_predictions = calculate_overall_score(
        bayes_best_base, bayes_best_categories, bayes_best_method
    )
    
    # 获取材料名称 (如果可用)
    try:
        # 加载编码参照表
        base_ref = pd.read_excel('data_exports/base_material_encoding_reference.xlsx')
        method_ref = pd.read_excel('data_exports/method_encoding_reference.xlsx')
        material_ref = pd.read_excel('data_exports/material_encoding_reference.xlsx')
        
        # 获取基底材料名称
        base_name = base_ref[base_ref['编码值'] == bayes_best_base]['基底材料'].values[0] if len(base_ref[base_ref['编码值'] == bayes_best_base]) > 0 else f"基底材料{bayes_best_base}"
        
        # 获取制备方法名称
        method_name = method_ref[method_ref['编码值'] == bayes_best_method]['制备方法'].values[0] if len(method_ref[method_ref['编码值'] == bayes_best_method]) > 0 else f"制备方法{bayes_best_method}"
        
        # 获取类别材料名称
        category_names = []
        for cat_id, cat_val in bayes_best_categories:
            # 查找最接近的编码值
            cat_df = material_ref[material_ref['分类ID'] == cat_id]
            if not cat_df.empty:
                # 找到最接近的编码值
                cat_df['差值'] = abs(cat_df['编码值'] - cat_val)
                closest = cat_df.loc[cat_df['差值'].idxmin()]
                category_names.append(f"{closest['材料名称']} (类别{cat_id}, 编码{cat_val:.1f})")
            else:
                category_names.append(f"类别{cat_id}材料 (编码{cat_val:.1f})")
        
        print(f"贝叶斯优化发现的最佳材料:")
        print(f"  基底材料: {base_name} (编码: {bayes_best_base})")
        print(f"  类别材料: {', '.join(category_names)}")
        print(f"  制备方法: {method_name} (编码: {bayes_best_method})")
        print(f"  综合评分: {bayes_score:.4f}")
        print("  预测性能:")
        for target, value in bayes_predictions.items():
            print(f"    {target}: {value:.2f}")
    except Exception as e:
        # 如果无法获取名称，则显示编码
        print(f"贝叶斯优化发现的最佳材料:")
        print(f"  基底材料: {bayes_best_base}")
        print(f"  类别材料: {', '.join([f'(类别{cat_id}, 值{cat_val:.1f})' for cat_id, cat_val in bayes_best_categories])}")
        print(f"  制备方法: {bayes_best_method}")
        print(f"  综合评分: {bayes_score:.4f}")
        print("  预测性能:")
        for target, value in bayes_predictions.items():
            print(f"    {target}: {value:.2f}")
        print(f"  注: 无法加载材料参照表: {str(e)}")
        
    # 可视化优化过程
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 设置中文字体
    plt.rcParams['axes.unicode_minus'] = False  # 正确显示负号
    
    # 使用自定义样式绘制收敛曲线
    plt.figure(figsize=(12, 8))
    iterations = range(len(result.func_vals))
    plt.plot(iterations, result.func_vals, 'o-', color='#1f77b4', 
            markersize=8, linewidth=2, label='目标函数值')
    plt.plot(iterations[result.func_vals.argmin()], result.func_vals.min(), 
            'r*', markersize=20, label='全局最优解')
    
    # 增加移动平均线，展示趋势
    window_size = max(2, len(result.func_vals) // 10)
    if len(result.func_vals) > window_size:
        moving_avg = np.convolve(result.func_vals, np.ones(window_size)/window_size, mode='valid')
        plt.plot(range(window_size-1, len(result.func_vals)), moving_avg, 
                '--', color='#ff7f0e', linewidth=2, label=f'移动平均 (窗口={window_size})')
    
    # 添加标题和标签
    plt.title("贝叶斯优化收敛过程", fontsize=18, fontweight='bold', pad=20)
    plt.xlabel("迭代次数", fontsize=14)
    plt.ylabel("目标函数值", fontsize=14)
    
    # 美化坐标轴和网格
    plt.grid(True, linestyle='--', alpha=0.7)
    ax = plt.gca()  # 获取当前坐标轴
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    # 添加注解，标记最优点
    min_idx = result.func_vals.argmin()
    min_val = result.func_vals.min()
    plt.annotate(f'最小值: {-min_val:.4f}', 
                xy=(min_idx, min_val), 
                xytext=(min_idx+2, min_val+0.05),
                arrowprops=dict(facecolor='black', shrink=0.05, width=1.5),
                fontsize=12)
    
    # 添加图例
    plt.legend(loc='upper right', fontsize=12, frameon=True, framealpha=0.9)
    
    # 添加水印和优化信息
    info_text = (f"贝叶斯优化信息:\n"
                f"· 总迭代次数: {len(result.func_vals)}\n"
                f"· 初始随机采样: 20\n"
                f"· 最优评分: {-min_val:.4f}")
    
    plt.figtext(0.02, 0.02, info_text, fontsize=10, 
                bbox=dict(facecolor='white', alpha=0.8, boxstyle='round,pad=0.5'))
    
    plt.tight_layout()
    plt.savefig('optimization_results/bayesian_convergence.png', dpi=300, bbox_inches='tight')

    # 在这一行之后、plt.close()之前插入我提供的代码
    # 保存优化数据供后续自定义可视化使用
    import os

    # 确保目录存在
    os.makedirs('optimization_results', exist_ok=True)

    # 准备数据
    iterations = list(range(len(result.func_vals)))
    func_vals = list(result.func_vals)
    best_idx = result.func_vals.argmin()
    best_val = result.func_vals.min()

    # 计算移动平均线数据（如果适用）
    moving_avg_data = []
    window_size = max(2, len(result.func_vals) // 10)
    if len(result.func_vals) > window_size:
        moving_avg = np.convolve(result.func_vals, np.ones(window_size)/window_size, mode='valid')
        # 用NaN填充开头以匹配长度
        moving_avg_full = [float('nan')] * (window_size - 1) + list(moving_avg)
    else:
        moving_avg_full = [float('nan')] * len(result.func_vals)

    # 创建数据字典
    data = {
        '迭代次数': iterations,
        '目标函数值': func_vals,
        '移动平均': moving_avg_full,
        '是否最优点': [i == best_idx for i in iterations]
    }

    # 转换为DataFrame并保存
    optimization_df = pd.DataFrame(data)
    optimization_df.to_csv('optimization_results/bayesian_optimization_data.csv', index=False)

    # 保存元数据
    metadata = pd.DataFrame({
        '指标': ['总迭代次数', '随机采样数', '最优评分', '最优迭代序号', '移动平均窗口大小'],
        '数值': [len(result.func_vals), 20, -best_val, best_idx, window_size]
    })
    metadata.to_csv('optimization_results/bayesian_optimization_metadata.csv', index=False)

    # 保存最优解的详细信息
    best_solution = pd.DataFrame({
        '参数': ['基底材料', '制备方法', '综合评分'] + 
            [f'类别{cat_id}材料值' for cat_id, _ in bayes_best_categories] +
            list(bayes_predictions.keys()),
        '数值': [bayes_best_base, bayes_best_method, bayes_score] + 
            [cat_val for _, cat_val in bayes_best_categories] +
            list(bayes_predictions.values())
    })
    best_solution.to_csv('optimization_results/bayesian_best_solution.csv', index=False)

    print(f"优化数据已保存至: optimization_results/bayesian_optimization_data.csv")
    print(f"优化元数据已保存至: optimization_results/bayesian_optimization_metadata.csv")
    print(f"优化最优解已保存至: optimization_results/bayesian_best_solution.csv")


    plt.close()
    
except Exception as e:
    print(f"贝叶斯优化执行失败: {str(e)}")
    bayes_best_base = None
    bayes_best_categories = None
    bayes_best_method = None
    bayes_score = 0
    bayes_predictions = {}



In [None]:
# ======= 精准化生成式AI优化 =======
print("\n2. 执行精准化生成式AI优化...")

# 首先验证模型加载状态
print("验证模型加载状态...")
print(f"目标变量: {target_columns}")
print(f"best_model中的键: {list(best_model.keys())}")

if not best_model:
    print("警告: best_model为空，检查模型文件路径...")
    # 显示可能的文件路径用于调试
    for target in target_columns:
        target_sanitized = sanitize_filename(target)
        expected_path = f'模型评估结果/{target_sanitized}_best_models.pkl'
        print(f"  期待的路径: {expected_path}")
        print(f"  文件是否存在: {os.path.exists(expected_path)}")
    
    # 如果模型未加载，尝试手动加载一次
    print("尝试重新加载模型...")
    for target in target_columns:
        target_sanitized = sanitize_filename(target)
        possible_paths = [
            f'模型评估结果/{target_sanitized}_best_models.pkl',
            f'模型评估结果/{target}_best_models.pkl',
            f'训练模型文件/{target_sanitized}_best_models.pkl',
            f'训练模型文件/{target}_best_models.pkl'
        ]
        
        for path in possible_paths:
            if os.path.exists(path):
                try:
                    with open(path, 'rb') as f:
                        target_model_info = pickle.load(f)
                    best_model[target] = {
                        'tolerance_r2': {
                            'model': target_model_info['best_tol_r2_model']
                        }
                    }
                    print(f"  ✓ 成功加载 {target} 的模型")
                    break
                except Exception as e:
                    print(f"  ❌ 加载 {path} 失败: {str(e)}")

# 验证最终的模型状态
if best_model:
    print(f"✓ 模型加载成功，共 {len(best_model)} 个目标变量")
    for target, model_info in best_model.items():
        print(f"  {target}: {model_info['tolerance_r2']['model']}")
else:
    print("❌ 模型加载失败，无法进行精准预测")
    # 设置空结果并退出
    gen_best_base = None
    gen_best_categories = None
    gen_best_method = None
    gen_score = 0
    gen_predictions = {}
    print("生成式AI优化跳过")

# 只有在模型成功加载后才继续
if best_model:
    print("\n开始收集高质量训练样本...")
    
    # 精准的材料组合收集
    valid_combinations = []
    material_scores = []
    material_encodings = []
    
    # 创建精准的编码函数
    def create_precise_encoding(base_material, categories, prep_method):
        """创建精准的材料编码，保留更多信息"""
        encoding = []
        
        # 基底材料 - 使用标准化编码
        base_normalized = (base_material - min(unique_materials['base'])) / \
                         max(1, max(unique_materials['base']) - min(unique_materials['base']))
        encoding.append(base_normalized)
        
        # 制备方法 - 使用标准化编码  
        method_normalized = (prep_method - min(unique_methods)) / \
                           max(1, max(unique_methods) - min(unique_methods))
        encoding.append(method_normalized)
        
        # 类别材料 - 使用更精细的编码
        # 为每个可能的类别ID创建独立的编码维度
        for cat_id in range(1, 6):  # 5个类别
            # 存在标志
            exists = 1.0 if any(cid == cat_id for cid, _ in categories) else 0.0
            encoding.append(exists)
            
            # 数值 - 归一化到[0,1]
            cat_value = 0.0
            for cid, cval in categories:
                if cid == cat_id:
                    # 根据类别ID的不同范围进行归一化
                    if cat_id <= 4:
                        cat_value = (cval - 100) / 99.0  # 100-199范围
                    else:  # cat_id == 5
                        cat_value = (cval - 500) / 199.0  # 500-699范围
                    break
            encoding.append(cat_value)
        
        # 添加组合特征：类别数量
        num_categories = len(categories)
        encoding.append(num_categories / 5.0)  # 归一化
        
        # 添加组合特征：类别多样性
        category_ids = sorted([cid for cid, _ in categories])
        diversity = len(set(category_ids)) / 5.0  # 归一化
        encoding.append(diversity)
        
        return np.array(encoding, dtype=np.float32)
    
    # 收集训练样本，增加质量控制
    progress_bar = CustomProgressBar(total=len(feature_data), desc="收集高质量样本")
    
    collected_samples = 0
    for i, row in feature_data.iterrows():
        try:
            # 提取基本信息
            base_material = row[base_material_column]
            prep_method = row[method_column]
            
            if pd.isna(base_material) or pd.isna(prep_method):
                progress_bar.update(1)
                continue
            
            # 提取并验证类别材料
            categories = []
            for col in category_columns:
                if pd.notna(row[col]) and row[col] > 0:
                    if '_' in col:
                        try:
                            category_id = int(col.split('_')[0].replace('类别', ''))
                            if 1 <= category_id <= 5:
                                categories.append((category_id, row[col]))
                        except (ValueError, IndexError):
                            continue
            
            # 质量控制：确保数据完整性
            if not categories or len(categories) > 5:
                progress_bar.update(1)
                continue
            
            # 去重类别
            seen_categories = set()
            unique_categories = []
            for cat_id, cat_val in categories:
                if cat_id not in seen_categories:
                    seen_categories.add(cat_id)
                    unique_categories.append((cat_id, cat_val))
            
            # 确保材料在已知范围内
            if (base_material not in unique_materials['base'] or 
                prep_method not in unique_methods):
                progress_bar.update(1)
                continue
            
            # 计算真实性能评分
            score, predictions = calculate_overall_score(base_material, unique_categories, prep_method)
            
            # 质量控制：只保留高质量样本
            if score > 0 and predictions and len(predictions) >= 2:
                valid_combinations.append((base_material, unique_categories, prep_method))
                material_scores.append(score)
                
                # 创建精准编码
                encoding = create_precise_encoding(base_material, unique_categories, prep_method)
                material_encodings.append(encoding)
                collected_samples += 1
                
                # 显示前几个成功样本的信息
                if collected_samples <= 3:
                    print(f"\n样本 {collected_samples}:")
                    print(f"  基底: {base_material}, 方法: {prep_method}")
                    print(f"  类别: {unique_categories}")
                    print(f"  评分: {score:.4f}")
                    print(f"  预测: {list(predictions.values())}")
        
        except Exception as e:
            if collected_samples < 3:
                print(f"\n处理样本 {i} 时出错: {str(e)}")
        
        progress_bar.update(1)
    
    progress_bar.close()
    print(f"成功收集 {collected_samples} 个高质量训练样本")
    
    if collected_samples >= 20:  # 提高最小样本要求
        print("\n训练精准生成模型...")
        
        # 转换为NumPy数组
        X_train = np.array(material_encodings)  
        y_train = np.array(material_scores)
        
        # 数据质量检查和清理
        print(f"训练数据形状: {X_train.shape}")
        print(f"特征维度: {X_train.shape[1]}")
        print(f"评分范围: [{y_train.min():.4f}, {y_train.max():.4f}]")
        
        # 处理异常值
        from sklearn.preprocessing import RobustScaler
        scaler = RobustScaler()  # 对异常值更鲁棒
        X_train_scaled = scaler.fit_transform(X_train)
        
        # 使用改进的生成模型 - 组合高斯混合模型和变分采样
        from sklearn.mixture import GaussianMixture
        
        # 优化的高斯混合模型参数
        n_components = min(8, max(3, collected_samples // 8))  # 更精细的组件选择
        print(f"使用 {n_components} 个高斯组件")
        
        # 训练高斯混合模型
        gmm = GaussianMixture(
            n_components=n_components,
            covariance_type='full',  # 使用完整协方差矩阵
            max_iter=200,  # 增加迭代次数
            random_state=42,
            reg_covar=1e-6,  # 正则化参数
            tol=1e-4  # 收敛容忍度
        )
        
        gmm.fit(X_train_scaled)
        print(f"高斯混合模型训练完成，收敛: {gmm.converged_}")
        print(f"对数似然: {gmm.score(X_train_scaled):.4f}")
        
        # 精准的材料生成策略
        print("生成高质量候选材料...")
        
        # 多种生成策略组合
        all_candidates = []
        
        # 策略1: 从高斯混合模型采样
        n_gmm_samples = min(300, collected_samples * 3)
        gmm_samples, _ = gmm.sample(n_gmm_samples)
        gmm_samples_original = scaler.inverse_transform(gmm_samples)
        
        # 策略2: 基于最佳样本的变分采样
        top_indices = np.argsort(y_train)[-min(10, len(y_train)):]  # 取前10个最佳样本
        for idx in top_indices:
            base_sample = X_train_scaled[idx]
            # 在最佳样本周围生成变分
            for _ in range(20):  # 每个最佳样本生成20个变分
                noise = np.random.normal(0, 0.1, base_sample.shape)  # 小幅随机扰动
                variant = base_sample + noise
                variant_original = scaler.inverse_transform([variant])[0]
                all_candidates.append(variant_original)
        
        # 合并所有候选样本
        all_candidates.extend(gmm_samples_original)
        
        print(f"生成 {len(all_candidates)} 个候选材料组合")
        
        # 精准的材料解码和评估
        evaluated_materials = []
        decode_errors = 0
        
        progress_bar = CustomProgressBar(total=len(all_candidates), desc="精准评估候选材料")
        
        for sample in all_candidates:
            try:
                # 精准解码基底材料
                base_norm = sample[0]
                base_range = max(unique_materials['base']) - min(unique_materials['base'])
                base_material = min(unique_materials['base']) + base_norm * base_range
                # 选择最接近的实际基底材料
                base_material = min(unique_materials['base'], 
                                  key=lambda x: abs(x - base_material))
                
                # 精准解码制备方法
                method_norm = sample[1]
                method_range = max(unique_methods) - min(unique_methods)
                prep_method = min(unique_methods) + method_norm * method_range
                # 选择最接近的实际制备方法
                prep_method = min(unique_methods,
                                key=lambda x: abs(x - prep_method))
                
                # 精准解码类别材料
                categories = []
                for cat_id in range(1, 6):
                    exists_idx = 2 + (cat_id - 1) * 2
                    value_idx = exists_idx + 1
                    
                    if exists_idx < len(sample) and value_idx < len(sample):
                        exists_prob = sample[exists_idx]
                        value_norm = sample[value_idx]
                        
                        # 更严格的存在判断
                        if exists_prob > 0.4:  # 提高阈值
                            # 精准的数值恢复
                            if cat_id <= 4:
                                cat_value = 100 + value_norm * 99
                            else:
                                cat_value = 500 + value_norm * 199
                            
                            # 确保数值在合理范围内
                            cat_value = max(100, min(699, cat_value))
                            categories.append((cat_id, cat_value))
                
                # 质量控制：确保至少有一个类别
                if not categories:
                    # 基于样本特征选择最可能的类别
                    best_cat_id = 1
                    best_prob = 0
                    for cat_id in range(1, 6):
                        exists_idx = 2 + (cat_id - 1) * 2
                        if exists_idx < len(sample) and sample[exists_idx] > best_prob:
                            best_prob = sample[exists_idx]
                            best_cat_id = cat_id
                    
                    # 为最可能的类别设置默认值
                    if best_cat_id <= 4:
                        default_val = 100 * best_cat_id + 50
                    else:
                        default_val = 600
                    categories.append((best_cat_id, default_val))
                
                # 去重和验证
                seen_cats = set()
                final_categories = []
                for cat_id, cat_val in categories:
                    if cat_id not in seen_cats:
                        seen_cats.add(cat_id)
                        final_categories.append((cat_id, cat_val))
                
                # 精准的性能评估
                score, predictions = calculate_overall_score(base_material, final_categories, prep_method)
                
                if score > 0 and predictions:
                    evaluated_materials.append({
                        'base_material': base_material,
                        'categories': final_categories,
                        'prep_method': prep_method,
                        'score': score,
                        'predictions': predictions
                    })
            
            except Exception as e:
                decode_errors += 1
                if decode_errors <= 3:
                    print(f"\n解码错误: {str(e)}")
            
            progress_bar.update(1)
        
        progress_bar.close()
        
        print(f"成功评估 {len(evaluated_materials)} 个有效材料组合")
        print(f"解码错误: {decode_errors} 个")
        
        if evaluated_materials:
            # 按评分排序
            evaluated_materials.sort(key=lambda x: x['score'], reverse=True)
            
            # 选择最佳材料
            best_material = evaluated_materials[0]
            
            gen_best_base = best_material['base_material']
            gen_best_categories = best_material['categories']
            gen_best_method = best_material['prep_method']
            gen_score = best_material['score']
            gen_predictions = best_material['predictions']
            
            print(f"\n🎯 精准生成式AI发现的最佳材料组合:")
            print(f"  基底材料编码: {gen_best_base}")
            print(f"  制备方法编码: {gen_best_method}")
            print(f"  类别材料组合: {gen_best_categories}")
            print(f"  综合评分: {gen_score:.6f}")
            print(f"  详细性能预测:")
            for target, value in gen_predictions.items():
                print(f"    {target}: {value:.4f}")
            
            # 与训练数据对比
            training_scores = material_scores
            improvement = (gen_score - np.mean(training_scores)) / np.mean(training_scores) * 100
            print(f"\n📊 性能对比:")
            print(f"  训练数据平均评分: {np.mean(training_scores):.6f}")
            print(f"  训练数据最高评分: {np.max(training_scores):.6f}")
            print(f"  生成式AI评分: {gen_score:.6f}")
            print(f"  相对于平均值的提升: {improvement:.2f}%")
            
            if gen_score > np.max(training_scores):
                print(f"  🎉 超越了所有训练样本！")
            else:
                rank = sum(1 for s in training_scores if s > gen_score) + 1
                print(f"  排名: 第 {rank}/{len(training_scores)} 位")
        
        else:
            print("❌ 未生成有效的材料组合")
            gen_best_base = None
            gen_best_categories = None
            gen_best_method = None
            gen_score = 0
            gen_predictions = {}
    
    else:
        print(f"❌ 高质量样本不足 ({collected_samples} < 20)，无法训练精准模型")
        gen_best_base = None
        gen_best_categories = None
        gen_best_method = None
        gen_score = 0
        gen_predictions = {}

print("\n精准化生成式AI优化完成！")

In [None]:
# ======= 3. 差分进化优化 =======
print("\n3. 执行差分进化优化...")

try:
    # 修改 OptimizationProgress 类，增加历史记录
    class OptimizationProgress:
        def __init__(self, description="优化进度"):
            self.description = description
            self.iteration = 0
            self.history = []
            self.best_values = []
            self.start_time = time.time()
            
            # 创建进度条
            print(f"{self.description}开始...")
        
        def update(self, xk, convergence=None):
            """记录优化过程"""
            self.iteration += 1
            
            # 记录当前最优值
            if isinstance(xk, dict) and 'fun' in xk:
                func_value = float(xk['fun'])  # 确保转换为Python浮点数
            else:
                # 如果是数组，确保转换为Python浮点数
                func_value = float(xk) if np.isscalar(xk) else float(xk[0]) if hasattr(xk, '__len__') and len(xk) > 0 else 0.0
            
            self.history.append(func_value)
            
            # 记录历史最优值
            if not self.best_values or func_value < min(self.best_values):
                self.best_values.append(func_value)
            else:
                self.best_values.append(min(self.best_values))
            
            # 每10次迭代显示一次进度
            if self.iteration % 10 == 0 or self.iteration == 1:
                elapsed = time.time() - self.start_time
                best_value = float(min(self.best_values))  # 确保是Python浮点数
                print(f"{self.description} - 迭代: {self.iteration}, 最优值: {best_value:.6f}, 用时: {elapsed:.2f}秒")
        
        def close(self):
            """完成优化过程"""
            elapsed = time.time() - self.start_time
            if self.history:
                best_value = float(min(self.best_values))  # 确保是Python浮点数
                print(f"{self.description}完成 - 共{self.iteration}次迭代, 最优值: {best_value:.6f}, 总用时: {elapsed:.2f}秒")
            else:
                print(f"{self.description}完成 - 无有效记录")

    # 定义差分进化目标函数
    def de_objective(x):
        """差分进化优化的目标函数"""
        try:
            # 确保 x 是一个标准Python列表或NumPy数组
            x_list = x.tolist() if hasattr(x, 'tolist') else list(x)
            
            # 基底材料和制备方法索引
            base_idx = int(x_list[0]) % len(unique_materials['base'])
            method_idx = int(x_list[1]) % len(unique_methods)
            
            # 从参数中提取类别信息
            categories = []
            for i in range(0, min(10, len(x_list)-2), 2):  # 最多5对类别参数
                if i+3 < len(x_list):
                    cat_id = max(1, min(5, int(x_list[i+2])))  # 确保类别ID在1-5范围内
                    cat_val = float(x_list[i+3])  # 确保是标量值
                    # 将值映射到有效范围 (100-699)
                    if cat_val > 0:  # 只添加正值
                        mapped_val = max(100, min(699, cat_val))
                        categories.append((cat_id, mapped_val))
            
            # 去除重复类别
            seen_cats = set()
            unique_cats = []
            for cat_id, cat_val in categories:
                if cat_id not in seen_cats:
                    seen_cats.add(cat_id)
                    unique_cats.append((cat_id, cat_val))
            
            # 确保至少有一个类别
            if not unique_cats and len(x_list) > 3:
                cat_id = max(1, min(5, int(x_list[2])))
                cat_val = max(100, min(699, float(x_list[3])))
                unique_cats.append((cat_id, cat_val))
            
            # 转换为实际材料
            base_material = unique_materials['base'][base_idx]
            prep_method = unique_methods[method_idx]
            
            # 计算综合评分
            score, _ = calculate_overall_score(base_material, unique_cats, prep_method)
            
            # 返回负分数用于最小化，确保是标量
            return -float(score)
        except Exception as e:
            return 0.0  # 出错时返回默认值
    
    # 定义参数范围
    bounds = [
        (0, len(unique_materials['base']) - 1),  # 基底材料索引
        (0, len(unique_methods) - 1),  # 制备方法索引
    ]
    
    # 添加类别材料参数的边界 (ID和值)
    for _ in range(5):  # 最多5种类别
        bounds.append((1, 5))  # 类别ID: 1-5
        bounds.append((100, 699))  # 类别材料编码值
    
    # 创建进度记录
    opt_progress = OptimizationProgress("差分进化优化")
    
    try:
        # 使用差分进化算法搜索最佳材料
        result = differential_evolution(
            de_objective,
            bounds,
            popsize=20,
            mutation=(0.5, 1.0),
            recombination=0.7,
            maxiter=200,
            tol=0.01,
            seed=42,
            updating='immediate',
            callback=opt_progress.update,
            workers=1,
            polish=True
        )
        
        opt_progress.close()
        
        # 解析最佳参数
        best_params = result.x.tolist()  # 确保转换为Python列表
        
        # 基底材料和制备方法
        base_idx = int(best_params[0]) % len(unique_materials['base'])
        method_idx = int(best_params[1]) % len(unique_methods)
        
        # 从参数中提取类别信息
        de_categories = []
        for i in range(0, min(10, len(best_params)-2), 2):
            if i+3 < len(best_params):
                cat_id = max(1, min(5, int(best_params[i+2])))
                cat_val = float(best_params[i+3])  # 确保是标量
                if cat_val > 0:
                    # 映射到有效范围
                    mapped_val = max(100, min(699, cat_val))
                    de_categories.append((cat_id, mapped_val))
        
        # 去除重复类别
        seen_cats = set()
        de_best_categories = []  # 修改变量名与后面使用一致
        for cat_id, cat_val in de_categories:
            if cat_id not in seen_cats:
                seen_cats.add(cat_id)
                de_best_categories.append((cat_id, cat_val))
        
        # 确保至少有一个类别
        if not de_best_categories and len(best_params) > 3:
            cat_id = max(1, min(5, int(best_params[2])))
            cat_val = max(100, min(699, float(best_params[3])))
            de_best_categories.append((cat_id, cat_val))
        
        # 转换为实际材料编码
        de_best_base = unique_materials['base'][base_idx]
        de_best_method = unique_methods[method_idx]
        
        # 计算性能
        de_score, de_predictions = calculate_overall_score(
            de_best_base, de_best_categories, de_best_method
        )
        
        # 获取材料名称 (如果可用)
        try:
            # 加载编码参照表
            base_ref = pd.read_excel('data_exports/base_material_encoding_reference.xlsx')
            method_ref = pd.read_excel('data_exports/method_encoding_reference.xlsx')
            material_ref = pd.read_excel('data_exports/material_encoding_reference.xlsx')
            
            # 获取基底材料名称
            base_name = base_ref[base_ref['编码值'] == de_best_base]['基底材料'].values[0] if len(base_ref[base_ref['编码值'] == de_best_base]) > 0 else f"基底材料{de_best_base}"
            
            # 获取制备方法名称
            method_name = method_ref[method_ref['编码值'] == de_best_method]['制备方法'].values[0] if len(method_ref[method_ref['编码值'] == de_best_method]) > 0 else f"制备方法{de_best_method}"
            
            # 获取类别材料名称
            category_names = []
            for cat_id, cat_val in de_best_categories:
                # 查找最接近的编码值
                cat_df = material_ref[material_ref['分类ID'] == cat_id]
                if not cat_df.empty:
                    # 找到最接近的编码值
                    cat_df['差值'] = abs(cat_df['编码值'] - cat_val)
                    closest = cat_df.loc[cat_df['差值'].idxmin()]
                    category_names.append(f"{closest['材料名称']} (类别{cat_id}, 编码{cat_val:.1f})")
                else:
                    category_names.append(f"类别{cat_id}材料 (编码{cat_val:.1f})")
            
            print(f"差分进化发现的最佳材料:")
            print(f"  基底材料: {base_name} (编码: {de_best_base})")
            print(f"  类别材料: {', '.join(category_names)}")
            print(f"  制备方法: {method_name} (编码: {de_best_method})")
            print(f"  综合评分: {de_score:.4f}")
            print("  预测性能:")
            for target, value in de_predictions.items():
                print(f"    {target}: {value:.2f}")
        except Exception as e:
            # 如果无法获取名称，则显示编码
            print(f"差分进化发现的最佳材料:")
            print(f"  基底材料: {de_best_base}")
            print(f"  类别材料: {', '.join([f'(类别{cat_id}, 值{cat_val:.1f})' for cat_id, cat_val in de_best_categories])}")
            print(f"  制备方法: {de_best_method}")
            print(f"  综合评分: {de_score:.4f}")
            print("  预测性能:")
            for target, value in de_predictions.items():
                print(f"    {target}: {value:.2f}")
            print(f"  注: 无法加载材料参照表: {str(e)}")
        
        # 添加差分进化可视化
        def plot_de_optimization(opt_progress):
            """可视化差分进化优化过程"""
            if not opt_progress.history:
                print("无优化历史记录可供可视化")
                return
            
            plt.figure(figsize=(12, 8))
            plt.rcParams['font.sans-serif'] = ['SimHei']
            plt.rcParams['axes.unicode_minus'] = False
            
            fig, ax = plt.subplots(figsize=(12, 6))
            
            # 绘制目标函数值
            iterations = range(1, len(opt_progress.history) + 1)
            ax.plot(iterations, opt_progress.history, 'o-', color='#4292c6', 
                    markersize=6, linewidth=1.5, alpha=0.7, label='目标函数值')
            
            # 绘制历史最优值
            ax.plot(iterations, opt_progress.best_values, '-', color='#d62728', 
                    linewidth=2.5, label='历史最优值')
            
            # 标记全局最优点
            best_idx = np.argmin(opt_progress.history)
            best_value = opt_progress.history[best_idx]
            ax.plot(best_idx + 1, best_value, 'r*', markersize=15)
            
            # 添加最优值标注
            ax.annotate(f'全局最优: {best_value:.4f}', 
                        xy=(best_idx + 1, best_value), 
                        xytext=(best_idx + 1 + 5, best_value - 0.02),
                        arrowprops=dict(facecolor='black', shrink=0.05, width=1),
                        fontsize=12)
            
            # 设置标题和标签
            ax.set_title("差分进化优化收敛过程", fontsize=18, fontweight='bold', pad=20)
            ax.set_xlabel("迭代次数", fontsize=14)
            ax.set_ylabel("目标函数值", fontsize=14)
            
            # 美化坐标轴和网格
            ax.grid(True, linestyle='--', alpha=0.7)
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)
            
            # 添加收敛分析区域
            if len(opt_progress.history) > 20:
                # 计算近期收敛速度 (最后20%的迭代)
                last_segment = opt_progress.best_values[-int(len(opt_progress.best_values)*0.2):]
                if last_segment and last_segment[0] != last_segment[-1]:
                    improve_rate = (last_segment[0] - last_segment[-1]) / last_segment[0] * 100
                    ax.text(0.02, 0.1, f"最近阶段改进率: {improve_rate:.2f}%", 
                            transform=ax.transAxes, fontsize=12,
                            bbox=dict(facecolor='white', alpha=0.8))
            
            # 添加统计信息
            info_text = (f"差分进化优化统计:\n"
                        f"· 总迭代次数: {len(opt_progress.history)}\n"
                        f"· 初始值: {opt_progress.history[0]:.4f}\n"
                        f"· 最终值: {opt_progress.history[-1]:.4f}\n"
                        f"· 全局最优: {best_value:.4f}\n"
                        f"· 总体改进: {(1 - best_value/opt_progress.history[0])*100:.1f}%")
            
            plt.figtext(0.02, 0.02, info_text, fontsize=10, 
                        bbox=dict(facecolor='white', alpha=0.8, boxstyle='round,pad=0.5'))
            
            # 添加图例
            ax.legend(loc='upper right', fontsize=12)
            
            plt.tight_layout()
            plt.savefig('optimization_results/differential_evolution_convergence.png', dpi=300, bbox_inches='tight')

            # 保存优化迭代数据
            iterations = list(range(1, len(opt_progress.history) + 1))
            optimization_data = pd.DataFrame({
                '迭代次数': iterations,
                '目标函数值': opt_progress.history,
                '历史最优值': opt_progress.best_values,
                '是否全局最优点': [i == np.argmin(opt_progress.history) + 1 for i in iterations]
            })
            optimization_data.to_csv('optimization_results/differential_evolution_data.csv', index=False)

            # 保存优化统计元数据
            best_idx = np.argmin(opt_progress.history)
            best_value = opt_progress.history[best_idx]

            # 计算近期收敛速度（如果有足够的迭代次数）
            improve_rate = float('nan')
            if len(opt_progress.history) > 20:
                last_segment = opt_progress.best_values[-int(len(opt_progress.best_values)*0.2):]
                if last_segment and last_segment[0] != last_segment[-1]:
                    improve_rate = (last_segment[0] - last_segment[-1]) / last_segment[0] * 100

            metadata = pd.DataFrame({
                '指标': [
                    '总迭代次数', 
                    '初始值', 
                    '最终值', 
                    '全局最优值',
                    '全局最优迭代次数',
                    '总体改进率(%)',
                    '最近阶段改进率(%)'
                ],
                '数值': [
                    len(opt_progress.history),
                    opt_progress.history[0],
                    opt_progress.history[-1],
                    best_value,
                    best_idx + 1,
                    (1 - best_value/opt_progress.history[0])*100 if opt_progress.history[0] != 0 else float('nan'),
                    improve_rate
                ]
            })
            metadata.to_csv('optimization_results/differential_evolution_metadata.csv', index=False)

            # 保存最优解的详细信息（如果可用）
            if 'de_best_base' in globals() and de_best_base is not None:
                best_solution = pd.DataFrame({
                    '参数': ['基底材料', '制备方法', '综合评分'] + 
                        [f'类别{cat_id}材料值' for cat_id, _ in de_best_categories] +
                        list(de_predictions.keys()),
                    '数值': [de_best_base, de_best_method, de_score] + 
                        [cat_val for _, cat_val in de_best_categories] +
                        list(de_predictions.values())
                })
                best_solution.to_csv('optimization_results/differential_evolution_best_solution.csv', index=False)

            print(f"差分进化优化数据已保存至: optimization_results/differential_evolution_data.csv")
            print(f"差分进化优化元数据已保存至: optimization_results/differential_evolution_metadata.csv")
            if 'de_best_base' in globals() and de_best_base is not None:
                print(f"差分进化最优解已保存至: optimization_results/differential_evolution_best_solution.csv")

            plt.close()
        
        # 绘制差分进化优化过程
        plot_de_optimization(opt_progress)
            
    except Exception as e:
        opt_progress.close()
        print(f"差分进化优化执行失败: {str(e)}")
        de_best_base = None
        de_best_categories = None
        de_best_method = None
        de_score = 0
        de_predictions = {}
        
except Exception as e:
    print(f"准备差分进化优化时出错: {str(e)}")
    de_best_base = None
    de_best_categories = None
    de_best_method = None
    de_score = 0
    de_predictions = {}



In [None]:
# ======= 4. 多目标优化 =======
print("\n4. 执行多目标优化...")
from pymoo.algorithms.moo.nsga2 import NSGA2
from pymoo.operators.sampling.rnd import FloatRandomSampling
from pymoo.operators.crossover.sbx import SBX
from pymoo.operators.mutation.pm import PM
from pymoo.optimize import minimize

pymoo_available = True
def predict_performance(target, features, model_type):
    """预测给定材料组合的性能"""
    if target not in best_model:
        print(f"{target}不在best_model字典中")
        return None

    # 获取模型名称
    model_name = best_model[target]['tolerance_r2']['model']
    
    # 加载最佳模型
    model_path = f'模型评估结果/最佳模型/{target}_最佳模型.pkl'
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    
    # 识别类别列
    category_columns = [col for col in features.columns if col.startswith('类别')]
    
    # 根据传入的模型类型应用不同的预处理
    features_processed = features.copy()
    if model_type == 'linear':
        # 线性模型: 类别编码除以1000
        for col in category_columns:
            features_processed[col] = features_processed[col] / 1000.0
    elif model_type == 'tree':
        # 树模型: 将0替换为NaN
        for col in category_columns:
            features_processed[col] = features_processed[col].replace(0, np.nan)
    
    # 预定义的特征列
    expected_features = category_columns + ['制备方法_编码', '基底材料_编码']
    
    # 确保所有需要的特征列都存在
    for feat in expected_features:
        if feat not in features_processed.columns:
            features_processed[feat] = 0.0
    
    # 使用预期的特征列
    features_for_model = features_processed[expected_features]
    
    # 进行预测
    if isinstance(model, dict) and 'model' in model:
        if model.get('needs_scaling', False) and 'scaler' in model:
            X_scaled = model['scaler'].transform(features_for_model)
            pred = model['model'].predict(X_scaled)[0]
        else:
            pred = model['model'].predict(features_for_model)[0]
    else:
        # XGBoost模型可能需要特殊处理
        if str(type(model)).find('xgboost') > -1:
            try:
                import xgboost as xgb
                with xgb.config_context(verbosity=0, predict_disable_shape_check=True):
                    pred = model.predict(features_for_model)[0]
            except:
                pred = model.predict(features_for_model)[0]
        else:
            pred = model.predict(features_for_model)[0]
    
    return pred
def calculate_overall_score(base_material, categories, prep_method):
    """计算给定材料组合的综合评分"""
    # 构建类别形式
    categories_formatted = []
    for cat_id, cat_value in categories:
        if cat_value > 0:  # 只添加存在的材料
            categories_formatted.append((cat_id, cat_value))
    
    if not categories_formatted:
        print("没有有效的类别材料")
        return 0.0, {}  # 如果没有有效材料，返回0分
    
    # 创建原始编码特征
    features = create_material_features(base_material, categories_formatted, prep_method)
    
    # 预测各指标性能
    predictions = {}
    for target in target_columns:
        if target in best_model:
            # 获取模型类型
            model_name = best_model[target]['tolerance_r2']['model']
            # 根据模型名推断类型
            if model_name in ['LinearRegression', 'Ridge', 'Lasso', 'ElasticNet', 'HuberRegressor']:
                model_type = 'linear'
            elif model_name in ['XGBoost', 'RandomForest', 'GradientBoosting', 'LightGBM', 'HistGradientBoosting']:
                model_type = 'tree'
            elif model_name in ['DeepNN']:
                model_type = 'nn'
            else:
                model_type = 'linear'  # 默认为线性模型
            
            # 使用修改后的预测函数，传递模型类型
            pred = predict_performance(target, features, model_type)
            if pred is not None:
                predictions[target] = pred
                #print(f"预测 {target}: {pred:.2f}")
    
    # 如果没有有效预测，返回0分
    if not predictions:
        print("没有有效的预测结果")
        return 0.0, {}
    
    # 根据因果分析结果调整权重
    weights = {}
    total_weight = 0.0
    
    for target in predictions.keys():
        if target == '水接触角':
            weights[target] = 1.0
        elif target == '循环使用次数':
            weights[target] = 1.2  # 略微提高权重
        elif target == '吸油能力':
            weights[target] = 1.3  # 更高权重
        else:
            weights[target] = 1.0
        total_weight += weights[target]
    
    # 计算加权综合得分
    score = 0.0
    
    for target, value in predictions.items():
        # 归一化特定指标的得分
        if target == '水接触角':
            # 水接触角范围通常为90-150度，值越大越好
            norm_score = min(1.0, max(0.0, value / 180.0))
        elif target == '循环使用次数':
            # 循环使用次数通常为10-50次，值越大越好
            norm_score = min(1.0, max(0.0, value / 50.0))
        elif target == '吸油能力':
            # 吸油能力通常为5-50 g/g，值越大越好
            norm_score = min(1.0, max(0.0, value / 50.0))
        else:
            norm_score = min(1.0, max(0.0, value / 100.0))
        
        #print(f"{target} 归一化得分: {norm_score:.4f}, 权重: {weights[target]/total_weight:.2f}")
            
        # 加权求和
        if total_weight > 0:
            score += (weights[target] / total_weight) * norm_score
    
    #print(f"总综合得分: {score:.4f}")
    return score, predictions
if pymoo_available:
    # 定义多目标优化问题
    class MaterialProblem(Problem):
        def __init__(self):
            # 定义参数：[base_idx, method_idx, cat1_id, cat1_val, cat2_id, cat2_val, ...]
            n_var = 12  # 基底材料, 制备方法, 和5对类别参数
            
            # 定义边界
            xl = np.zeros(n_var)
            xu = np.zeros(n_var)
            
            # 基底材料和制备方法
            xl[0] = 0
            xu[0] = len(unique_materials['base']) - 1
            
            xl[1] = 0
            xu[1] = len(unique_methods) - 1
            
            # 类别参数 (ID和值)
            for i in range(5):  # 5个类别
                # 类别ID
                xl[2+i*2] = 1
                xu[2+i*2] = 5
                
                # 类别材料编码值
                xl[3+i*2] = 100
                xu[3+i*2] = 699
            
            # 目标函数数量：每个目标变量一个
            n_obj = len(target_columns)
            
            # 约束数量
            n_constr = 0
            
            super().__init__(n_var=n_var, n_obj=n_obj, n_constr=n_constr, xl=xl, xu=xu)
        
        def _evaluate(self, x, out, *args, **kwargs):
            # 初始化目标值数组
            f = np.zeros((x.shape[0], self.n_obj))
            
            # 对每个决策向量评估目标函数
            for i in range(x.shape[0]):
                # 解码决策向量
                base_idx = int(x[i, 0]) % len(unique_materials['base'])
                method_idx = int(x[i, 1]) % len(unique_methods)
                
                # 提取类别信息
                categories = []
                for j in range(5):  # 5个类别
                    cat_id = max(1, min(5, int(x[i, 2+j*2])))
                    cat_val = max(100, min(699, x[i, 3+j*2]))
                    if cat_val > 0:
                        categories.append((cat_id, cat_val))
                
                # 去除重复类别
                seen_cats = set()
                unique_cats = []
                for cat_id, cat_val in categories:
                    if cat_id not in seen_cats:
                        seen_cats.add(cat_id)
                        unique_cats.append((cat_id, cat_val))
                
                # 确保至少有一个类别
                if not unique_cats:
                    cat_id = max(1, min(5, int(x[i, 2])))
                    cat_val = max(100, min(699, x[i, 3]))
                    unique_cats.append((cat_id, cat_val))
                
                # 转换为实际材料编码
                base_material = unique_materials['base'][base_idx]
                prep_method = unique_methods[method_idx]
                
                # 创建特征
                features = create_material_features(base_material, unique_cats, prep_method)
                
                # 对每个目标变量进行预测
                for j, target in enumerate(target_columns):
                    if target in best_model:
                        # 获取模型类型
                        model_name = best_model[target]['tolerance_r2']['model']
                        # 根据模型名推断类型
                        if model_name in ['LinearRegression', 'Ridge', 'Lasso', 'ElasticNet', 'HuberRegressor']:
                            model_type = 'linear'
                        elif model_name in ['XGBoost', 'RandomForest', 'GradientBoosting', 'LightGBM', 'HistGradientBoosting']:
                            model_type = 'tree'
                        elif model_name in ['DeepNN']:
                            model_type = 'nn'
                        else:
                            model_type = 'linear'  # 默认为线性模型
                        
                        # 使用适当的预处理方式进行预测
                        pred = predict_performance(target, features, model_type)
                        
                        if pred is not None:
                            # 水接触角需要取反，因为多目标优化是最小化问题，而我们希望水接触角越大越好
                            if target == '水接触角':
                                # 理想值是180，所以用180减去预测值，越接近0越好
                                f[i, j] = 180.0 - pred
                            else:
                                # 其他指标直接取负值，因为越大越好
                                f[i, j] = -pred
                        else:
                            # 默认值
                            if target == '水接触角':
                                f[i, j] = 90.0  # 180 - 90 = 90
                            else:
                                f[i, j] = -10.0
                    else:
                        # 默认值
                        if target == '水接触角':
                            f[i, j] = 90.0
                        else:
                            f[i, j] = -10.0
            
            out["F"] = f
    
    # 创建和求解问题
    problem = MaterialProblem()
    
    # 创建进度条
    print("运行多目标优化...")
    progress_bar = CustomProgressBar(total=50, desc="多目标优化")
    
    # 设置算法
    algorithm = NSGA2(
        pop_size=50,
        sampling=FloatRandomSampling(),
        crossover=SBX(prob=0.9, eta=15),
        mutation=PM(eta=20),
        eliminate_duplicates=True
    )
    
    # 创建定制回调函数来更新进度条
    def progress_callback(algorithm):
        progress_bar.update(1)
        return False  # 继续运行算法
    
    # 运行优化
    res = minimize(
        problem,
        algorithm,
        ('n_gen', 50),
        seed=42,
        callback=progress_callback,
        verbose=False
    )
    
    progress_bar.close()
    
    # 获取帕累托前沿解
    pareto_front = res.F
    pareto_solutions = res.X
    
    # 将目标函数值转换回正数
    pareto_front_original = -pareto_front.copy()
    # 特殊处理水接触角(如果是第一个目标)
    if target_columns[0] == '水接触角':
        pareto_front_original[:, 0] = 180.0 - pareto_front[:, 0]
    
    # 解析和评估帕累托最优解
    multi_solutions = []
    
    print(f"发现 {len(pareto_solutions)} 个帕累托最优解")
    
    for i, solution in enumerate(pareto_solutions):
        # 解码决策向量
        base_idx = int(solution[0]) % len(unique_materials['base'])
        method_idx = int(solution[1]) % len(unique_methods)
        
        # 提取类别信息
        categories = []
        for j in range(5):  # 5个类别
            if 2+j*2 < len(solution):
                cat_id = max(1, min(5, int(solution[2+j*2])))
                cat_val = max(100, min(699, solution[3+j*2]))
                if cat_val > 0:
                    categories.append((cat_id, cat_val))
        
        # 去除重复类别
        seen_cats = set()
        unique_cats = []
        for cat_id, cat_val in categories:
            if cat_id not in seen_cats:
                seen_cats.add(cat_id)
                unique_cats.append((cat_id, cat_val))
        
        # 确保至少有一个类别
        if not unique_cats:
            cat_id = max(1, min(5, int(solution[2])))
            cat_val = max(100, min(699, solution[3]))
            unique_cats.append((cat_id, cat_val))
        
        # 转换为实际材料编码
        mo_base = unique_materials['base'][base_idx]
        mo_method = unique_methods[method_idx]
        
        # 计算综合性能
        mo_score, mo_predictions = calculate_overall_score(mo_base, unique_cats, mo_method)
        
        # 存储解
        multi_solutions.append({
            '基底材料': mo_base,
            '类别材料': unique_cats,
            '制备方法': mo_method,
            '综合评分': mo_score,
            '预测性能': mo_predictions,
            '帕累托等级': i + 1
        })
    
    # 按综合评分排序
    multi_solutions.sort(key=lambda x: x['综合评分'], reverse=True)
    
    # 输出前几个最佳解
    print("多目标优化发现的最佳材料:")
    
    # 尝试加载材料参照表
    try:
        # 加载编码参照表
        base_ref = pd.read_excel('data_exports/base_material_encoding_reference.xlsx')
        method_ref = pd.read_excel('data_exports/method_encoding_reference.xlsx')
        material_ref = pd.read_excel('data_exports/material_encoding_reference.xlsx')
        
        # 对前三个解获取材料名称
        for i, solution in enumerate(multi_solutions[:3]):
            # 获取基底材料名称
            base_name = base_ref[base_ref['编码值'] == solution['基底材料']]['基底材料'].values[0] if len(base_ref[base_ref['编码值'] == solution['基底材料']]) > 0 else f"基底材料{solution['基底材料']}"
            
            # 获取制备方法名称
            method_name = method_ref[method_ref['编码值'] == solution['制备方法']]['制备方法'].values[0] if len(method_ref[method_ref['编码值'] == solution['制备方法']]) > 0 else f"制备方法{solution['制备方法']}"
            
            # 获取类别材料名称
            category_names = []
            for cat_id, cat_val in solution['类别材料']:
                # 查找最接近的编码值
                cat_df = material_ref[material_ref['分类ID'] == cat_id]
                if not cat_df.empty:
                    # 找到最接近的编码值
                    cat_df['差值'] = abs(cat_df['编码值'] - cat_val)
                    closest = cat_df.loc[cat_df['差值'].idxmin()]
                    category_names.append(f"{closest['材料名称']} (类别{cat_id}, 编码{cat_val:.1f})")
                else:
                    category_names.append(f"类别{cat_id}材料 (编码{cat_val:.1f})")
            
            print(f"\n解 #{i+1}:")
            print(f"  基底材料: {base_name} (编码: {solution['基底材料']})")
            print(f"  类别材料: {', '.join(category_names)}")
            print(f"  制备方法: {method_name} (编码: {solution['制备方法']})")
            print(f"  综合评分: {solution['综合评分']:.4f}")
            print("  预测性能:")
            for target, value in solution['预测性能'].items():
                print(f"    {target}: {value:.2f}")
    except Exception as e:
        # 如果无法获取名称，则显示编码
        for i, solution in enumerate(multi_solutions[:3]):
            print(f"\n解 #{i+1}:")
            print(f"  基底材料: {solution['基底材料']}")
            print(f"  类别材料: {', '.join([f'(类别{cat_id}, 值{cat_val:.1f})' for cat_id, cat_val in solution['类别材料']])}")
            print(f"  制备方法: {solution['制备方法']}")
            print(f"  综合评分: {solution['综合评分']:.4f}")
            print("  预测性能:")
            for target, value in solution['预测性能'].items():
                print(f"    {target}: {value:.2f}")
        print(f"  注: 无法加载材料参照表: {str(e)}")
    
    # 选择最佳解
    if multi_solutions:
        best_solution = multi_solutions[0]
        mo_best_base = best_solution['基底材料']
        mo_best_categories = best_solution['类别材料']
        mo_best_method = best_solution['制备方法']
        mo_score = best_solution['综合评分']
        mo_predictions = best_solution['预测性能']
    else:
        print("多目标优化未能找到有效的材料组合")
        mo_best_base = None
        mo_best_categories = None
        mo_best_method = None
        mo_score = 0
        mo_predictions = {}
        
    # 可视化帕累托前沿
    if pareto_front.shape[1] >= 2:
        try:
            # 绘制二维投影
            plt.figure(figsize=(10, 8))
            plt.rcParams['font.sans-serif'] = ['SimHei']  # 设置中文字体
            plt.rcParams['axes.unicode_minus'] = False  # 正确显示负号
            
            # 如果有两个以上的目标，使用前两个
            sc = plt.scatter(
                pareto_front_original[:, 0], 
                pareto_front_original[:, 1],
                c=range(len(pareto_front)),
                cmap='viridis',
                s=100, alpha=0.7
            )
            
            # 添加标题和标签
            plt.title('材料性能帕累托前沿', fontsize=15)
            plt.xlabel(target_columns[0], fontsize=12)
            plt.ylabel(target_columns[1], fontsize=12)
            
            # 添加颜色条表示解的排名
            cbar = plt.colorbar(sc)
            cbar.set_label('解的排名', fontsize=12)
            
            # 高亮最佳综合解
            if multi_solutions:
                best_idx = 0
                plt.scatter(
                    pareto_front_original[best_idx, 0],
                    pareto_front_original[best_idx, 1],
                    s=200, alpha=1, color='red',
                    marker='*', label='最佳综合解'
                )
            
            plt.grid(True, alpha=0.3)
            plt.legend()
            plt.tight_layout()
            plt.savefig('optimization_results/pareto_front.png', dpi=300)
            plt.close()
            
            # 如果有3个或更多目标，创建三维图
            if pareto_front.shape[1] >= 3:
                from mpl_toolkits.mplot3d import Axes3D
                
                fig = plt.figure(figsize=(12, 10))
                ax = fig.add_subplot(111, projection='3d')
                
                sc = ax.scatter(
                    pareto_front_original[:, 0],
                    pareto_front_original[:, 1],
                    pareto_front_original[:, 2],
                    c=range(len(pareto_front)),
                    cmap='viridis',
                    s=100, alpha=0.7
                )
                
                ax.set_title('材料性能三维帕累托前沿', fontsize=15)
                ax.set_xlabel(target_columns[0], fontsize=12)
                ax.set_ylabel(target_columns[1], fontsize=12)
                ax.set_zlabel(target_columns[2], fontsize=12)
                
                # 高亮最佳综合解
                if multi_solutions:
                    best_idx = 0
                    ax.scatter(
                        pareto_front_original[best_idx, 0],
                        pareto_front_original[best_idx, 1],
                        pareto_front_original[best_idx, 2],
                        s=200, alpha=1, color='red',
                        marker='*', label='最佳综合解'
                    )
                
                ax.legend()
                plt.tight_layout()
                plt.savefig('optimization_results/pareto_front_3d.png', dpi=300)
                plt.close()
                # 保存帕累托前沿原始数据
                try:
                    # 创建数据导出目录（如果不存在）
                    os.makedirs('optimization_results/raw_data', exist_ok=True)
                    
                    # 1. 保存帕累托前沿数据点
                    pareto_data = pd.DataFrame()
                    for i in range(pareto_front_original.shape[1]):
                        if i < len(target_columns):
                            column_name = target_columns[i]
                        else:
                            column_name = f"目标_{i+1}"
                        pareto_data[column_name] = pareto_front_original[:, i]
                    
                    # 添加排名列
                    pareto_data['解的排名'] = range(1, len(pareto_front_original) + 1)
                    
                    # 标记最佳综合解
                    pareto_data['是最佳综合解'] = False
                    if multi_solutions:
                        pareto_data.loc[0, '是最佳综合解'] = True
                    
                    # 保存为CSV文件
                    pareto_data.to_csv('optimization_results/raw_data/pareto_front_data.csv', index=False, encoding='utf-8-sig')
                    print(f"已保存帕累托前沿数据点到 'optimization_results/raw_data/pareto_front_data.csv'")
                    
                    # 2. 保存解决方案详细数据
                    solutions_data = pd.DataFrame()
                    solutions_data['解的排名'] = range(1, len(multi_solutions) + 1)
                    solutions_data['基底材料'] = [sol['基底材料'] for sol in multi_solutions]
                    solutions_data['制备方法'] = [sol['制备方法'] for sol in multi_solutions]
                    solutions_data['综合评分'] = [sol['综合评分'] for sol in multi_solutions]
                    
                    # 添加类别材料列
                    solutions_data['类别材料'] = [str(sol['类别材料']) for sol in multi_solutions]
                    
                    # 添加各目标预测性能
                    for target in target_columns:
                        solutions_data[f'预测_{target}'] = [sol['预测性能'].get(target, float('nan')) for sol in multi_solutions]
                    
                    # 保存为CSV文件
                    solutions_data.to_csv('optimization_results/raw_data/solutions_data.csv', index=False, encoding='utf-8-sig')
                    print(f"已保存解决方案数据到 'optimization_results/raw_data/solutions_data.csv'")
                    
                    # 3. 保存为Excel文件，包含多个工作表
                    with pd.ExcelWriter('optimization_results/raw_data/optimization_results.xlsx', engine='openpyxl') as writer:
                        pareto_data.to_excel(writer, sheet_name='帕累托前沿数据', index=False)
                        solutions_data.to_excel(writer, sheet_name='解决方案数据', index=False)
                        
                        # 创建更详细的解决方案工作表
                        detailed_solutions = pd.DataFrame()
                        for i, sol in enumerate(multi_solutions):
                            row = {
                                '解的排名': i + 1,
                                '基底材料': sol['基底材料'],
                                '制备方法': sol['制备方法'],
                                '综合评分': sol['综合评分']
                            }
                            
                            # 添加类别材料
                            for j, (cat_id, cat_val) in enumerate(sol['类别材料']):
                                row[f'类别{cat_id}_ID'] = cat_id
                                row[f'类别{cat_id}_值'] = cat_val
                            
                            # 添加预测性能
                            for target, value in sol['预测性能'].items():
                                row[f'预测_{target}'] = value
                                
                            # 添加到DataFrame
                            detailed_solutions = pd.concat([detailed_solutions, pd.DataFrame([row])], ignore_index=True)
                        
                        detailed_solutions.to_excel(writer, sheet_name='详细解决方案', index=False)
                    
                    print(f"已保存完整优化结果到 'optimization_results/raw_data/optimization_results.xlsx'")

                except Exception as e:
                    print(f"保存帕累托前沿原始数据时出错: {str(e)}")
        except Exception as e:
            print(f"绘制帕累托前沿时出错: {str(e)}")
else:
    print("已跳过多目标优化，没有可用的pymoo库")
    mo_best_base = None
    mo_best_categories = None
    mo_best_method = None
    mo_score = 0
    mo_predictions = {}
    


In [None]:
# ======= 5. 集成所有优化结果 =======
print("\n5. 集成所有优化结果...")

# 收集所有找到的解
all_solutions = []

# 添加有效的解
if bayes_best_base is not None:
    all_solutions.append({
        '优化方法': '贝叶斯优化',
        '基底材料': bayes_best_base,
        '类别材料': bayes_best_categories,
        '制备方法': bayes_best_method,
        '综合评分': bayes_score,
        '预测性能': bayes_predictions
    })

if gen_best_base is not None:
    all_solutions.append({
        '优化方法': '生成式AI',
        '基底材料': gen_best_base,
        '类别材料': gen_best_categories,
        '制备方法': gen_best_method,
        '综合评分': gen_score,
        '预测性能': gen_predictions
    })

if de_best_base is not None:
    all_solutions.append({
        '优化方法': '差分进化',
        '基底材料': de_best_base,
        '类别材料': de_best_categories,
        '制备方法': de_best_method,
        '综合评分': de_score,
        '预测性能': de_predictions
    })

if mo_best_base is not None:
    all_solutions.append({
        '优化方法': '多目标优化',
        '基底材料': mo_best_base,
        '类别材料': mo_best_categories,
        '制备方法': mo_best_method,
        '综合评分': mo_score,
        '预测性能': mo_predictions
    })

# 按照综合评分排序
all_solutions.sort(key=lambda x: x['综合评分'], reverse=True)

# 输出所有解的比较
print("\n所有优化方法的结果比较:")

if all_solutions:
    # 尝试加载材料参照表以获取名称
    try:
        # 加载编码参照表
        base_ref = pd.read_excel('data_exports/base_material_encoding_reference.xlsx')
        method_ref = pd.read_excel('data_exports/method_encoding_reference.xlsx')
        material_ref = pd.read_excel('data_exports/material_encoding_reference.xlsx')
        
        for i, solution in enumerate(all_solutions):
            # 获取基底材料名称
            base_name = base_ref[base_ref['编码值'] == solution['基底材料']]['基底材料'].values[0] if len(base_ref[base_ref['编码值'] == solution['基底材料']]) > 0 else f"基底材料{solution['基底材料']}"
            
            # 获取制备方法名称
            method_name = method_ref[method_ref['编码值'] == solution['制备方法']]['制备方法'].values[0] if len(method_ref[method_ref['编码值'] == solution['制备方法']]) > 0 else f"制备方法{solution['制备方法']}"
            
            # 获取类别材料名称
            category_names = []
            for cat_id, cat_val in solution['类别材料']:
                # 查找最接近的编码值
                cat_df = material_ref[material_ref['分类ID'] == cat_id]
                if not cat_df.empty:
                    # 找到最接近的编码值
                    cat_df['差值'] = abs(cat_df['编码值'] - cat_val)
                    closest = cat_df.loc[cat_df['差值'].idxmin()]
                    category_names.append(f"{closest['材料名称']} (类别{cat_id}, 编码{cat_val:.1f})")
                else:
                    category_names.append(f"类别{cat_id}材料 (编码{cat_val:.1f})")
            
            print(f"\n{i+1}. {solution['优化方法']}:")
            print(f"  基底材料: {base_name} (编码: {solution['基底材料']})")
            print(f"  类别材料: {', '.join(category_names)}")
            print(f"  制备方法: {method_name} (编码: {solution['制备方法']})")
            print(f"  综合评分: {solution['综合评分']:.4f}")
            print("  预测性能:")
            for target, value in solution['预测性能'].items():
                print(f"    {target}: {value:.2f}")
    except Exception as e:
        # 如果无法获取名称，则显示编码
        for i, solution in enumerate(all_solutions):
            print(f"\n{i+1}. {solution['优化方法']}:")
            print(f"  基底材料: {solution['基底材料']}")
            print(f"  类别材料: {', '.join([f'(类别{cat_id}, 值{cat_val:.1f})' for cat_id, cat_val in solution['类别材料']])}")
            print(f"  制备方法: {solution['制备方法']}")
            print(f"  综合评分: {solution['综合评分']:.4f}")
            print("  预测性能:")
            for target, value in solution['预测性能'].items():
                print(f"    {target}: {value:.2f}")
        print(f"  注: 无法加载材料参照表: {str(e)}")
    
    # 选择集成方案（排名前三的方案）
    top_solutions = all_solutions[:min(3, len(all_solutions))]
    
    # 创建集成分析可视化
    if len(top_solutions) >= 2:
        # 性能雷达图
        categories = target_columns
        if categories:
            plt.figure(figsize=(12, 10))
            plt.rcParams['font.sans-serif'] = ['SimHei']  # 设置中文字体
            plt.rcParams['axes.unicode_minus'] = False  # 正确显示负号
            
            ax = plt.subplot(111, polar=True)
            
            # 计算角度
            N = len(categories)
            angles = [n / float(N) * 2 * np.pi for n in range(N)]
            angles += angles[:1]  # 闭合
            
            # 绘制每种方案
            for i, solution in enumerate(top_solutions):
                # 提取性能值
                values = []
                for cat in categories:
                    if cat in solution['预测性能']:
                        val = solution['预测性能'][cat]
                        # 归一化
                        if cat == '水接触角':
                            norm_val = min(val / 180.0, 1.0)
                        elif cat == '循环使用次数':
                            norm_val = min(val / 50.0, 1.0)
                        elif cat == '吸油能力':
                            norm_val = min(val / 50.0, 1.0)
                        else:
                            norm_val = val / 100.0
                        values.append(norm_val)
                    else:
                        values.append(0)
                
                # 闭合
                values += values[:1]
                
                # 绘制
                ax.plot(angles, values, 'o-', linewidth=2, label=solution['优化方法'])
                ax.fill(angles, values, alpha=0.25)
            
            # 设置标签
            ax.set_thetagrids(np.degrees(angles[:-1]), categories)
            
            plt.title('不同优化方法的材料性能比较', size=15)
            plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
            plt.savefig('optimization_results/optimization_comparison.png', dpi=300)
            plt.close()
    
    # 保存最佳材料组合
    best_solution = all_solutions[0]
    
    # 获取并打印最佳材料组合的详细信息
    try:
        # 加载编码参照表
        base_ref = pd.read_excel('data_exports/base_material_encoding_reference.xlsx')
        method_ref = pd.read_excel('data_exports/method_encoding_reference.xlsx')
        material_ref = pd.read_excel('data_exports/material_encoding_reference.xlsx')
        
        # 获取基底材料名称
        base_name = base_ref[base_ref['编码值'] == best_solution['基底材料']]['基底材料'].values[0] if len(base_ref[base_ref['编码值'] == best_solution['基底材料']]) > 0 else f"基底材料{best_solution['基底材料']}"
        
        # 获取制备方法名称
        method_name = method_ref[method_ref['编码值'] == best_solution['制备方法']]['制备方法'].values[0] if len(method_ref[method_ref['编码值'] == best_solution['制备方法']]) > 0 else f"制备方法{best_solution['制备方法']}"
        
        # 获取类别材料名称
        category_names = []
        for cat_id, cat_val in best_solution['类别材料']:
            # 查找最接近的编码值
            cat_df = material_ref[material_ref['分类ID'] == cat_id]
            if not cat_df.empty:
                # 找到最接近的编码值
                cat_df['差值'] = abs(cat_df['编码值'] - cat_val)
                closest = cat_df.loc[cat_df['差值'].idxmin()]
                category_names.append(f"{closest['材料名称']} (类别{cat_id}, 编码{cat_val:.1f})")
            else:
                category_names.append(f"类别{cat_id}材料 (编码{cat_val:.1f})")
        
        print(f"\n集成优化框架推荐的最佳材料组合:")
        print(f"  优化方法: {best_solution['优化方法']}")
        print(f"  基底材料: {base_name} (编码: {best_solution['基底材料']})")
        print(f"  类别材料: {', '.join(category_names)}")
        print(f"  制备方法: {method_name} (编码: {best_solution['制备方法']})")
        print(f"  综合评分: {best_solution['综合评分']:.4f}")
        print("  预测性能:")
        for target, value in best_solution['预测性能'].items():
            print(f"    {target}: {value:.2f}")
    except Exception as e:
        # 如果无法获取名称，则显示编码
        print(f"\n集成优化框架推荐的最佳材料组合:")
        print(f"  优化方法: {best_solution['优化方法']}")
        print(f"  基底材料: {best_solution['基底材料']}")
        print(f"  类别材料: {', '.join([f'(类别{cat_id}, 值{cat_val:.1f})' for cat_id, cat_val in best_solution['类别材料']])}")
        print(f"  制备方法: {best_solution['制备方法']}")
        print(f"  综合评分: {best_solution['综合评分']:.4f}")
        print("  预测性能:")
        for target, value in best_solution['预测性能'].items():
            print(f"    {target}: {value:.2f}")
        print(f"  注: 无法加载材料参照表: {str(e)}")
    
    # 创建性能对比柱状图
    plt.figure(figsize=(14, 8))
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 设置中文字体
    plt.rcParams['axes.unicode_minus'] = False  # 正确显示负号
    
    # 准备数据
    targets = target_columns
    methods = [s['优化方法'] for s in all_solutions]
    
    # 多组柱状图
    x = np.arange(len(targets))
    width = 0.8 / len(methods)
    
    # 绘制每个方法的柱状图
    for i, method in enumerate(methods):
        performances = []
        for target in targets:
            solution = all_solutions[i]
            perf = solution['预测性能'].get(target, 0)
            performances.append(perf)
        
        pos = x - 0.4 + (i + 0.5) * width
        plt.bar(pos, performances, width, label=method)
    
    # 添加标签和标题
    plt.xlabel('性能指标')
    plt.ylabel('预测值')
    plt.title('不同优化方法的材料性能对比')
    plt.xticks(x, targets)
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('optimization_results/performance_comparison.png', dpi=300)
    plt.close()
    
    # 保存所有解到CSV
    solutions_data = []
    
    for solution in all_solutions:
        # 基础信息
        row = {
            '优化方法': solution['优化方法'],
            '基底材料': solution['基底材料'],
            '类别材料': str([(cat_id, f"{cat_val:.1f}") for cat_id, cat_val in solution['类别材料']]),
            '制备方法': solution['制备方法'],
            '综合评分': solution['综合评分']
        }
        
        # 添加各性能指标
        for target, value in solution['预测性能'].items():
            row[target] = value
            
        solutions_data.append(row)
    
    # 创建DataFrame并保存
    solutions_df = pd.DataFrame(solutions_data)
    solutions_df.to_csv('optimization_results/optimized_materials.csv', index=False, encoding='utf-8-sig')
    print(f"\n优化结果已保存到: optimization_results/optimized_materials.csv")
    
else:
    print("没有任何优化方法找到有效的材料组合")

print("\n材料性能多方法集成优化完成")