In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
import os
import glob
import matplotlib.pyplot as plt
import matplotlib
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, max_error
import optuna
import copy
import warnings
import json

# 忽略一些Pandas的警告
warnings.filterwarnings('ignore')

# 设置matplotlib
try:
    matplotlib.use('TkAgg')
except:
    pass

# ==========================================
# 0. 全局配置
# ==========================================
GLOBAL_CONFIG = {
    # === [源域配置] 直接指定两个源域文件的绝对路径 ===
    'source_files': [
        r"C:\Users\CHX\xwechat_files\wxid_uomr32ya9e9k11_d15b\msg\file\2026-01\Filtered_Dataset_1_NCA_battery_features_wide.csv",
        r"C:\Users\CHX\xwechat_files\wxid_uomr32ya9e9k11_d15b\msg\file\2026-01\Filtered_Dataset_2_NCM_battery_features_wide.csv",
        r"C:\Users\CHX\xwechat_files\wxid_uomr32ya9e9k11_d15b\msg\file\2026-01\Filtered_Dataset_3_NCM_NCA_battery_features_wide_downsampled_120s.csv"
    ],
    # 'source_files': [
    #     r"C:\Users\CHX\xwechat_files\wxid_uomr32ya9e9k11_d15b\msg\file\2026-01\Dataset_2_NCM_battery_features_wide.csv",
    #     r"C:\Users\CHX\xwechat_files\wxid_uomr32ya9e9k11_d15b\msg\file\2026-01\Dataset_3_NCM_NCA_battery_features_wide_downsampled_120s.csv"
    # ],
    # === [目标域配置] 目标域文件所在的父文件夹 ===
    'target_parent_folder': r"C:\Users\CHX\xwechat_files\wxid_uomr32ya9e9k11_d15b\msg\file\2026-01\降采样耐久性测试数据",
    
    # === [结果保存] ===
    'save_dir': r"C:\Users\CHX\xwechat_files\wxid_uomr32ya9e9k11_d15b\msg\file\2026-01\2409贝叶斯迁移结果\LFP降采样SOH", 
    
    # === [目标域划分] 4个微调(Train)，4个测试(Test) ===
    # 注意：这里只需要文件名的关键词即可，不用写全路径
    'train_car_names': ['battery_1', 'battery_5', 'battery_8'], 
    'test_car_names':  ['battery_2', 'battery_4', 'battery_3', 'battery_6', 'battery_7'],
    
    # === [特征列] 14个电压片段 ===
    'features': [f'V_sec_{i}' for i in range(1, 15)], # V_sec_1 到 V_sec_14
    
    'label_col': 'SOH',       # 标签列名
    'time_col': 'start_time', # 这里的CSV可能不一定有时间列，代码里会做兼容处理(用索引)
    'input_dim': 14,          # 输入维度
    'latent_dim': 6,          # 隐层维度 (因为输入变多了，稍微增加一点中间层宽度)
    
    'device': torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    'n_trials': 80,       # 贝叶斯优化次数
    'max_epochs': 1000,   # 最大训练轮数
    'patience': 200       # 早停耐心值
}

# 确保保存目录存在
os.makedirs(GLOBAL_CONFIG['save_dir'], exist_ok=True)
print(f"Using device: {GLOBAL_CONFIG['device']}")
print(f"Results will be saved to: {GLOBAL_CONFIG['save_dir']}")

# ==========================================
# 1. 工具类与数据加载
# ==========================================

class EarlyStopping:
    def __init__(self, patience=200, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        self.best_model_state = None

    def __call__(self, val_loss, model):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.best_model_state = copy.deepcopy(model.state_dict())
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.best_model_state = copy.deepcopy(model.state_dict())
            self.counter = 0

def load_source_data_csv(file_list, feature_cols, label_col):
    """加载源域数据 (CSV格式)"""
    data_list = []
    print(f"Loading Source Data from {len(file_list)} specific files...")
    for f in file_list:
        if not os.path.exists(f):
            print(f"[Warning] Source file not found: {f}")
            continue
        try:
            df = pd.read_csv(f)
            # 确保没有空值
            df = df.dropna(subset=feature_cols + [label_col])
            features = df[feature_cols].values
            # SOH 已经是 0.x 的小数，无需额外处理
            labels = df[label_col].values.reshape(-1, 1) 
            data_list.append(np.hstack((features, labels)))
            print(f"  Loaded {os.path.basename(f)}: shape {features.shape}")
        except Exception as e:
            print(f"  Error reading {os.path.basename(f)}: {e}")

    if not data_list: raise ValueError("No source data loaded.")
    combined = np.vstack(data_list)
    return combined[:, :-1], combined[:, -1:]

def get_files_by_names(folder_path, target_names):
    """辅助函数：根据名字筛选CSV文件"""
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    selected_files = []
    for f in all_files:
        fname = os.path.basename(f)
        for target in target_names:
            # 简单的关键词匹配
            if target in fname:
                selected_files.append(f)
                break 
    return sorted(selected_files)

def load_target_train_stacked_csv(folder_path, target_names, feature_cols, label_col):
    """加载目标域训练集 (CSV)"""
    files = get_files_by_names(folder_path, target_names)
    print(f"Loading Target Train Data: Found {len(files)} files matching {target_names}")
    
    data_list = []
    for f in files:
        try:
            df = pd.read_csv(f)
            df = df.dropna(subset=feature_cols + [label_col])
            features = df[feature_cols].values
            labels = df[label_col].values.reshape(-1, 1)
            data_list.append(np.hstack((features, labels)))
            print(f"  Loaded Train File: {os.path.basename(f)}")
        except Exception as e:
            print(f"  Error reading {os.path.basename(f)}: {e}")
            
    if not data_list: raise ValueError(f"No target train data found for names: {target_names}")
    combined = np.vstack(data_list)
    return combined[:, :-1], combined[:, -1:]

def load_target_test_individual_csv(folder_path, target_names, feature_cols, label_col):
    """加载目标域测试集 (CSV)：单独保存用于评估"""
    files = get_files_by_names(folder_path, target_names)
    print(f"Loading Target Test Data: Found {len(files)} files matching {target_names}")
    
    cars_data = [] 
    for f in files:
        try:
            df = pd.read_csv(f)
            # 处理时间列：如果有 'start_time' 或其他列可以使用，否则使用索引
            time_feat = GLOBAL_CONFIG['time_col']
            if time_feat not in df.columns:
                # 尝试寻找常见的时间列名，如果都没有则用Index
                times = df.index.values
            else:
                times = df[time_feat].values
            
            df = df.dropna(subset=feature_cols + [label_col])
            
            X_raw = df[feature_cols].values
            y_raw = df[label_col].values.reshape(-1, 1)
            # 重新对齐时间（因为dropna可能删了行）
            # 注意：如果时间列很重要且必须对齐，dropna这里要小心。这里为了demo简单，假设dropna删的不多
            # 如果dropna后长度对不上，我们截断times
            if len(times) != len(X_raw):
                 times = times[:len(X_raw)]

            filename = os.path.basename(f)
            
            cars_data.append({
                'X_raw': X_raw, 
                'y_raw': y_raw, 
                'times': times,
                'name': filename
            })
            print(f"  Loaded Test File: {filename}")
        except Exception as e:
            print(f"  Error reading test file {os.path.basename(f)}: {e}")
            
    return cars_data
def get_dataloader(X, y, batch_size, shuffle=True, drop_last=False):
    tensor_x = torch.FloatTensor(X)
    tensor_y = torch.FloatTensor(y)
    dataset = TensorDataset(tensor_x, tensor_y)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)


# ==========================================
# 2. 模型定义 (对应输入维度14)
# ==========================================
class SAE_FCN_Paper(nn.Module):
    def __init__(self, input_dim=14, latent_dim=6):
        super(SAE_FCN_Paper, self).__init__()
        # 编码器：14 -> 24 -> 12 -> latent_dim
        self.enc1 = nn.Sequential(nn.Linear(input_dim, 24), nn.BatchNorm1d(24), nn.ReLU(), nn.Linear(24, 12), nn.Tanh())
        self.enc2 = nn.Sequential(nn.Linear(12, 8), nn.BatchNorm1d(8), nn.ReLU(), nn.Linear(8, latent_dim), nn.Tanh())
        
        # 解码器
        self.dec2 = nn.Sequential(nn.Linear(latent_dim, 8), nn.BatchNorm1d(8), nn.Sigmoid(), nn.Linear(8, 12))
        self.dec1 = nn.Sequential(nn.Linear(12, 24), nn.BatchNorm1d(24), nn.Sigmoid(), nn.Linear(24, input_dim))
        
        # FCN回归器
        self.fcn = nn.Sequential(nn.Linear(latent_dim, 16), nn.ReLU(), nn.Linear(16, 8), nn.ReLU(), nn.Linear(8, 1))

    def forward(self, x):
        feat_12 = self.enc1(x)
        features = self.enc2(feat_12)
        rec_12 = self.dec2(features)
        x_recon = self.dec1(rec_12)
        soh_pred = self.fcn(features)
        return x_recon, soh_pred, features

# ==========================================
# 3. 贝叶斯优化 Objective
# ==========================================
def objective(trial, X_src, y_src, X_tgt_train, y_tgt_train, test_cars_data, scaler_tgt):
    
    # 3.1 采样超参数
    lr_pre = trial.suggest_loguniform('lr_pretrain', 1e-4, 1e-2)
    lr_fine = trial.suggest_loguniform('lr_finetune', 1e-5, 1e-3)
    lambda_w = trial.suggest_float('lambda_weight', 0.1, 0.9)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])
    
    # 3.2 数据准备
    loader_src = get_dataloader(X_src, y_src, batch_size, shuffle=True, drop_last=True)
    loader_tgt_tr = get_dataloader(X_tgt_train, y_tgt_train, batch_size, shuffle=True, drop_last=True)
    
    # 构建 Validation Set (所有测试车数据合并，只为计算Pre-Pruning Loss)
    X_test_all_raw = np.vstack([c['X_raw'] for c in test_cars_data])
    y_test_all_raw = np.vstack([c['y_raw'] for c in test_cars_data])
    X_test_all_norm = scaler_tgt.transform(X_test_all_raw)
    
    tensor_test_x = torch.FloatTensor(X_test_all_norm).to(GLOBAL_CONFIG['device'])
    tensor_test_y = torch.FloatTensor(y_test_all_raw).to(GLOBAL_CONFIG['device'])
    loader_test_val = DataLoader(TensorDataset(tensor_test_x, tensor_test_y), batch_size=batch_size, shuffle=False)

    # 3.3 模型
    # 注意这里传入 GLOBAL_CONFIG['input_dim'] (14)
    model = SAE_FCN_Paper(input_dim=GLOBAL_CONFIG['input_dim'], latent_dim=GLOBAL_CONFIG['latent_dim']).to(GLOBAL_CONFIG['device'])
    loss_fn = nn.MSELoss()
    
    # 3.4 源域预训练
    optimizer_pre = optim.Adam(model.parameters(), lr=lr_pre)
    model.train()
    # 简单跑50轮预训练
    for epoch in range(50): 
        for bx, by in loader_src:
            bx, by = bx.to(GLOBAL_CONFIG['device']), by.to(GLOBAL_CONFIG['device'])
            optimizer_pre.zero_grad()
            xr, sp, _ = model(bx)
            loss = lambda_w * loss_fn(xr, bx) + (1-lambda_w) * loss_fn(sp, by)
            loss.backward()
            optimizer_pre.step()
            
    # 3.5 目标域微调 + 早停 (冻结FCN)
    for param in model.fcn.parameters(): param.requires_grad = False
    
    optimizer_fine = optim.Adam(list(model.enc1.parameters()) + list(model.enc2.parameters()) + \
                                list(model.dec1.parameters()) + list(model.dec2.parameters()), lr=lr_fine)
    
    es = EarlyStopping(patience=GLOBAL_CONFIG['patience'])
    
    for epoch in range(GLOBAL_CONFIG['max_epochs']):
        # Train
        model.train()
        for bx, by in loader_tgt_tr:
            bx, by = bx.to(GLOBAL_CONFIG['device']), by.to(GLOBAL_CONFIG['device'])
            optimizer_fine.zero_grad()
            xr, sp, _ = model(bx)
            loss = lambda_w * loss_fn(xr, bx) + (1-lambda_w) * loss_fn(sp, by)
            loss.backward()
            optimizer_fine.step()
            
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for bx, by in loader_test_val:
                xr, sp, _ = model(bx)
                l = lambda_w * loss_fn(xr, bx) + (1-lambda_w) * loss_fn(sp, by)
                val_loss += l.item()
        val_loss /= len(loader_test_val)
        
        es(val_loss, model)
        if es.early_stop: break
            
        trial.report(val_loss, epoch)
        if trial.should_prune(): raise optuna.exceptions.TrialPruned()

    # =========================================================
    # 3.6 评估与保存
    # =========================================================
    
    # 获取最佳模型参数
    model.load_state_dict(es.best_model_state)
    model.eval()
    
    # --- 1. 创建当次Trial的子文件夹 ---
    trial_subfolder = os.path.join(GLOBAL_CONFIG['save_dir'], f"Trial_{trial.number}")
    os.makedirs(trial_subfolder, exist_ok=True)
    
    # --- 2. 保存模型权重和完整模型 ---
    torch.save(model.state_dict(), os.path.join(trial_subfolder, "model_weights.pth"))
    torch.save(model, os.path.join(trial_subfolder, "full_model.pth"))
    
    # 准备Excel写入器
    excel_path = os.path.join(trial_subfolder, "Predictions.xlsx")
    writer = pd.ExcelWriter(excel_path, engine='openpyxl')
    
    # 用于收集所有数据以计算 Overall Metrics
    all_trues_list, all_preds_list = [], []
    
    # 用于收集每辆车的 Metrics 写入 TXT
    metrics_report_lines = []
    metrics_report_lines.append(f"=== Trial {trial.number} Detailed Report ===\n")
    metrics_report_lines.append(f"Hyperparameters: {json.dumps(trial.params, indent=4)}\n")
    metrics_report_lines.append("-" * 50 + "\n")
    metrics_report_lines.append(f"{'Car Name':<20} | {'RMSE(%)':<10} | {'MAE(%)':<10} | {'R2':<10} | {'MaxError(%)':<10}\n")
    metrics_report_lines.append("-" * 80 + "\n")
    
    with torch.no_grad():
        for i, car_info in enumerate(test_cars_data):
            # 预测
            x_norm = scaler_tgt.transform(car_info['X_raw'])
            t_x = torch.FloatTensor(x_norm).to(GLOBAL_CONFIG['device'])
            _, pred_01, _ = model(t_x)
            
            # 转为 numpy 并还原百分比
            # 这里SOH本身是[0,1]区间，乘以100变成百分数用于指标计算
            pred_01 = pred_01.cpu().numpy().flatten()
            true_01 = car_info['y_raw'].flatten()
            pred_pct = pred_01 * 100
            true_pct = true_01 * 100
            
            # 收集该车 metrics
            m_rmse = np.sqrt(mean_squared_error(true_pct, pred_pct))
            m_mae = mean_absolute_error(true_pct, pred_pct)
            m_r2 = r2_score(true_pct, pred_pct)
            m_max = max_error(true_pct, pred_pct)
            
            # 格式化并添加到报告列表
            car_name_short = car_info['name'].replace('.csv', '')
            metrics_report_lines.append(f"{car_name_short[:19]:<20} | {m_rmse:<10.4f} | {m_mae:<10.4f} | {m_r2:<10.4f} | {m_max:<10.4f}\n")
            
            # 添加到汇总列表
            all_trues_list.extend(true_pct)
            all_preds_list.extend(pred_pct)
            
            # --- 3. 写入Excel ---
            # 为了避免Excel Sheet名字过长报错，截取前30字符
            pd.DataFrame({
                'start_time': car_info['times'],
                'True_SOH': true_01,      # Excel里保存原始小数
                'Predicted_SOH': pred_01, # Excel里保存原始小数
                'True_SOH_Pct': true_pct, # 同时保存百分比方便查看
                'Predicted_SOH_Pct': pred_pct
            }).to_excel(writer, sheet_name=car_name_short[:30], index=False)
            
    writer.close()
    
    # 计算 Overall Metrics
    all_trues = np.array(all_trues_list)
    all_preds = np.array(all_preds_list)
    
    ov_rmse = np.sqrt(mean_squared_error(all_trues, all_preds))
    ov_mae = mean_absolute_error(all_trues, all_preds)
    ov_r2 = r2_score(all_trues, all_preds)
    ov_max = max_error(all_trues, all_preds)
    
    # 补充报告底部
    metrics_report_lines.append("-" * 80 + "\n")
    metrics_report_lines.append(f"{'OVERALL (Combined)':<20} | {ov_rmse:<10.4f} | {ov_mae:<10.4f} | {ov_r2:<10.4f} | {ov_max:<10.4f}\n")
    
    # --- 4. 保存 TXT 报告 ---
    txt_path = os.path.join(trial_subfolder, "hyperparameters_metrics.txt")
    with open(txt_path, 'w', encoding='utf-8') as f:
        f.writelines(metrics_report_lines)
    
    print(f"[Trial {trial.number}] RMSE: {ov_rmse:.4f}% | Saved to folder: {os.path.basename(trial_subfolder)}")
    return ov_rmse

# ==========================================
# 4. 主流程
# ==========================================
if __name__ == '__main__':
    # 4.1 加载源域
    print(">>> 1. Loading Source Domain (Specified CSVs)...")
    X_src, y_src = load_source_data_csv(
        GLOBAL_CONFIG['source_files'], 
        GLOBAL_CONFIG['features'], 
        GLOBAL_CONFIG['label_col']
    )
    # 归一化：建议源域和目标域统一使用MinMaxScaler
    scaler_src = MinMaxScaler()
    X_src_norm = scaler_src.fit_transform(X_src)
    
    # 4.2 加载目标域训练集 (4个微调电池)
    print("\n>>> 2. Loading Target Train Set (Fine-tuning Cars)...")
    X_tgt_tr, y_tgt_tr = load_target_train_stacked_csv(
        GLOBAL_CONFIG['target_parent_folder'], 
        GLOBAL_CONFIG['train_car_names'], 
        GLOBAL_CONFIG['features'],
        GLOBAL_CONFIG['label_col']
    )
    # 目标域Scaler只在训练集上Fit
    scaler_tgt = MinMaxScaler()
    X_tgt_tr_norm = scaler_tgt.fit_transform(X_tgt_tr)
    
    # 4.3 加载目标域测试集 (4个测试电池)
    print("\n>>> 3. Loading Target Test Set (Testing Cars)...")
    test_cars_data = load_target_test_individual_csv(
        GLOBAL_CONFIG['target_parent_folder'], 
        GLOBAL_CONFIG['test_car_names'],
        GLOBAL_CONFIG['features'],
        GLOBAL_CONFIG['label_col']
    )
    
    # 4.4 开启贝叶斯优化
    print(f"\n>>> 4. Starting Optuna ({GLOBAL_CONFIG['n_trials']} trials)...")
    # 设置Optuna日志级别
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    study = optuna.create_study(direction="minimize")
    
    func = lambda trial: objective(
        trial, 
        X_src_norm, y_src, 
        X_tgt_tr_norm, y_tgt_tr,
        test_cars_data,
        scaler_tgt
    )
    
    study.optimize(func, n_trials=GLOBAL_CONFIG['n_trials'])
    
    print("\n" + "="*50)
    print("OPTIMIZATION FINISHED")
    print(f"Best Trial: {study.best_trial.number}")
    print(f"Best RMSE : {study.best_value:.4f}%")
    print(f"All trial results saved to: {GLOBAL_CONFIG['save_dir']}")
    print("="*50)
    
    # 4.5 绘制最佳结果
    try:
        best_folder = os.path.join(GLOBAL_CONFIG['save_dir'], f"Trial_{study.best_trial.number}")
        best_file = os.path.join(best_folder, "Predictions.xlsx")
        
        print(f"Plotting results from: {best_file}")
        
        all_true, all_pred = [], []
        xl = pd.ExcelFile(best_file)
        for sheet in xl.sheet_names:
            df = xl.parse(sheet)
            all_true.extend(df['True_SOH_Pct'].values)
            all_pred.extend(df['Predicted_SOH_Pct'].values)
        
        plt.figure(figsize=(8, 8))
        plt.scatter(all_true, all_pred, alpha=0.5, label='Datapoints')
        mi, ma = min(min(all_true), min(all_pred)), max(max(all_true), max(all_pred))
        plt.plot([mi, ma], [mi, ma], 'r--', label='Ideal')
        plt.title(f"Best Transfer Result (Trial {study.best_trial.number})\nRMSE: {study.best_value:.4f}%")
        plt.xlabel("True SOH (%)"); plt.ylabel("Predicted SOH (%)")
        plt.legend(); plt.grid(True)
        plt.show()
    except Exception as e:
        print(f"Plotting failed: {e}")


Using device: cuda
Results will be saved to: C:\Users\CHX\xwechat_files\wxid_uomr32ya9e9k11_d15b\msg\file\2026-01\2409贝叶斯迁移结果\LFP降采样SOH
>>> 1. Loading Source Domain (Specified CSVs)...
Loading Source Data from 3 specific files...
  Loaded Filtered_Dataset_1_NCA_battery_features_wide.csv: shape (13944, 14)
  Loaded Filtered_Dataset_2_NCM_battery_features_wide.csv: shape (17470, 14)
  Loaded Filtered_Dataset_3_NCM_NCA_battery_features_wide_downsampled_120s.csv: shape (4118, 14)

>>> 2. Loading Target Train Set (Fine-tuning Cars)...
Loading Target Train Data: Found 3 files matching ['battery_1', 'battery_5', 'battery_8']
  Loaded Train File: Resampled_battery_1_merged_down_120s_14pts.csv
  Loaded Train File: Resampled_battery_5_merged_down_120s_14pts.csv
  Loaded Train File: Resampled_battery_8_merged_down_120s_14pts.csv

>>> 3. Loading Target Test Set (Testing Cars)...
Loading Target Test Data: Found 5 files matching ['battery_2', 'battery_4', 'battery_3', 'battery_6', 'battery_7']
  Loa

[W 2026-01-04 12:25:01,752] Trial 7 failed with parameters: {'lr_pretrain': 0.0006774174796239654, 'lr_finetune': 0.0005616579343112419, 'lambda_weight': 0.29242765516310987, 'batch_size': 128} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "D:\Anaconda\envs\CHX\lib\site-packages\optuna\study\_optimize.py", line 205, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\CHX\AppData\Local\Temp\ipykernel_38028\530030776.py", line 438, in <lambda>
    func = lambda trial: objective(
  File "C:\Users\CHX\AppData\Local\Temp\ipykernel_38028\530030776.py", line 292, in objective
    for bx, by in loader_test_val:
  File "D:\Anaconda\envs\CHX\lib\site-packages\torch\utils\data\dataloader.py", line 733, in __next__
    data = self._next_data()
  File "D:\Anaconda\envs\CHX\lib\site-packages\torch\utils\data\dataloader.py", line 789, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "D:\Anaconda

KeyboardInterrupt: 