In [2]:
"""
CQL连续动作空间模型评估脚本

评估方法：
1. FQE (Fitted Q Evaluation) - 策略价值评估
2. 与行为策略对比
3. Episode级别分析
4. 可视化

运行方式：
    python evaluate_cql_continuous.py
"""

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List
import warnings
warnings.filterwarnings('ignore')

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

# 模型定义（与训练脚本保持一致）
def create_mlp(input_dim, output_dim, hidden_sizes=(256, 256), activation=nn.ReLU):
    layers = []
    last_dim = input_dim
    for h in hidden_sizes:
        layers.extend([nn.Linear(last_dim, h), activation()])
        last_dim = h
    layers.append(nn.Linear(last_dim, output_dim))
    return nn.Sequential(*layers)

class GaussianPolicy(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_sizes=(256, 256)):
        super().__init__()
        self.net = create_mlp(state_dim, 2 * action_dim, hidden_sizes)
        self.log_std_min = -5
        self.log_std_max = 2
    def forward(self, state):
        mean_logstd = self.net(state)
        mean, log_std = torch.chunk(mean_logstd, 2, dim=-1)
        log_std = torch.tanh(log_std)
        log_std = self.log_std_min + 0.5 * (log_std + 1) * (self.log_std_max - self.log_std_min)
        std = torch.exp(log_std)
        return mean, std
    def sample(self, state):
        mean, std = self(state)
        normal = torch.distributions.Normal(mean, std)
        x_t = normal.rsample()
        action = torch.tanh(x_t)
        log_prob = normal.log_prob(x_t) - torch.log(1 - action.pow(2) + 1e-6)
        log_prob = log_prob.sum(dim=-1, keepdim=True)
        return action, log_prob, torch.tanh(mean)

class QNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_sizes=(256, 256)):
        super().__init__()
        self.net = create_mlp(state_dim + action_dim, 1, hidden_sizes)
    def forward(self, state, action):
        sa = torch.cat([state, action], dim=-1)
        return self.net(sa)

class CQLAgent(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_sizes=(256, 256)):
        super().__init__()
        self.q1 = QNetwork(state_dim, action_dim, hidden_sizes)
        self.q2 = QNetwork(state_dim, action_dim, hidden_sizes)
        self.q1_target = QNetwork(state_dim, action_dim, hidden_sizes)
        self.q2_target = QNetwork(state_dim, action_dim, hidden_sizes)
        self.policy = GaussianPolicy(state_dim, action_dim, hidden_sizes)
        self.q1_target.load_state_dict(self.q1.state_dict())
        self.q2_target.load_state_dict(self.q2.state_dict())
    @torch.no_grad()
    def soft_update(self, tau=0.005):
        for target_param, param in zip(self.q1_target.parameters(), self.q1.parameters()):
            target_param.data.mul_(1 - tau).add_(tau * param.data)
        for target_param, param in zip(self.q2_target.parameters(), self.q2.parameters()):
            target_param.data.mul_(1 - tau).add_(tau * param.data)

# 数据列定义
STATE_COLS = [
    "vanco_level(ug/mL)",
    "creatinine(mg/dL)",
    "wbc(K/uL)",
    "bun(mg/dL)",
    "temperature",
    "sbp",
    "heart_rate"
]
ACTION_COL = "totalamount_mg"
REWARD_COL = "step_reward"
TIME_COLS = ["stay_id", "step_4hr"]

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ========== 1. 加载模型和数据 ==========
print("=" * 80)
print("CQL连续动作空间模型评估")
print("=" * 80)

print("\n1. 加载模型...")
checkpoint = torch.load('cql_final_model.pt', map_location=DEVICE, weights_only=False)
state_scaler = checkpoint['state_scaler']
action_scaler = checkpoint['action_scaler']
config = checkpoint['config']
r_min = checkpoint['r_min']
r_range = checkpoint['r_range']
state_dim = checkpoint['state_dim']
action_dim = checkpoint['action_dim']

# 初始化模型
agent = CQLAgent(state_dim, action_dim).to(DEVICE)
agent.load_state_dict(checkpoint['agent'])
agent.eval()

print(f"✅ 模型已加载")
print(f"   状态维度: {state_dim}")
print(f"   动作维度: {action_dim}")
print(f"   配置: alpha={config['alpha']}, gamma={config['gamma']}")

# ========== 2. 加载和预处理数据 ==========
print("\n2. 加载数据...")
df = pd.read_csv("ready_data.csv")
df = df.sort_values(['stay_id', 'step_4hr']).reset_index(drop=True)
df[STATE_COLS] = df[STATE_COLS].fillna(df[STATE_COLS].median())

# 标准化
full_states = state_scaler.transform(df[STATE_COLS].values)
full_rewards = df[REWARD_COL].values.astype(np.float32)

# 归一化奖励（使用训练时的参数）
full_rewards = (full_rewards - r_min) / r_range

# 构建dones
full_dones = np.zeros(len(df), dtype=np.float32)
for stay_id in df['stay_id'].unique():
    stay_mask = df['stay_id'] == stay_id
    stay_indices = np.where(stay_mask)[0]
    if len(stay_indices) > 0:
        full_dones[stay_indices[-1]] = 1.0

print(f"✅ 数据已加载")
print(f"   总记录数: {len(df)}")
print(f"   总episode数: {df['stay_id'].nunique()}")
print(f"   奖励范围（归一化）: [{full_rewards.min():.4f}, {full_rewards.max():.4f}]")

# ========== 3. FQE评估函数 ==========
def evaluate_policy_value_fqe(agent, states, rewards, dones, gamma=0.99, n_samples=10):
    """
    使用Fitted Q Evaluation (FQE) 评估策略价值（连续动作空间）
    
    对于连续动作空间，使用蒙特卡洛估计：
    V^π(s) ≈ (1/N) * Σ_i Q(s, a_i),  a_i ~ π(·|s)
    """
    agent.eval()
    states_tensor = torch.FloatTensor(states).to(DEVICE)
    
    with torch.no_grad():
        # 多次采样取平均（蒙特卡洛估计）
        policy_values_list = []
        
        for _ in range(n_samples):
            policy_actions, _, _ = agent.policy.sample(states_tensor)
            q1_vals = agent.q1(states_tensor, policy_actions)
            q2_vals = agent.q2(states_tensor, policy_actions)
            q_vals = (q1_vals + q2_vals) / 2
            policy_values_list.append(q_vals.cpu().numpy())
        
        # 平均Q值（策略价值）
        policy_values = np.mean(policy_values_list, axis=0).flatten()
    
    # 按episode分组，计算每个episode的初始状态价值
    episode_initial_values = []
    episode_actual_returns = []
    episode_lengths = []
    
    current_return = 0.0
    current_discount = 1.0
    episode_start_idx = 0
    
    for i in range(len(dones)):
        current_return += current_discount * rewards[i]
        current_discount *= gamma
        
        if dones[i] == 1 or i == len(dones) - 1:
            if episode_start_idx < len(policy_values):
                episode_initial_values.append(policy_values[episode_start_idx])
                episode_actual_returns.append(current_return)
                episode_lengths.append(i - episode_start_idx + 1)
            
            current_return = 0.0
            current_discount = 1.0
            episode_start_idx = i + 1
    
    return {
        'policy_value_mean': np.mean(policy_values),
        'policy_value_std': np.std(policy_values),
        'episode_initial_values': episode_initial_values,
        'episode_actual_returns': episode_actual_returns,
        'episode_lengths': episode_lengths,
        'mean_episode_value': np.mean(episode_initial_values) if episode_initial_values else 0.0,
        'mean_episode_return': np.mean(episode_actual_returns) if episode_actual_returns else 0.0,
        'num_episodes': len(episode_initial_values),
    }

def evaluate_behavior_policy_value(rewards, dones, gamma=0.99):
    """评估行为策略（数据中的策略）的价值"""
    episode_returns = []
    current_return = 0.0
    current_discount = 1.0
    
    for i in range(len(rewards)):
        current_return += current_discount * rewards[i]
        current_discount *= gamma
        
        if dones[i] == 1 or i == len(rewards) - 1:
            episode_returns.append(current_return)
            current_return = 0.0
            current_discount = 1.0
    
    return {
        'mean_return': np.mean(episode_returns) if episode_returns else 0.0,
        'std_return': np.std(episode_returns) if episode_returns else 0.0,
        'episode_returns': episode_returns,
        'num_episodes': len(episode_returns),
    }

# ========== 4. 执行评估 ==========
print("\n3. 执行FQE评估...")
fqe_results = evaluate_policy_value_fqe(
    agent, full_states, full_rewards, full_dones,
    gamma=config['gamma'], n_samples=10
)

print(f"\n【CQL策略价值（FQE评估）】")
print(f"  策略期望价值: {fqe_results['mean_episode_value']:.4f} ± {np.std(fqe_results['episode_initial_values']):.4f}")
print(f"  平均状态价值: {fqe_results['policy_value_mean']:.4f} ± {fqe_results['policy_value_std']:.4f}")
print(f"  评估episode数: {fqe_results['num_episodes']}")

print("\n4. 评估行为策略...")
behavior_results = evaluate_behavior_policy_value(full_rewards, full_dones, gamma=config['gamma'])

print(f"\n【行为策略价值（实际数据）】")
print(f"  实际平均回报: {behavior_results['mean_return']:.4f} ± {behavior_results['std_return']:.4f}")
print(f"  评估episode数: {behavior_results['num_episodes']}")

# ========== 5. 策略改进分析 ==========
print("\n5. 策略改进分析...")
improvement = fqe_results['mean_episode_value'] - behavior_results['mean_return']
relative_improvement = (improvement / abs(behavior_results['mean_return'])) * 100 if behavior_results['mean_return'] != 0 else 0.0

print(f"\n【策略改进分析】")
print(f"  绝对改进: {improvement:.4f}")
print(f"  相对改进: {relative_improvement:.2f}%")
if improvement > 0:
    print(f"  解释: CQL策略比行为策略好 {relative_improvement:.2f}%")
elif improvement < 0:
    print(f"  解释: CQL策略比行为策略保守 {abs(relative_improvement):.2f}%（医疗中保守是优点）")
else:
    print(f"  解释: CQL策略与行为策略相当")

# ========== 6. Episode级别详细分析 ==========
print("\n6. Episode级别详细分析...")
agent.eval()
episode_detailed_results = []

with torch.no_grad():
    for stay_id in df['stay_id'].unique():
        stay_data = df[df['stay_id'] == stay_id].sort_values('step_4hr').reset_index(drop=True)
        
        if len(stay_data) == 0:
            continue
        
        stay_indices = np.where(df['stay_id'] == stay_id)[0]
        stay_states = torch.FloatTensor(full_states[stay_indices]).to(DEVICE)
        stay_rewards = full_rewards[stay_indices]
        
        # 策略动作和Q值（多次采样取平均）
        policy_values_list = []
        policy_actions_list = []
        
        for _ in range(10):  # 采样10次
            policy_actions, _, _ = agent.policy.sample(stay_states)
            q1_vals = agent.q1(stay_states, policy_actions)
            q2_vals = agent.q2(stay_states, policy_actions)
            q_vals = (q1_vals + q2_vals) / 2
            policy_values_list.append(q_vals.cpu().numpy())
            policy_actions_list.append(policy_actions.cpu().numpy())
        
        # 平均
        policy_values = np.mean(policy_values_list, axis=0).flatten()
        policy_actions_norm = np.mean(policy_actions_list, axis=0).flatten()
        
        # 初始状态价值
        initial_value = policy_values[0]
        
        # 实际回报
        actual_return = stay_rewards.sum()
        discounted_return = sum(r * (config['gamma'] ** i) for i, r in enumerate(stay_rewards))
        
        # 数据动作（原始值）
        data_actions_raw = stay_data[ACTION_COL].values
        
        # 策略动作（反标准化到原始值）
        policy_actions_raw = action_scaler.inverse_transform(policy_actions_norm.reshape(-1, 1)).flatten()
        policy_actions_raw = np.maximum(0, policy_actions_raw)  # 确保非负
        
        # 动作差异（MAE）
        action_mae = np.mean(np.abs(policy_actions_raw - data_actions_raw))
        
        episode_detailed_results.append({
            'stay_id': stay_id,
            'length': len(stay_data),
            'initial_value': initial_value,
            'actual_return': actual_return,
            'discounted_return': discounted_return,
            'action_mae': action_mae,
            'mean_policy_action': np.mean(policy_actions_raw),
            'mean_data_action': np.mean(data_actions_raw),
        })

# 汇总统计
print(f"\n【Episode级别统计】")
print(f"  评估episode数: {len(episode_detailed_results)}")
print(f"  平均episode长度: {np.mean([r['length'] for r in episode_detailed_results]):.1f} 步")
print(f"  平均初始价值: {np.mean([r['initial_value'] for r in episode_detailed_results]):.4f}")
print(f"  平均实际回报: {np.mean([r['actual_return'] for r in episode_detailed_results]):.4f}")
print(f"  平均折扣回报: {np.mean([r['discounted_return'] for r in episode_detailed_results]):.4f}")
print(f"  平均动作MAE: {np.mean([r['action_mae'] for r in episode_detailed_results]):.2f} mg")
print(f"  平均策略动作: {np.mean([r['mean_policy_action'] for r in episode_detailed_results]):.2f} mg")
print(f"  平均数据动作: {np.mean([r['mean_data_action'] for r in episode_detailed_results]):.2f} mg")


# ========== 8. 最终总结 ==========
print("\n" + "=" * 80)
print("评估总结")
print("=" * 80)
print(f"\n【策略价值（FQE评估）】")
print(f"  CQL策略期望价值: {fqe_results['mean_episode_value']:.4f}")
print(f"  行为策略实际回报: {behavior_results['mean_return']:.4f}")
print(f"\n【策略改进】")
print(f"  {relative_improvement:.2f}%")
if improvement > 0:
    print(f"  CQL策略比行为策略好 {relative_improvement:.2f}%")
elif improvement < 0:
    print(f"  CQL策略比行为策略保守 {abs(relative_improvement):.2f}%（医疗中保守是优点）")
print(f"\n【关键发现】")
print(f"  • 策略价值是评估RL策略的核心指标")
print(f"  • 平均动作MAE: {np.mean([r['action_mae'] for r in episode_detailed_results]):.2f} mg")
print(f"  • 在医疗场景中，保守的策略（价值略低）可能是优点")
print("=" * 80)

print("\n✅ 评估完成！")





CQL连续动作空间模型评估

1. 加载模型...
✅ 模型已加载
   状态维度: 7
   动作维度: 1
   配置: alpha=0.01, gamma=0.99

2. 加载数据...
✅ 数据已加载
   总记录数: 2113
   总episode数: 58
   奖励范围（归一化）: [0.0000, 1.0000]

3. 执行FQE评估...

【CQL策略价值（FQE评估）】
  策略期望价值: 63.6100 ± 4.8356
  平均状态价值: 61.2058 ± 29.7328
  评估episode数: 58

4. 评估行为策略...

【行为策略价值（实际数据）】
  实际平均回报: 17.0907 ± 10.1630
  评估episode数: 58

5. 策略改进分析...

【策略改进分析】
  绝对改进: 46.5193
  相对改进: 272.19%
  解释: CQL策略比行为策略好 272.19%

6. Episode级别详细分析...

【Episode级别统计】
  评估episode数: 58
  平均episode长度: 36.4 步
  平均初始价值: 63.6250
  平均实际回报: 22.0597
  平均折扣回报: 17.0907
  平均动作MAE: 162.56 mg
  平均策略动作: 69.78 mg
  平均数据动作: 116.11 mg

评估总结

【策略价值（FQE评估）】
  CQL策略期望价值: 63.6100
  行为策略实际回报: 17.0907

【策略改进】
  272.19%
  CQL策略比行为策略好 272.19%

【关键发现】
  • 策略价值是评估RL策略的核心指标
  • 平均动作MAE: 162.56 mg
  • 在医疗场景中，保守的策略（价值略低）可能是优点

✅ 评估完成！
