# 训练专家智能体

In [1]:
from stable_baselines3 import SAC
from stable_baselines3.sac import MlpPolicy
from stable_baselines3.common.evaluation import evaluate_policy
import gym
from tqdm import tqdm_notebook as tqdm
import numpy as np
import torch as T
import torch.nn as nn
import random

In [2]:
# 创建环境
env_name = "BipedalWalker-v3"
env = gym.make(env_name)
print(env.observation_space.shape, env.action_space.shape)

(24,) (4,)


In [3]:
# 测试环境
TEST_EPISODE_LENGTH = 6
for i in range(TEST_EPISODE_LENGTH):
    env.reset()
    episode_reward = 0
    while True:
        _, reward, done, _ = env.step(env.action_space.sample())
        episode_reward += reward
        if done:
            print(f"Episode {i+1} reward: {episode_reward}")
            break


Episode 1 reward: -103.30651400911435
Episode 2 reward: -106.41460101633675
Episode 3 reward: -119.01006671276005
Episode 4 reward: -99.51496625272767
Episode 5 reward: -120.79805165005507
Episode 6 reward: -85.30791510040703


In [4]:
# 专家智能体模型
# expert = SAC(policy=MlpPolicy, env=env, verbose=1, policy_kwargs={"net_arch": [128, 128]})
# print(expert.__dict__)
# https://www.gymlibrary.dev/environments/box2d/bipedal_walker/
# Good performance: reward > 300 
# expert.learn(100000, log_interval=10)
# reward, _ = evaluate_policy(expert, env, 10, render=False, warn=False)
# print(reward)
# expert.save('sac-expert-pedelwalker.zip')
expert = SAC.load('./sac-expert-pedelwalker.zip')


Exception: Can't get attribute '_make_function' on <module 'cloudpickle.cloudpickle' from '/home/cxw/pyenvs/robot/lib/python3.8/site-packages/cloudpickle/cloudpickle.py'>


# 生成专家轨迹

In [21]:
def sample_expert_data(n_episode):
    states = []
    actions = []
    for episode in range(n_episode):
        state = env.reset()
        done = False
        while not done:
            action = expert.predict(state)[0]
            states.append(state)
            actions.append(action)
            next_state, reward, done, _ = env.step(action)
            state = next_state
    return np.array(states), np.array(actions)


env.seed(0)
T.manual_seed(0)
random.seed(0)
n_episode = 50 
expert_s, expert_a = sample_expert_data(n_episode)

# n_samples = 250000   # 采样30个数据
# random_index = random.sample(range(expert_s.shape[0]), n_samples)
# expert_s = expert_s[random_index]
# expert_a = expert_a[random_index]

# 行为克隆

In [23]:
device = T.device("cuda") if T.cuda.is_available() else T.device("cpu")


class PolicyNet(T.nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(PolicyNet, self).__init__()
        self.net = nn.Sequential(
            T.nn.Linear(state_dim, hidden_dim),
            T.nn.LeakyReLU(),
            T.nn.Linear(hidden_dim, hidden_dim),
            T.nn.LeakyReLU(),
            T.nn.Linear(hidden_dim, action_dim)
        )

    def forward(self, x):
        return self.net(x)


In [35]:
class BehaviorClone:
    def __init__(self, state_dim, hidden_dim, action_dim, lr):
        self.policy = PolicyNet(state_dim, hidden_dim, action_dim).to(device)
        self.optimizer = T.optim.Adam(self.policy.parameters(), lr=lr)
        self.criterion = T.nn.MSELoss()

    def learn(self, states, actions):
        states = T.tensor(states, dtype=T.float).to(device)
        actions = T.tensor(actions, dtype=T.float).to(device)
        bc_loss = self.criterion(actions, self.policy(states))

        self.optimizer.zero_grad()
        bc_loss.backward()
        self.optimizer.step()

    def take_action(self, state):
        state = T.tensor([state], dtype=T.float).to(device)
        action = self.policy(state)
        # action_dist = T.distributions.Categorical(probs)
        # action = action_dist.sample()
        action = action.cpu().detach().numpy()[0]
        return action

In [36]:
def test_agent(agent, env, n_episode):
    return_list = []
    for _ in range(n_episode):
        episode_return = 0
        state = env.reset()
        done = False
        while not done:
            action = agent.take_action(state)
            next_state, reward, done, _ = env.step(action)
            state = next_state
            episode_return += reward
        return_list.append(episode_return)
    return np.mean(return_list)


env.seed(0)
T.manual_seed(0)
np.random.seed(0)
num_episodes = 250
hidden_dim = 128
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

lr = 1e-3
bc_agent = BehaviorClone(state_dim, hidden_dim, action_dim, lr)
n_iterations = 1000
batch_size = 64
test_returns = []

with tqdm(total=n_iterations, desc="进度条") as pbar:
    for i in range(n_iterations):
        sample_indices = np.random.randint(low=0,
                                           high=expert_s.shape[0],
                                           size=batch_size)
        bc_agent.learn(expert_s[sample_indices], expert_a[sample_indices])
        current_return = test_agent(bc_agent, env, 5)
        test_returns.append(current_return)
        if (i + 1) % 10 == 0:
            pbar.set_postfix({'return': '%.3f' % np.mean(test_returns[-10:])})
        pbar.update(1)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  with tqdm(total=n_iterations, desc="进度条") as pbar:


进度条:   0%|          | 0/1000 [00:00<?, ?it/s]

KeyboardInterrupt: 