In [1]:
!nvidia-smi

Sun Feb 27 14:48:15 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!apt-get install -y \
    libgl1-mesa-dev \
    libgl1-mesa-glx \
    libglew-dev \
    libosmesa6-dev \
    software-properties-common

!apt-get install -y patchelf
!pip install gym
!pip install free-mujoco-py
!pip install tensorboardX

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libglew-dev is already the newest version (2.0.0-5).
libgl1-mesa-dev is already the newest version (20.0.8-0ubuntu1~18.04.1).
libgl1-mesa-glx is already the newest version (20.0.8-0ubuntu1~18.04.1).
libosmesa6-dev is already the newest version (20.0.8-0ubuntu1~18.04.1).
software-properties-common is already the newest version (0.96.24.32.18).
The following package was automatically installed and is no longer required:
  libnvidia-common-470
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
patchelf is already the newest version (0.9-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-470
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.


In [3]:
# 挂载google drive
from google.colab import drive

drive.mount('/content/drive')
# 更改当前目录到项目目录
import os

os.chdir("/content/drive/MyDrive/projects/PPO_mujoco")
!pwd

Mounted at /content/drive
/content/drive/MyDrive/projects/PPO_mujoco


In [5]:
import mujoco_py
import gym  # gym must to be imported behind mujoco_py
import os
import math
import datetime
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

from collections import deque
from tensorboardX.writer import SummaryWriter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [6]:
# 虽然不知道这个py是干什么的，但是感觉很厉害的样子，暂且先留着吧
# from https://github.com/joschu/modular_rl
# http://www.johndcook.com/blog/standard_deviation/
class RunningStat(object):
    def __init__(self, shape):
        self._n = 0
        self._M = np.zeros(shape)
        self._S = np.zeros(shape)

    def push(self, x):
        x = np.asarray(x)
        assert x.shape == self._M.shape
        self._n += 1
        if self._n == 1:
            self._M[...] = x
        else:
            oldM = self._M.copy()
            self._M[...] = oldM + (x - oldM) / self._n
            self._S[...] = self._S + (x - oldM) * (x - self._M)

    @property
    def n(self):
        return self._n

    @property
    def mean(self):
        return self._M

    @property
    def var(self):
        return self._S / (self._n - 1) if self._n > 1 else np.square(self._M)

    @property
    def std(self):
        return np.sqrt(self.var)

    @property
    def shape(self):
        return self._M.shape


class ZFilter:
    """
    y = (x-mean)/std
    using running estimates of mean,std
    """

    def __init__(self, shape, demean=True, destd=True, clip=10.0):
        self.demean = demean
        self.destd = destd
        self.clip = clip
        self.rs = RunningStat(shape)

    def __call__(self, x, update=True):
        if update: self.rs.push(x)
        if self.demean:
            x = x - self.rs.mean
        if self.destd:
            x = x / (self.rs.std + 1e-8)
        if self.clip:
            x = np.clip(x, -self.clip, self.clip)
        return x

    def output_shape(self, input_space):
        return input_space.shape

In [7]:
# network 
class Actor(nn.Module):
    def __init__(self, input_dim, action_dim, hidden_dim):
        self.input_dim = input_dim
        self.action_dim = action_dim
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, action_dim)
        self.fc3.weight.data.mul_(0.1)
        self.fc3.bias.data.mul_(0.0)

    def forward(self, x):
        x = F.tanh(self.fc1(x))
        x = F.tanh(self.fc2(x))
        mu = self.fc3(x)
        logstd = torch.zeros_like(mu)
        std = torch.exp(logstd)
        return mu, std, logstd


class Critic(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)
        self.fc3.weight.data.mul_(0.1)
        self.fc3.bias.data.mul_(0.0)

    def forward(self, x):
        x = F.tanh(self.fc1(x))
        x = F.tanh(self.fc2(x))
        v = self.fc3(x)
        return v


In [8]:
def get_action(mu, std):
    action = torch.normal(mu, std)
    action = action.data.numpy()
    return action


def log_density(x, mu, std, logstd):
    var = std.pow(2)
    log_density = -(x - mu).pow(2) / (2 * var) \
                  - 0.5 * math.log(2 * math.pi) - logstd

    return log_density.sum(1, keepdim=True)


def flat_grad(grads):
    grad_flatten = []
    for grad in grads:
        grad_flatten.append(grad.view(-1))
    grad_flatten = torch.cat(grad_flatten)
    return grad_flatten


def flat_hessian(hessians):
    hessians_flatten = []
    for hessian in hessians:
        hessians_flatten.append(hessian.contiguous().view(-1))
    hessians_flatten = torch.cat(hessians_flatten).data
    return hessians_flatten


def flat_params(model):
    params = []
    for param in model.parameters():
        params.append(param.data.view(-1))
    params_flatten = torch.cat(params)
    return params_flatten


def update_model(model, new_params):
    index = 0
    for params in model.parameters():
        params_length = len(params.view(-1))
        new_param = new_params[index: index + params_length]
        new_param = new_param.view(params.size())
        params.data.copy_(new_param)
        index += params_length


def kl_divergence(new_actor, old_actor, states):
    mu, std, logstd = new_actor(torch.Tensor(states))
    mu_old, std_old, logstd_old = old_actor(torch.Tensor(states))
    mu_old = mu_old.detach()
    std_old = std_old.detach()
    logstd_old = logstd_old.detach()

    # kl divergence between old policy and new policy : D( pi_old || pi_new )
    # pi_old -> mu0, logstd0, std0 / pi_new -> mu, logstd, std
    # be careful of calculating KL-divergence. It is not symmetric metric
    kl = logstd_old - logstd + (std_old.pow(2) + (mu_old - mu).pow(2)) / (2.0 * std.pow(2)) - 0.5

    return kl.sum(1, keepdim=True)

In [66]:
class PPO:
    def __init__(self, device, input_dim, action_dim, hidden_dim, actor_lr, critic_lr, l2_rate, gamma, lamda, batch_size, clip_param, running_state):
        self.device = device
        self.actor = Actor(input_dim, action_dim, hidden_dim).to(device)
        self.critic = Critic(input_dim, hidden_dim).to(device)
        self.actor_optim = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic_optim = optim.Adam(self.critic.parameters(), lr=critic_lr, weight_decay=l2_rate)
        self.gamma = gamma
        self.lamda = lamda
        self.batch_size = batch_size
        self.clip_param = clip_param
        self.running_state = running_state

    def get_state(self, observation):
        state = running_state(observation)
        state = torch.Tensor(state).unsqueeze(0)
        return state

    def select_action(self, state):
        mu, std, _ = self.actor(state.to(self.device))
        action = torch.normal(mu, std)
        action = action.data.cpu().numpy()[0]
        return action

    def log_density(self, x, mu, std, logstd):
        var = std.pow(2)
        logdensity = -(x - mu).pow(2) / (2 * var) \
                      - 0.5 * math.log(2 * math.pi) - logstd

        return logdensity.sum(1, keepdim=True)

    def train_model(self, memory):
        memory = np.array(memory)
        states = torch.Tensor(np.vstack(memory[:, 0])).to(self.device)
        actions = torch.Tensor(list(memory[:, 1])).to(self.device)
        rewards = torch.Tensor(list(memory[:, 2])).to(self.device)
        masks = torch.Tensor(list(memory[:, 3])).to(self.device)
        values = self.critic(states)
        # ----------------------------
        # step 1: get returns and GAEs and log probability of old policy
        returns, advants = self.get_gae(rewards, masks, values)
        mu, std, logstd = self.actor(states)
        old_policy = self.log_density(actions, mu, std, logstd).detach()
        criterion = torch.nn.MSELoss()
        n = len(states)
        arr = np.arange(n)
        # ----------------------------
        # step 2: get value loss and actor loss and update actor & critic
        for epoch in range(10):
            np.random.shuffle(arr)
            for i in range(n // self.batch_size):
                batch_index = arr[self.batch_size * i: self.batch_size * (i + 1)]
                batch_index = torch.LongTensor(batch_index)
                # print("batch_index ", batch_index)
                states_samples = states[batch_index]
                returns_samples = returns.unsqueeze(1)[batch_index]
                advants_samples = advants.unsqueeze(1)[batch_index]
                actions_samples = actions[batch_index]
                old_policy_samples = old_policy[batch_index]
                loss, ratio = self.surrogate_loss(advants_samples, states_samples, old_policy_samples, actions_samples)

                values = self.critic(states_samples)
                critic_loss = criterion(values, returns_samples)
                self.critic_optim.zero_grad()
                critic_loss.backward()
                self.critic_optim.step()

                clipped_ratio = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param)
                clipped_loss = clipped_ratio * advants_samples
                actor_loss = -torch.min(loss, clipped_loss).mean()
                self.actor_optim.zero_grad()
                actor_loss.backward()
                self.actor_optim.step()

    def get_gae(self, rewards, masks, values):
        returns = torch.zeros_like(rewards).to(self.device)
        advants = torch.zeros_like(rewards).to(self.device)

        running_returns = 0
        previous_value = 0
        running_advants = 0

        for t in reversed(range(0, len(rewards))):
            running_returns = rewards[t] + self.gamma * running_returns * masks[t]
            running_tderror = rewards[t] + self.gamma * previous_value * masks[t] - values.data[t]
            running_advants = running_tderror + self.gamma * self.lamda * running_advants * masks[t]

            returns[t] = running_returns
            previous_value = values.data[t]
            advants[t] = running_advants

        advants = (advants - advants.mean()) / advants.std()
        return returns, advants

    def surrogate_loss(self, advants, states, old_policy, actions):
        mu, std, logstd = self.actor(states)
        new_policy = self.log_density(actions, mu, std, logstd)

        ratio = torch.exp(new_policy - old_policy)
        surrogate = ratio * advants
        return surrogate, ratio

In [49]:
class Config:
    # common config
    random_seed = 500  # set random seed if required (0 = no random seed)
    train_time = str(datetime.datetime.now().replace(microsecond=0).strftime("%Y-%m-%d-%H-%M-%S"))

    # cuda config
    device = torch.device('cpu')
    device_name = "cpu"
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
        torch.cuda.empty_cache()
        device_name = torch.cuda.get_device_name(device)

    # network config
    hidden_dim = 64
    critic_lr = 0.0003
    actor_lr = 0.0003
    # env config
    env_name = "Hopper-v2"

    # ppo config
    max_step = 10000000
    gamma = 0.99
    lamda = 0.98
    batch_size = 64
    l2_rate = 0.001
    max_kl = 0.01
    clip_param = 0.2
    update_interval = 2000
    update_episode = 50

    # log config
    if not os.path.exists("runs"):
        os.mkdir("runs")
    if not os.path.exists("runs/" + env_name):
        os.mkdir("runs/" + env_name)
    if not os.path.exists("saved"):
        os.mkdir("saved")
    if not os.path.exists("saved/model"):
        os.mkdir("saved/model")


In [69]:
config = Config()

env = gym.make(config.env_name)
env.seed(config.random_seed)
torch.manual_seed(config.random_seed)
print("==========" * 5)
print("Train env: {}, device: {}".format(config.env_name, config.device_name))
print("==========" * 5)
input_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
running_state = ZFilter(env.observation_space.shape, clip=5)

reward_writer = SummaryWriter('runs/' + config.env_name + "/train_reward_" + config.train_time)
ppo_agent = PPO(config.device, input_dim, action_dim, config.hidden_dim,
                config.actor_lr, config.critic_lr, config.l2_rate, config.gamma, config.lamda, config.batch_size, config.clip_param,
                running_state)

episode = 0
step = 0
all_rewards = []
episode_reward = 0
memory = deque()
start_time = datetime.datetime.now().replace(microsecond=0)
observation = env.reset()
episodes = 0
all_steps = 0

for iter in range(15000):
    ppo_agent.actor.eval(), ppo_agent.critic.eval()
    memory = deque()
    
    steps = 0
    scores = []
    while steps < 2048:
        episodes += 1
        state = env.reset()
        state = ppo_agent.get_state(state)
        score = 0
        for _ in range(10000):

            steps += 1
            action = ppo_agent.select_action(state)
            next_state, reward, done, _ = env.step(action)
            next_state = ppo_agent.get_state(next_state)

            if done:
                mask = 0
            else:
                mask = 1

            memory.append([state, action, reward, mask])

            score += reward
            state = next_state

            if done:
                break
        scores.append(score)
    all_steps += steps
    print("steps: {}, all step: {}".format(steps, all_steps))
    score_avg = np.mean(scores)
    print('{} episode score is {:.2f}'.format(episodes, score_avg))
    ppo_agent.actor.train(), ppo_agent.critic.train()
    ppo_agent.train_model(memory)

# ppo_agent.actor.eval(), ppo_agent.critic.eval()
# while step < config.max_step:
#     state = ppo_agent.get_state(observation)
#     action = ppo_agent.select_action(state)
#     next_state, reward, done, _ = env.step(action)
#     step += 1
#     episode_reward += reward
#     if done:
#         next_state = env.reset()
#         all_rewards.append(episode_reward)
#         episode += 1
#         episode_reward = 0
#         mask = 0
#     else:
#         mask = 1
#     memory.append([state, action, reward, mask])
#     state = next_state
#     if done and episode % config.update_episode == 0:
#         ppo_agent.actor.train(), ppo_agent.critic.train()
#         epoch_time = datetime.datetime.now().replace(microsecond=0)
#         print("step: {}, reward: {:.2f}, episode:{}, train time: {}".format(
#             step, np.mean(all_rewards[-5:]), episode, epoch_time - start_time))
#         ppo_agent.train_model(memory)  # update network
#         memory = deque()  # reset memory
#         torch.save(ppo_agent.actor.state_dict(), "saved/model/" + config.env_name + "_actor_" + config.train_time)
#         torch.save(ppo_agent.actor.state_dict(), "saved/model" + config.env_name + "_critic_" + config.train_time)
#         ppo_agent.actor.eval(), ppo_agent.critic.eval()

Train env: Hopper-v2, device: Tesla T4




steps: 2059, all step: 2059
107 episode score is 16.76




steps: 2091, all step: 4150
179 episode score is 31.02
steps: 2074, all step: 6224
235 episode score is 51.24
steps: 2049, all step: 8273
270 episode score is 96.02
steps: 2085, all step: 10358
300 episode score is 125.71
steps: 2134, all step: 12492
328 episode score is 142.48
steps: 2095, all step: 14587
353 episode score is 176.43
steps: 2050, all step: 16637
375 episode score is 198.98
steps: 2098, all step: 18735
397 episode score is 205.68
steps: 2059, all step: 20794
421 episode score is 187.12
steps: 2081, all step: 22875
443 episode score is 208.46
steps: 2114, all step: 24989
465 episode score is 216.34
steps: 2120, all step: 27109
487 episode score is 217.67
steps: 2109, all step: 29218
509 episode score is 210.09
steps: 2052, all step: 31270
530 episode score is 219.21
steps: 2085, all step: 33355
552 episode score is 211.29
steps: 2123, all step: 35478
574 episode score is 215.59
steps: 2122, all step: 37600
596 episode score is 220.24
steps: 2136, all step: 39736
618 epis

KeyboardInterrupt: ignored