代码实现效果不佳，不如TensorFlow实现结果稳定。当超参数一致时，Torch实现收敛效果。

该代码只适合于MountainCar环境，在CartPole环境中无法收敛。

思想是提供了一个D(s,a)->s'，s'-s_t得到的均方差作为奖励的一部分，同时对该D网络进行训练最小化，当s与a的组合对未出现过，则值更大，鼓励探索

运行收敛需要多次运行得到想要的结果，否则使用TensorFlow？

# 导入函数包

In [211]:
import numpy as np
import torch
from torch import nn
import gym
torch.set_default_dtype(torch.float64)

# 经验池

In [None]:
class memory:
    def __init__(self, env, memory_length=10000, memory_minibatch=128):
        self.env = env
        self.memory_length = memory_length
        self.memory_minibatch = memory_minibatch

        self.state_size = self.env.observation_space.shape[0]
        try:
            self.action_size = self.env.action_space.shape[0]
        except:
            self.action_size = 1

        self.memory_width = self.state_size * 2 + self.action_size + 1
        self.memory = np.zeros((self.memory_length, self.memory_width))

        self.index = 0
        self.max_index = 0

    def store(self, state, action, reward, next_state):
        transacton = np.hstack((state, action, reward, next_state))
        self.memory[self.index, :] = transacton

        self.index += 1
        if self.index % self.memory_length == 0:
            self.index = 0
        if self.max_index < self.memory_length:
            self.max_index += 1

    def sample(self):
        choice_random = np.random.choice(self.max_index, self.memory_minibatch)
        choice_data = self.memory[choice_random, :]
        state = choice_data[:, 0:self.state_size]
        action = choice_data[:,
                             self.state_size:self.state_size + self.action_size]
        reward = choice_data[:, self.state_size + self.action_size:
                             self.state_size + self.action_size + 1]
        next_state = choice_data[:, self.state_size + self.action_size + 1:]

        reward = np.squeeze(reward)
        action = np.squeeze(action)
        
        return state, action, reward, next_state


# 好奇心网络

In [None]:
class curiosity_net(nn.Module):
    def __init__(self, state_dim, hidden_dim=32):
        super().__init__()
        self.state_dim = state_dim
        self.action_dim = 1
        self.hidden_dim = hidden_dim

        self.linear1 = nn.Linear(self.state_dim, self.hidden_dim)
        self.linear2 = nn.Linear(self.action_dim, self.hidden_dim)
        self.activate = nn.ReLU()
        self.linear3 = nn.Linear(self.hidden_dim, self.state_dim)

    def forward(self, s, a):
        x1 = self.linear1(s)
        x2 = self.linear2(a)
        x = x1 + x2
        x = self.activate(x)
        x = self.linear3(x)
        return x



# Q网络

In [None]:
class network:
    def __init__(self, env, hidden_dimension=128, learning_rate=1e-3):
        self.env = env
        self.hidden_dimension = hidden_dimension
        self.learning_rate = learning_rate

        self.input_dimension = self.env.observation_space.shape[0]
        self.output_dimension = self.env.action_space.n

        self.model = self.__create_network()
        self.model_curiosity = curiosity_net(self.input_dimension,
                            self.output_dimension)

        self.loss = torch.nn.MSELoss()
        self.optimizer = torch.optim.RMSprop(self.model.parameters(),
                            lr=self.learning_rate)
        self.optimizer_curiosity = torch.optim.RMSprop(self.model_curiosity.parameters(),
                                lr=0.01)

    @staticmethod
    def replace(network_from, network_to):
        network_to.load_state_dict(network_from.state_dict())

    @staticmethod
    def optimizer(predict_object, predict_value, target_value):
        loss = predict_object.loss(predict_value, target_value)
        predict_object.optimizer.zero_grad()
        loss.backward()
        predict_object.optimizer.step()
        return loss.item()


    class q_network_model(nn.Module):
        def __init__(self, in_dim, out_dim, hidden_dim):
            super().__init__()
            self.in_dim = in_dim
            self.out_dim = out_dim
            self.hidden_dim = hidden_dim

            self.linear1 = nn.Linear(self.in_dim, self.hidden_dim)
            self.linear2 = nn.Linear(self.hidden_dim, self.out_dim)
            self.activate = nn.ReLU()

        def forward(self, s):
            x = self.linear1(s)
            x = self.activate(x)
            x = self.linear2(x)
            return x

    def __create_network(self):
        return self.q_network_model(self.input_dimension,
                      self.output_dimension,
                      self.hidden_dimension)



# 智能体

In [None]:
class agent_q:
    def __init__(self, env, gamma=0.9):
        self.env = env
        self.gamma = gamma

        self.epislon_method = self.epislon_method_1()

        self.q_network = network(self.env)
        self.q_network_target = network(self.env)
        self.memory = memory(self.env)

        self.epislon_learn_step = 0
        pass

    def output_action(self, state):
        state = torch.from_numpy(state)
        action_value = self.q_network.model(state)
        action_value = np.array(action_value.tolist())

        random_number = np.random.random()
        if random_number > self.epislon_method.epislon_init:
            action = self.env.action_space.sample()
        else:
            action = np.argmax(action_value)
            action = np.squeeze(action)
        return action, action_value

    def sample_postprocess(self):
        state, action, reward, next_state = self.memory.sample()
        state = torch.from_numpy(state)
        next_state = torch.from_numpy(next_state)
        action = np.squeeze(action)
        reward = np.squeeze(reward)
        action = action.astype(np.int32)
        return state, action, reward, next_state

    def learn(self):
        self.epislon_learn_step += 1

        if self.epislon_learn_step % 300 == 0:
            network.replace(self.q_network.model,
                    self.q_network_target.model)
            
        state, action, reward, next_state = self.sample_postprocess()
        next_state_fit = self.q_network.model_curiosity(
            state, torch.Tensor(action[np.newaxis].T))
        loss = nn.functional.mse_loss(next_state_fit, next_state)
        loss_value = torch.sum(torch.pow(next_state_fit - next_state, 2), axis=1).detach().numpy()

        if self.epislon_learn_step % 1000 == 0:
            self.q_network.optimizer_curiosity.zero_grad()
            loss.backward()
            # print('loss item:', loss.item())
            self.q_network.optimizer_curiosity.step()

        reward += loss_value
        # print(reward)

        target_value_max, target_action_max = torch.max(
            self.q_network_target.model(next_state), axis=1)
        target_value = reward + self.gamma * \
            np.array(target_value_max.tolist())

        predict_value_all = self.q_network.model(state)

        replace_index = np.arange(self.memory.memory_minibatch, dtype=np.int32)
        target_value_all = np.array(predict_value_all.tolist())
        target_value_all[replace_index, action] = target_value

        self.epislon_method.update()

        return network.optimizer(self.q_network,
                    predict_value_all,
                    torch.from_numpy(target_value_all))

    class epislon_method_1:
        def __init__(self):
            self.epislon_init = 0.01
            self.epislon_increment = 1.001
            self.epislon_max = 0.90

        def update(self):
            if self.epislon_init < self.epislon_max:
                self.epislon_init *= self.epislon_increment

    class epislon_method_2:
        def __init__(self):
            self.epislon_init = 0.95

        def update(self):
            pass


# 交互部分

In [None]:
class interactive:
    def __init__(self, env, epoch_max=1000, epoch_replace=1):
        self.env = env
        self.epoch_max = epoch_max
        self.epoch_replace = epoch_replace

        self.env = self.env.unwrapped
        self.agent = agent_q(self.env)

    def start_execute(self):
        self.epoch_index = 0
        self.loss_value = 0
        for i in range(self.epoch_max):
            self.epoch_index += 1
            state = self.env.reset()
            self.epoch_step = 0
            while True:
                self.epoch_step += 1
                action, _ = self.agent.output_action(state)
                next_state, reward, done, info = self.env.step(action)

                self.agent.memory.store(state, action, reward, next_state)
                state = next_state

                self.loss_value = self.agent.learn()

                if done:
                    break
            self.statistic()

    def statistic(self):
        if not self.epoch_index > 1:
            self.epoch_step_list = []
            self.loss_value_list = []
        print('epoch %-5s, length %-5s, loss_value %5f, epislon %5f' %
                (self.epoch_index, self.epoch_step, self.loss_value,
                self.agent.epislon_method.epislon_init))
        self.epoch_step_list.append(self.epoch_step)
        self.loss_value_list.append(self.loss_value)



# 代码执行

In [None]:
if __name__ == '__main__':
    env_name = 'MountainCar-v0'
    env = gym.make(env_name)
    dqn_evoluate = interactive(env, epoch_max=50)
    dqn_evoluate.start_execute()

# 随机网络蒸馏

## 编码网络

In [212]:
class encoder_net(nn.Module):
    def __init__(self, state_dim, encoder_dim=1000):
        super().__init__()
        self.linear1 = nn.Linear(state_dim, encoder_dim)

    def forward(self, s):
        x = self.linear1(s)
        return x

## 预训练网络

In [213]:
class pre_train_net(nn.Module):
    def __init__(self, state_dim, hidden_dim=128, encoder_dim=1000):
        super().__init__()
        self.linear1 = nn.Linear(state_dim, hidden_dim)
        self.activate = nn.ReLU()
        self.linear2 = nn.Linear(hidden_dim, encoder_dim)

    def forward(self, s):
        x = self.activate(self.linear1(s))
        x = self.linear2(x)
        return x

## Q网络

In [218]:
class network:
    def __init__(self, env, hidden_dimension=128, learning_rate=1e-3):
        self.env = env
        self.hidden_dimension = hidden_dimension
        self.learning_rate = learning_rate

        self.input_dimension = self.env.observation_space.shape[0]
        self.output_dimension = self.env.action_space.n

        self.model = self.__create_network()
        self.encoder_net = encoder_net(self.input_dimension)
        self.model_pre_train = pre_train_net(self.input_dimension)

        self.loss = torch.nn.MSELoss()
        self.optimizer = torch.optim.RMSprop(self.model.parameters(),
                            lr=self.learning_rate)
        self.optimizer_pre_train = torch.optim.RMSprop(self.model_pre_train.parameters(),
                                lr=0.01)

    @staticmethod
    def replace(network_from, network_to):
        network_to.load_state_dict(network_from.state_dict())

    @staticmethod
    def optimizer(predict_object, predict_value, target_value):
        loss = predict_object.loss(predict_value, target_value)
        predict_object.optimizer.zero_grad()
        loss.backward()
        predict_object.optimizer.step()
        return loss.item()


    class q_network_model(nn.Module):
        def __init__(self, in_dim, out_dim, hidden_dim):
            super().__init__()
            self.in_dim = in_dim
            self.out_dim = out_dim
            self.hidden_dim = hidden_dim

            self.linear1 = nn.Linear(self.in_dim, self.hidden_dim)
            self.linear2 = nn.Linear(self.hidden_dim, self.out_dim)
            self.activate = nn.ReLU()

        def forward(self, s):
            x = self.linear1(s)
            x = self.activate(x)
            x = self.linear2(x)
            return x

    def __create_network(self):
        return self.q_network_model(self.input_dimension,
                      self.output_dimension,
                      self.hidden_dimension)



## 智能体

In [220]:
class agent_q:
    def __init__(self, env, gamma=0.9):
        self.env = env
        self.gamma = gamma

        self.epislon_method = self.epislon_method_1()

        self.q_network = network(self.env)
        self.q_network_target = network(self.env)
        self.memory = memory(self.env)

        self.epislon_learn_step = 0
        pass

    def output_action(self, state):
        state = torch.from_numpy(state)
        action_value = self.q_network.model(state)
        action_value = np.array(action_value.tolist())

        random_number = np.random.random()
        if random_number > self.epislon_method.epislon_init:
            action = self.env.action_space.sample()
        else:
            action = np.argmax(action_value)
            action = np.squeeze(action)
        return action, action_value

    def sample_postprocess(self):
        state, action, reward, next_state = self.memory.sample()
        state = torch.from_numpy(state)
        next_state = torch.from_numpy(next_state)
        action = np.squeeze(action)
        reward = np.squeeze(reward)
        action = action.astype(np.int32)
        return state, action, reward, next_state

    def learn(self):
        self.epislon_learn_step += 1

        if self.epislon_learn_step % 300 == 0:
            network.replace(self.q_network.model,
                    self.q_network_target.model)
            
        state, action, reward, next_state = self.sample_postprocess()
        next_state_encoder = self.q_network.encoder_net(next_state)
        next_state_pre_train = self.q_network.model_pre_train(next_state)

        loss = nn.functional.mse_loss(next_state_pre_train, next_state_encoder)
        loss_value = torch.sum(torch.pow(next_state_pre_train - next_state_encoder, 2),
                               axis=1).detach().numpy()

        if self.epislon_learn_step % 1000 == 0:
            self.q_network.optimizer_pre_train.zero_grad()
            loss.backward()
            # print('loss item:', loss.item())
            self.q_network.optimizer_pre_train.step()

        reward += loss_value
        # print(reward)

        target_value_max, target_action_max = torch.max(
            self.q_network_target.model(next_state), axis=1)
        target_value = reward + self.gamma * \
            np.array(target_value_max.tolist())

        predict_value_all = self.q_network.model(state)

        replace_index = np.arange(self.memory.memory_minibatch, dtype=np.int32)
        target_value_all = np.array(predict_value_all.tolist())
        target_value_all[replace_index, action] = target_value

        self.epislon_method.update()

        return network.optimizer(self.q_network,
                    predict_value_all,
                    torch.from_numpy(target_value_all))

    class epislon_method_1:
        def __init__(self):
            self.epislon_init = 0.01
            self.epislon_increment = 1.001
            self.epislon_max = 0.90

        def update(self):
            if self.epislon_init < self.epislon_max:
                self.epislon_init *= self.epislon_increment

    class epislon_method_2:
        def __init__(self):
            self.epislon_init = 0.95

        def update(self):
            pass


## 交互部分

In [216]:
class interactive:
    def __init__(self, env, epoch_max=1000, epoch_replace=1):
        self.env = env
        self.epoch_max = epoch_max
        self.epoch_replace = epoch_replace

        self.env = self.env.unwrapped
        self.agent = agent_q(self.env)

    def start_execute(self):
        self.epoch_index = 0
        self.loss_value = 0
        for i in range(self.epoch_max):
            self.epoch_index += 1
            state = self.env.reset()
            self.epoch_step = 0
            while True:
                self.epoch_step += 1
                action, _ = self.agent.output_action(state)
                next_state, reward, done, info = self.env.step(action)

                self.agent.memory.store(state, action, reward, next_state)
                state = next_state

                self.loss_value = self.agent.learn()

                if done:
                    break
            self.statistic()

    def statistic(self):
        if not self.epoch_index > 1:
            self.epoch_step_list = []
            self.loss_value_list = []
        print('epoch %-5s, length %-5s, loss_value %5f, epislon %5f' %
                (self.epoch_index, self.epoch_step, self.loss_value,
                self.agent.epislon_method.epislon_init))
        self.epoch_step_list.append(self.epoch_step)
        self.loss_value_list.append(self.loss_value)



## 代码执行

In [221]:
if __name__ == '__main__':
    env_name = 'MountainCar-v0'
    env = gym.make(env_name)
    dqn_evoluate = interactive(env, epoch_max=50)
    dqn_evoluate.start_execute()

epoch 1    , length 6990 , loss_value 8.370674, epislon 0.900847
epoch 2    , length 8051 , loss_value 0.130469, epislon 0.900847
epoch 3    , length 477  , loss_value 0.123358, epislon 0.900847
epoch 4    , length 318  , loss_value 0.357393, epislon 0.900847
epoch 5    , length 416  , loss_value 0.086545, epislon 0.900847
epoch 6    , length 2489 , loss_value 0.117268, epislon 0.900847
epoch 7    , length 178  , loss_value 0.047599, epislon 0.900847
epoch 8    , length 180  , loss_value 0.039014, epislon 0.900847
epoch 9    , length 327  , loss_value 0.046421, epislon 0.900847
epoch 10   , length 183  , loss_value 0.057766, epislon 0.900847
epoch 11   , length 180  , loss_value 0.024246, epislon 0.900847
epoch 12   , length 334  , loss_value 0.080988, epislon 0.900847
epoch 13   , length 193  , loss_value 0.026834, epislon 0.900847
epoch 14   , length 199  , loss_value 0.040778, epislon 0.900847
epoch 15   , length 167  , loss_value 0.046859, epislon 0.900847
epoch 16   , length 147  

# 工具参考

## TensorFlow实现

In [None]:
import numpy as np
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior() 
import gym
import matplotlib.pyplot as plt
from gym import wrappers
tf.reset_default_graph()

class CuriosityNet:
    def __init__(
            self,
            n_a,
            n_s,
            lr=0.01,
            gamma=0.98,
            epsilon=0.95,
            replace_target_iter=300,
            memory_size=10000,
            batch_size=128,
            output_graph=False,
    ):
        self.n_a = n_a
        self.n_s = n_s
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size

        # total learning step
        self.learn_step_counter = 0
        self.memory_counter = 0

        # initialize zero memory [s, a, r, s_]
        self.memory = np.zeros((self.memory_size, n_s * 2 + 2))
        self.tfs, self.tfa, self.tfr, self.tfs_, self.dyn_train, self.dqn_train, self.q, self.int_r = \
            self._build_nets()

        t_params = tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES, scope='target_net')
        e_params = tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES, scope='eval_net')

        with tf.variable_scope('hard_replacement'):
            self.target_replace_op = [
                tf.assign(t, e) for t, e in zip(t_params, e_params)]

        self.sess = tf.Session()

        if output_graph:
            tf.summary.FileWriter("logs/", self.sess.graph)

        self.sess.run(tf.global_variables_initializer())

    def _build_nets(self):
        tfs = tf.placeholder(
            tf.float32, [None, self.n_s], name="s")    # input State
        # input Action
        tfa = tf.placeholder(tf.int32, [None, ], name="a")
        # extrinsic reward
        tfr = tf.placeholder(tf.float32, [None, ], name="ext_r")
        tfs_ = tf.placeholder(
            tf.float32, [None, self.n_s], name="s_")  # input Next State

        # dynamics net
        dyn_s_, curiosity, dyn_train = self._build_dynamics_net(tfs, tfa, tfs_)

        # normal RL model
        total_reward = tf.add(curiosity, tfr, name="total_r")
        q, dqn_loss, dqn_train = self._build_dqn(tfs, tfa, total_reward, tfs_)
        return tfs, tfa, tfr, tfs_, dyn_train, dqn_train, q, curiosity

    def _build_dynamics_net(self, s, a, s_):
        with tf.variable_scope("dyn_net"):
            float_a = tf.expand_dims(
                tf.cast(a, dtype=tf.float32, name="float_a"), axis=1, name="2d_a")
            sa = tf.concat((s, float_a), axis=1, name="sa")
            encoded_s_ = s_                # here we use s_ as the encoded s_

            dyn_l = tf.layers.dense(sa, 32, activation=tf.nn.relu)
            dyn_s_ = tf.layers.dense(dyn_l, self.n_s)  # predicted s_
        with tf.name_scope("int_r"):
            squared_diff = tf.reduce_sum(
                tf.square(encoded_s_ - dyn_s_), axis=1)  # intrinsic reward

        # It is better to reduce the learning rate in order to stay curious
        train_op = tf.train.RMSPropOptimizer(
            self.lr, name="dyn_opt").minimize(tf.reduce_mean(squared_diff))
        return dyn_s_, squared_diff, train_op

    def _build_dqn(self, s, a, r, s_):
        with tf.variable_scope('eval_net'):
            e1 = tf.layers.dense(s, 128, tf.nn.relu)
            q = tf.layers.dense(e1, self.n_a, name="q")
        with tf.variable_scope('target_net'):
            t1 = tf.layers.dense(s_, 128, tf.nn.relu)
            q_ = tf.layers.dense(t1, self.n_a, name="q_")

        with tf.variable_scope('q_target'):
            q_target = r + self.gamma * \
                tf.reduce_max(q_, axis=1, name="Qmax_s_")

        with tf.variable_scope('q_wrt_a'):
            a_indices = tf.stack(
                [tf.range(tf.shape(a)[0], dtype=tf.int32), a], axis=1)
            q_wrt_a = tf.gather_nd(params=q, indices=a_indices)

        loss = tf.losses.mean_squared_error(
            labels=q_target, predictions=q_wrt_a)   # TD error
        train_op = tf.train.RMSPropOptimizer(self.lr, name="dqn_opt").minimize(
            loss, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "eval_net"))
        return q, loss, train_op

    def store_transition(self, s, a, r, s_):
        transition = np.hstack((s, [a, r], s_))
        # replace the old memory with new memory
        index = self.memory_counter % self.memory_size
        self.memory[index, :] = transition
        self.memory_counter += 1

    def choose_action(self, observation):
        # to have batch dimension when feed into tf placeholder
        s = observation[np.newaxis, :]

        if np.random.uniform() < self.epsilon:
            # forward feed the observation and get q value for every actions
            actions_value = self.sess.run(self.q, feed_dict={self.tfs: s})
            action = np.argmax(actions_value)
        else:
            action = np.random.randint(0, self.n_a)
        return action

    def learn(self):
        # check to replace target parameters
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.sess.run(self.target_replace_op)

        # sample batch memory from all memory
        top = self.memory_size if self.memory_counter > self.memory_size else self.memory_counter
        sample_index = np.random.choice(top, size=self.batch_size)
        batch_memory = self.memory[sample_index, :]

        bs, ba, br, bs_ = batch_memory[:, :self.n_s], batch_memory[:, self.n_s], \
            batch_memory[:, self.n_s + 1], batch_memory[:, -self.n_s:]
        self.sess.run(self.dqn_train, feed_dict={
                      self.tfs: bs, self.tfa: ba, self.tfr: br, self.tfs_: bs_})
        if self.learn_step_counter % 1000 == 0:     # delay training in order to stay curious
            self.sess.run(self.dyn_train, feed_dict={
                          self.tfs: bs, self.tfa: ba, self.tfs_: bs_})
        self.learn_step_counter += 1


env = gym.make('MountainCar-v0')
env = wrappers.Monitor(env, 'performance-1', force=True)
env._max_episode_steps = 50000
env = env.unwrapped

dqn = CuriosityNet(n_a=3, n_s=2, lr=0.01, output_graph=False)
ep_steps = []
for epi in range(50):
    s = env.reset()
    steps = 0
    while True:
        # env.render()
        a = dqn.choose_action(s)
        s_, r, done, info = env.step(a)
        dqn.store_transition(s, a, r, s_)
        dqn.learn()
        if done:
            print('Epi: ', epi, "| steps: ", steps)
            ep_steps.append(steps)
            break
        s = s_
        steps += 1

plt.plot(ep_steps)
plt.ylabel("steps")
plt.xlabel("episode")
plt.show()
