In [12]:
!pip install gym

Defaulting to user installation because normal site-packages is not writeable
Collecting gym
  Downloading gym-0.17.2.tar.gz (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 81 kB/s eta 0:00:016
Collecting pyglet<=1.5.0,>=1.4.0
  Downloading pyglet-1.5.0-py2.py3-none-any.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 30 kB/s eta 0:00:01
[?25hCollecting cloudpickle<1.4.0,>=1.2.0
  Downloading cloudpickle-1.3.0-py2.py3-none-any.whl (26 kB)
Building wheels for collected packages: gym
  Building wheel for gym (setup.py) ... [?25ldone
[?25h  Created wheel for gym: filename=gym-0.17.2-py3-none-any.whl size=1650891 sha256=c9088b2f09cfe188f24bc1a7761f7d5a3318e87329fb05ca7d8c2e86a44324e8
  Stored in directory: /home/yuhailin/.cache/pip/wheels/18/e1/58/89a2aa24e6c2cc800204fc02010612afdf200926c4d6bfe315
Successfully built gym
Installing collected packages: pyglet, cloudpickle, gym
Successfully installed cloudpickle-1.3.0 gym-0.17.2 pyglet-1.5.0


In [30]:
import  gym,os
import  numpy as np
import  matplotlib
from matplotlib import pyplot as plt
# Default parameters for plots
matplotlib.rcParams['font.size'] = 18
matplotlib.rcParams['figure.titlesize'] = 18
matplotlib.rcParams['figure.figsize'] = [9, 7]
matplotlib.rcParams['font.family'] = ['KaiTi']
matplotlib.rcParams['axes.unicode_minus']=False 

import 	tensorflow as tf
from    tensorflow import keras
from    tensorflow.keras import layers,optimizers,losses
from    PIL import Image

In [31]:
env = gym.make('CartPole-v1')  # 创建游戏环境

In [32]:
learning_rate = 0.0002
gamma         = 0.99

In [33]:
class Policy(keras.Model):
    # 策略网络，生成动作的概率分布
    def __init__(self):
        super(Policy, self).__init__()
        self.data = [] # 存储轨迹
        # 输入为状态，输出为左、右2个动作
        self.fc1 = layers.Dense(128, kernel_initializer='he_normal')
        self.fc2 = layers.Dense(2, kernel_initializer='he_normal')
        # 网络优化器
        self.optimizer = optimizers.Adam(lr=learning_rate)

    def call(self, inputs, training=None):
        x = tf.nn.relu(self.fc1(inputs))
        x = tf.nn.softmax(self.fc2(x), axis=1)
        return x

    def put_data(self, item):
        # 记录r,log_P(a|s)
        self.data.append(item)

    def train_net(self, tape):
        # 计算梯度并更新策略网络参数
        R = 0 # 初始奖励为0
        for r, log_prob in self.data[::-1]:
            R = r + gamma * R # 计算每个时间戳上的回报
            # 每个时间戳都计算一次梯度
            loss = -log_prob * R
            with tape.stop_recording():
                # 优化策略网络
                grads = tape.gradient(loss, self.trainable_variables)
                self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
        self.data = [] # 清空轨迹


In [38]:
def main():
    pi = Policy() # 创建策略网络
    pi(tf.random.normal((4,4)))
    pi.summary()
    score = 0.0 # 计分
    print_interval = 20 # 打印间隔
    returns = []

    for n_epi in range(300):
        s = env.reset() # 初始化状态
        with tf.GradientTape(persistent=True) as tape:
            for t in range(501): 
                # 送入状态向量，获取策略
                s = tf.constant(s,dtype=tf.float32)
                s = tf.expand_dims(s, axis=0)
                prob = pi(s) # 动作分布:[1,2]
                # 从类别分布中采样1个动作, shape: [1]
                a = tf.random.categorical(tf.math.log(prob), 1)[0]
                a = int(a) # Tensor转数字
                s_prime, r, done, info = env.step(a)
                # 记录动作a和动作产生的奖励r
                pi.put_data((r, tf.math.log(prob[0][a])))
                s = s_prime # 刷新状态
                score += r # 累积奖励

                if done:  # 当前episode终止
                    break
            # episode终止后，训练一次网络
            pi.train_net(tape)
        del tape

        if n_epi%print_interval==0 and n_epi!=0:
            returns.append(score/print_interval)
            print(f"# of episode :{n_epi}, avg score : {score/print_interval}")
            score = 0.0
    env.close() # 关闭环境

    plt.plot(np.arange(len(returns))*print_interval, returns)
    plt.plot(np.arange(len(returns))*print_interval, returns, 's')
    plt.xlabel('epochs')
    plt.ylabel('total reward')
    plt.show()

In [None]:
main()

Model: "policy_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              multiple                  640       
_________________________________________________________________
dense_5 (Dense)              multiple                  258       
Total params: 898
Trainable params: 898
Non-trainable params: 0
_________________________________________________________________
# of episode :20, avg score : 15.0
# of episode :40, avg score : 16.1
# of episode :60, avg score : 18.95
# of episode :80, avg score : 32.85
# of episode :100, avg score : 34.15
# of episode :120, avg score : 36.95
# of episode :140, avg score : 60.7
# of episode :160, avg score : 65.6
# of episode :180, avg score : 84.8
# of episode :200, avg score : 118.1
# of episode :220, avg score : 136.2
# of episode :240, avg score : 201.05
# of episode :260, avg score : 327.95
