In [3]:
import numpy as np
import matplotlib.pyplot as plt
import moviepy.editor as mpy
import skimage.transform
from IPython.display import Image, display

import tensorflow as tf
import tensorflow_probability as tfp
import tensorflow.keras.losses as kls
gpus = tf.config.list_physical_devices("GPU") 
if gpus:
    try:
        # Restrict TensorFlow to only use the fourth GPU
        tf.config.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)


        


Imageio: 'ffmpeg-linux64-v3.3.1' was not found on your computer; downloading it now.
Try 1. Download from https://github.com/imageio/imageio-binaries/raw/master/ffmpeg/ffmpeg-linux64-v3.3.1 (43.8 MB)
Downloading: 8192/45929032 bytes (0.0%)2367488/45929032 bytes (5.2%)5824512/45929032 bytes (12.7%)9052160/45929032 bytes (19.7%)12484608/45929032 bytes (27.2%)15966208/45929032 bytes (34.8%)19546112/45929032 bytes (42.6%)23060480/45929032 bytes (50.2%)26615808/45929032 bytes (57.9%)30203904/45929032 bytes (65.8%)33792000/45929032 bytes (73.6%)37322752/45929032 bytes (81.3%)40796160/45929032 bytes (88.8%)

因為ＣＵＤＡ太舊，所以用colab跑，
這邊安裝了一些必要的package

In [4]:
!pip install pygame
!pip install gym
!pip install git+https://github.com/GrupoTuring/PyGame-Learning-Environment
!pip install git+https://github.com/lusob/gym-ple

Collecting pygame
  Downloading pygame-2.1.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21.8 MB)
[K     |████████████████████████████████| 21.8 MB 1.6 MB/s 
[?25hInstalling collected packages: pygame
Successfully installed pygame-2.1.2
Collecting git+https://github.com/GrupoTuring/PyGame-Learning-Environment
  Cloning https://github.com/GrupoTuring/PyGame-Learning-Environment to /tmp/pip-req-build-nftc7mjy
  Running command git clone -q https://github.com/GrupoTuring/PyGame-Learning-Environment /tmp/pip-req-build-nftc7mjy
Building wheels for collected packages: ple
  Building wheel for ple (setup.py) ... [?25l[?25hdone
  Created wheel for ple: filename=ple-0.0.2-py3-none-any.whl size=723174 sha256=64800b4c331475162ac81bdbafa4adc4b53a29db834eafc2753f8d53c046b851
  Stored in directory: /tmp/pip-ephem-wheel-cache-r6j3bam_/wheels/1c/65/8c/4644500b42643db570312f5fc3e90e8c7848a17136cf202f16
Successfully built ple
Installing collected packages: ple
Successfully installed p

In [5]:
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"  # this line make pop-out window not appear

from PIL import Image    
from ple import PLE
from ple.games.flappybird import FlappyBird

game = FlappyBird()
env = PLE(game, fps=30, display_screen=False)  # environment interface to game
env.reset_game()

test_game = FlappyBird()
test_env = PLE(test_game, fps=30, display_screen=False)
test_env.reset_game()

pygame 2.1.2 (SDL 2.0.16, Python 3.7.12)
Hello from the pygame community. https://www.pygame.org/contribute.html
couldn't import doomish
Couldn't import doom


In [6]:
hparas = {
    'image_size': 84,
    'num_stack': 4,
    'action_dim': len(env.getActionSet()),
    'hidden_size': 256,
    'lr': 0.0001,
    'gamma': 0.99,
    'lambda': 0.95,
    'clip_val': 0.2,
    'ppo_epochs': 8,
    'test_epochs': 1,
    'num_steps': 512,
    'mini_batch_size': 64,
    'target_reward': 200,
    'max_episode': 30000,
}

In [7]:
def make_anim(images, fps=60, true_image=False):
    duration = len(images) / fps

    def make_frame(t):
        try:
            x = images[int(len(images) / duration * t)]
        except:
            x = images[-1]

        if true_image:
            return x.astype(np.uint8)
        else:
            return ((x + 1) / 2 * 255).astype(np.uint8)

    clip = mpy.VideoClip(make_frame, duration=duration)
    clip.fps = fps
    
    return clip


In [8]:
def preprocess_screen(screen):
    screen = skimage.transform.rotate(screen, -90, resize=True)
    screen = screen[:400, :]
    screen = skimage.transform.resize(screen, [hparas['image_size'], hparas['image_size'], 1])
    return screen.astype(np.float32)

def frames_to_state(input_frames):
    if(len(input_frames) == 1):
        state = np.concatenate(input_frames*4, axis=-1)
    elif(len(input_frames) == 2):
        state = np.concatenate(input_frames[0:1]*2 + input_frames[1:]*2, axis=-1)
    elif(len(input_frames) == 3):
        state = np.concatenate(input_frames + input_frames[2:], axis=-1)
    else:
        state = np.concatenate(input_frames[-4:], axis=-1)

    return state

可以發現其實sample code的model疊得非常小，actor & critic的network都只有一層的layer。主要的feature extractor則是一個小型的CNN。

In [9]:
class ActorCriticNetwork(tf.keras.Model):
    def __init__(self, hparas):
        super().__init__()

        self.feature_extractor = tf.keras.Sequential([
          # Convolutional Layers
          tf.keras.layers.Conv2D(filters=32, kernel_size=8, strides=4),
          tf.keras.layers.ReLU(),
          tf.keras.layers.Conv2D(filters=64, kernel_size=4, strides=2),
          tf.keras.layers.ReLU(),
          tf.keras.layers.Conv2D(filters=64, kernel_size=3, strides=1),
          tf.keras.layers.ReLU(),
          # Embedding Layers
          tf.keras.layers.Flatten(),
          tf.keras.layers.Dense(hparas['hidden_size']),
          tf.keras.layers.ReLU(),
        ])

        # Actor Network
        self.actor = tf.keras.layers.Dense(hparas['action_dim'], activation='softmax')
        # Critic Network
        self.critic = tf.keras.layers.Dense(1, activation = None)

    def call(self, input):
        x = self.feature_extractor(input)
        action_logits = self.actor(x)
        value = self.critic(x)
        return action_logits, value

In [10]:
class Agent():
    def __init__(self, hparas):
        self.gamma = hparas['gamma']
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=hparas['lr'])
        self.actor_critic = ActorCriticNetwork(hparas)
        self.clip_pram = hparas['clip_val']
    
    def ppo_iter(self, mini_batch_size, states, actions, log_probs, returns, advantage):
        batch_size = states.shape[0]
        for _ in range(batch_size // mini_batch_size):
            rand_ids = tf.convert_to_tensor(np.random.randint(0, batch_size, mini_batch_size), dtype=tf.int32)
            yield tf.gather(states, rand_ids), tf.gather(actions, rand_ids), tf.gather(log_probs, rand_ids), \
             tf.gather(returns, rand_ids), tf.gather(advantage, rand_ids)
    
    def ppo_update(self, ppo_epochs, mini_batch_size, states, actions, log_probs, discount_rewards, advantages):       
        total_actor_loss = 0
        total_critic_loss = 0
        for _ in range(ppo_epochs):
            for state, action, old_log_probs, reward, advantage in self.ppo_iter(mini_batch_size, states, actions, log_probs, discount_rewards, advantages):
                reward = tf.expand_dims(reward, axis=-1)

                with tf.GradientTape() as tape:
                    prob, value = self.actor_critic(state, training=True)
                    dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
                    entropy = tf.math.reduce_mean(dist.entropy())
                    new_log_probs = dist.log_prob(action)

                    # PPO ratio
                    ratio = tf.math.exp(new_log_probs - old_log_probs)
                    surr1 = ratio * advantage
                    surr2 = tf.clip_by_value(ratio, 1.0 - self.clip_pram, 1.0 + self.clip_pram) * advantage

                    actor_loss = tf.math.negative(tf.math.reduce_mean(tf.math.minimum(surr1, surr2))) - 0.1 * entropy
                    critic_loss = 0.5 * tf.math.reduce_mean(kls.mean_squared_error(reward, value))

                    total_loss = actor_loss + critic_loss
            
                # single optimizer
                grads = tape.gradient(total_loss, self.actor_critic.trainable_variables)
                self.optimizer.apply_gradients(zip(grads, self.actor_critic.trainable_variables))
      
                total_actor_loss += actor_loss
                total_critic_loss += critic_loss
        return total_actor_loss, total_critic_loss


In [11]:
def compute_gae(rewards, masks, values, gamma, LAMBDA):
    gae = 0
    returns = []
    for i in reversed(range(len(rewards))):
        delta = rewards[i] + gamma * values[i + 1] * masks[i] - values[i]
        gae = delta + gamma * LAMBDA * masks[i] * gae
        returns.append(gae + values[i])

    returns.reverse()
    return returns

In [12]:
def test_reward(test_env, agent):
    total_reward = 0
    # Reset the environment
    test_env.reset_game()
    input_frames = [preprocess_screen(test_env.getScreenGrayscale())]

    while not test_env.game_over():

        state = frames_to_state(input_frames)
        state = tf.expand_dims(state, axis=0)
        prob, value = agent.actor_critic(state)

        action = np.argmax(prob[0].numpy())
        reward = test_env.act(test_env.getActionSet()[action])
        total_reward += reward

        input_frames.append(preprocess_screen(test_env.getScreenGrayscale()))

    return total_reward

In [16]:
agent = Agent(hparas)
max_episode = hparas['max_episode']
test_per_n_episode = 10
force_save_per_n_episode = 1000
early_stop_reward = 10

start_s = 0
best_reward = -5.0

checkpoint = tf.train.Checkpoint(
    actor_critic = agent.actor_critic,
    optimizer = agent.optimizer,
)

checkpoint.restore('/content/drive/MyDrive/Lab17/save/checkpoints/ckpt-41')


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fac19bb5b90>

In [None]:
ep_reward = []
total_avgr = []
early_stop = False
avg_rewards_list = []

env.reset_game()

for s in range(0, max_episode):
    if early_stop == True:
        break

    rewards = []
    states = []
    actions = []
    log_probs = []
    masks = []
    values = []

    display_frames = [env.getScreenRGB()]
    input_frames = [preprocess_screen(env.getScreenGrayscale())]

    for step in range(hparas['num_steps']):

        state = frames_to_state(input_frames)
        state = tf.expand_dims(state, axis=0)
        prob, value = agent.actor_critic(state)

        dist = tfp.distributions.Categorical(probs=prob[0], dtype=tf.float32)
        action = dist.sample(1)
        log_prob = dist.log_prob(action)

        reward = env.act(env.getActionSet()[int(action.numpy())])

        done = env.game_over()

        states.append(state)
        actions.append(action)
        values.append(value[0])
        log_probs.append(log_prob)
        rewards.append(tf.convert_to_tensor(reward, dtype=tf.float32))
        masks.append(tf.convert_to_tensor(1-int(done), dtype=tf.float32))

        display_frames.append(env.getScreenRGB())
        input_frames.append(preprocess_screen(env.getScreenGrayscale()))

        if done:
            env.reset_game()
            input_frames = [preprocess_screen(env.getScreenGrayscale())]
  
    _, next_value = agent.actor_critic(state)
    values.append(next_value[0])

    returns = compute_gae(rewards, masks, values, hparas['gamma'], hparas['lambda'])

    returns = tf.concat(returns, axis=0)
    log_probs = tf.concat(log_probs, axis=0)
    values = tf.concat(values, axis=0)
    states = tf.concat(states, axis=0)
    actions = tf.concat(actions, axis=0)
    advantage = returns - values[:-1]

    a_loss, c_loss = agent.ppo_update(hparas['ppo_epochs'], hparas['mini_batch_size'], states, actions, log_probs, returns, advantage)
    print('[Episode %d]  Actor loss: %.5f, Critic loss: %.5f' % (s, a_loss, c_loss))

    if s % 100 == 0:
        # test agent hparas['test_epochs'] times to get the average reward
        avg_reward = np.mean([test_reward(test_env, agent) for _ in range(hparas['test_epochs'])])
        print("Test average reward is %.1f, Current best average reward is %.1f\n" % (avg_reward, best_reward))
        avg_rewards_list.append(avg_reward)

        if avg_reward > best_reward:
            best_reward = avg_reward
            agent.actor_critic.save('/content/drive/MyDrive/ML/Lab17/save/Actor/model_actor_{}_{}'.format(s, avg_reward), save_format="tf")
            checkpoint.save(file_prefix = '/content/drive/MyDrive/ML/Lab17/save/checkpoints/ckpt')

    if s % 100 == 0:
        agent.actor_critic.save('/content/drive/MyDrive/ML/Lab17/save/Actor/model_actor_{}_{}'.format(s, avg_reward), save_format="tf")
        checkpoint.save(file_prefix = '/content/drive/MyDrive/ML/Lab17/save/checkpoints/ckpt')
        clip = make_anim(display_frames, fps=60, true_image=True).rotate(-90)
        clip.write_videofile("/content/drive/MyDrive/ML/Lab17/{}_demo-{}.webm".format('Lab17', s), fps=60)
        display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))

    if best_reward >= early_stop_reward:
        early_stop = True

[Episode 0]  Actor loss: 9.92662, Critic loss: 1.81462
Test average reward is -5.0, Current best average reward is -5.0

INFO:tensorflow:Assets written to: /content/drive/MyDrive/ML/Lab17/save/Actor/model_actor_0_-5.0/assets
[MoviePy] >>>> Building video /content/drive/MyDrive/ML/Lab17/Lab17_demo-0.webm
[MoviePy] Writing video /content/drive/MyDrive/ML/Lab17/Lab17_demo-0.webm


100%|█████████▉| 513/514 [00:26<00:00, 19.37it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: /content/drive/MyDrive/ML/Lab17/Lab17_demo-0.webm 




100%|█████████▉| 513/514 [00:02<00:00, 216.57it/s]


[Episode 1]  Actor loss: 4.64537, Critic loss: 1.85919
[Episode 2]  Actor loss: -3.33689, Critic loss: 2.23525
[Episode 3]  Actor loss: 8.76013, Critic loss: 1.49893
[Episode 4]  Actor loss: -4.23454, Critic loss: 1.29827
[Episode 5]  Actor loss: -8.34846, Critic loss: 1.21915
[Episode 6]  Actor loss: -5.63089, Critic loss: 0.74755
[Episode 7]  Actor loss: -11.61086, Critic loss: 1.17963
[Episode 8]  Actor loss: -2.76820, Critic loss: 0.65783
[Episode 9]  Actor loss: -3.57708, Critic loss: 0.65928
[Episode 10]  Actor loss: -1.67447, Critic loss: 1.25803
[Episode 11]  Actor loss: -3.99446, Critic loss: 1.16007
[Episode 12]  Actor loss: -9.31354, Critic loss: 1.62471
[Episode 13]  Actor loss: -1.83453, Critic loss: 0.90047
[Episode 14]  Actor loss: -9.39906, Critic loss: 1.18731
[Episode 15]  Actor loss: -2.03255, Critic loss: 1.18917
[Episode 16]  Actor loss: -13.85658, Critic loss: 1.38044
[Episode 17]  Actor loss: -11.34039, Critic loss: 1.90935
[Episode 18]  Actor loss: -1.87659, Cri

100%|█████████▉| 513/514 [00:12<00:00, 40.96it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: /content/drive/MyDrive/ML/Lab17/Lab17_demo-100.webm 




100%|█████████▉| 513/514 [00:02<00:00, 202.72it/s]


[Episode 101]  Actor loss: -8.32007, Critic loss: 2.29648
[Episode 102]  Actor loss: -17.76615, Critic loss: 1.43079
[Episode 103]  Actor loss: -10.60945, Critic loss: 2.48282
[Episode 104]  Actor loss: -3.07548, Critic loss: 2.28201
[Episode 105]  Actor loss: -1.01718, Critic loss: 2.79084
[Episode 106]  Actor loss: -4.68008, Critic loss: 1.91413
[Episode 107]  Actor loss: -7.07602, Critic loss: 2.26278
[Episode 108]  Actor loss: -4.58854, Critic loss: 3.69778


這次的lab跑了非常久，但由於使用colab所以無法跟之前Sarsa比較。

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
for s in range(350, max_episode):
    if early_stop == True:
        break

    rewards = []
    states = []
    actions = []
    log_probs = []
    masks = []
    values = []

    display_frames = [env.getScreenRGB()]
    input_frames = [preprocess_screen(env.getScreenGrayscale())]

    for step in range(hparas['num_steps']):

        state = frames_to_state(input_frames)
        state = tf.expand_dims(state, axis=0)
        prob, value = agent.actor_critic(state)

        dist = tfp.distributions.Categorical(probs=prob[0], dtype=tf.float32)
        action = dist.sample(1)
        log_prob = dist.log_prob(action)

        reward = env.act(env.getActionSet()[int(action.numpy())])

        done = env.game_over()

        states.append(state)
        actions.append(action)
        values.append(value[0])
        log_probs.append(log_prob)
        rewards.append(tf.convert_to_tensor(reward, dtype=tf.float32))
        masks.append(tf.convert_to_tensor(1-int(done), dtype=tf.float32))

        display_frames.append(env.getScreenRGB())
        input_frames.append(preprocess_screen(env.getScreenGrayscale()))

        if done:
            env.reset_game()
            input_frames = [preprocess_screen(env.getScreenGrayscale())]
  
    _, next_value = agent.actor_critic(state)
    values.append(next_value[0])

    returns = compute_gae(rewards, masks, values, hparas['gamma'], hparas['lambda'])

    returns = tf.concat(returns, axis=0)
    log_probs = tf.concat(log_probs, axis=0)
    values = tf.concat(values, axis=0)
    states = tf.concat(states, axis=0)
    actions = tf.concat(actions, axis=0)
    advantage = returns - values[:-1]

    a_loss, c_loss = agent.ppo_update(hparas['ppo_epochs'], hparas['mini_batch_size'], states, actions, log_probs, returns, advantage)
    print('[Episode %d]  Actor loss: %.5f, Critic loss: %.5f' % (s, a_loss, c_loss))

    if s % 50 == 0:
        # test agent hparas['test_epochs'] times to get the average reward
        avg_reward = np.mean([test_reward(test_env, agent) for _ in range(hparas['test_epochs'])])
        print("Test average reward is %.1f, Current best average reward is %.1f\n" % (avg_reward, best_reward))
        avg_rewards_list.append(avg_reward)

        if avg_reward > best_reward:
            best_reward = avg_reward
            agent.actor_critic.save('/content/drive/MyDrive/ML/Lab17/save/Actor/model_actor_{}_{}'.format(s, avg_reward), save_format="tf")
            checkpoint.save(file_prefix = '/content/drive/MyDrive/ML/Lab17/save/checkpoints/ckpt')

    if s % 50 == 0:
        agent.actor_critic.save('/content/drive/MyDrive/ML/Lab17/save/Actor/model_actor_{}_{}'.format(s, avg_reward), save_format="tf")
        checkpoint.save(file_prefix = '/content/drive/MyDrive/ML/Lab17/save/checkpoints/ckpt')
        clip = make_anim(display_frames, fps=60, true_image=True).rotate(-90)
        clip.write_videofile("/content/drive/MyDrive/ML/Lab17/{}_demo-{}.webm".format('Lab17', s), fps=60)
        display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))

    if best_reward >= early_stop_reward:
        early_stop = True

[Episode 350]  Actor loss: -4.78898, Critic loss: 0.22233
Test average reward is -5.0, Current best average reward is -4.0

INFO:tensorflow:Assets written to: /content/drive/MyDrive/ML/Lab17/save/Actor/model_actor_350_-5.0/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/ML/Lab17/save/Actor/model_actor_350_-5.0/assets


[MoviePy] >>>> Building video /content/drive/MyDrive/ML/Lab17/Lab17_demo-350.webm
[MoviePy] Writing video /content/drive/MyDrive/ML/Lab17/Lab17_demo-350.webm


100%|█████████▉| 513/514 [00:10<00:00, 46.95it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: /content/drive/MyDrive/ML/Lab17/Lab17_demo-350.webm 




100%|█████████▉| 513/514 [00:02<00:00, 219.50it/s]


[Episode 351]  Actor loss: -0.87693, Critic loss: 0.32554
[Episode 352]  Actor loss: -2.92151, Critic loss: 0.29948
[Episode 353]  Actor loss: -6.32510, Critic loss: 0.68057
[Episode 354]  Actor loss: -1.66025, Critic loss: 0.32836
[Episode 355]  Actor loss: -3.18148, Critic loss: 0.31571
[Episode 356]  Actor loss: -4.83701, Critic loss: 0.29166
[Episode 357]  Actor loss: -6.00509, Critic loss: 0.33636
[Episode 358]  Actor loss: -7.47656, Critic loss: 0.58264
[Episode 359]  Actor loss: -5.60705, Critic loss: 0.40880
[Episode 360]  Actor loss: -4.09983, Critic loss: 0.28298
[Episode 361]  Actor loss: -5.78716, Critic loss: 0.31380


In [None]:
ep_reward = []
total_avgr = []
early_stop = False
avg_rewards_list = []

env.reset_game()

for s in range(0, max_episode):
    if early_stop == True:
        break

    rewards = []
    states = []
    actions = []
    log_probs = []
    masks = []
    values = []

    display_frames = [env.getScreenRGB()]
    input_frames = [preprocess_screen(env.getScreenGrayscale())]

    for step in range(hparas['num_steps']):

        state = frames_to_state(input_frames)
        state = tf.expand_dims(state, axis=0)
        prob, value = agent.actor_critic(state)

        dist = tfp.distributions.Categorical(probs=prob[0], dtype=tf.float32)
        action = dist.sample(1)
        log_prob = dist.log_prob(action)

        reward = env.act(env.getActionSet()[int(action.numpy())])

        done = env.game_over()

        states.append(state)
        actions.append(action)
        values.append(value[0])
        log_probs.append(log_prob)
        rewards.append(tf.convert_to_tensor(reward, dtype=tf.float32))
        masks.append(tf.convert_to_tensor(1-int(done), dtype=tf.float32))

        display_frames.append(env.getScreenRGB())
        input_frames.append(preprocess_screen(env.getScreenGrayscale()))

        if done:
            env.reset_game()
            input_frames = [preprocess_screen(env.getScreenGrayscale())]
  
    _, next_value = agent.actor_critic(state)
    values.append(next_value[0])

    returns = compute_gae(rewards, masks, values, hparas['gamma'], hparas['lambda'])

    returns = tf.concat(returns, axis=0)
    log_probs = tf.concat(log_probs, axis=0)
    values = tf.concat(values, axis=0)
    states = tf.concat(states, axis=0)
    actions = tf.concat(actions, axis=0)
    advantage = returns - values[:-1]

    a_loss, c_loss = agent.ppo_update(hparas['ppo_epochs'], hparas['mini_batch_size'], states, actions, log_probs, returns, advantage)
    print('[Episode %d]  Actor loss: %.5f, Critic loss: %.5f' % (s, a_loss, c_loss))

    if s % 100 == 0:
        # test agent hparas['test_epochs'] times to get the average reward
        avg_reward = np.mean([test_reward(test_env, agent) for _ in range(hparas['test_epochs'])])
        print("Test average reward is %.1f, Current best average reward is %.1f\n" % (avg_reward, best_reward))
        avg_rewards_list.append(avg_reward)

        if avg_reward > best_reward:
            best_reward = avg_reward
            agent.actor_critic.save('/content/drive/MyDrive/ML/Lab17/save/Actor/model_actor_{}_{}'.format(s, avg_reward), save_format="tf")
            checkpoint.save(file_prefix = '/content/drive/MyDrive/ML/Lab17/save/checkpoints/ckpt')

    if s % 100 == 0:
        agent.actor_critic.save('/content/drive/MyDrive/ML/Lab17/save/Actor/model_actor_{}_{}'.format(s, avg_reward), save_format="tf")
        checkpoint.save(file_prefix = '/content/drive/MyDrive/ML/Lab17/save/checkpoints/ckpt')
        clip = make_anim(display_frames, fps=60, true_image=True).rotate(-90)
        clip.write_videofile("/content/drive/MyDrive/ML/Lab17/{}_demo-{}.webm".format('Lab17', s), fps=60)
        display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))

    if best_reward >= early_stop_reward:
        early_stop = True

In [None]:
ep_reward = []
total_avgr = []
early_stop = False
avg_rewards_list = []

env.reset_game()

for s in range(0, max_episode):
    if early_stop == True:
        break

    rewards = []
    states = []
    actions = []
    log_probs = []
    masks = []
    values = []

    display_frames = [env.getScreenRGB()]
    input_frames = [preprocess_screen(env.getScreenGrayscale())]

    for step in range(hparas['num_steps']):

        state = frames_to_state(input_frames)
        state = tf.expand_dims(state, axis=0)
        prob, value = agent.actor_critic(state)

        dist = tfp.distributions.Categorical(probs=prob[0], dtype=tf.float32)
        action = dist.sample(1)
        log_prob = dist.log_prob(action)

        reward = env.act(env.getActionSet()[int(action.numpy())])

        done = env.game_over()

        states.append(state)
        actions.append(action)
        values.append(value[0])
        log_probs.append(log_prob)
        rewards.append(tf.convert_to_tensor(reward, dtype=tf.float32))
        masks.append(tf.convert_to_tensor(1-int(done), dtype=tf.float32))

        display_frames.append(env.getScreenRGB())
        input_frames.append(preprocess_screen(env.getScreenGrayscale()))

        if done:
            env.reset_game()
            input_frames = [preprocess_screen(env.getScreenGrayscale())]
  
    _, next_value = agent.actor_critic(state)
    values.append(next_value[0])

    returns = compute_gae(rewards, masks, values, hparas['gamma'], hparas['lambda'])

    returns = tf.concat(returns, axis=0)
    log_probs = tf.concat(log_probs, axis=0)
    values = tf.concat(values, axis=0)
    states = tf.concat(states, axis=0)
    actions = tf.concat(actions, axis=0)
    advantage = returns - values[:-1]

    a_loss, c_loss = agent.ppo_update(hparas['ppo_epochs'], hparas['mini_batch_size'], states, actions, log_probs, returns, advantage)
    print('[Episode %d]  Actor loss: %.5f, Critic loss: %.5f' % (s, a_loss, c_loss))

    if s % 100 == 0:
        # test agent hparas['test_epochs'] times to get the average reward
        avg_reward = np.mean([test_reward(test_env, agent) for _ in range(hparas['test_epochs'])])
        print("Test average reward is %.1f, Current best average reward is %.1f\n" % (avg_reward, best_reward))
        avg_rewards_list.append(avg_reward)

        if avg_reward > best_reward:
            best_reward = avg_reward
            agent.actor_critic.save('/content/drive/MyDrive/Lab17/save/Actor/model_actor_{}_{}'.format(s, avg_reward), save_format="tf")
            checkpoint.save(file_prefix = '/content/drive/MyDrive/Lab17/save/checkpoints/ckpt')

    if s % 100 == 0:
        agent.actor_critic.save('/content/drive/MyDrive/Lab17/save/Actor/model_actor_{}_{}'.format(s, avg_reward), save_format="tf")
        checkpoint.save(file_prefix = '/content/drive/MyDrive/Lab17/save/checkpoints/ckpt')
        clip = make_anim(display_frames, fps=60, true_image=True).rotate(-90)
        clip.write_videofile("/content/drive/MyDrive/Lab17/{}_demo-{}.webm".format('Lab17', s), fps=60)
        display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))

    if best_reward >= early_stop_reward:
        early_stop = True

[Episode 0]  Actor loss: -6.65580, Critic loss: 2.58232
Test average reward is -5.0, Current best average reward is -5.0

INFO:tensorflow:Assets written to: /content/drive/MyDrive/Lab17/save/Actor/model_actor_0_-5.0/assets
[MoviePy] >>>> Building video /content/drive/MyDrive/Lab17/Lab17_demo-0.webm
[MoviePy] Writing video /content/drive/MyDrive/Lab17/Lab17_demo-0.webm


100%|█████████▉| 513/514 [00:13<00:00, 37.51it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: /content/drive/MyDrive/Lab17/Lab17_demo-0.webm 



100%|█████████▉| 513/514 [00:02<00:00, 210.71it/s]


[Episode 1]  Actor loss: -13.24397, Critic loss: 1.19053
[Episode 2]  Actor loss: -16.05931, Critic loss: 2.73037
[Episode 3]  Actor loss: -5.41935, Critic loss: 2.28920
[Episode 4]  Actor loss: -2.15067, Critic loss: 2.28578
[Episode 5]  Actor loss: -8.61098, Critic loss: 1.93225
[Episode 6]  Actor loss: -9.66016, Critic loss: 1.89916
[Episode 7]  Actor loss: -5.98418, Critic loss: 0.85674
[Episode 8]  Actor loss: -19.78886, Critic loss: 2.34340
[Episode 9]  Actor loss: -13.73773, Critic loss: 2.48574
[Episode 10]  Actor loss: -12.04491, Critic loss: 3.21916
[Episode 11]  Actor loss: 1.77783, Critic loss: 4.28993
[Episode 12]  Actor loss: 2.19222, Critic loss: 4.29365
[Episode 13]  Actor loss: -9.99139, Critic loss: 4.01409
[Episode 14]  Actor loss: -14.10838, Critic loss: 2.95867
[Episode 15]  Actor loss: -10.40547, Critic loss: 2.54685
[Episode 16]  Actor loss: -19.01404, Critic loss: 3.01433
[Episode 17]  Actor loss: -0.02440, Critic loss: 2.30793
[Episode 18]  Actor loss: -7.47175

100%|█████████▉| 513/514 [00:12<00:00, 41.93it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: /content/drive/MyDrive/Lab17/Lab17_demo-100.webm 



100%|█████████▉| 513/514 [00:02<00:00, 205.46it/s]


[Episode 101]  Actor loss: -2.69717, Critic loss: 3.64991
[Episode 102]  Actor loss: -14.83696, Critic loss: 2.60648
[Episode 103]  Actor loss: -7.68761, Critic loss: 2.43200
[Episode 104]  Actor loss: 1.85902, Critic loss: 1.14810
[Episode 105]  Actor loss: -3.99432, Critic loss: 2.49796
[Episode 106]  Actor loss: 1.35563, Critic loss: 2.76304
[Episode 107]  Actor loss: -12.74383, Critic loss: 1.96540
[Episode 108]  Actor loss: -8.01345, Critic loss: 2.25369
[Episode 109]  Actor loss: -4.85032, Critic loss: 2.91194
[Episode 110]  Actor loss: -9.73932, Critic loss: 1.62696
[Episode 111]  Actor loss: 3.74747, Critic loss: 3.44101
[Episode 112]  Actor loss: -2.32663, Critic loss: 1.83202
[Episode 113]  Actor loss: -11.72855, Critic loss: 2.62341
[Episode 114]  Actor loss: -4.62569, Critic loss: 1.21728
[Episode 115]  Actor loss: -5.74698, Critic loss: 2.92089
[Episode 116]  Actor loss: -12.09376, Critic loss: 1.74460
[Episode 117]  Actor loss: -33.66854, Critic loss: 3.98351
[Episode 118

100%|█████████▉| 513/514 [00:13<00:00, 38.28it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: /content/drive/MyDrive/Lab17/Lab17_demo-200.webm 




100%|█████████▉| 513/514 [00:02<00:00, 191.78it/s]


[Episode 201]  Actor loss: -12.14003, Critic loss: 4.59398
[Episode 202]  Actor loss: -9.07983, Critic loss: 3.18548
[Episode 203]  Actor loss: 0.11354, Critic loss: 3.13080
[Episode 204]  Actor loss: -4.89624, Critic loss: 4.23171
[Episode 205]  Actor loss: -4.87605, Critic loss: 3.39980
[Episode 206]  Actor loss: -4.02242, Critic loss: 4.32125
[Episode 207]  Actor loss: -2.51458, Critic loss: 2.59848
[Episode 208]  Actor loss: -1.46873, Critic loss: 2.25789
[Episode 209]  Actor loss: -8.50643, Critic loss: 2.24433
[Episode 210]  Actor loss: -4.93523, Critic loss: 1.36012
[Episode 211]  Actor loss: -10.21931, Critic loss: 6.90638
[Episode 212]  Actor loss: -3.02215, Critic loss: 2.45570
[Episode 213]  Actor loss: -25.04956, Critic loss: 4.63085
[Episode 214]  Actor loss: -9.90611, Critic loss: 5.65313
[Episode 215]  Actor loss: -0.66873, Critic loss: 3.82373
[Episode 216]  Actor loss: -7.40840, Critic loss: 4.42035
[Episode 217]  Actor loss: -6.61853, Critic loss: 3.58763
[Episode 218

100%|█████████▉| 513/514 [00:11<00:00, 43.51it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: /content/drive/MyDrive/Lab17/Lab17_demo-300.webm 



100%|█████████▉| 513/514 [00:02<00:00, 185.88it/s]


[Episode 301]  Actor loss: -3.68878, Critic loss: 4.06080
[Episode 302]  Actor loss: 8.39528, Critic loss: 2.43897
[Episode 303]  Actor loss: -5.73728, Critic loss: 2.91342
[Episode 304]  Actor loss: 3.96540, Critic loss: 4.13274
[Episode 305]  Actor loss: -5.60055, Critic loss: 2.70606
[Episode 306]  Actor loss: -7.07031, Critic loss: 2.44246
[Episode 307]  Actor loss: 11.96911, Critic loss: 2.27605
[Episode 308]  Actor loss: -14.32069, Critic loss: 2.98387
[Episode 309]  Actor loss: -8.54458, Critic loss: 3.06756
[Episode 310]  Actor loss: -2.23395, Critic loss: 2.34692
[Episode 311]  Actor loss: 3.59394, Critic loss: 3.26059
[Episode 312]  Actor loss: -13.31017, Critic loss: 6.39555
[Episode 313]  Actor loss: 9.75885, Critic loss: 6.28976
[Episode 314]  Actor loss: -0.82579, Critic loss: 3.75425
[Episode 315]  Actor loss: -19.52435, Critic loss: 3.54716
[Episode 316]  Actor loss: -1.94300, Critic loss: 1.21377
[Episode 317]  Actor loss: -1.60733, Critic loss: 2.69218
[Episode 318]  

100%|█████████▉| 513/514 [00:12<00:00, 40.26it/s]

[MoviePy] Done.
[MoviePy] >>>> Video ready: /content/drive/MyDrive/Lab17/Lab17_demo-400.webm 




100%|█████████▉| 513/514 [00:02<00:00, 172.89it/s]


[Episode 401]  Actor loss: -18.02440, Critic loss: 2.02998
[Episode 402]  Actor loss: 6.80832, Critic loss: 2.00497
[Episode 403]  Actor loss: -2.32874, Critic loss: 2.00733
[Episode 404]  Actor loss: -5.54784, Critic loss: 2.70122
[Episode 405]  Actor loss: -5.66454, Critic loss: 2.82554
[Episode 406]  Actor loss: -2.09339, Critic loss: 4.73558
[Episode 407]  Actor loss: -8.95384, Critic loss: 3.02128
[Episode 408]  Actor loss: 4.03257, Critic loss: 2.66044
[Episode 409]  Actor loss: -9.08668, Critic loss: 2.50530
[Episode 410]  Actor loss: -17.86114, Critic loss: 3.11535
[Episode 411]  Actor loss: -17.31636, Critic loss: 1.73757
[Episode 412]  Actor loss: 1.59686, Critic loss: 5.31768
[Episode 413]  Actor loss: 6.49036, Critic loss: 5.15502
[Episode 414]  Actor loss: -11.13320, Critic loss: 2.67676
[Episode 415]  Actor loss: 3.33570, Critic loss: 2.83443
[Episode 416]  Actor loss: 5.07042, Critic loss: 2.33238
[Episode 417]  Actor loss: 8.14934, Critic loss: 2.36102
[Episode 418]  Ac

100%|█████████▉| 513/514 [00:12<00:00, 39.60it/s]


[MoviePy] Done.
[MoviePy] >>>> Video ready: /content/drive/MyDrive/Lab17/Lab17_demo-500.webm 



100%|█████████▉| 513/514 [00:03<00:00, 167.57it/s]


[Episode 501]  Actor loss: 1.56557, Critic loss: 5.22552
[Episode 502]  Actor loss: 11.47033, Critic loss: 4.49066
[Episode 503]  Actor loss: -17.00399, Critic loss: 8.20125
[Episode 504]  Actor loss: -11.90815, Critic loss: 6.77610
[Episode 505]  Actor loss: -3.53258, Critic loss: 3.34089
[Episode 506]  Actor loss: -3.30710, Critic loss: 2.62025
[Episode 507]  Actor loss: -15.11455, Critic loss: 3.89085
[Episode 508]  Actor loss: -20.25179, Critic loss: 3.85983
[Episode 509]  Actor loss: -15.76460, Critic loss: 3.52232
[Episode 510]  Actor loss: -23.03700, Critic loss: 6.09453
[Episode 511]  Actor loss: -8.07550, Critic loss: 3.18982
[Episode 512]  Actor loss: 11.55306, Critic loss: 4.30068
[Episode 513]  Actor loss: -0.98997, Critic loss: 2.34242
[Episode 514]  Actor loss: 9.73242, Critic loss: 4.70334


相比於傳統的RL algo像是Sarsa或是Q-learning Deep-RL每個episode的訓練時間更久。我認為Deep RL在比較困難的task才能展現出他的優勢，


影片連結
https://drive.google.com/file/d/11EdhyTV2mLxjXs4h-UxuZ1OMfuNGHO4_/view?usp=sharing