### Import libraries

In [1]:
import os
#os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["SDL_VIDEODRIVER"] = "dummy"  # make pygame window not appear
import numpy as np
from ple.games.flappybird import FlappyBird
from ple import PLE

game = FlappyBird()
env = PLE(game, fps=30, display_screen=False)  # environment interface to game

couldn't import doomish
Couldn't import doom


In [2]:
# return a dictionary whose key is action description and value is action index
print(game.actions)
# return a list of action index (include None)
print(env.getActionSet())

{'up': 119}
[119, None]


In [3]:
game.getGameState()  # a dictionary describe state

{'next_next_pipe_bottom_y': 207,
 'next_next_pipe_dist_to_player': 427.0,
 'next_next_pipe_top_y': 107,
 'next_pipe_bottom_y': 129,
 'next_pipe_dist_to_player': 283,
 'next_pipe_top_y': 29,
 'player_vel': 0,
 'player_y': 256}

In [4]:
import math
import copy
from collections import defaultdict
MIN_EXPLORING_RATE = 0.01
MIN_LEARNING_RATE = 0.05


class Agent:

  def __init__(self,
               bucket_range_per_feature,
               num_action,
               t=0,
               discount_factor=0.99):
    self.update_parameters(t)  # init explore rate and learning rate
    self.q_table = defaultdict(lambda: np.zeros(num_action))
    self.discount_factor = discount_factor
    self.num_action = num_action

    # how to discretize each feature in a state
    # the higher each value, less time to train but with worser performance
    # e.g. if range = 2, feature with value 1 is equal to feature with value 0 bacause int(1/2) = int(0/2)
    self.bucket_range_per_feature = bucket_range_per_feature

  def select_action(self, state):
    # epsilon-greedy
    state_idx = self.get_state_idx(state)
    if np.random.rand() < self.exploring_rate:
      action = np.random.choice(num_action)  # Select a random action
    else:
      action = np.argmax(self.q_table[state_idx])  # Select the action with the highest q
    return action

  def update_policy(self, state, action, reward, state_plum):
    state_idx = self.get_state_idx(state)
    state_plum_idx = self.get_state_idx(state_plum)
    # Update Q_value using Q-learning update rule
    # best_q = np.max(self.q_table[state_plum_idx])
    # Update Q_value using SARSA update rule
    best_q = np.average(self.q_table[state_plum_idx])
    self.q_table[state_idx][action] += self.learning_rate * (
        reward + self.discount_factor * (best_q) - self.q_table[state_idx][action])

  def get_state_idx(self, state):
    # instead of using absolute position of pipe, use relative position
    state = copy.deepcopy(state)
    state['next_next_pipe_bottom_y'] -= state['player_y']
    state['next_next_pipe_top_y'] -= state['player_y']
    state['next_pipe_bottom_y'] -= state['player_y']
    state['next_pipe_top_y'] -= state['player_y']

    # sort to make list converted from dict ordered in alphabet order
    state = [v for k, v in sorted(state.items())]

    # the following decrease state space to speed up training
    state_idx = []
    for i in range(len(state)):
      state_idx.append(int(state[i] / self.bucket_range_per_feature[i]))
    return tuple(state_idx)

  def update_parameters(self, episode):
    self.exploring_rate = max(MIN_EXPLORING_RATE,
                              min(0.5, 0.99**((episode) / 30)))
    self.learning_rate = max(MIN_LEARNING_RATE, min(0.5, 0.99
                                                    **((episode) / 30)))

  def shutdown_explore(self):
    # make action selection greedy
    self.exploring_rate = 0

In [5]:
num_action = len(env.getActionSet())
bucket_range_per_feature = [40, 40, 512, 20, 20, 20, 4, 16]

# init agent
agent = Agent(bucket_range_per_feature, num_action)

In [None]:
def make_anim(images, fps=60, true_image=False):
  duration = len(images) / fps
  import moviepy.editor as mpy

  def make_frame(t):
    try:
      x = images[int(len(images) / duration * t)]
    except:
      x = images[-1]

    if true_image:
      return x.astype(np.uint8)
    else:
      return ((x + 1) / 2 * 255).astype(np.uint8)

  clip = mpy.VideoClip(make_frame, duration=duration)
  clip.fps = fps
  return clip

In [None]:
%%time
from IPython.display import Image, display

reward_per_epoch = []
lifetime_per_epoch = []
exploring_rates = []
learning_rates = []
print_every_episode = 1000
show_gif_every_episode = 5000
NUM_EPISODE = 100000

for episode in range(0, NUM_EPISODE):

  # Reset the environment
  env.reset_game()

  # record frame
  frames = [env.getScreenRGB()]

  # for every 500 episodes, shutdown exploration to see performance of greedy action
  if episode % print_every_episode == 0:
    agent.shutdown_explore()

  # the initial state
  state = game.getGameState()
  cum_reward = 0  # cumulate reward for this episode
  t = 0

  while not env.game_over():

    # select an action
    action = agent.select_action(state)

    # execute the action and get reward
    reward = env.act(
        env.getActionSet()[action])  # reward = +1 when pass a pipe, -5 when die

    frames.append(env.getScreenRGB())

    # cumulate reward
    cum_reward += reward

    # observe the result
    state_plum = game.getGameState()  # get next state

    # update agent
    agent.update_policy(state, action, reward, state_plum)

    # Setting up for the next iteration
    state = state_plum
    t += 1

  # update explore rating and learning rate
  agent.update_parameters(episode)

  if episode % print_every_episode == 0:
    print("Episode %d finished after %f time steps" % (episode, t))
    print("cumulated reward: %f" % cum_reward)
    print("exploring rate %f" % agent.exploring_rate)
    print("learning rate %f" % agent.learning_rate)
    reward_per_epoch.append(cum_reward)
    exploring_rates.append(agent.exploring_rate)
    learning_rates.append(agent.learning_rate)
    lifetime_per_epoch.append(t)

  if episode % show_gif_every_episode == 0:  # for every 5000 episode, record an animation
    print("len frames:", len(frames))
    #clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
    clip = make_anim(frames, fps=30, true_image=True).rotate(-90)
    #display(clip.ipython_display(fps=60, autoplay=1, loop=1))
    display(clip.ipython_display(fps=60, autoplay=0, loop=0))

Episode 0 finished after 62.000000 time steps
cumulated reward: -5.000000
exploring rate 0.500000
learning rate 0.500000
len frames: 63


 99%|█████████▉| 126/127 [00:00<00:00, 192.99it/s]


Episode 1000 finished after 62.000000 time steps
cumulated reward: -5.000000
exploring rate 0.500000
learning rate 0.500000
Episode 2000 finished after 62.000000 time steps
cumulated reward: -5.000000
exploring rate 0.500000
learning rate 0.500000
Episode 3000 finished after 62.000000 time steps
cumulated reward: -5.000000
exploring rate 0.366032
learning rate 0.366032
Episode 4000 finished after 98.000000 time steps
cumulated reward: -4.000000
exploring rate 0.261834
learning rate 0.261834


 12%|█▏        | 25/207 [00:00<00:00, 248.27it/s]

Episode 5000 finished after 102.000000 time steps
cumulated reward: -3.000000
exploring rate 0.187298
learning rate 0.187298
len frames: 103


100%|█████████▉| 206/207 [00:00<00:00, 296.78it/s]


Episode 6000 finished after 39.000000 time steps
cumulated reward: -5.000000
exploring rate 0.133980
learning rate 0.133980
Episode 7000 finished after 62.000000 time steps
cumulated reward: -5.000000
exploring rate 0.095840
learning rate 0.095840
Episode 8000 finished after 98.000000 time steps
cumulated reward: -4.000000
exploring rate 0.068557
learning rate 0.068557
Episode 9000 finished after 78.000000 time steps
cumulated reward: -4.000000
exploring rate 0.049041
learning rate 0.050000


 17%|█▋        | 22/127 [00:00<00:00, 217.00it/s]

Episode 10000 finished after 62.000000 time steps
cumulated reward: -5.000000
exploring rate 0.035080
learning rate 0.050000
len frames: 63


 99%|█████████▉| 126/127 [00:00<00:00, 231.87it/s]


Episode 11000 finished after 98.000000 time steps
cumulated reward: -4.000000
exploring rate 0.025094
learning rate 0.050000


In [None]:
# plot lifetime against training episodes
import matplotlib.pyplot as plt
fig, ax1 = plt.subplots(figsize=(20, 5))
plt.plot(range(len(lifetime_per_epoch)), lifetime_per_epoch)
fig.tight_layout()
plt.show()

In [None]:
# plot reward against training episodes
import matplotlib.pyplot as plt
fig, ax1 = plt.subplots(figsize=(20, 5))
plt.plot(range(len(reward_per_epoch)), reward_per_epoch)
plt.show()

### Q-Learning vs. SARSA

SARSA is an on-policy algorithm (it follows the policy that is learning) and Q-learning is an off-policy algorithm (it can follow any policy (that fulfills some convergence requirements).

In SARSA, we use the same policy (i.e epsilon-greedy) that generated the previous action A(t) to generate the next action, A(t+1) which we run through our Q-function for updates. Intuitively, SARSA is on-policy because we use the same policy to generate the current action A(t) and the next action A(t+1). We then evaluate our policy’s action selection, and improve upon it by improving the Q-function estimates.

相對而言，Q-learning比較大膽且勇敢，對錯誤並不那麼在乎。而Sarsa則是一種保守的算法，相對比較在乎每一步決策，對於錯誤比較銘感。