In [1]:
import gym
env = gym.make('gym_examples:gym_examples/ShootingAirplane-v0', render_mode='text')

In [2]:
obs, info = env.reset()

In [3]:
env.observation_space

Box(0, 255, (8, 8, 1), uint8)

In [4]:
obs.shape

(8, 8, 1)

In [5]:
env.render()

         |         
         |         
         |         
         |    HHH  
         |     H   
         |   HHHHH 
         |     H   
         |         



In [6]:
obs, reward, done, _, info = env.step((0, 0))

  if not isinstance(terminated, (bool, np.bool8)):


In [7]:
env.render()

M        |         
         |         
         |         
         |    HHH  
         |     H   
         |   HHHHH 
         |     H   
         |         



In [8]:
obs, reward, done, _, info = env.step((4, 4))

In [9]:
env.render()

M        |         
         |         
         |         
         |    HHH  
    H    |     H   
         |   HHHHH 
         |     H   
         |         



In [10]:
obs, reward, done, _, info = env.step((3, 3))
env.render()

M        |         
         |         
         |         
   H     |    HHH  
    H    |     H   
         |   HHHHH 
         |     H   
         |         



In [11]:
reward

1

In [12]:
obs, reward, done, _, info = env.step((1, 3))
env.render()

M        |         
   M     |         
         |         
   H     |    HHH  
    H    |     H   
         |   HHHHH 
         |     H   
         |         



In [13]:
reward

-1

# DQN

In [14]:
import numpy as np
import torch
import torch.nn as nn
import torchvision.transforms as T

In [15]:
gamma = 0.99
epsilon = 1.0
epsilon_max = 1.0
epsilon_min = 0.1
epsilon_interval = epsilon_max - epsilon_min
batch_size = 16
max_steps_per_episode = 60
max_episodes = 10000

In [16]:
env.observation_space

Box(0, 255, (8, 8, 1), uint8)

In [17]:
env.action_space

MultiDiscrete([8 8])

In [18]:
num_actions = 64

In [19]:
class QModel(nn.Module):
  def __init__(self, num_actions):
    super(QModel, self).__init__()
    self.dropout = nn.Dropout(p=0.3)
    self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding='same')
    self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding='same')
    self.conv3 = nn.Conv2d(32, 32, kernel_size=3, stride=1)
    self.flatten = nn.Flatten()
    self.fc1 = nn.Linear(1152, 512)
    self.fc2 = nn.Linear(512, num_actions)

  def forward(self, x):
    x = nn.functional.relu(self.conv1(x))
    x = nn.functional.relu(self.conv2(x))
    x = self.dropout(x)
    x = nn.functional.relu(self.conv3(x))
    x = self.flatten(x)
    x = nn.functional.relu(self.fc1(x))
    x = self.dropout(x)
    return self.fc2(x)

In [20]:
model = QModel(num_actions)

In [21]:
model_target = QModel(num_actions)

In [22]:
loss_function = nn.SmoothL1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.00025)

In [23]:
action_history = []
action_mask_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []

In [24]:
episode_reward_history = []
running_reward = 0.
episode_count = 0
frame_count = 0

In [25]:
epsilon_random_frames = 50000
# Number of frames for exploration
epsilon_greedy_frames = 200000.0
# Maximum replay length
# Note: The Deepmind paper suggests 1000000 however this causes memory issues
max_memory_length = 500000
# Train the model after 4 actions
update_after_actions = 4
# How often to update the target network
update_target_network = 10000

In [26]:
def preprocess_state(obs):
  st = torch.from_numpy(obs).squeeze()
  st = st.to(torch.int64)
  st = torch.nn.functional.one_hot(st, num_classes=3)
  st = st.permute(2, 0, 1)
  return st.to(torch.float32)

In [27]:
board, info = env.reset()

In [28]:
board.shape

(8, 8, 1)

In [29]:
st = preprocess_state(board)
st.shape

torch.Size([3, 8, 8])

In [30]:
st

tensor([[[1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.]],

        [[0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0.],
       

In [31]:
def get_greedy_epsilon(model, state, mask):
  global epsilon

  #if frame_count < epsilon_random_frames or np.random.rand(1)[0] < epsilon:
  if np.random.rand(1)[0] < epsilon:
    action = np.random.choice([ i for i in range(num_actions) if mask[i] == 1 ])
  else:
    with torch.no_grad():
      # add a batch axis
      state_tensor = state.unsqueeze(0)
      # compute the q-values
      q_values = model(state_tensor)
      # select the q-values of valid actions
      action = torch.argmax(
        q_values.squeeze() + torch.from_numpy(mask) * 100., dim=0)

  epsilon -= epsilon_interval / epsilon_greedy_frames
  epsilon = max(epsilon, epsilon_min)
  return action

In [32]:
def get_greedy_action(model, state, mask):
  global epsilon
  with torch.no_grad():
    state_tensor = state.unsqueeze(0) # batch dimension
    q_values = model(state_tensor)
    action = torch.argmax(
      q_values.squeeze() + torch.from_numpy(mask) * 100.,dim=0) 
  return action

In [33]:
def sample_batch(_batch_size):
  indices = np.random.choice(range(len(done_history)), size=_batch_size, replace=False)
  state_sample = np.array([state_history[i].squeeze(0).numpy() for i in indices])
  state_next_sample = np.array([state_next_history[i].squeeze(0).numpy() for i in indices])
  rewards_sample = np.array([rewards_history[i] for i in indices], dtype=np.float32)
  action_sample = np.array([action_history[i] for i in indices])

  # action mask is the mask for the valid actions at the '''next''' state
  action_mask_sample = np.array([action_mask_history[i] for i in indices])
  done_sample = np.array([float(done_history[i]) for i in indices])
  return state_sample, state_next_sample, rewards_sample, action_sample, action_mask_sample, done_sample

In [34]:
def update_network():
  state_sample, state_next_sample, rewards_sample, action_sample, action_mask_sample, done_sample = \
    sample_batch(batch_size)

  state_sample = torch.tensor(state_sample, dtype=torch.float32)
  state_next_sample = torch.tensor(state_next_sample, dtype=torch.float32)
  action_sample = torch.tensor(action_sample, dtype=torch.int64)
  action_mask_sample = torch.tensor(action_mask_sample, dtype=torch.int64)
  rewards_sample = torch.tensor(rewards_sample, dtype=torch.float32)
  done_sample = torch.tensor(done_sample, dtype=torch.float32)

  with torch.no_grad():
    future_rewards = model_target(state_next_sample)            # off policy, target and behavior policy are different, update later
    max_q_values = torch.max (
        future_rewards + action_mask_sample * 100., dim=1).values.detach() - 100.
    target_q_values = rewards_sample + gamma * max_q_values * (1. - done_sample)

  q_values = model(state_sample)
  q_values_action = q_values.gather(dim=1, index=action_sample.unsqueeze(1)).squeeze(1)
  loss = loss_function(q_values_action, target_q_values)
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()


In [35]:
for _ in range(max_episodes):
  state, info = env.reset()
  state = preprocess_state(state)
  action_mask = info['action_mask'].reshape((-1,))
  episode_reward = 0

  for timestep in range(1, max_steps_per_episode):
    frame_count += 1

    action = get_greedy_epsilon(model, state, action_mask)

    state_next, reward, done, _, info = env.step((action // 8, action % 8))
    state_next = preprocess_state(state_next)
    action_mask = info['action_mask'].reshape((-1,))

    episode_reward += reward

    action_history.append(action)
    action_mask_history.append(action_mask)
    state_history.append(state)
    state_next_history.append(state_next)
    rewards_history.append(reward)
    done_history.append(done)

    state = state_next

    if frame_count % update_after_actions == 0 and len(done_history) > batch_size:
      update_network()

    if frame_count % update_target_network == 0:
      model_target.load_state_dict(model.state_dict())

    if len(rewards_history) > max_memory_length:
      del rewards_history[:1]
      del state_history[:1]
      del state_next_history[:1]
      del action_history[:1]
      del action_mask_history[:1]
      del done_history[:1]
    if done:
      break

  episode_count +=1
  episode_reward_history.append(episode_reward)

  if len(episode_reward_history)> 100:
    del episode_reward_history[0]

  running_reward = np.mean(episode_reward_history)

  if episode_count % 10 == 0:
    print(f'Episode {episode_count} reward: {episode_reward} running reward: {running_reward} epsilon: {epsilon}')


Episode 10 reward: -36 running reward: -36.9 epsilon: 0.9974935000000207
Episode 20 reward: -39 running reward: -37.9 epsilon: 0.9949240000000419
Episode 30 reward: -41 running reward: -38.266666666666666 epsilon: 0.992359000000063
Episode 40 reward: -34 running reward: -38.075 epsilon: 0.9898525000000837
Episode 50 reward: -35 running reward: -38.4 epsilon: 0.9872470000001052
Episode 60 reward: -20 running reward: -37.78333333333333 epsilon: 0.9848305000001252
Episode 70 reward: -41 running reward: -37.857142857142854 epsilon: 0.9822520000001465
Episode 80 reward: -38 running reward: -37.9625 epsilon: 0.9796915000001676
Episode 90 reward: -29 running reward: -37.94444444444444 epsilon: 0.9771445000001886
Episode 100 reward: -39 running reward: -38.13 epsilon: 0.9745255000002102
Episode 110 reward: -33 running reward: -38.17 epsilon: 0.9719830000002312
Episode 120 reward: -43 running reward: -37.7 epsilon: 0.9695710000002511
Episode 130 reward: -38 running reward: -37.4 epsilon: 0.9670

KeyboardInterrupt: 

In [None]:
import time, sys
from IPython.display import clear_output
board, info = env.reset()
state = preprocess_state(board)
action_mask = info['action_mask'].reshape((-1,))
done = False
env.render()
while not done:
  action = get_greedy_action(model, state, action_mask)
  print("action: ({}, {})".format(action // 8, action % 8))
  sys.stdout.flush()
  time.sleep(1.0)
  clear_output(wait=False)
  board, reward, done, _, info = env.step((action // 8, action % 8))
  state = preprocess_state(board)
  action_mask = info['action_mask'].reshape((-1,))
  env.render()


    MH   |      H  
   H H   |    H H  
  MHHHH  |    HHHH 
   HMH   |    H H  
   MMH   |      H  
     MM  |         
         |         
         |         

