In [30]:
import numpy as np
import tensorflow as tf
import collections
import statistics
import tqdm

from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple

from fourInRowGame import Chip, FourInRowGame
from randomAgent import randomFiarAgent

In [31]:
class ActorCritic(tf.keras.Model):
  """Combined actor-critic network."""

  def __init__(
      self,
      num_actions: int,
      num_hidden_units: int):
    """Initialize."""
    super().__init__()

    self.common = layers.Dense(num_hidden_units, activation="relu")
    self.actor = layers.Dense(num_actions)
    self.critic = layers.Dense(1)

  def call(self, inputs: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
    x = self.common(inputs)
    return self.actor(x), self.critic(x)

In [32]:
game_cols = 6
game_rows = 5

num_actions = game_cols
num_hidden_units = 128

eps = np.finfo(np.float32).eps.item()

model = ActorCritic(num_actions, num_hidden_units)
env = FourInRowGame(game_rows, game_cols)
randomAgent = randomFiarAgent(env)
env.print()


- - - - - - 
- - - - - - 
- - - - - - 
- - - - - - 
- - - - - - 

In [33]:
randomAgent.select_col()

0

In [34]:
def env_step(action):
    # invalid move
    if env.column_height(action) == env.rows:
        return np.array(env.get_simple_slots_negative()).reshape(-1), -100, True
    # rf agent victory
    agent_victory = env.play_game(Chip.YELLOW, action)
    if agent_victory:
        return np.array(env.get_simple_slots_negative()).reshape(-1), 100, True
    # random agent victory
    rand_col = randomAgent.select_col()
    random_victory = env.play_game(Chip.RED, rand_col)
    if random_victory:
        return np.array(env.get_simple_slots_negative()).reshape(-1), -50, True
    # no victory
    return np.array(env.get_simple_slots_negative()).reshape(-1), -1, False

In [35]:
print(env_step(0))
env.print()

(array([ 1,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]), -1, False)

- - - - - - 
- - - - - - 
- - - - - - 
- - - - - - 
Y - - - R - 

In [36]:
@tf.numpy_function(Tout=[tf.int32, tf.int32, tf.int32])
def env_step_optimized(action: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
  """Returns state, reward and done flag given an action."""

  state, reward, done = env_step(action)
  return (state.astype(np.int32),
          np.array(reward, np.int32),
          np.array(done, np.int32))


In [37]:
def run_episode(
    initial_state: tf.Tensor,
    model: tf.keras.Model,
    max_steps: int) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
  """Runs a single episode to collect training data."""

  action_probs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
  values = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
  rewards = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)

  initial_state_shape = initial_state.shape
  state = initial_state

  for t in tf.range(max_steps):
    # Convert state into a batched tensor (batch size = 1)
    state = tf.expand_dims(state, 0)

    # Run the model and to get action probabilities and critic value
    action_logits_t, value = model(state)

    # Sample next action from the action probability distribution
    action = tf.random.categorical(action_logits_t, 1)[0, 0]
    action_probs_t = tf.nn.softmax(action_logits_t)

    # Store critic values
    values = values.write(t, tf.squeeze(value))

    # Store log probability of the action chosen
    action_probs = action_probs.write(t, action_probs_t[0, action])

    # Apply action to the environment to get next state and reward
    state, reward, done = env_step_optimized(action)
    state.set_shape(initial_state_shape)

    # Store reward
    rewards = rewards.write(t, reward)

    if tf.cast(done, tf.bool):
      break

  action_probs = action_probs.stack()
  values = values.stack()
  rewards = rewards.stack()

  return action_probs, values, rewards



In [38]:
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=0.01)

def get_expected_return(
    rewards: tf.Tensor,
    gamma: float,
    standardize: bool = True) -> tf.Tensor:
  """Compute expected returns per timestep."""

  n = tf.shape(rewards)[0]
  returns = tf.TensorArray(dtype=tf.float32, size=n)

  # Start from the end of `rewards` and accumulate reward sums
  # into the `returns` array
  rewards = tf.cast(rewards[::-1], dtype=tf.float32)
  discounted_sum = tf.constant(0.0)
  discounted_sum_shape = discounted_sum.shape
  for i in tf.range(n):
    reward = rewards[i]
    discounted_sum = reward + gamma * discounted_sum
    discounted_sum.set_shape(discounted_sum_shape)
    returns = returns.write(i, discounted_sum)
  returns = returns.stack()[::-1]

  if standardize:
    returns = ((returns - tf.math.reduce_mean(returns)) /
               (tf.math.reduce_std(returns) + eps))

  return returns

huber_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)

def compute_loss(
    action_probs: tf.Tensor,
    values: tf.Tensor,
    returns: tf.Tensor) -> tf.Tensor:
  """Computes the combined Actor-Critic loss."""

  advantage = returns - values

  action_log_probs = tf.math.log(action_probs)
  actor_loss = -tf.math.reduce_sum(action_log_probs * advantage)

  critic_loss = huber_loss(values, returns)

  return actor_loss + critic_loss

@tf.function
def train_step(
    initial_state: tf.Tensor,
    model: tf.keras.Model,
    optimizer: tf.keras.optimizers.Optimizer,
    gamma: float,
    max_steps_per_episode: int) -> tf.Tensor:
  """Runs a model training step."""

  with tf.GradientTape() as tape:

    # Run the model for one episode to collect training data
    action_probs, values, rewards = run_episode(
        initial_state, model, max_steps_per_episode)

    # Calculate the expected returns
    returns = get_expected_return(rewards, gamma)

    # Convert training data to appropriate TF tensor shapes
    action_probs, values, returns = [
        tf.expand_dims(x, 1) for x in [action_probs, values, returns]]

    # Calculate the loss values to update our network
    loss = compute_loss(action_probs, values, returns)

  # Compute the gradients from the loss
  grads = tape.gradient(loss, model.trainable_variables)

  # Apply the gradients to the model's parameters
  optimizer.apply_gradients(zip(grads, model.trainable_variables))

  episode_reward = tf.math.reduce_sum(rewards)

  return episode_reward


In [39]:

min_episodes_criterion = 100
max_episodes = 10000
max_steps_per_episode = int(game_cols*game_rows/2)

reward_threshold = 500
running_reward = 0

# The discount factor for future rewards
gamma = 0.99

# Keep the last episodes reward
episodes_reward: collections.deque = collections.deque(maxlen=min_episodes_criterion)

t = tqdm.trange(max_episodes)
for i in t:
    env.reset()
    initial_state = np.array(env.get_simple_slots_negative()).reshape(-1)
    initial_state = tf.constant(initial_state, dtype=tf.int32)
    episode_reward = int(train_step(
        initial_state, model, optimizer, gamma, max_steps_per_episode))

    episodes_reward.append(episode_reward)
    running_reward = statistics.mean(episodes_reward)


    t.set_postfix(
        episode_reward=episode_reward, running_reward=running_reward)

    # Show the average episode reward every 10 episodes
    if i % 10 == 0:
      pass # print(f'Episode {i}: average reward: {avg_reward}')

    if running_reward > reward_threshold and i >= min_episodes_criterion:
        break

print(f'\nSolved at episode {i}: average reward: {running_reward:.2f}!')


100%|██████████| 10000/10000 [00:16<00:00, 612.08it/s, episode_reward=-103, running_reward=22.4]


Solved at episode 9999: average reward: 22.35!





In [50]:






# Sample next action from the action probability distribution


print(action.numpy())

4


In [61]:
env.reset()
def play_against_model():
    start_state = np.array(env.get_simple_slots_negative()).reshape(-1)
    format_state = tf.expand_dims(start_state, 0)
    action_logits_t, value = model(format_state)
    model_action = tf.random.categorical(action_logits_t, 1)[0, 0]
    action_probs_t = tf.nn.softmax(action_logits_t)
    env.play_game(Chip.YELLOW, model_action)
    print(action_probs_t)

In [78]:
play_against_model()
env.print()

ValueError: Column 0 is already filled

In [77]:
env.play_game(Chip.RED,5)
env.print()


Y - - - Y - 
Y - - - Y R 
Y - - - Y R 
R - R - R R 
Y - R - Y R 