# Setup

In [1]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import keras
from keras import layers, Input

import numpy as np
import tensorflow as tf
from connect4 import Connect4

# Configuration paramaters for the whole setup
seed = 42
gamma = 0.99  # Discount factor for past rewards
epsilon = 1.0  # Epsilon greedy parameter
epsilon_min = 0.1  # Minimum epsilon greedy parameter
epsilon_max = 1.0  # Maximum epsilon greedy parameter
epsilon_interval = (
    epsilon_max - epsilon_min
)  # Rate at which to reduce chance of random action being taken
batch_size = 32  # Size of batch taken from replay buffer
max_steps_per_episode = 10000
max_episodes = 10  # Limit training episodes, will run until solved if smaller than 1

connect_4 = Connect4()

: 

# Deep Q-Network

In [None]:
num_actions = 7
possible_actions = [i for i in range(7)]

def create_q_model():
    # Network defined by the Deepmind paper
    return keras.Sequential(
        [
            layers.Input(shape=(43,)),
            layers.Dense(512, activation="relu"),
            layers.Dense(num_actions, activation="linear"),
        ]
    )


# The first model makes the predictions for Q-values which are used to
# make a action.
model = create_q_model()
# Build a target model for the prediction of future rewards.
# The weights of a target model get updated every 10000 steps thus when the
# loss between the Q-values is calculated the target Q-value is stable.
model_target = create_q_model()

# Train

In [None]:
optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

num_games = 1
max_games = 1000000
trainging_game_batch_size = 100
reward_punishment_const = 100

# Number of frames to take random action and observe output
epsilon_random_games = 500
# Number of frames for exploration
epsilon_greedy_games = 1000

update_network = 1000

# History
games_states = []
games_next_states = []
games_actions = []
games_rewards = []
games_done = []

loss_function = keras.losses.Huber()

def transform_state(state):
  return tf.convert_to_tensor(
      np.array(state)
  )

def flatten_list(lst):
  return [
      lst[i][j] 
      for i in range(len(lst))
      for j in range(len(lst[i]))
  ]

while num_games < max_games:
  state = connect_4.reset()
  state = transform_state(state)

  done = False
  game_states = []
  game_next_states = []
  game_actions = []
  game_rewards = []
  game_done = []
  
  while not done:
    if num_games < epsilon_random_games or epsilon > np.random.rand(1)[0]:
      action = np.random.choice(num_actions)
    else:
      action_probs = model(state, training=False)
      action = keras.ops.argmax(action_probs[0]).numpy()

    epsilon -= epsilon_interval / epsilon_greedy_games
    epsilon = max(epsilon, epsilon_min)

    next_state, reward, done = connect_4.move(action)
    game_states.append(state)
    game_next_states.append(next_state)
    game_actions.append(action)
    game_rewards.append(reward)
    game_done.append(abs(done))

    next_state = transform_state(next_state)
    state = next_state
  for i in range(len(game_rewards)):
    player = game_states[i][-1]
    if player == done:
      game_rewards[i] += reward_punishment_const
    else:
      game_rewards[i] -= reward_punishment_const
  games_states.append(game_states)
  games_next_states.append(game_next_states)
  games_actions.append(game_actions)
  games_rewards.append(game_rewards)
  games_done.append(game_done)

  if len(games_states) == trainging_game_batch_size:
    state_sample = np.array(flatten_list(games_states))
    next_state_sample = np.array(flatten_list(games_next_states))
    action_sample = np.array(flatten_list(games_actions))
    rewards_sample = np.array(flatten_list(games_rewards))
    done_sample = np.array(flatten_list(games_done))

    future_reward = model_target.predict(next_state_sample)
    updated_q_values = rewards_sample + gamma * np.amax(
      future_reward, axis=1
    )
    updated_q_values = updated_q_values * (1 - done_sample) - reward_punishment_const*done_sample
    
    masks = tf.one_hot(action_sample, num_actions)
    
    with tf.GradientTape() as tape:
      q_values = model(state_sample)
      
      q_action = np.sum(np.multiply(q_values, masks), axis=1)
      loss = loss_function(updated_q_values, q_action)
    
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    
  if num_games % update_network == 0:
    model_target.set_weights(model.get_weights())
    print("Updated Network")


  num_games += 1