# Model based Racing Kings Reinforcement Learning

In [1]:
!pip install numpy
!pip install tensorflow==2.3.0
!pip install keras
!pip install keras-rl2
!pip install chess



In [2]:
import sys
import numpy as np
import tensorflow as tf
import datetime
from statistics import mean
from racing_kings_env import RacingKingsEnvironment

2021-07-27 16:36:31.520591: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1


## Testing the Environment

In [3]:
episodes = 10
env = RacingKingsEnvironment()
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render(mode=None)
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{} Info:{}'.format(episode, score, info))
env.close()

Episode:1 Score:-1 Info:{'msg': 'Action is not a valid move'}
Episode:2 Score:-1 Info:{'msg': 'Action is not a valid move'}
Episode:3 Score:-1 Info:{'msg': 'Action is not a valid move'}
Episode:4 Score:-1 Info:{'msg': 'Action is not a valid move'}
Episode:5 Score:-1 Info:{'msg': 'Action is not a valid move'}
Episode:6 Score:-1 Info:{'msg': 'Action is not a valid move'}
Episode:7 Score:-1 Info:{'msg': 'Action is not a valid move'}
Episode:8 Score:-1 Info:{'msg': 'Action is not a valid move'}
Episode:9 Score:-1 Info:{'msg': 'Action is not a valid move'}
Episode:10 Score:-1 Info:{'msg': 'Action is not a valid move'}
closing


## Building the RL-Model

In [4]:
class CustomModel(tf.keras.Model):
    def __init__(self, shape_states, hidden_layers_template, shape_actions):
        super(CustomModel, self).__init__()
        self.input_layer = tf.keras.layers.InputLayer(input_shape=shape_states)
        self.hidden_layers = []
        for hlt in hidden_layers_template:
            self.hidden_layers.append(tf.keras.layers.Conv2D(
                hlt, kernel_size=(3,3), activation='relu', kernel_initializer='RandomNormal'))
        self.flatten_layer = tf.keras.layers.Flatten()
        self.output_layer = tf.keras.layers.Dense(
                shape_actions, activation='linear', kernel_initializer='RandomNormal')

    @tf.function
    def call(self, inputs):
        z = self.input_layer(inputs)
        for layer in self.hidden_layers:
            z = layer(z)
        flatten = self.flatten_layer(z)
        output = self.output_layer(flatten)
        return output

In [5]:
class DQN:
    def __init__(self, shape_states, shape_actions, hidden_layers_template, gamma, max_experiences, min_experiences, batch_size, lr):
        self.shape_actions = shape_actions
        self.shape_states = shape_states
        self.batch_size = batch_size
        self.optimizer = tf.optimizers.Adam(lr)
        self.gamma = gamma
        self.model = CustomModel(shape_states, hidden_layers_template, shape_actions)
        self.experience = {'s': [], 'a': [], 'r': [], 's2': [], 'done': []}
        self.max_experiences = max_experiences
        self.min_experiences = min_experiences

    def predict(self, inputs):
        if inputs.shape == self.shape_states:
            inputs = np.expand_dims(inputs, axis = 0)
        prediction = self.model(inputs.astype('float32'))
        return prediction

    def train(self, TargetNet):
        if len(self.experience['s']) < self.min_experiences:
            return 0
        # chooses an random integer in range low to high, with batch_size samples
        ids = np.random.randint(low=0, high=len(self.experience['s']), size=self.batch_size)
        # sets states to an array of batch_size random experiences from the experience array
        states = np.asarray([self.experience['s'][i] for i in ids])
        # same for actions
        actions = np.asarray([self.experience['a'][i] for i in ids])
        # same for rewards
        rewards = np.asarray([self.experience['r'][i] for i in ids])
        # same for next states
        states_next = np.asarray([self.experience['s2'][i] for i in ids])
        # same for dones
        dones = np.asarray([self.experience['done'][i] for i in ids])
        # predicts the next values based on the next states
        value_next = np.max(TargetNet.predict(states_next), axis=1)
        # gets the reward where done is true 
        # and the reward * self.gamma * predicted_value) where done is false
        actual_values = np.where(dones, rewards, rewards+self.gamma*value_next)
        
        with tf.GradientTape() as tape:
            selected_action_values = tf.math.reduce_sum(
                self.predict(states) * tf.one_hot(actions, self.shape_actions), axis=1)
            loss = tf.math.reduce_mean(tf.square(actual_values - selected_action_values))
        variables = self.model.trainable_variables
        gradients = tape.gradient(loss, variables)
        self.optimizer.apply_gradients(zip(gradients, variables))
        return loss

    def get_action(self, states, epsilon):
        if np.random.random() < epsilon:
            return np.random.choice(self.shape_actions)
        else:
            return np.argmax(self.predict(states))

    def add_experience(self, exp):
        if len(self.experience['s']) >= self.max_experiences:
            for key in self.experience.keys():
                self.experience[key].pop(0)
        for key, value in exp.items():
            self.experience[key].append(value)

    def copy_weights(self, TrainNet):
        variables1 = self.model.trainable_variables
        variables2 = TrainNet.model.trainable_variables
        for v1, v2 in zip(variables1, variables2):
            v1.assign(v2.numpy())

In [6]:
def play_racing_kings(env, TrainNet, TargetNet, epsilon, copy_step):
    rewards = 0
    iter = 0
    done = False
    observations = env.reset()
    losses = list()
    while not done:
        action = TrainNet.get_action(observations, epsilon)
        prev_observations = observations
        observations, reward, done, _ = env.step(action)
        rewards += reward
        if done:
            env.reset()

        exp = {'s': prev_observations, 'a': action, 'r': reward, 's2': observations, 'done': done}
        TrainNet.add_experience(exp)
        loss = TrainNet.train(TargetNet)
        if isinstance(loss, int):
            losses.append(loss)
        else:
            losses.append(loss.numpy())
        iter += 1
        if iter % copy_step == 0:
            TargetNet.copy_weights(TrainNet)
    return rewards, mean(losses), iter

In [7]:
env = RacingKingsEnvironment()
gamma = 0.99
copy_step = 5
hidden_units = [256, 256]
max_experiences = 10000
min_experiences = 100
batch_size = 32
lr = 1e-2
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = 'Training/Logs/CustomDQN-' + current_time + '-min_epsilon-025-decay-1e-5_no-neg-reward'
summary_writer = tf.summary.create_file_writer(log_dir)

TrainNet = DQN(env.state_shape, env.action_shape, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr)
TargetNet = DQN(env.state_shape, env.action_shape, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr)

N = 500000
StorageInterval = 500
total_episode_rewards = np.empty(StorageInterval)
total_episode_lengths = np.empty(StorageInterval)
total_episode_losses = np.empty(StorageInterval)
# starting epsilon: at the beginning of training 99 % of randomness are allowed
epsilon = 0.99
# sets the speed epsilon decreases to min epsilon
decay = 1 - 1e-5
# sets the end amount of randomness encounterd by to model on long term training to 1 %
min_epsilon = 0.25
for n in range(N):
    epsilon = epsilon * decay
    if epsilon < min_epsilon:
        epsilon = min_epsilon
    
    reward, loss, step = play_racing_kings(env, TrainNet, TargetNet, epsilon, copy_step)
    total_episode_rewards[n%StorageInterval] = reward
    total_episode_lengths[n%StorageInterval] = step
    total_episode_losses[n%StorageInterval] = loss
    if n % StorageInterval == 0 and n != 0:
        avg_episode_rewards = np.mean(total_episode_rewards)
        avg_episode_length = np.mean(total_episode_lengths)
        avg_episode_losses = np.mean(total_episode_losses)
        with summary_writer.as_default():
            tf.summary.scalar('mean_ep_length', avg_episode_length, step=n)
            tf.summary.scalar('mean_reward', avg_episode_rewards, step=n)
            tf.summary.scalar('loss', avg_episode_losses, step=n)
            tf.summary.scalar('epsilon', epsilon, step=n)
        print('episode:{} epsilon:{:.3} mean_reward:{:.3} mean_ep_length:{:.3} loss:{:.3}'
                .format(n, float(epsilon), float(avg_episode_rewards), float(avg_episode_length), float(avg_episode_losses)))
        total_episode_rewards = np.empty(StorageInterval)
        total_episode_lengths = np.empty(StorageInterval)
        total_episode_losses = np.empty(StorageInterval)
env.close()

2021-07-27 16:36:33.292065: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1
2021-07-27 16:36:33.331441: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-07-27 16:36:33.331869: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce GTX 1060 6GB computeCapability: 6.1
coreClock: 1.7085GHz coreCount: 10 deviceMemorySize: 5.93GiB deviceMemoryBandwidth: 178.99GiB/s
2021-07-27 16:36:33.331891: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2021-07-27 16:36:33.333440: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10
2021-07-27 16:36:33.334740: I tensorflow/stream_executo

episode:500 epsilon:0.985 mean_reward:-1.0 mean_ep_length:1.01 loss:0.091
episode:1000 epsilon:0.98 mean_reward:-1.0 mean_ep_length:1.01 loss:0.043
episode:1500 epsilon:0.975 mean_reward:-1.0 mean_ep_length:1.02 loss:0.0416
episode:2000 epsilon:0.97 mean_reward:-1.0 mean_ep_length:1.01 loss:0.0357
episode:2500 epsilon:0.966 mean_reward:-1.0 mean_ep_length:1.02 loss:0.0355
episode:3000 epsilon:0.961 mean_reward:-1.0 mean_ep_length:1.01 loss:0.0391
episode:3500 epsilon:0.956 mean_reward:-1.0 mean_ep_length:1.02 loss:0.0325
episode:4000 epsilon:0.951 mean_reward:-1.0 mean_ep_length:1.02 loss:0.0322
episode:4500 epsilon:0.946 mean_reward:-0.999 mean_ep_length:1.02 loss:0.0302
episode:5000 epsilon:0.942 mean_reward:-1.0 mean_ep_length:1.01 loss:0.0311
episode:5500 epsilon:0.937 mean_reward:-1.0 mean_ep_length:1.01 loss:0.0321
episode:6000 epsilon:0.932 mean_reward:-1.0 mean_ep_length:1.01 loss:0.0294
episode:6500 epsilon:0.928 mean_reward:-1.0 mean_ep_length:1.0 loss:0.0273
episode:7000 eps

In [17]:
episodes = 10
for episode in range(1, episodes+1):
    # obs = env.reset()
    env.board.reset()
    obs, _, _, _ = env.step(None)
    done = False
    score = 0

    while not done:
        env.render(mode="human")
        action = TrainNet.get_action(obs, 0)
        print(">>{}".format(env.action_index_to_uci(action)))
        obs, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{} Info:{}'.format(episode, score, info))

>>a5f4
Episode:10 Score:-1 Info:{'msg': 'Action is not a valid move'}


## Save and Reload the Model Weights

In [9]:
TrainNet.model.save_weights('Training/SavedModels/train_100k_net-025-1e-5_no-neg-reward.h5f', overwrite=True)
TargetNet.model.save_weights('Training/SavedModels/target_100k_net-025-1e-5_no-neg-reward.h5f', overwrite=True)

Delete the current models

In [14]:
del TrainNet.model
del TargetNet.model



load the model from memory

In [15]:
TrainNet.model = CustomModel(TrainNet.shape_states, [256, 256], TrainNet.shape_actions)
TrainNet.model.load_weights('Training/SavedModels/train_100k_net-025-1e-5_no-neg-reward.h5f')
TargetNet.model = CustomModel(TargetNet.shape_states, [256, 256], TargetNet.shape_actions)
TargetNet.model.load_weights('Training/SavedModels/target_100k_net-025-1e-5_no-neg-reward.h5f')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f9ea02fb430>