## Author: Ariel Guerrero
### Q-Learning Agent with ***Neural Network***


Implementation of Q-Learning Agent with a ***neural network*** using OpenAI Gym environment.


### Q Learning Algorithm

***Q(s<sub>t</sub>, a<sub>t</sub>) = R<sub>t+1</sub> + gamma*** * ***max(Q(s<sub>t+1</sub>))***

* s = state
* a = action
* R<sub>t+1</sub> = reward
* gamma = discount factor
* max(Q(s<sub>t+1</sub>) = max Q value for all possible actions in state s<sub>t+1</sub>


### Imports

In [19]:
import gym
import time
import random
import numpy as np
import tensorflow as tf
import warnings
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense

from gym.envs.registration import register
from IPython.display import clear_output
from collections import deque

import pygame

warnings.filterwarnings('ignore')


#### Sometimes pygame doesn't close properly, ***run this cell to force close***

In [20]:
# if pygame is running, quit it
if pygame.get_init():
    pygame.quit()
    pygame.register_quit(sys.exit)



## Global Vars

In [21]:
train_episodes=600
test_episodes=200
max_steps=200
batch_size=32

### Registering A Frozen Lake Environment with no slippery tiles

In [22]:
# frozen lake registration for non slippery lake
try:
    register(
        id='FrozenLakeNoSlip-v0',
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name': '4x4', 'is_slippery': False},
        max_episode_steps=100,
        reward_threshold=0.78,  # optimum = .8196
    )
except:
    pass

### Functions to create a new environment and to display environment information

In [23]:
def new_env(env_name):
    """
        description: Create a new train and test environment.
        @param env_name: Name of the environment
        @return: train and test environment
    """
    env_train = gym.make(env_name)
    env_test = gym.make(env_name)
    return env_train, env_test

def env_attributes(env):
    """
        description:    Prints Attributes of the environment
        @param env:      Gym environment
    """
    print("observation space: ", env.observation_space)
    # number of actions
    if type(
            env.action_space) == gym.spaces.discrete.Discrete:
        print("action space: ", env.action_space)
    else:
        print("action range: ",
              env.action_space.low, env.action_space.high)


### Definining the Agent and QNN-Agent classes

In [24]:
# Basic Agent
class Agent():
    """
        description:    Agent with discrete or continuous action space
    """

    def __init__(self, env):
        # is the agent discrete or continuous?
        self.is_discrete = type(
            env.action_space) == gym.spaces.discrete.Discrete
        # if discrete, get the action size
        if self.is_discrete:
            self.action_size = env.action_space.n
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
        self.env = env
        env_attributes(env)

    def get_action(self):
        if self.is_discrete:
            action = self.env.action_space.sample()
        else:
            action = np.random.uniform(
                self.action_low,
                self.action_high,
                self.action_shape
            )
        return action

# Q Learning Agent with Neural Network
class QNNAgent(Agent):
    def __init__(self, env, discount_rate=0.90, learning_rate=0.01, epsilon=.99):
        """
            description: Q Learning Agent with Neural Network
            @param env: openai Gym environment
            @param discount_rate: how much future values lose weight based on how far they are
            @param learning_rate: rate at which an algorithm adjusts its estimates based on the new information
            @param epsilon: probability of prioritizing an exploritory action over a policy action (explore values vs acting greedy)
        """
        super().__init__(env)
        # action size is already defined in the parent Agent class
        # state size
        self.state_size = env.observation_space.n
        # learning rate (alpha)
        self.learning_rate = learning_rate
        # discount rate (gamma)
        self.discount_rate = discount_rate
        # epsilon
        self.epsilon = epsilon
        self.epsilon_decay = .99
        # memory
        self.memory = deque(maxlen=2500)        
        # building the model
        self.build_model()

    def build_model(self):
        self.model = keras.Sequential([
            keras.layers.Dense(16, input_dim=self.state_size, activation='relu'),
            keras.layers.Dense(self.action_size, activation='sigmoid')
        ])
        self.model.compile(loss='mean_squared_error', optimizer=Adam(learning_rate=self.learning_rate), metrics=['accuracy'])
        print(self.model.summary())
        keras.utils.plot_model(self.model, show_layer_names=False)

    def print_model_bias(self):
        for layer in self.model.layers:
            print(layer.name)
            print(f"shape: {layer.get_weights()[1].shape}")
            print(f"bias: {layer.get_weights()[1]}")

    def print_model_weights(self):
        for layer in self.model.layers:
            print(layer.name)
            print(f"shape: {layer.get_weights()[0].shape}")
            print(f"weights: {layer.get_weights()[0]}")

    def add_memory(self, new_state, reward, done, state, action):
        self.memory.append((new_state, reward, done, state, action))

    def get_action(self, state):
        return super().get_action() if np.random.rand() < self.epsilon else self.predict(state)

    def predict(self, state):
        return np.argmax(self.model.predict(state))

    def train(self, experience):
        new_state, reward, done, state, action = experience
        # target = reward + (gamma * max(Q(s')))
        target = reward + self.discount_rate * self.predict(new_state)
        q_next = self.model.predict(state) if not done else np.zeros((1, self.action_size))

        if done and reward !=1:
           q_update = (target - action) * self.learning_rate
        elif not done and reward != 1:
            q_update = target
        else:
            q_update = target * (self.learning_rate + 1)

        q_next[0][action] = q_update
        
        self.model.fit(state, q_next, epochs=60, batch_size=5, verbose=0)
        
        if done:
            self.epsilon *= self.epsilon_decay 



### Initializing the Agent and Environment

In [25]:
#  lake is slippery (hard)
# env_name = 'FrozenLake-v1'
#  lake is not slippery (easy)
env_name = 'FrozenLakeNoSlip-v0'
# create new environments
env_train, env_test = new_env(env_name)

### Environment

![](img/frozen_lake_env.png)

In [26]:
#  new QAgent with neural network
agent = QNNAgent(env_train)

observation space:  Discrete(16)
action space:  Discrete(4)
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 16)                272       
                                                                 
 dense_5 (Dense)             (None, 4)                 68        
                                                                 
Total params: 340
Trainable params: 340
Non-trainable params: 0
_________________________________________________________________
None
You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


### Neural Network Model
![model](img/model.png)

### Driver Code

#### training the agent


In [27]:
total_reward = 0
for episode in range(train_episodes):

    state = env_train.reset()   
    state_arr = np.zeros(agent.state_size)
    state_arr[state] = 1
    state = np.reshape(state_arr, [1, agent.state_size])
    reward = step = 0
    done = False

    while not done and step < max_steps:
        action = agent.get_action(state)
        new_state, reward, done, info = env_train.step(action)
        state_arr = np.zeros(agent.state_size)
        state_arr[new_state] = 1
        new_state = np.reshape(state_arr, [1, agent.state_size])
        agent.train(experience=(new_state, reward, done, state, action))
        state = new_state
        # not rendering since it takes forever 
        env_train.render()
        # time.sleep(.01)
        step += 1
    clear_output(wait=True)
    total_reward += reward
    print(f'episode: {episode+1}/{train_episodes}, steps taken: {step}, total reward: {total_reward}, epsilon: {agent.epsilon}')
    # print(f'----------------------------- model bias --------------------------------')
    # agent.print_model_bias()
env_train.close()
print(f'Training score:  {100 * (total_reward / train_episodes)}')

episode: 95/600, steps taken: 15, total reward: 2.0, epsilon: 0.38104711810454983


KeyboardInterrupt: 

#### testing the agent

In [None]:
total_complete = 0
for episode in range(test_episodes):
    
    state = env_test.reset()
    state_arr=np.zeros(agent.state_size)
    state_arr[state] = 1
    state = np.reshape(state_arr, [1, agent.state_size])
    reward = step = 0
    done = False

    while not done and step < max_steps:
        
        action = agent.predict(state)

        new_state, reward, done, info = env_test.step(action)
        new_state_arr = np.zeros(agent.state_size)
        new_state_arr[new_state] = 1
        new_state = np.reshape(new_state_arr, [1, agent.state_size])
        state = new_state
        
        env_test.render()
        time.sleep(.05)
        step += 1

    clear_output(wait=True)
    total_complete += reward
    print(f'episode: {episode}/{test_episodes}, steps taken: {step}, completions: {total_complete}, epsilon: {agent.epsilon}')

env_test.close()

print(f'Testing score: {(total_complete/test_episodes) * 100}')

### Omitted code snippets, useful to keep just in case

        # print("action: ", action, " reward: ", reward, " done: ", done, " info: ", info, " new_state: ", new_state, " new_state_arr: ", new_state_arr, " state: ", state, " state_arr: ", state_arr)
        # print(f"agent memory len: {len(agent.memory)}")
        # if len(agent.memory) > batch_size:
        #         agent.train_replay(batch_size)   
        # def train_replay(self, batch_size):
        #     minibatch = random.sample(self.memory, batch_size)
        #     for new_state, reward, done, state, action in minibatch:
        #         target = reward
        #         if not done:
        #             target = reward + self.discount_rate * np.argmax(self.model.predict(new_state))
        #         target_f = self.model.predict(state)
        #         target_f[0][action] = target
        #         self.model.fit(state, target_f, epochs=10, verbose=0)
        #     if self.epsilon > self.min_epsilon:
        #         self.epsilon *= self.epsilon_decay 
        #     self.epsilon_lst.append(self.epsilon)
        
        
        
        
        # print(f"state: {state}")
        # print(f"new_state: {new_state}")
        # print(f"target: {target}")
        # print(f"action: {action}")
        # print(f"q_next: {q_next}")
        # print(f"q_next[action]: {q_next[0][action]}")
        # print(f"q_update: {q_update}")
        # print(f"updated q_next[action]: {q_next[0][action]}")
        # print(f'state {state}, q_next {q_next}')
        # wait 5 sec
        # time.sleep(5)