## Author: Ariel Guerrero
### ***Q-Learning Agent***


Implementation of ***Q-Learning Agent*** using OpenAI Gym environment.


### *Q Learning Algorithm*

***Q(s<sub>t</sub>, a<sub>t</sub>) = R<sub>t+1</sub> + gamma * max(Q(s<sub>t+1</sub>))***

* s = state
* a = action
* R<sub>t+1</sub> = reward
* gamma = discount factor
* max(Q(s<sub>t+1</sub>) = max Q value ***for all possible actions*** in state s<sub>t+1</sub>

### Imports

In [8]:

import gym
import random
import numpy as np
import time
from gym.envs.registration import register
from IPython.display import clear_output

import pygame



#### Sometimes pygame doesnt close properly, ***run this cell to force it to close***

In [None]:
# if pygame is running, quit it
if pygame.get_init():
    pygame.quit()


### Registering A Frozen Lake Environment with no slippery tiles

In [2]:

# frozen lake registration for non slippery lake
try:
    register(
        id='FrozenLakeNoSlip-v0',
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name': '4x4', 'is_slippery': False},
        max_episode_steps=100,
        reward_threshold=0.78,  # optimum = .8196
    )
except:
    pass



### Functions to create a new environment and to display environment information

In [3]:

def new_env(env_name):
    """
        description: Create a new environment.
    """
    env = gym.make(env_name)
    return env


def env_attributes(env):
    """
        description:    Prints Attributes of the environment
        @param env:      Gym environment
    """
    print("observation space: ", env.observation_space)
    # number of actions
    if type(
            env.action_space) == gym.spaces.discrete.Discrete:
        print("action space: ", env.action_space)
    else:
        print("action range: ",
              env.action_space.low, env.action_space.high)



### Definining the Agent and Q-Agent classes

In [4]:
# Basic Agent
class Agent():
    """
        description:    Agent with discrete or continuous action space
    """

    def __init__(self, env):
        # I know its discrete but putting this here for future reference
        # is the agent discrete or continuous?
        self.is_discrete = type(
            env.action_space) == gym.spaces.discrete.Discrete

        # if discrete, get the action size
        if self.is_discrete:
            self.action_size = env.action_space.n
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape

        env_attributes(env)

    def get_action(self, state):
        # if discrete, get the action
        if self.is_discrete:
            action = random.randrange(self.action_size)
        else:
            action = np.random.uniform(
                self.action_low,
                self.action_high,
                self.action_shape
            )

        return action


# Q Learning Agent
class QAgent(Agent):
    def __init__(self, env, discount_rate=0.97, learning_rate=0.01, epsilon=.75):
        """
            description:    Q Learning Agent
            @param env:      Gym environment
            @param discount_rate:   represents how much future values lose weight based on how far they are
            @param learning_rate:   represents the rate at which an algorithm adjusts its estimates based on the new information 
            @param epsilon:    represents the probability of prioritizing an exploritory action over a policy action (explore values vs acting greedy)
        """
        super().__init__(env)
        # action size is already defined in the parent Agent class
        # state size
        self.state_size = env.observation_space.n
        print("state size: ", self.state_size)
        # discount rate
        self.discount_rate = discount_rate
        # learning rate
        self.learning_rate = learning_rate
        # epsilon
        self.epsilon = epsilon
        # building the model
        self.build_model()

    def build_model(self):
        self.q_table = 1e-4 * \
            np.random.random([self.state_size, self.action_size])

    # redefining the get_action method to use the Q table selecting values based on the corresponding state
    def get_action(self, state):
        q_state = self.q_table[state]
        action_greedy = np.argmax(q_state)
        action_explore = super().get_action(state)
        # return a random action with probability epsilon else return the greedy action
        return action_explore if random.random() < self.epsilon else action_greedy

    def train(self, experience):
        state, action, next_state, reward, done = experience

        # get the next q state
        q_next = self.q_table[next_state]
        q_next = np.zeros([self.action_size]) if done else q_next
        q_target = reward + self.discount_rate * np.max(q_next)
        print(f"target: {q_target}")
        print(f"action: {action}")
        print(f"q_next: {q_next}")
        print(f"q_next[action]: {q_next[action]}")
        print(f"state: {state}")
        print(f"new_state: {next_state}")    


        print(f"q_table: {self.q_table}")
        # calc distance from current q state to the target q state
        print(f"q_table[state][actions]: {self.q_table[state][action]}")
        q_update = q_target - self.q_table[state, action]
        print(f"q_update: {q_update}")
        self.q_table[state, action] += self.learning_rate * q_update
        print(f"q_table: {self.q_table}")
        # we need to reduce epsilon over time as we get more experience and the table is more optimal
        if done:
            # exponential decay to epsilon
            self.epsilon = self.epsilon * 0.99
        
        # wait 2 seconds
        time.sleep(4)


### Initializing the Agent and Environment

In [5]:
#  lake is slippery (hard)
# env_name = 'FrozenLake-v1'
#  lake is not slippery (easy)
env_name = 'FrozenLakeNoSlip-v0'
# create new environment
env = new_env(env_name)
#  new QAgent
agent = QAgent(env)

observation space:  Discrete(16)
action space:  Discrete(4)
state size:  16


### Driver Code

In [6]:
# used to keep track of the cumulative reward
total_reward = 0
# train agent over a number of episodes
for episode in range(100):
    # fresh state
    state = env.reset()
    # agent is finished or failed
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        agent.train(experience=(state, action, next_state, reward, done))
        state = next_state
        total_reward += reward
        print(f"state: {state}, action: {action}")
        print(
            f"episode: {episode}, total reward: {total_reward}, epsilon: {agent.epsilon}")
        # env.render()
        time.sleep(.05)
        clear_output(wait=True)


target: 8.763686624751133e-05
action: 1
q_next: [8.97646178e-06 9.03472848e-05 5.96939136e-05 7.66099532e-05]
q_next[action]: 9.034728479124879e-05
state: 0
new_state: 4
q_table: [[3.88907177e-05 2.95443311e-05 3.09162971e-05 1.04648915e-05]
 [5.41589639e-06 6.56875818e-05 7.15945535e-06 2.53001402e-05]
 [7.12741330e-05 7.81181762e-05 3.04170225e-05 8.57554077e-06]
 [2.08127846e-05 4.32472924e-05 9.83342561e-05 1.22447720e-05]
 [8.97646178e-06 9.03472848e-05 5.96939136e-05 7.66099532e-05]
 [1.70629732e-05 1.63644470e-05 5.34573245e-05 9.94798507e-05]
 [5.52145438e-05 6.83857442e-05 9.17767473e-06 1.50099803e-05]
 [5.68236839e-05 2.79478341e-05 6.58232214e-05 2.23804455e-05]
 [9.00728209e-05 1.35005627e-05 8.39464914e-05 5.06668516e-05]
 [1.91060263e-05 9.72176101e-05 1.25837108e-05 5.62348179e-05]
 [2.84159225e-05 7.28000724e-05 8.46431348e-05 9.66354304e-05]
 [1.38454249e-05 8.63095651e-05 1.71780381e-05 3.33302087e-05]
 [4.70389806e-05 3.44168153e-05 5.88337038e-05 4.16891086e-06]
 [

KeyboardInterrupt: 