In [1]:
import gym
import random
import numpy as np
import time
from gym.envs.registration import register
from IPython.display import clear_output

In [2]:
env_name = "FrozenLake-v0"
env = gym.make(env_name)
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)
type(env.action_space)


Observation space: Discrete(16)
Action space: Discrete(4)


gym.spaces.discrete.Discrete

In [3]:
try:
    register(
        id='FrozenLakeNoSlip-v0',
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery':False},
        max_episode_steps=100,
        reward_threshold=0.78, # optimum = .8196
        )
except:
    pass

In [4]:
class Agent(object):
    def __init__(self,env):
        self.is_discrete = type(env.action_space) == gym.spaces.discrete.Discrete
        if self.is_discrete:
            self.action_size = env.action_space.n
            print("Action size:", self.action_size)
        else:
            self.action_low = env.action_space.low
            self.action_high  = env.action_space.high
            self.action_shape = env.action_space.shape
            print("Action range:", self.action_low, self.action_high)
            
            
    def get_action(self, state):
        if self.is_discrete:
            action = random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low,
                                       self.action_high,
                                       self.action_shape)
        return action

In [5]:
class QAgent(Agent):
    def __init__(self,env,discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.total_reward = 0
        self.state_size = env.observation_space.n   # Size of the state space, we need this to construct the q-table.
        print("State size:",self.state_size)
        
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.eps = 1    # Epsilon is the factor/probability we are using to choose between qtable actions and random actions.   
        
        # Construct the q(action,value) table/matrix
        self.buildQtable()   
        
    def buildQtable(self):
#         np.random.rand(self.state_size,self.action_size)
        self.Qtable = 1e-4*np.random.random([self.state_size, self.action_size])
        
    
    def get_action(self,state):
        q_state = self.Qtable[state]
        q_action_greedy = np.argmax(q_state)
        q_action_random = super().get_action(state)   # Randomised action from the parent class get_action method.
        self.random = random.random()
        return q_action_random if self.random < self.eps else q_action_greedy
    
        
    def train(self,experience):
        state, action, next_state, reward, done = experience  #This experience tuple will have to be created by you, after accepting values from env.step().
        q_state_next = self.Qtable[next_state]    # Get the corresponding q_state arrays containing actions from the q-table.
        q_state_next = np.zeros(self.action_size) if done else q_state_next
    
        q_value_target  = reward + self.discount_rate* np.max(q_state_next)   # The new estimated/target value is based on the recieved reward and the discounted value of the next state.
        
        # Error to update the Qtable value with
        q_value_error = q_value_target - self.Qtable[state,action]    # NOTICE we've used the current state-action pair from Qtable, so that the next time the agent is in that position, it will consider accordingly.
        
        # Update the Qtable using Bellman equation
        self.Qtable[state,action] += self.learning_rate*q_value_error
        
        # Decrease random actions as the number of episodes increase
        if done:
            self.eps = self.eps*.99
            


In [6]:
agent = QAgent(env)

Action size: 4
State size: 16


In [16]:

# agent = Agent(env)
agent.total_reward = 0

for ep in range(100):
    done = False
    state = env.reset()
    while not done:
        # Do an action for the state you're in, either random or from q_table, so initially it will do random actions
        action = agent.get_action(state)
        
        # Recieve the 4 golden values,after doing the action. (Read- NCfOM)
        next_state,reward,done,info = env.step(action)
        
        # Train the agent
        agent.train((state, action, next_state, reward, done))  # Now that you have the results from your actions, use them to better your agent in future experiences.
        
        state = next_state 
        agent.total_reward += reward
        
        print("s:", state, "a:", action)
        print("Episode: {}, Total reward: {}, Current Reward:{}, eps: {},random: {}".format(ep,agent.total_reward,reward,agent.eps,agent.random))
#         print(agent.total_reward,agent.eps)
        env.render()
        
        print(agent.Qtable)
        time.sleep(0.05)
        clear_output(wait=True)
    

s: 5 a: 2
Episode: 99, Total reward: 15.0, Current Reward:0.0, eps: 4.317124741065784e-05,random: 0.8044647191927314
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
[[  5.53749618e-05   6.47889807e-05   7.31317303e-03   5.55118436e-05]
 [  4.66643883e-05   4.45402729e-05   4.21649673e-05   7.06509626e-03]
 [  1.14666721e-02   8.07703444e-05   4.36964231e-05   5.26522957e-05]
 [  2.64235338e-05   2.45365543e-05   5.28714503e-05   6.40195880e-05]
 [  1.11511251e-02   5.32839721e-05   5.35541622e-05   2.95071983e-05]
 [  9.37334233e-05   7.39120275e-05   5.20014982e-05   1.61823298e-06]
 [  4.93850295e-05   2.68049006e-02   1.73231655e-05   5.00712512e-05]
 [  8.16033934e-05   7.34803010e-05   6.19597746e-05   1.08050944e-05]
 [  5.39976356e-05   2.43426517e-02   2.48577846e-05   1.02595144e-05]
 [  2.49363585e-05   4.73007874e-05   7.34016698e-02   5.85563955e-05]
 [  7.73823231e-05   1.26170174e-01   4.49948135e-05   3.78390478e-05]
 [  2.44313231e-06   1.46117895e-05   1.53476018e-05   6.626430

In [None]:
# NOTES - The number of rows in Qtable = 8*8 = 16, which are the states.
# Similarly there are 4 Actions - Left,Right,Up,Down.
#  Observe the qtable ater a lot of iterations, the maximum in the first state is 2nd, which is Right, and the minimum is the last, which is Up.
# LEFT = 0
# DOWN = 1
# RIGHT = 2
# UP = 3
# LINK  - https://gym.openai.com/envs/FrozenLake-v0/