In [1]:
import gym
import random
import numpy as np
import time
from gym.envs.registration import register
from IPython.display import clear_output

In [2]:
env_name = "FrozenLake-v0"
env = gym.make(env_name)
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)
type(env.action_space)


Observation space: Discrete(16)
Action space: Discrete(4)


gym.spaces.discrete.Discrete

In [3]:
try:
    register(
        id='FrozenLakeNoSlip-v0',
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery':False},
        max_episode_steps=100,
        reward_threshold=0.78, # optimum = .8196
        )
except:
    pass

In [4]:
class Agent(object):
    def __init__(self,env):
        self.is_discrete = type(env.action_space) == gym.spaces.discrete.Discrete
        if self.is_discrete:
            self.action_size = env.action_space.n
            print("Action size:", self.action_size)
        else:
            self.action_low = env.action_space.low
            self.action_high  = env.action_space.high
            self.action_shape = env.action_space.shape
            print("Action range:", self.action_low, self.action_high)
            
            
    def get_action(self, state):
        if self.is_discrete:
            action = random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low,
                                       self.action_high,
                                       self.action_shape)
        return action

In [5]:
class QAgent(Agent):
    def __init__(self,env,discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.total_reward = 0
        self.state_size = env.observation_space.n   # Size of the state space, we need this to construct the q-table.
        print("State size:",self.state_size)
        
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.eps = 1    # Epsilon is the factor/probability we are using to choose between qtable actions and random actions.   
        
        # Construct the q(action,value) table/matrix
        self.buildQtable()   
        
    def buildQtable(self):
#         np.random.rand(self.state_size,self.action_size)
        self.Qtable = 1e-4*np.random.random([self.state_size, self.action_size])
        
    
    def get_action(self,state):
        q_state = self.Qtable[state]
        q_action_greedy = np.argmax(q_state)
        q_action_random = super().get_action(state)   # Randomised action from the parent class get_action method.
        self.random = random.random()
        return q_action_random if self.random < self.eps else q_action_greedy
    
        
    def train(self,experience):
        state, action, next_state, reward, done = experience  #This experience tuple will have to be created by you, after accepting values from env.step().
        q_state_next = self.Qtable[next_state]    # Get the corresponding q_state arrays containing actions from the q-table.
        q_state_next = np.zeros(self.action_size) if done else q_state_next
    
        q_value_target  = reward + self.discount_rate* np.max(q_state_next)   # The new estimated/target value is based on the recieved reward and the discounted value of the next state.
        
        # Error to update the Qtable value with
        q_value_error = q_value_target - self.Qtable[state,action]    # NOTICE we've used the current state-action pair from Qtable, so that the next time the agent is in that position, it will consider accordingly.
        
        # Update the Qtable using Bellman equation
        self.Qtable[state,action] += self.learning_rate*q_value_error
        
        # Decrease random actions as the number of episodes increase
        if done:
            self.eps = self.eps*.99
            


In [6]:
agent = QAgent(env)

Action size: 4
State size: 16


In [7]:

# agent = Agent(env)
agent.total_reward = 0

for ep in range(100):
    done = False
    state = env.reset()
    while not done:
        # Do an action for the state you're in, either random or from q_table, so initially it will do random actions
        action = agent.get_action(state)
        
        # Recieve the 4 golden values,after doing the action. (Read- NCfOM)
        next_state,reward,done,info = env.step(action)
        
        # Train the agent
        agent.train((state, action, next_state, reward, done))  # Now that you have the results from your actions, use them to better your agent in future experiences.
        
        state = next_state 
        agent.total_reward += reward
        
        print("s:", state, "a:", action)
        print("Episode: {}, Total reward: {}, Current Reward:{}, eps: {},random: {}".format(ep,agent.total_reward,reward,agent.eps,agent.random))
#         print(agent.total_reward,agent.eps)
        env.render()
        
        print(agent.Qtable)
        time.sleep(0.05)
        clear_output(wait=True)
    

s: 5 a: 2
Episode: 99, Total reward: 3.0, Current Reward:0.0, eps: 0.36603234127322926,random: 0.7260121830415334
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
[[  4.71234151e-05   4.40461816e-05   8.22838847e-05   6.52600806e-05]
 [  7.37844218e-05   6.62907675e-05   3.20340685e-05   5.92107531e-05]
 [  8.61115079e-05   5.79039445e-05   4.18332078e-05   1.01058110e-05]
 [  1.81538271e-05   7.45616373e-05   4.37230378e-05   6.36754249e-05]
 [  3.91060948e-05   3.76135380e-05   7.45634154e-05   1.44663123e-05]
 [  3.59309423e-05   2.19484065e-06   3.17624914e-05   8.66676135e-05]
 [  8.76981060e-05   9.24203525e-05   5.22548715e-05   6.23069067e-05]
 [  5.11125233e-05   2.56901347e-05   9.91837043e-05   6.36427490e-05]
 [  5.27270506e-05   1.53894701e-05   5.46592264e-05   4.54312695e-06]
 [  1.31257418e-05   4.95206373e-05   8.35953143e-06   7.27835457e-06]
 [  1.73346477e-05   2.25370597e-04   2.63257243e-04   1.62451918e-05]
 [  4.86447650e-05   1.10210084e-05   5.72303203e-06   1.90509864e

In [None]:
# NOTES - The number of rows in Qtable = 8*8 = 16, which are the states.
# Similarly there are 4 Actions - Left,Right,Up,Down.
#  Observe the qtable ater a lot of iterations, the maximum in the first state is 2nd, which is Right, and the minimum is the last, which is Up.
# LEFT = 0
# DOWN = 1
# RIGHT = 2
# UP = 3
# LINK  - https://gym.openai.com/envs/FrozenLake-v0/