In [None]:
# Imports
import gym
import random
import numpy as np
import time
from gym.envs.registration import register
from IPython.display import clear_output

In [None]:
env_name = "FrozenLakeNoSlip-v1" 
custom_map = ["SFFFFFFF", 
              "FFFFFFFF", 
              "FFFHFFFF",
              "FFFFFHFF", 
              "FFFHFFFF", 
              "FHHFFFHF", 
              "FHFFHFHF", 
              "FFFHFFFG"]
N = len(custom_map)
register(
    id='FrozenLakeNoSlip-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'is_slippery':False},
    max_episode_steps=400,
    reward_threshold=0.78,
)

env = gym.make(env_name, desc = custom_map)
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)

In [None]:
class Agent():
    def __init__(self, env):
        self.is_discrete = \
            type(env.action_space) == gym.spaces.discrete.Discrete
        
        if self.is_discrete:
            self.action_size = env.action_space.n
            print("Action size:", self.action_size)
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
            print("Action range:", self.action_low, self.action_high)
        
    def get_action(self, state):
        if self.is_discrete:
            action = random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low,
                                       self.action_high,
                                       self.action_shape)
        return action

In [None]:
class QAgent(Agent):
    
    def __init__(self, env, discount_rate=0.97, learning_rate=0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n
        print("State size:", self.state_size)
        
        self.epsilon = 1.0
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.q_table = 1e-4 * np.random.random([self.state_size, self.action_size])
#         self.q_table = 1e-4 * np.zeros([self.state_size, self.action_size])
        
    def get_action(self, state):
        if random.random() < self.epsilon:
            return super().get_action(state)
        else:
            return np.argmax(self.q_table[state])
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        q_next = np.zeros([self.action_size]) if done else q_next
        q_target = reward + self.discount_rate * np.max(q_next)
        
        delta_q = q_target - self.q_table[state,action]
        self.q_table[state,action] += self.learning_rate * delta_q
        
        if done:
            self.epsilon *= 0.99

In [None]:
agent = QAgent(env)

In [None]:
total_reward = 0
rounds = 5
num_eps = 100

def run(rounds, num_eps, total_reward):

    for _ in range(1, rounds + 1):

        for _ in range(1, num_eps + 1):
            state = env.reset()
            done = False
            while not done:
                action = agent.get_action(state)
                next_state, reward, done, _ = env.step(action)
                agent.train((state,action,next_state,reward,done))
                state = next_state
                total_reward += reward

#                 print("Round", i)
#                 print("s:", state, "a:", action)
#                 print("total reward:", total_reward)
#                 env.render()
#                 print(agent.q_table)
#                 time.sleep(0.05)
#                 clear_output(wait=True)

        total_reward = 0
        


In [None]:
run(rounds, num_eps, total_reward)

In [None]:
import pickle
with open("./{N}x{N}custom_frozen_lake_qtable.pkl", "wb") as f:
    pickle.dump(agent.q_table, f)


In [None]:
with open(f"./{N}x{N}custom_frozen_lake_qtable.pkl", "rb") as f:
    qtest = pickle.load(f)

In [None]:
for episode in range(5):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(400):

        action = np.argmax(qtest[state, :])
        new_state, reward, done, info = env.step(action)
        state = new_state
        total_reward += reward

        env.render()

        time.sleep(0.05)

        print("Number of steps", step+1)

        clear_output(wait=True)
        if done:
            break

env.close()