<a href="https://colab.research.google.com/github/c-quilo/RL-OpenAIGym/blob/main/RL_QLearning_FrozenLakeNoSlip.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Reinforcement learning exercises using OpenAI Gym
Author: César Quilodrán-Casas
Environment: Cart Pole v1 *italicised text*



In [47]:
!pip install gym
!pip install ipython



In [48]:
#Load packages
import gym
import matplotlib.pyplot as plt
import numpy as np
import time
from gym.envs.registration import register
from IPython.display import clear_output

In [49]:
try:
    register(
        id='FrozenLakeNoSlip-v0',
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery':False},
        max_episode_steps=100,
        reward_threshold=0.78, # optimum = .8196
    )
except:
    pass

In [50]:
envName = 'CartPole-v1'
envName = 'MountainCar-v0'
envName = 'MountainCarContinuous-v0'
envName = 'FrozenLakeNoSlip-v0'
env = gym.make(envName)
print('Observations:', env.observation_space)
print('Actions: ', env.action_space)
type(env.action_space)

Observations: Discrete(16)
Actions:  Discrete(4)


gym.spaces.discrete.Discrete

In [51]:
#Define Agent
class Agent():
    def __init__(self, env):
        self.is_discrete = \
            type(env.action_space) == gym.spaces.discrete.Discrete
        
        if self.is_discrete:
            self.action_size = env.action_space.n
            print('Action_size: ', self.action_size)
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
            print('Action_range: ', self.action_low, self.action_high)
#Perform an action
    def get_action(self, state):
        if self.is_discrete:
            action = np.random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low,
                                        self.action_high,
                                        self.action_shape)
        return action

In [76]:
class QAgent(Agent):
    def __init__(self, env, discount_rate=0.95, learning_rate=0.001):
        super().__init__(env)
        self.state_size = env.observation_space.n
        print('State size: ', self.state_size)

        self.eps = 1.0
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
    
    def build_model(self):
      self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])

    def get_action(self, state):
      q_state = self.q_table[state]
      action_greedy = np.argmax(q_state)
      action_random = super().get_action(state)
      return action_random if np.random.random() < self.eps else action_greedy

      return action
    
    def train(self, experience):
      state, action, next_state, reward, done = experience
      q_next = self.q_table[next_state]
      q_next = np.zeros([self.action_size]) if done else q_next
      q_target = reward + self.discount_rate * np.max(q_next)

      q_update = q_target - self.q_table[state, action]
      self.q_table[state, action] += self.learning_rate * q_update

      if done:
        self.eps = self.eps * 0.99
    
agent = QAgent(env)
    

Action_size:  4
State size:  16


In [None]:
#Training
#Maximises reward
total_reward = 0
for episode in range(100):
  state = env.reset()
  done = False
  while not done:
    action = agent.get_action(state)
    next_state, reward, done, info = env.step(action)
    agent.train((state, action, next_state, reward, done))
    state = next_state
    total_reward += reward
    print('s: ', state, 'a: ', action)
    print('Episode: {}, Total reward: {}, epsilon: {}'.format(episode, total_reward, agent.eps))
    env.render()
    print(agent.q_table)
    time.sleep(0.05)
    clear_output(wait=True)

s:  3 a:  2
Episode: 31, Total reward: 2.0, epsilon: 0.1481449915475795
  (Right)
SFF[41mF[0m
FHFH
FFFH
HFFG
[[6.23026865e-05 4.29058603e-05 6.22896812e-05 2.87342663e-05]
 [4.67385245e-05 3.28165439e-05 3.99445321e-05 3.02268352e-05]
 [4.47021322e-05 3.67907160e-05 6.60732893e-05 4.44708587e-05]
 [8.60359119e-05 3.63309901e-05 3.97824063e-05 2.76835846e-05]
 [8.57773367e-05 1.62212413e-05 6.90219228e-05 8.96596927e-06]
 [4.35776084e-06 1.20477969e-05 1.18164412e-05 1.58321130e-06]
 [3.60570202e-05 9.96382857e-05 8.40680390e-05 4.66637516e-05]
 [7.61507853e-05 9.59043520e-05 3.37883445e-05 6.85605395e-05]
 [4.75917760e-05 6.29448560e-05 1.39352581e-05 5.91309924e-05]
 [8.02283365e-05 8.03461301e-05 3.62696555e-06 5.66640894e-05]
 [4.94396363e-06 6.42638052e-05 3.81254002e-05 1.74625813e-05]
 [5.46584474e-05 2.26470156e-05 9.30775431e-05 1.20985111e-06]
 [9.11367965e-05 2.64259302e-05 3.74925292e-05 8.19754896e-05]
 [4.32642488e-05 4.58113904e-05 8.55651610e-05 5.43255033e-05]
 [7.300