In [1]:
!pip install -U catalyst-rl

Requirement already up-to-date: catalyst-rl in /usr/local/lib/python3.6/dist-packages (20.3)


In [2]:
!pip install wandb



## Setup environment

In [3]:
import numpy as np
import random
import catalyst_rl
from catalyst_rl import rl

In [14]:
from tqdm import tqdm

## Create environment

In [4]:
env = rl.environment.gym.GymEnvWrapper("Taxi-v3")

In [5]:
env.env.render()

+---------+
|R: | : :G|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+



## Create Q-table and initialize it

In [7]:
# Number of actions
num_of_actions = env.env.action_space.n
print("Number of actions is", num_of_actions)

Number of actions is 6


In [8]:
# Number of possible observations
num_of_observations = env.env.observation_space.n
print("Number of observations is", num_of_observations)

Number of observations is 500


In [9]:
Q_table = np.zeros((num_of_observations, num_of_actions))
Q_table

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

## Create hyperparameters

In [18]:
# Number of episodes
num_of_episodes = 50000
# Number of test episodes 
num_of_test_episodes = 100
# Max steps per episode
max_episode_steps = 100

In [11]:
# Learning rate
lr = 0.7
# Discounting rate
gamma = 0.7

In [13]:
# Exploration parameters
# Exploration rate (also known as epsilon rate)
epsilon = 1 # can be from 0 to 1
max_epsilon = 1
min_epsilon = 0.01
decay_rate = 0.01

## Q-Learning algorithm

Implement the Q-learning algorithm

In [19]:
# Life time of the learning algorithm
for episode in tqdm(range(num_of_episodes)):

  # Reset environment
  state = env.env.reset() # Start from the beginning
  done = False # True if out agent achieved success or failed

  for step in range(max_episode_steps):

    # Choose an action
    ## Initialize a random number 
    epsilon_tradeoff = random.uniform(0, 1) # This value will help agent to chooce the mode (exploration or explotation)

    if epsilon_tradeoff > epsilon: # exploataion mode, choose the best action at current state (biggest Q value for that state)
      action = np.argmax(Q_table[state, :]) # The maximum Q value for current state

    else: # exploration mode, explore the environment
      action = env.env.action_space.sample() # randomly choose an action from the action space

    # Make the action
    new_state, reward, done, info = env.env.step(action) # receive a new_state and reward from making an action

    # Update a Q value in Q table using Bellman equation
    Q_table[state, action] = Q_table[state, action] + lr * (reward + gamma * np.max(Q_table[new_state, :]) - Q_table[state, action])

    # Update the state of the environment
    state = new_state

    # If the desired process is done then finish current episode and start the new one
    if done:
      break

  # After an episode we did some exploration
  # If epsilon is 1 then we do exploration all the time and do not do any explotation
  # As we need to explotate obtained knowledge about the environment to obtain better Q table
  # We reduce epsilon after each episode
  epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)

100%|██████████| 50000/50000 [00:28<00:00, 1773.61it/s]


## Evaluate our algorithm

In [22]:
# Reset the invironment to start
env.env.reset()
# Records rewards that our model achieve
rewards = []

for episode in range(num_of_test_episodes):

  done = False
  state = env.env.reset()
  rewards_for_episode = 0

  for step in range(max_episode_steps):
    
    env.env.render()

    action = np.argmax(Q_table[state, :])

    new_state, reward, done, info = env.env.step(action)

    rewards_for_episode += reward

    if done:
      rewards.append(rewards_for_episode)
      break

    state = new_state

env.env.close()
print("Average reward is", sum(rewards) / num_of_test_episodes)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
| : | : :[43m [0m|
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (South)
+---------+
|[34;1mR[0m: | : :G|
| : | :[43m [0m: |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (South)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|[34;1m[43mR[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0