In [1]:
!pip install cmake 'gym[atari]' scipy

Collecting cmake
[?25l  Downloading https://files.pythonhosted.org/packages/ff/34/0a311fedffcc7a153bbc0390ef4c378dbc7f09f9865247137f82d62f8e7a/cmake-3.15.3-py3-none-manylinux2010_x86_64.whl (16.5MB)
[K    100% |████████████████████████████████| 16.5MB 933kB/s ta 0:00:011    69% |██████████████████████▏         | 11.5MB 4.2MB/s eta 0:00:02
[?25hCollecting gym[atari]
[?25l  Downloading https://files.pythonhosted.org/packages/1d/85/a7a462d7796f097027d60f9a62b4e17a0a94dcf12ac2a9f9a913333b11a6/gym-0.15.4.tar.gz (1.6MB)
[K    100% |████████████████████████████████| 1.6MB 2.6MB/s ta 0:00:01
Collecting pyglet<=1.3.2,>=1.2.0 (from gym[atari])
[?25l  Downloading https://files.pythonhosted.org/packages/1c/fc/dad5eaaab68f0c21e2f906a94ddb98175662cc5a654eee404d59554ce0fa/pyglet-1.3.2-py2.py3-none-any.whl (1.0MB)
[K    100% |████████████████████████████████| 1.0MB 4.6MB/s ta 0:00:01
[?25hCollecting cloudpickle~=1.2.0 (from gym[atari])
  Downloading https://files.pythonhosted.org/packages/c1/4

In [3]:
import gym

env = gym.make("Taxi-v3").env

env.render()

+---------+
|[43mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+



In [4]:
env.reset()
env.render()
print("Action Space{}".format(env.action_space))
print("State Space{}".format(env.observation_space))

+---------+
|R: | :[43m [0m:[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+

Action SpaceDiscrete(6)
State SpaceDiscrete(500)


In [5]:
state=env.encode(3,1,2,0)
print("State:",state)
env.s=state
env.render()

State: 328
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+



In [6]:
env.P[328]

{0: [(1.0, 428, -1, False)],
 1: [(1.0, 228, -1, False)],
 2: [(1.0, 348, -1, False)],
 3: [(1.0, 328, -1, False)],
 4: [(1.0, 328, -10, False)],
 5: [(1.0, 328, -10, False)]}

In [7]:
# Without RL
env.s=328
epochs=0
penalties, reward=0,0
frames=[]
done=False
while not done:
    action=env.action_space.sample()
    state,reward,done,info=env.step(action)
    if reward==-10:
        penalties+=1
    frames.append({'frame':env.render(mode='ansi'),
                  'state':state,
                  'action':action,
                  'reward':reward})
    epochs+=1
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))

Timesteps taken: 1952
Penalties incurred: 595


In [13]:
from IPython.display import clear_output
from time import sleep
def print_frames(frames):
    for i,frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        #print(frame['frame'].getvalue())
        print(f"Timestep: {i+1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.1)
print_frames(frames)

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 1952
State: 0
Action: 5
Reward: 20


In [15]:
#With Q-Learning
import numpy as np
q_table=np.zeros([env.observation_space.n,env.action_space.n])

In [19]:
import random
from IPython.display import clear_output
#Hyperparameters
alpha=0.1
gamma=0.6
epsilon=0.1

#for plotting metrics
all_epochs=[]
all_penalties=[]

for i in range(1,100001):
    state=env.reset()
    epochs,penalties,reward=0,0,0
    done=False
    while not done:
        if random.uniform(0,1)<epsilon:
            action=env.action_space.sample()
        else:
            action=np.argmax(q_table[state])
        next_state, reward, done, info=env.step(action)
        
        old_value=q_table[state,action]
        next_max=np.max(q_table[next_state])
        new_value=(1-alpha)*old_value+alpha*(reward+gamma*next_max)
        q_table[state,action]=new_value
        
        if reward==-10:
            penalties+=1
        state=next_state
        epochs+=1
        
    if i%100==0:
        clear_output(wait=True)
        print(f"Episode: {i}")
print("Training finished.\n")

Episode: 100000
Training finsihed.



In [20]:
q_table[328]

array([ -2.39822645,  -2.27325184,  -2.40702552,  -2.3588204 ,
       -10.31327957, -10.06746382])

In [22]:
total_epochs, total_penalties= 0,0
episodes=100

for _ in range(episodes):
    state=env.reset()
    epochs,penalties,reward=0,0,0
    done=False
    while not done:
        action=np.argmax(q_table[state])
        state,reward,done,info=env.step(action)
        if reward==-10:
            penalties+=1
        epochs+=1
    
    total_penalties+=penalties
    total_epochs+=epochs
print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

Results after 100 episodes:
Average timesteps per episode: 12.74
Average penalties per episode: 0.0
