**Sources**

[one](https://www.oreilly.com/learning/introduction-to-reinforcement-learning-and-openai-gym)

[two](https://www.learndatasci.com/tutorials/reinforcement-q-learning-scratch-python-openai-gym/)

In [3]:
!pip install gym scipy 

Collecting gym
  Using cached https://files.pythonhosted.org/packages/9b/50/ed4a03d2be47ffd043be2ee514f329ce45d98a30fe2d1b9c61dea5a9d861/gym-0.10.5.tar.gz
Collecting pyglet>=1.2.0 (from gym)
  Using cached https://files.pythonhosted.org/packages/1c/fc/dad5eaaab68f0c21e2f906a94ddb98175662cc5a654eee404d59554ce0fa/pyglet-1.3.2-py2.py3-none-any.whl
Collecting future (from pyglet>=1.2.0->gym)
  Using cached https://files.pythonhosted.org/packages/00/2b/8d082ddfed935f3608cc61140df6dcbf0edea1bc3ab52fb6c29ae3e81e85/future-0.16.0.tar.gz
Building wheels for collected packages: gym, future
  Running setup.py bdist_wheel for gym: started
  Running setup.py bdist_wheel for gym: finished with status 'done'
  Stored in directory: C:\Users\blearn\AppData\Local\pip\Cache\wheels\cb\14\71\f4ab006b1e6ff75c2b54985c2f98d0644fffe9c1dddc670925
  Running setup.py bdist_wheel for future: started
  Running setup.py bdist_wheel for future: finished with status 'done'
  Stored in directory: C:\Users\blearn\AppData

You are using pip version 9.0.1, however version 10.0.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [1]:
import gym
import scipy
import numpy as np

In [2]:
env = gym.make("Taxi-v2")
env.render()

+---------+
|[34;1mR[0m: | : :G|
| : : : : |
| : : : : |
|[43m [0m| : | : |
|Y| : |[35mB[0m: |
+---------+



In [7]:
env = gym.make("CartPole-v1")
env.render()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [8]:
env.close()

In [4]:
env = gym.make("Taxi-v2").env
env.reset()
#env.render()

epochs = 0
penalties, reward = 0, 0

frames = [] # for animation

done = False

while not done:
    #env.render()
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)

    if reward == -10:
        penalties += 1
    
    # Put each rendered frame into dict for animation
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )

    epochs += 1
    
    
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))
print("Frames: {}".format(len(frames)))

Timesteps taken: 1047
Penalties incurred: 314
Frames: 1047


In [9]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'].getvalue())
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.05)
        
#print_frames(frames)

In [30]:
env.P[479]

{0: [(1.0, 479, -1, False)],
 1: [(1.0, 379, -1, False)],
 2: [(1.0, 499, -1, False)],
 3: [(1.0, 479, -1, False)],
 4: [(1.0, 479, -10, False)],
 5: [(1.0, 479, 20, True)]}

## Random Search

In [7]:
env.s = 328  # set environment to illustration's state

epochs = 0
penalties, reward = 0, 0

frames = [] # for animation

done = False

while not done:
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)

    if reward == -10:
        penalties += 1
    
    # Put each rendered frame into dict for animation
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )

    epochs += 1
    
    
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))

Timesteps taken: 323
Penalties incurred: 103


In [10]:
print_frames(frames)

+---------+
|[35m[42mR[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 323
State: 16
Action: 5
Reward: 20


### Linear Combination

In [11]:
o = env.reset()

o

82

In [16]:
parameters = np.random.rand(6) * 2 -1
print(parameters)
softmax = lambda x : np.exp(x)/np.sum(np.exp(x))
print(softmax(parameters))
print(softmax(parameters).argmax())

[-0.41473231 -0.29221068 -0.19696372 -0.67037214 -0.03612746  0.79468366]
[ 0.11160918  0.12615672  0.13876362  0.08643248  0.16297685  0.37406115]
5


In [48]:
env.reset()
env.s = 328  # set environment to illustration's state

epochs = 0
penalties, reward = 0, 0

frames = [] # for animation

done = False

while not done:
    parameters = np.random.rand(6) * 2 -1 # one weight per possible action
    action = softmax(parameters).argmax() # softmax to decide which action
    state, reward, done, info = env.step(action)

    if reward == -10:
        penalties += 1
    
    # Put each rendered frame into dict for animation
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )

    epochs += 1
    
    
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))

Timesteps taken: 200
Penalties incurred: 67


In [50]:
env.reset()
env.s = 328  # set environment to illustration's state

epochs = 0
penalties, reward = 0, 0

frames = [] # for animation

# For plotting metrics
all_epochs = []
all_penalties = []

for i in range(1, 100):
    state = env.reset()

    epochs, penalties, reward, = 0, 0, 0
    done = False

    while not done:
        parameters = np.random.rand(6) * 2 -1 # one weight per possible action
        action = softmax(parameters).argmax() # softmax to decide which action
        state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1

        # Put each rendered frame into dict for animation
        frames.append({
            'frame': env.render(mode='ansi'),
            'state': state,
            'action': action,
            'reward': reward
            }
        )
        
        epochs += 1
    
    
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))

Timesteps taken: 200
Penalties incurred: 72


## Hill-Climb

In [52]:
env.reset()
env.s = 328  # set environment to illustration's state

epochs = 0
penalties, reward = 0, 0


frames = [] # for animation

done = False

while not done:
    parameters = np.random.rand(6) * 2 -1 # one weight per possible action
    action = softmax(parameters).argmax() # softmax to decide which action
    state, reward, done, info = env.step(action)

    if reward == -10:
        penalties += 1
    
    # Put each rendered frame into dict for animation
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )

    epochs += 1
    
    
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))

Timesteps taken: 200
Penalties incurred: 50


In [53]:
print_frames(frames)

+---------+
|R: | : :G|
| : : : : |
| : : : : |
| |[42m_[0m: | : |
|[35mY[0m| : |B: |
+---------+
  (West)

Timestep: 200
State: 338
Action: 3
Reward: -1


In [69]:
env.s = 328  # set environment to illustration's state

epochs = 0
penalties, reward = 0, 0

frames = [] # for animation

# For plotting metrics
all_epochs = []
all_penalties = []

noise_scaling = 2.5
bestparams = np.random.rand(6) * 2 -1 # one weight per possible action
bestreward = -1000000

for i in range(1, 1500):
    state = env.reset()

    if totalreward > bestreward:
        bestreward = totalreward
        bestparams = parameters
    
    epochs, penalties, reward, = 0, 0, 0
    done = False
    totalreward = 0
    parameters = bestparams + (np.random.rand(6) * 2 -1) * noise_scaling
    
    while not done:
        action = softmax(state*parameters).argmax() # softmax to decide which action
        state, reward, done, info = env.step(action)

        totalreward += reward
        if reward == -10:
            penalties += 1
        

        # Put each rendered frame into dict for animation
        frames.append({
            'frame': env.render(mode='ansi'),
            'state': state,
            'action': action,
            'reward': reward
            }
        )
        
        epochs += 1
    
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")
        print(f"Reward: {bestreward}")
        print(f"Params: {parameters}")
    
    
#print("Timesteps taken: {}".format(epochs))
print("Best Reward: {}".format(bestreward))
print("Best Params: {}".format(bestparams))

Episode: 1400
Reward: -200
Params: [-3.97149099  0.14443596  0.46187537 -4.09137549 -2.74800509  0.72836405]
Best Reward: -200
Best Params: [-1.68542482 -0.08076862  2.1839019  -3.08635167 -1.23272213  1.38129388]


## Q-Learning

In [27]:
env = gym.make("Taxi-v2")
env.reset()

414

In [28]:
q_table = np.zeros([env.observation_space.n, env.action_space.n])

In [29]:
%%time
"""Training the agent"""

import random
from IPython.display import clear_output

# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1

# For plotting metrics
all_epochs = []
all_penalties = []

for i in range(1, 100001):
    state = env.reset()

    epochs, penalties, reward, = 0, 0, 0
    done = False
    
    while not done:
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Explore action space
        else:
            action = np.argmax(q_table[state]) # Exploit learned values

        next_state, reward, done, info = env.step(action) 
        
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1
        
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")

Episode: 100000
Training finished.

Wall time: 28.6 s


In [30]:
q_table[328]

array([ -2.30510326,  -1.97092096,  -2.30796264,  -2.21346985,
        -9.63045697, -10.26734801])

In [31]:
"""Evaluate agent's performance after Q-learning"""

total_epochs, total_penalties = 0, 0
episodes = 100

for _ in range(episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        action = np.argmax(q_table[state])
        state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1

        epochs += 1

    total_penalties += penalties
    total_epochs += epochs

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

Results after 100 episodes:
Average timesteps per episode: 12.69
Average penalties per episode: 0.0


## Write your own program

Turn this code into a program that can use mutliple environments.

Use the CartPole-v0 Random Search example as a reference.