<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Exploring-OpenAI" data-toc-modified-id="Exploring-OpenAI-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Exploring OpenAI</a></span></li></ul></div>

# Reinforcement Learning <a class="tocSkip">

## Exploring OpenAI

In [1]:
import numpy as np
np.random.seed(123)
print("NumPy:{}".format(np.__version__))

import tensorflow as tf
tf.set_random_seed(123)
print("TensorFlow:{}".format(tf.__version__))

import keras
print("Keras:{}".format(keras.__version__))

import gym
print('OpenAI Gym:',gym.__version__)

NumPy:1.13.1
TensorFlow:1.4.1


Using TensorFlow backend.


Keras:2.0.9
OpenAI Gym: 0.9.4


# OpenAI 101

In [2]:
# print how many environments available

all_env = list(gym.envs.registry.all())

print('Total Environments in Gym version {} : {}'.format(gym.__version__,len(all_env)))

Total Environments in Gym version 0.9.4 : 777


In [3]:
for e in list(all_env):
    print(e)

EnvSpec(Assault-ramDeterministic-v0)
EnvSpec(AirRaidNoFrameskip-v4)
EnvSpec(UpNDown-ram-v4)
EnvSpec(Robotank-ramNoFrameskip-v4)
EnvSpec(Kangaroo-ram-v0)
EnvSpec(Kangaroo-ramNoFrameskip-v0)
EnvSpec(CrazyClimber-ramNoFrameskip-v4)
EnvSpec(SemisuperPendulumNoise-v0)
EnvSpec(MsPacman-ramNoFrameskip-v4)
EnvSpec(CrazyClimber-ram-v0)
EnvSpec(InvertedDoublePendulum-v1)
EnvSpec(VentureNoFrameskip-v4)
EnvSpec(ChopperCommand-ramDeterministic-v0)
EnvSpec(OffSwitchCartpoleProb-v0)
EnvSpec(AssaultDeterministic-v0)
EnvSpec(Freeway-ram-v0)
EnvSpec(BoxingDeterministic-v4)
EnvSpec(Venture-ramNoFrameskip-v0)
EnvSpec(Hero-ramNoFrameskip-v0)
EnvSpec(Assault-v0)
EnvSpec(WizardOfWorNoFrameskip-v4)
EnvSpec(Freeway-ramDeterministic-v4)
EnvSpec(Berzerk-ram-v4)
EnvSpec(Boxing-ram-v0)
EnvSpec(RoadRunner-ramDeterministic-v0)
EnvSpec(Frostbite-ram-v4)
EnvSpec(AlienNoFrameskip-v4)
EnvSpec(Bowling-ram-v4)
EnvSpec(Breakout-ramDeterministic-v4)
EnvSpec(BeamRider-ramDeterministic-v4)
EnvSpec(Zaxxon-ramDeterministic-v0)


# Play the cartploe game with stochastic control

In [None]:
import matplotlib.pyplot as plt
import matplotlib.animation as anm
from matplotlib import rc
#rc('animation', html='html5')
rc('animation', ffmpeg_path='/usr/bin/ffmpeg')
%matplotlib inline

from matplotlib import animation
from JSAnimation.IPython_display import display_animation

def env_render(env_vis):
    plt.figure()
    plot = plt.imshow(env_vis[0])
    plt.axis('off')
    def animate(i):
        plot.set_data(env_vis[i])

    anim = anm.FuncAnimation(plt.gcf(),
                             animate,
                             frames=len(env_vis),
                             interval=20,
                             repeat=True,
                             repeat_delay=20)
    display(display_animation(anim, default_mode='loop'))
    
env = gym.make('CartPole-v0')
n_episodes = 1
env_vis = []
for i_episode in range(n_episodes):
    observation = env.reset()
    for t in range(100):
        env_vis.append(env.render(mode = 'rgb_array'))
        print(observation)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished at t{}".format(t+1))
            break
env.render(close=True)
env_render(env_vis)

# Simple Policies

In [4]:
def policy_logic(env,obs):
    return 1 if obs[2] > 0 else 0
def policy_random(env,obs):
    return env.action_space.sample()

def experiment(policy, n_episodes, rewards_max):
    rewards=np.empty(shape=(n_episodes))
    env = gym.make('CartPole-v0')
    
    for i in range(n_episodes):
        obs = env.reset()
        done = False
        episode_reward = 0
        while not done:
            action = policy(env,obs)
            obs, reward, done, info = env.step(action)
            episode_reward += reward
            if episode_reward > rewards_max:
                break
        rewards[i]=episode_reward
        #print("Episode finished at t{}".format(reward))
    print('Policy:{}, Min reward:{}, Max reward:{}, Average reward:{}'
          .format(policy.__name__,
                  np.min(rewards),
                  np.max(rewards),
                  np.mean(rewards)))

n_episodes = 100
rewards_max = 10000
experiment(policy_random, n_episodes, rewards_max)
experiment(policy_logic, n_episodes, rewards_max)

Policy:policy_random, Min reward:9.0, Max reward:81.0, Average reward:22.52
Policy:policy_logic, Min reward:25.0, Max reward:58.0, Average reward:42.37


# Simple Policies with Parameters

In [5]:
def policy_logic(theta,obs):
    # just ignore theta
    return 1 if obs[2] > 0 else 0

def policy_random(theta,obs):
    return 0 if np.matmul(theta,obs) < 0 else 1

def episode(env, policy, rewards_max):
    obs = env.reset()
    done = False
    episode_reward = 0
    if policy.__name__ in ['policy_random']:
        theta = np.random.rand(4) * 2 - 1
    else:
        theta = None
    while not done:
        action = policy(theta,obs)
        obs, reward, done, info = env.step(action)
        episode_reward += reward
        if episode_reward > rewards_max:
            break
    return episode_reward
    
def experiment(policy, n_episodes, rewards_max):
    rewards=np.empty(shape=(n_episodes))
    env = gym.make('CartPole-v0')
    
    for i in range(n_episodes):
        rewards[i]=episode(env,policy,rewards_max)
        #print("Episode finished at t{}".format(reward))
    print('Policy:{}, Min reward:{}, Max reward:{}, Average reward:{}'
          .format(policy.__name__,
                  np.min(rewards),
                  np.max(rewards),
                  np.mean(rewards)))

n_episodes = 100
rewards_max = 10000
experiment(policy_random, n_episodes, rewards_max)
experiment(policy_logic, n_episodes, rewards_max)

Policy:policy_random, Min reward:8.0, Max reward:200.0, Average reward:36.52
Policy:policy_logic, Min reward:25.0, Max reward:66.0, Average reward:44.18


# Simple Policies with Training

In [6]:
# train with random search

def policy_logic(theta,obs):
    # just ignore theta
    return 1 if obs[2] > 0 else 0

def policy_random(theta,obs):
    return 0 if np.matmul(theta,obs) < 0 else 1

def episode(env,policy, rewards_max,theta):
    obs = env.reset()
    done = False
    episode_reward = 0

    while not done:
        action = policy(theta,obs)
        obs, reward, done, info = env.step(action)
        episode_reward += reward
        if episode_reward > rewards_max:
            break
    return episode_reward

def train(policy, n_episodes, rewards_max):

    env = gym.make('CartPole-v0')
    np.random.seed(0)
    
    theta_best = np.empty(shape=[4])
    reward_best = 0

    for i in range(n_episodes):
        if policy.__name__ in ['policy_random']:
            theta = np.random.rand(4) * 2 - 1
        else:
            theta = None
        
        reward_episode=episode(env,policy,rewards_max, theta)
        if reward_episode > reward_best:
            reward_best = reward_episode
            theta_best = theta.copy()
    return reward_best,theta_best
    
def experiment(policy, n_episodes, rewards_max, theta=None):
    rewards=np.empty(shape=[n_episodes])
    env = gym.make('CartPole-v0')
    
    for i in range(n_episodes):
        rewards[i]=episode(env,policy,rewards_max,theta)
        #print("Episode finished at t{}".format(reward))
    print('Policy:{}, Min reward:{}, Max reward:{}, Average reward:{}'
          .format(policy.__name__,
                  np.min(rewards),
                  np.max(rewards),
                  np.mean(rewards)))

n_episodes = 100
rewards_max = 10000

reward,theta = train(policy_random, n_episodes, rewards_max)
print('trained theta: {}, rewards: {}'.format(theta,reward))
experiment(policy_random, n_episodes, rewards_max, theta)

experiment(policy_logic, n_episodes, rewards_max)

trained theta: [-0.1526904   0.29178823 -0.12482558  0.783546  ], rewards: 200.0
Policy:policy_random, Min reward:200.0, Max reward:200.0, Average reward:200.0
Policy:policy_logic, Min reward:24.0, Max reward:68.0, Average reward:41.66


# Simple Policies with Training until Certain Rewards

In [7]:
# train with random search until we reach rewards > 200
def policy_logic(theta,obs):
    # just ignore theta
    return 1 if obs[2] > 0 else 0

def policy_random(theta,obs):
    return 0 if np.matmul(theta,obs) < 0 else 1

def episode(env,policy, rewards_max,theta):
    obs = env.reset()
    done = False
    episode_reward = 0

    while not done:
        action = policy(theta,obs)
        obs, reward, done, info = env.step(action)
        episode_reward += reward
        if episode_reward > rewards_max:
            break
    return episode_reward

def train(policy, n_episodes, rewards_max):

    env = gym.make('CartPole-v0')
    np.random.seed(0)
    
    theta_best = np.empty(shape=[4])
    reward_best = 0

    # n_episodes acts as a max in this case
    for i in range(n_episodes):
        if policy.__name__ in ['policy_random']:
            theta = np.random.rand(4) * 2 - 1
        else:
            theta = None
        reward_episode=episode(env,policy,rewards_max, theta)
        if reward_episode > reward_best:
            reward_best = reward_episode
            theta_best = theta.copy()
        if reward_best >= rewards_max:
            break
    return reward_best,theta_best
    
def experiment(policy, n_episodes, rewards_max, theta=None):
    rewards=np.empty(shape=[n_episodes])
    env = gym.make('CartPole-v0')
    
    for i in range(n_episodes):
        rewards[i]=episode(env,policy,rewards_max,theta)
        #print("Episode finished at t{}".format(reward))
    print('Policy:{}, Min reward:{}, Max reward:{}, Average reward:{}'
          .format(policy.__name__,
                  np.min(rewards),
                  np.max(rewards),
                  np.mean(rewards)))

n_episodes = 10000 # acts as max episodes to use in training
rewards_max = 200
reward,theta = train(policy_random, n_episodes, rewards_max)
print('trained theta: {}, rewards: {}'.format(theta,reward))

n_episodes = 100
rewards_max = 10000
experiment(policy_random, n_episodes, rewards_max, theta)
experiment(policy_logic, n_episodes, rewards_max)

trained theta: [-0.1526904   0.29178823 -0.12482558  0.783546  ], rewards: 200.0
Policy:policy_random, Min reward:200.0, Max reward:200.0, Average reward:200.0
Policy:policy_logic, Min reward:24.0, Max reward:64.0, Average reward:41.96


# Neural Network Policy

In [19]:
# train with the neural network

def policy_random(theta,obs):
    return 0 if np.matmul(theta,obs) < 0 else 1

def policy_naive_nn(nn,obs):
    return np.argmax(nn.predict(np.array([obs])))


# returns obs -> actions -> rewards arrays
# specify t_max to run for t_max steps
# specify r_max to run until r_max is reached
# specify both t_max ad r_max to run for t_max but break if r_max is reached
def episode(env, policy, theta, r_max=0, t_max=0, return_hist_reward=0):
    if return_hist_reward > 0:
        o_list=[]
        a_list=[]
        r_list=[]
    
    episode_reward = 0

    obs = env.reset()
    done = False
    t = 0
    while not done:
        action = policy(theta,obs)
        if return_hist_reward>0:
            o_list.append(obs)
            a_list.append(action)
        obs, reward, done, info = env.step(action)
        if return_hist_reward>0:
            r_list.append(reward)
        episode_reward += reward
        if r_max > 0 and episode_reward > r_max:
            break
        t+=1
        if t_max > 0 and t == t_max:
            break

    
    if return_hist_reward>=episode_reward:
        return_val = [np.array(o_list),np.array(a_list),np.array(r_list)]
    else:
        return_val = episode_reward
    return return_val

# experiment collect observations and rewards for each episode
def experiment(env, policy, n_episodes,theta=None, r_max=0, t_max=0, return_hist_reward=0):
    
    if return_hist_reward>0:
        obs_list=[]
        action_list=[]
    else:
        rewards=np.empty(shape=[n_episodes])
    for i in range(n_episodes):
        val = episode(env,policy,theta, r_max, t_max,return_hist_reward)
        if isinstance(val, list):
            obs_list.append(val[0])
            action_list.append(val[1])
            if not return_hist_reward:
                rewards[i]=np.sum(val[2])
        else:
            if not return_hist_reward:
                rewards[i]=val
            
    if return_hist_reward>0:
        return_val = [np.concatenate(obs_list,axis=0), np.concatenate(action_list,axis=0)]
    else:
        return_val = []
        print('Policy:{}, Min reward:{}, Max reward:{}, Average reward:{}'
          .format(policy.__name__,
                  np.min(rewards),
                  np.max(rewards),
                  np.mean(rewards)))

    return return_val

In [20]:
# build the model
from keras.models import Sequential
from keras.layers import Dense
model = Sequential()
model.add(Dense(8,input_dim=4, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 8)                 40        
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 18        
Total params: 58
Trainable params: 58
Non-trainable params: 0
_________________________________________________________________


In [21]:
# create training data
env = gym.make('CartPole-v0')
n_obs = 4
n_actions = 2
theta = np.random.rand(4) * 2 - 1
n_episodes = 100
r_max = 0
t_max = 0

x_train, y_train = experiment(env, 
                              policy_random, 
                              n_episodes,
                              theta, 
                              r_max, 
                              t_max, 
                              return_hist_reward=100 )
y_train = np.eye(n_actions)[y_train]
print(x_train.shape,y_train.shape)

(5933, 4) (5933, 2)


In [22]:
# train the model
model.fit(x_train, y_train, epochs=50, batch_size=10)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f3b48175cc0>

In [23]:
n_episodes = 200
r_max = 0
t_max = 0

_ = experiment(env, 
              policy_naive_nn, 
              n_episodes,
              theta=model, 
              r_max=r_max, 
              t_max=t_max, 
              return_hist_reward=0 )

_ = experiment(env, 
              policy_random, 
              n_episodes,
              theta, 
              r_max, 
              t_max, 
              return_hist_reward=0 )

Policy:policy_naive_nn, Min reward:40.0, Max reward:107.0, Average reward:65.595
Policy:policy_random, Min reward:41.0, Max reward:122.0, Average reward:65.175
