In [1]:
import random
import gym
import numpy as np
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

In [2]:
ENV_NAME = "CartPole-v0"

MEMORY_SIZE = 1000000
BATCH_SIZE = 20

In [22]:
class Agent:

    def __init__(self, state_shape, num_actions, num_envs, alpha=0.001, gamma=0.95, epsilon_i=1.0, epsilon_f=0.01, n_epsilon=0.1, hidden_sizes = []):
        self.epsilon_i = epsilon_i
        self.epsilon_f = epsilon_f
        self.n_epsilon = n_epsilon
        self.epsilon = epsilon_i
        self.gamma = gamma

        self.num_actions = num_actions
        self.num_envs = num_envs
        self.memory = deque(maxlen=MEMORY_SIZE)

        self.Q = Sequential()
        for size in hidden_sizes:
            self.Q.add(Dense(size, activation='relu', use_bias='false', kernel_initializer='he_uniform', dtype='float64'))
        self.Q.add(Dense(self.num_actions, activation="linear", use_bias='false', kernel_initializer='zeros', dtype='float64'))
        
        
#         self.Q.add(Dense(24, input_shape=state_shape, activation="relu", use_bias='false', kernel_initializer='he_uniform'))
#         self.Q.add(Dense(24, activation="relu", use_bias='false', kernel_initializer='he_uniform'))
#         self.Q.add(Dense(self.num_actions, activation="linear", use_bias='false', kernel_initializer='zeros'))
#         self.Q.add(Dense(self.num_actions, input_shape=state_shape, activation="linear", use_bias='false', kernel_initializer='zeros'))
        self.optimizer = tf.keras.optimizers.SGD(alpha)

    def remember(self, s_t, a_t, r_t, s_t_next, done):
        self.memory.append((s_t, a_t, r_t, s_t_next, done))

    def act(self, s_t):
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.num_actions, size=self.num_envs)
        return np.argmax(self.Q(s_t), axis=1)
    
    def decay_epsilon(self, n):
        self.epsilon = max(
            self.epsilon_f, 
            self.epsilon_i - (n/self.n_epsilon)*(self.epsilon_i - self.epsilon_f))

    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        batch = random.sample(self.memory, BATCH_SIZE)
        for s_t, a_t, r_t, s_t_next, d_t in batch:
            with tf.GradientTape() as tape:
                Q_next = tf.stop_gradient(tf.reduce_max(self.Q(s_t_next), axis=1))
                Q_pred = tf.reduce_sum(self.Q(s_t)*tf.one_hot(a_t, self.num_actions, dtype=tf.float64), axis=1)
                loss = tf.reduce_mean(0.5*(r_t + (1-d_t)*self.gamma*Q_next - Q_pred)**2)
            grads = tape.gradient(loss, self.Q.trainable_variables)
            self.optimizer.apply_gradients(zip(grads, self.Q.trainable_variables))

In [23]:
class DiscreteToBoxWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        assert isinstance(env.observation_space, gym.spaces.Discrete), \
            "Should only be used to wrap Discrete envs."
        self.n = self.observation_space.n
        self.observation_space = gym.spaces.Box(0, 1, (self.n,))
    
    def observation(self, obs):
        new_obs = np.zeros(self.n)
        new_obs[obs] = 1
        return new_obs

In [24]:
class VectorizedEnvWrapper(gym.Wrapper):
    def __init__(self, make_env, num_envs=1):
        super().__init__(make_env())
        self.num_envs = num_envs
        self.envs = [make_env() for env_index in range(num_envs)]
    
    def reset(self):
        return np.asarray([env.reset() for env in self.envs])
    
    def reset_at(self, env_index):
        return self.envs[env_index].reset()
    
    def step(self, actions):
        next_states, rewards, dones, infos = [], [], [], []
        for env, action in zip(self.envs, actions):
            next_state, reward, done, info = env.step(action)
            next_states.append(next_state)
            rewards.append(reward)
            dones.append(done)
            infos.append(info)
        return np.asarray(next_states), np.asarray(rewards), \
            np.asarray(dones), np.asarray(infos)

In [25]:
def train(env, agent, T=20000, num_envs=32):
    rewards = []
    episode_rewards = 0
    s_t = env.reset()
    for t in range(T):
        a_t = agent.act(s_t)
        s_t_next, r_t, d_t, info = env.step(a_t)
        agent.remember(s_t, a_t, r_t, s_t_next, d_t)
        s_t = s_t_next
        agent.experience_replay()
        agent.decay_epsilon(t/T)
        episode_rewards += r_t

        for i in range(env.num_envs):
            if d_t[i]:
                print("exploration: " + str(agent.epsilon) + ", score: " + str(episode_rewards[i]))
                rewards.append(episode_rewards[i])
                episode_rewards[i] = 0
                s_t[i] = env.reset_at(i)
    return rewards

In [28]:
num_envs = 32
env = VectorizedEnvWrapper(lambda: DiscreteToBoxWrapper(gym.make("FrozenLake-v0")), num_envs)
state_shape = env.observation_space.shape
num_actions = env.action_space.n
agent = Agent(state_shape, num_actions, num_envs, alpha=0.8, hidden_sizes=[])
rewards = train(env, agent, T=10000, num_envs=num_envs)

exploration: 0.99901, score: 0.0
exploration: 0.99901, score: 0.0
exploration: 0.99901, score: 0.0
exploration: 0.99802, score: 0.0
exploration: 0.99802, score: 0.0
exploration: 0.99802, score: 0.0
exploration: 0.99703, score: 0.0
exploration: 0.99703, score: 0.0
exploration: 0.99703, score: 0.0
exploration: 0.99604, score: 0.0
exploration: 0.99604, score: 0.0
exploration: 0.99604, score: 0.0
exploration: 0.99505, score: 0.0
exploration: 0.99505, score: 0.0
exploration: 0.99505, score: 0.0
exploration: 0.99505, score: 0.0
exploration: 0.99505, score: 0.0
exploration: 0.99406, score: 0.0
exploration: 0.99406, score: 0.0
exploration: 0.99406, score: 0.0
exploration: 0.99307, score: 0.0
exploration: 0.99307, score: 0.0
exploration: 0.99307, score: 0.0
exploration: 0.99307, score: 0.0
exploration: 0.99307, score: 0.0
exploration: 0.99307, score: 0.0
exploration: 0.99208, score: 0.0
exploration: 0.99208, score: 0.0
exploration: 0.99208, score: 0.0
exploration: 0.99109, score: 0.0
exploratio

exploration: 0.93466, score: 0.0
exploration: 0.93466, score: 0.0
exploration: 0.93466, score: 0.0
exploration: 0.93466, score: 0.0
exploration: 0.93466, score: 0.0
exploration: 0.93367, score: 0.0
exploration: 0.93268, score: 0.0
exploration: 0.93268, score: 1.0
exploration: 0.93268, score: 0.0
exploration: 0.93268, score: 0.0
exploration: 0.93169, score: 0.0
exploration: 0.93169, score: 0.0
exploration: 0.93169, score: 0.0
exploration: 0.9307, score: 0.0
exploration: 0.9307, score: 0.0
exploration: 0.9307, score: 0.0
exploration: 0.9307, score: 0.0
exploration: 0.9307, score: 0.0
exploration: 0.9307, score: 0.0
exploration: 0.9307, score: 0.0
exploration: 0.92971, score: 0.0
exploration: 0.92971, score: 0.0
exploration: 0.92971, score: 0.0
exploration: 0.92971, score: 0.0
exploration: 0.92971, score: 0.0
exploration: 0.92971, score: 0.0
exploration: 0.92971, score: 0.0
exploration: 0.92872, score: 0.0
exploration: 0.92872, score: 0.0
exploration: 0.92872, score: 0.0
exploration: 0.92

exploration: 0.8732800000000001, score: 0.0
exploration: 0.8732800000000001, score: 0.0
exploration: 0.8732800000000001, score: 0.0
exploration: 0.8732800000000001, score: 0.0
exploration: 0.8732800000000001, score: 0.0
exploration: 0.87229, score: 0.0
exploration: 0.87229, score: 0.0
exploration: 0.8713, score: 0.0
exploration: 0.8713, score: 0.0
exploration: 0.8713, score: 0.0
exploration: 0.87031, score: 0.0
exploration: 0.87031, score: 0.0
exploration: 0.87031, score: 0.0
exploration: 0.86932, score: 0.0
exploration: 0.86932, score: 0.0
exploration: 0.86932, score: 0.0
exploration: 0.86932, score: 0.0
exploration: 0.86932, score: 0.0
exploration: 0.86932, score: 0.0
exploration: 0.86833, score: 0.0
exploration: 0.86833, score: 0.0
exploration: 0.86734, score: 0.0
exploration: 0.86734, score: 0.0
exploration: 0.86734, score: 0.0
exploration: 0.86734, score: 0.0
exploration: 0.86734, score: 0.0
exploration: 0.86734, score: 0.0
exploration: 0.86734, score: 0.0
exploration: 0.86635, sc

exploration: 0.80497, score: 0.0
exploration: 0.80497, score: 0.0
exploration: 0.80497, score: 0.0
exploration: 0.80497, score: 0.0
exploration: 0.80497, score: 0.0
exploration: 0.80299, score: 0.0
exploration: 0.80299, score: 0.0
exploration: 0.80299, score: 0.0
exploration: 0.80299, score: 0.0
exploration: 0.802, score: 0.0
exploration: 0.802, score: 0.0
exploration: 0.79903, score: 0.0
exploration: 0.79903, score: 0.0
exploration: 0.79903, score: 0.0
exploration: 0.79903, score: 0.0
exploration: 0.79903, score: 0.0
exploration: 0.79903, score: 0.0
exploration: 0.79903, score: 0.0
exploration: 0.79903, score: 0.0
exploration: 0.79804, score: 0.0
exploration: 0.79804, score: 1.0
exploration: 0.79804, score: 0.0
exploration: 0.79804, score: 0.0
exploration: 0.79705, score: 0.0
exploration: 0.79705, score: 0.0
exploration: 0.79705, score: 0.0
exploration: 0.79606, score: 0.0
exploration: 0.79606, score: 0.0
exploration: 0.79606, score: 0.0
exploration: 0.79606, score: 0.0
exploration: 0

exploration: 0.7317100000000001, score: 0.0
exploration: 0.7317100000000001, score: 0.0
exploration: 0.7317100000000001, score: 0.0
exploration: 0.73072, score: 0.0
exploration: 0.73072, score: 0.0
exploration: 0.73072, score: 0.0
exploration: 0.73072, score: 0.0
exploration: 0.73072, score: 0.0
exploration: 0.72973, score: 0.0
exploration: 0.72874, score: 0.0
exploration: 0.72874, score: 0.0
exploration: 0.72874, score: 0.0
exploration: 0.72874, score: 0.0
exploration: 0.72874, score: 0.0
exploration: 0.72874, score: 0.0
exploration: 0.72775, score: 0.0
exploration: 0.72775, score: 0.0
exploration: 0.72775, score: 0.0
exploration: 0.72775, score: 0.0
exploration: 0.7267600000000001, score: 1.0
exploration: 0.72577, score: 0.0
exploration: 0.72577, score: 0.0
exploration: 0.72577, score: 0.0
exploration: 0.72577, score: 0.0
exploration: 0.72577, score: 0.0
exploration: 0.72577, score: 0.0
exploration: 0.72577, score: 0.0
exploration: 0.72478, score: 0.0
exploration: 0.72478, score: 0.0

exploration: 0.66241, score: 0.0
exploration: 0.66241, score: 0.0
exploration: 0.66142, score: 0.0
exploration: 0.66142, score: 0.0
exploration: 0.65944, score: 0.0
exploration: 0.65944, score: 0.0
exploration: 0.65944, score: 0.0
exploration: 0.65944, score: 0.0
exploration: 0.65944, score: 0.0
exploration: 0.65845, score: 0.0
exploration: 0.65845, score: 0.0
exploration: 0.65845, score: 0.0
exploration: 0.65845, score: 0.0
exploration: 0.65845, score: 0.0
exploration: 0.65746, score: 0.0
exploration: 0.65746, score: 0.0
exploration: 0.65746, score: 0.0
exploration: 0.65746, score: 0.0
exploration: 0.6564700000000001, score: 0.0
exploration: 0.6564700000000001, score: 0.0
exploration: 0.6564700000000001, score: 0.0
exploration: 0.6564700000000001, score: 0.0
exploration: 0.6564700000000001, score: 0.0
exploration: 0.6564700000000001, score: 0.0
exploration: 0.6564700000000001, score: 0.0
exploration: 0.6554800000000001, score: 0.0
exploration: 0.6554800000000001, score: 0.0
exploratio

exploration: 0.60004, score: 0.0
exploration: 0.60004, score: 0.0
exploration: 0.60004, score: 0.0
exploration: 0.5990500000000001, score: 0.0
exploration: 0.5990500000000001, score: 0.0
exploration: 0.59707, score: 0.0
exploration: 0.59707, score: 0.0
exploration: 0.59707, score: 1.0
exploration: 0.59707, score: 0.0
exploration: 0.59707, score: 0.0
exploration: 0.59509, score: 0.0
exploration: 0.59509, score: 0.0
exploration: 0.59509, score: 0.0
exploration: 0.59509, score: 0.0
exploration: 0.59509, score: 0.0
exploration: 0.59212, score: 0.0
exploration: 0.5911299999999999, score: 0.0
exploration: 0.5911299999999999, score: 0.0
exploration: 0.5911299999999999, score: 0.0
exploration: 0.5911299999999999, score: 0.0
exploration: 0.5911299999999999, score: 0.0
exploration: 0.5911299999999999, score: 0.0
exploration: 0.59014, score: 0.0
exploration: 0.59014, score: 0.0
exploration: 0.59014, score: 0.0
exploration: 0.59014, score: 0.0
exploration: 0.58816, score: 0.0
exploration: 0.58816,

exploration: 0.51787, score: 1.0
exploration: 0.51589, score: 1.0
exploration: 0.5149, score: 1.0
exploration: 0.5139100000000001, score: 0.0
exploration: 0.5139100000000001, score: 0.0
exploration: 0.5139100000000001, score: 0.0
exploration: 0.5139100000000001, score: 0.0
exploration: 0.5139100000000001, score: 0.0
exploration: 0.51292, score: 1.0
exploration: 0.51193, score: 0.0
exploration: 0.51193, score: 0.0
exploration: 0.51193, score: 0.0
exploration: 0.50797, score: 0.0
exploration: 0.50797, score: 0.0
exploration: 0.50797, score: 0.0
exploration: 0.50797, score: 0.0
exploration: 0.50698, score: 1.0
exploration: 0.50698, score: 0.0
exploration: 0.50698, score: 0.0
exploration: 0.5059899999999999, score: 0.0
exploration: 0.5059899999999999, score: 0.0
exploration: 0.5059899999999999, score: 0.0
exploration: 0.505, score: 0.0
exploration: 0.505, score: 0.0
exploration: 0.505, score: 0.0
exploration: 0.505, score: 0.0
exploration: 0.5040100000000001, score: 0.0
exploration: 0.5040

exploration: 0.43471000000000004, score: 0.0
exploration: 0.43471000000000004, score: 0.0
exploration: 0.43471000000000004, score: 0.0
exploration: 0.43471000000000004, score: 0.0
exploration: 0.43471000000000004, score: 0.0
exploration: 0.4337200000000001, score: 0.0
exploration: 0.4337200000000001, score: 0.0
exploration: 0.4337200000000001, score: 0.0
exploration: 0.43273000000000006, score: 0.0
exploration: 0.43273000000000006, score: 0.0
exploration: 0.43174, score: 0.0
exploration: 0.43174, score: 0.0
exploration: 0.4307500000000001, score: 0.0
exploration: 0.4307500000000001, score: 0.0
exploration: 0.42976000000000003, score: 0.0
exploration: 0.42976000000000003, score: 0.0
exploration: 0.42976000000000003, score: 0.0
exploration: 0.42976000000000003, score: 0.0
exploration: 0.4287700000000001, score: 1.0
exploration: 0.42778000000000005, score: 0.0
exploration: 0.42778000000000005, score: 0.0
exploration: 0.42778000000000005, score: 0.0
exploration: 0.42778000000000005, score:

exploration: 0.36639999999999995, score: 0.0
exploration: 0.36639999999999995, score: 0.0
exploration: 0.36639999999999995, score: 0.0
exploration: 0.36639999999999995, score: 0.0
exploration: 0.36639999999999995, score: 0.0
exploration: 0.36639999999999995, score: 0.0
exploration: 0.36541, score: 0.0
exploration: 0.36541, score: 0.0
exploration: 0.36541, score: 0.0
exploration: 0.36541, score: 0.0
exploration: 0.3644200000000001, score: 0.0
exploration: 0.3624400000000001, score: 0.0
exploration: 0.3624400000000001, score: 0.0
exploration: 0.3624400000000001, score: 0.0
exploration: 0.3624400000000001, score: 0.0
exploration: 0.36046, score: 1.0
exploration: 0.35947000000000007, score: 0.0
exploration: 0.35947000000000007, score: 0.0
exploration: 0.35947000000000007, score: 0.0
exploration: 0.35947000000000007, score: 0.0
exploration: 0.35848000000000013, score: 0.0
exploration: 0.35848000000000013, score: 0.0
exploration: 0.35848000000000013, score: 0.0
exploration: 0.358480000000000

exploration: 0.26938000000000006, score: 1.0
exploration: 0.26839000000000013, score: 0.0
exploration: 0.26542, score: 0.0
exploration: 0.26443000000000005, score: 1.0
exploration: 0.2634400000000001, score: 0.0
exploration: 0.2634400000000001, score: 0.0
exploration: 0.2634400000000001, score: 0.0
exploration: 0.2634400000000001, score: 0.0
exploration: 0.2634400000000001, score: 0.0
exploration: 0.2634400000000001, score: 0.0
exploration: 0.2634400000000001, score: 0.0
exploration: 0.2634400000000001, score: 0.0
exploration: 0.26146, score: 1.0
exploration: 0.25948000000000004, score: 0.0
exploration: 0.25948000000000004, score: 1.0
exploration: 0.2584900000000001, score: 1.0
exploration: 0.25750000000000006, score: 0.0
exploration: 0.25750000000000006, score: 1.0
exploration: 0.25651, score: 1.0
exploration: 0.2535400000000001, score: 0.0
exploration: 0.25255000000000016, score: 1.0
exploration: 0.25156, score: 0.0
exploration: 0.25156, score: 0.0
exploration: 0.24958000000000002, s

exploration: 0.14464, score: 0.0
exploration: 0.14464, score: 0.0
exploration: 0.14464, score: 0.0
exploration: 0.14464, score: 0.0
exploration: 0.14464, score: 0.0
exploration: 0.14365000000000017, score: 0.0
exploration: 0.14365000000000017, score: 1.0
exploration: 0.13968999999999998, score: 0.0
exploration: 0.13870000000000016, score: 0.0
exploration: 0.1377100000000001, score: 0.0
exploration: 0.13573000000000002, score: 0.0
exploration: 0.13573000000000002, score: 0.0
exploration: 0.13573000000000002, score: 0.0
exploration: 0.13573000000000002, score: 0.0
exploration: 0.13573000000000002, score: 0.0
exploration: 0.13473999999999997, score: 0.0
exploration: 0.13473999999999997, score: 0.0
exploration: 0.13473999999999997, score: 0.0
exploration: 0.13473999999999997, score: 0.0
exploration: 0.13473999999999997, score: 0.0
exploration: 0.13473999999999997, score: 0.0
exploration: 0.13375000000000015, score: 1.0
exploration: 0.13177000000000005, score: 1.0
exploration: 0.13078, scor

exploration: 0.014950000000000019, score: 1.0
exploration: 0.013960000000000083, score: 0.0
exploration: 0.012970000000000148, score: 1.0
exploration: 0.012970000000000148, score: 0.0
exploration: 0.010990000000000055, score: 1.0
exploration: 0.010990000000000055, score: 0.0
exploration: 0.010000000000000009, score: 1.0
exploration: 0.010000000000000009, score: 1.0
exploration: 0.010000000000000009, score: 0.0
exploration: 0.010000000000000009, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 0.0


exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploratio

exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploratio

exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 0.0
exploration: 0.01, score: 1.0
exploratio

KeyboardInterrupt: 

In [27]:
num_envs = 32
env = VectorizedEnvWrapper(lambda: gym.make("CartPole-v0"), num_envs)
state_shape = env.observation_space.shape
num_actions = env.action_space.n
agent = Agent(state_shape, num_actions, num_envs, alpha=0.001, hidden_sizes=[24, 24])
rewards = train(env, agent, T=10000, num_envs=num_envs)

exploration: 0.98911, score: 12.0
exploration: 0.98713, score: 14.0
exploration: 0.98713, score: 14.0
exploration: 0.98713, score: 14.0
exploration: 0.98614, score: 15.0
exploration: 0.98614, score: 15.0
exploration: 0.98614, score: 15.0
exploration: 0.98515, score: 16.0
exploration: 0.98515, score: 16.0
exploration: 0.98515, score: 16.0
exploration: 0.98515, score: 16.0
exploration: 0.98515, score: 16.0
exploration: 0.98416, score: 17.0
exploration: 0.98317, score: 18.0
exploration: 0.98218, score: 19.0
exploration: 0.98218, score: 19.0
exploration: 0.9802, score: 21.0
exploration: 0.97921, score: 22.0
exploration: 0.97921, score: 22.0
exploration: 0.97723, score: 24.0
exploration: 0.97723, score: 9.0
exploration: 0.97624, score: 25.0
exploration: 0.97624, score: 25.0
exploration: 0.97525, score: 12.0
exploration: 0.97525, score: 26.0
exploration: 0.97426, score: 11.0
exploration: 0.97426, score: 12.0
exploration: 0.97327, score: 28.0
exploration: 0.97327, score: 28.0
exploration: 0.9

exploration: 0.8119000000000001, score: 73.0
exploration: 0.81091, score: 40.0
exploration: 0.81091, score: 25.0
exploration: 0.80992, score: 9.0
exploration: 0.80992, score: 12.0
exploration: 0.80893, score: 16.0
exploration: 0.80893, score: 43.0
exploration: 0.80794, score: 25.0
exploration: 0.8069500000000001, score: 15.0
exploration: 0.80497, score: 10.0
exploration: 0.80398, score: 12.0
exploration: 0.802, score: 15.0
exploration: 0.80101, score: 14.0
exploration: 0.79903, score: 37.0
exploration: 0.79903, score: 26.0
exploration: 0.79804, score: 17.0
exploration: 0.7950699999999999, score: 34.0
exploration: 0.7921, score: 35.0
exploration: 0.79012, score: 46.0
exploration: 0.78913, score: 18.0
exploration: 0.7822, score: 55.0
exploration: 0.7822, score: 80.0
exploration: 0.78121, score: 20.0
exploration: 0.78022, score: 32.0
exploration: 0.77923, score: 26.0
exploration: 0.77725, score: 25.0
exploration: 0.77527, score: 35.0
exploration: 0.76933, score: 39.0
exploration: 0.76735,

exploration: 0.39709000000000005, score: 132.0
exploration: 0.3951100000000001, score: 182.0
exploration: 0.3772900000000001, score: 200.0
exploration: 0.3772900000000001, score: 110.0
exploration: 0.37432, score: 200.0
exploration: 0.36541, score: 133.0
exploration: 0.3644200000000001, score: 105.0
exploration: 0.3644200000000001, score: 119.0
exploration: 0.3485800000000001, score: 200.0
exploration: 0.3466, score: 200.0
exploration: 0.33175, score: 200.0
exploration: 0.32184999999999997, score: 200.0
exploration: 0.3079900000000001, score: 170.0
exploration: 0.30601, score: 129.0
exploration: 0.2980900000000001, score: 200.0
exploration: 0.2872000000000001, score: 188.0
exploration: 0.28423, score: 181.0
exploration: 0.28324000000000005, score: 192.0
exploration: 0.2822500000000001, score: 152.0
exploration: 0.27928, score: 200.0
exploration: 0.27631000000000006, score: 200.0
exploration: 0.27631000000000006, score: 200.0
exploration: 0.27532, score: 200.0
exploration: 0.27532, scor

KeyboardInterrupt: 