In [6]:
import gym
import random
import numpy as np
import tensorflow as tf
from gym.envs.registration import register
import time
from IPython.display import clear_output
from collections import deque



In [7]:
try:
    register(
        id='FrozenLakeNoSlip-v0',
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery': False},
        max_episode_steps=100,
        reward_threshold=0.78, # optimum = .8196
    )
except:
    pass

env_name = "FrozenLake-v0"
env_name = "CartPole-v1"
env = gym.make(env_name)
type(env.action_space)

gym.spaces.discrete.Discrete

In [8]:
class Agent():
    def __init__(self, env):
        self.is_discrete = (type(env.action_space) == gym.spaces.discrete.Discrete)
        if self.is_discrete:
            self.action_size = env.action_space.n
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
    def get_action(self, state):
        if self.is_discrete:
            action = random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low,
                                  self.action_high, self.action_shape)
        return action
        

In [9]:
class QNAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.
        print(self.state_size)
        
        
    
        self.eps = 1.0
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        
        self.build_model()
        
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        
        self.replay_buffer = deque(maxlen = 1000)
    
    def build_model(self):
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        tf.reset_default_graph() 
        self.state_in = tf.placeholder(tf.int32, shape = [None])
        self.action_in = tf.placeholder(tf.int32, shape = [None])
        self.target_in = tf.placeholder(tf.float32, shape = [None ])
        
        #tf.one_hot(a, b) creates a b dimension tensor with value of index a=0
        self.state = tf.one_hot(self.state_in, depth=self.state_size)
        self.action = tf.one_hot(self.action_in, depth=self.action_size)
        
        #tf.layers.dense(input, units = output_dimension)
        self.q_state = tf.layers.dense(self.state, units=self.action_size, name="q_table")
        self.q_action = tf.reduce_sum(tf.multiply(self.q_state, self.action), axis=1)
        
        self.loss = tf.reduce_sum(tf.square(self.target_in - self.q_action))
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
        
        
        
    def get_action(self, state):
        q_state = self.sess.run(self.q_state, feed_dict = {self.state_in: [state]})
        #q_state = self.q_table[state]
        print(q_state)
        action_greedy = np.argmax(q_state)
        action_random = super().get_action(state)
        return action_random if np.random.random() < self.eps else action_greedy
    
    def train(self, experience, batchsize = 50):
        self.replay_buffer.append(experience)
        samples = random.choices(self.replay_buffer, k=batchsize)
        state, action, next_state, reward, done = (list(col) for col in zip(experience, *samples))
        
        #state, action, next_state, reward, done = ([exp] for exp in experience)
        
        #q_next = np.zeros([self.action_size]) if done else self.q_table[next_state]
        q_next = self.sess.run(self.q_state, feed_dict={self.state_in: next_state})
        q_next[done] = np.zeros(self.action_size)
        q_target = reward + self.discount_rate * np.max(q_next, axis = 1)
        
        '''
        q_update = q_target - self.q_table[state, action]
        self.q_table[state, action] += self.learning_rate * q_update
        '''
        feed = {self.state_in: state, self.action_in:action, self.target_in: q_target}
        self.sess.run(self.optimizer, feed_dict = feed)
        
        if experience[4]:
            self.eps *= 0.99
            
    def __del__(self):
        self.sess.close();

agent = QNAgent(env)

AttributeError: 'Box' object has no attribute 'n'

In [5]:


print("observation space: ",  env.observation_space)
print("action size: ", env.action_space)
total_reward = 0
for ep in range(100):
    done = False
    state = env.reset()
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        agent.train([state, action, next_state, reward, done])
        print('s: ', state, 'a: ', action)
        state = next_state
        total_reward += reward
        env.render()
        with tf.variable_scope("q_table", reuse=True):
            weights = agent.sess.run(tf.get_variable("kernel"))
            print(weights)
        print('episode: ', ep, 'total reward: ', total_reward, 'epsilon: ', agent.eps)
        time.sleep(0.1)
        clear_output(wait=True)
        
env.close()

[[0.9119729 0.0089384 0.8786441 0.8901896]]
s:  3 a:  1
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
[[ 0.2576847   0.3809003   0.32898873  0.14447957]
 [ 0.26435217 -0.4904318   0.35624877  0.1839204 ]
 [ 0.29063606  0.4466845   0.3253152   0.20925392]
 [ 0.31721005 -0.4803118   0.32536942  0.18831977]
 [ 0.29051068  0.41355208 -0.5451436   0.15301499]
 [-0.28817979  0.43754506 -0.32566905 -0.34973836]
 [-0.5845974   0.46917808 -0.5417781   0.21133482]
 [-0.12304896 -0.15485492  0.20762599 -0.47755504]
 [ 0.30772644 -0.49212992  0.37766144  0.17811662]
 [ 0.3076532   0.4725821   0.40767333 -0.69207853]
 [ 0.35213917  0.49347663 -0.5546767   0.22202204]
 [ 0.09937453 -0.3407585   0.0960623  -0.3675157 ]
 [-0.35346127 -0.47275323  0.34889203  0.26617038]
 [-0.5661116   0.46492544  0.42357838  0.22346868]
 [ 0.363528    0.49836913  0.44829994  0.25793284]
 [-0.12196353 -0.43644816  0.12036645  0.18192303]]
episode:  99 total reward:  29.0 epsilon:  0.36603234127322926
