In [1]:
import tensorflow as tf 
import numpy as np 
import gym
from tensorflow.keras.models import load_model

In [2]:
env= gym.make("MountainCar-v0")
low = env.observation_space.low
high = env.observation_space.high

In [3]:
print(high)
print(low)
print(env.action_space.n)
print(env.observation_space.shape)

[0.6  0.07]
[-1.2  -0.07]
3
(2,)


In [17]:
def model():
  model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(2,)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(3)
  ])

  optimizer = tf.keras.optimizers.RMSprop(0.001)

  model.compile(loss='mse',
                optimizer=optimizer,
                metrics=['mae', 'mse'])
  return model


In [5]:
class experience():
  def __init__(self, buffer_size, state_dim):
    self.buffer_size = buffer_size
    self.pointer = 0
    self.state_mem = np.zeros((self.buffer_size, *state_dim), dtype=np.int32)
    self.action_mem = np.zeros(self.buffer_size, dtype=np.int32)
    self.next_state_mem = np.zeros((self.buffer_size, *state_dim), dtype=np.int32)
    self.reward_mem = np.zeros(self.buffer_size, dtype=np.int32)
    self.done_mem = np.zeros(self.buffer_size, dtype=np.int32)

  def add_exp(self, state, action, reward, next_state, done):
    idx = self.pointer % self.buffer_size
    self.state_mem[idx] = state
    self.action_mem[idx] = action
    self.reward_mem[idx] = reward
    self.next_state_mem[idx] = next_state
    self.done_mem[idx] =  1 + int(done)
    self.pointer += 1

  def sample_exp(self, batch_size):
    max_mem = min(self.pointer, self.buffer_size)
    batch = np.random.choice(max_mem, batch_size, replace= False)
    state = self.state_mem[batch]
    action = self.action_mem[batch]
    reward = self.reward_mem[batch]
    next_state = self.next_state_mem[batch]
    done = self.done_mem[batch]
    return state, action , reward, next_state, done


In [6]:
class agent():
  def __init__(self):
    self.q_net = model()
    self.target_net = model()
    self.epsilon = 1.0
    self.epsilon_decay = 1e-3
    self.min_epsilon = 0.01
    self.memory = experience(buffer_size=1000000, state_dim=env.observation_space.shape)
    self.batch_size = 64
    self.gamma = 0.99


  def act(self, state):
    if np.random.rand() <= self.epsilon:
      self.update_epsilon()
      action = env.action_space.sample()

    else:
      state = np.array([state])
      action = self.q_net.predict(state)
      action = np.argmax(action)

    return action


  def update_mem(self, state, action, reward, next_state, done):
    self.memory.add_exp(state, action, reward, next_state, done)


  def update_target(self):
    self.target_net.set_weights(self.q_net.get_weights())  


  def update_epsilon(self):
    self.epsilon = self.epsilon - self.epsilon_decay if self.epsilon > self.min_epsilon else self.min_epsilon
    return self.epsilon


  def train(self):
    if self.memory.pointer < self.batch_size:
      return 
    state, action, reward, next_state, done = self.memory.sample_exp(self.batch_size)
    target = self.q_net.predict(state)
    next_state_val = self.target_net.predict(next_state)
    batch_index = np.arange(self.batch_size)
    target[batch_index, action] = reward + self.gamma * np.amax(next_state_val, axis=1)*done
    self.q_net.train_on_batch(state, target)


    def save_model(self):
        self.q_net.save("model.h5")
        self.target_net.save("target_model.h5")


    def load_model(self):
        self.q_net = load_model("model.h5")
        self.target_net = load_model("model.h5")



In [18]:
agentoo7 = agent()
agentoo7.q_net.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 64)                192       
_________________________________________________________________
dense_7 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_8 (Dense)              (None, 3)                 195       
Total params: 4,547
Trainable params: 4,547
Non-trainable params: 0
_________________________________________________________________


In [None]:
agentoo7.target_net.set_weights(agentoo7.q_net.get_weights())  

In [None]:
#tf.compat.v1.disable_eager_execution()
steps = 5000
for s in range(steps):
  done = False
  state = env.reset()
  total_reward = 0
  t = 0
  while not done:
    #env.render()
    action = agentoo7.act(state)
    next_state, reward, done, _ = env.step(action)
    agentoo7.update_mem(state, action, reward, next_state, done)
    if s % 10 == 0 and s != 0:
      agentoo7.update_target()
    agentoo7.train()
    state = next_state
    total_reward += reward
    t += 1
    if done:
      print("total reward after {} steps is {}".format(s, total_reward))