In [3]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [4]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [5]:
import tensorflow as tf
import gym
import numpy as np
import tqdm
import matplotlib.pyplot as plt
from collections import namedtuple
from tensorflow.keras import Sequential
from tensorflow.keras.layers import InputLayer
from tensorflow.keras.layers import Dense

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = [12, 4]

In [6]:
#GPU check
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [7]:
# enable tensor debugging
tf.debugging.set_log_device_placement(True)

In [8]:
env = gym.make('MountainCarContinuous-v0')

In [9]:
def render(env, policy=None):
    """Graphically render an episode using the given policy

    :param env:  Gym environment
    :param policy:  function which maps state to action.  If None, the random
                    policy is used.
    """

    if policy is None:

        def policy(state):
            return env.action_space.sample()

    state = env.reset()
    env.render()

    while True:
        action = policy(state)
        state, _, done, _ = env.step(action)
        env.render()

        if done:
            break

    env.close()

# Replay buffer

In [10]:
Batch = namedtuple(
    'Batch', ('states', 'actions', 'rewards', 'next_states', 'dones')
)

class ReplayMemory:
    
    def __init__(self, max_size, state_dim, action_dim):
        self.max_size = max_size
        self.state_dim = state_dim
        
        self.states = np.random.random((max_size, state_dim))
        self.actions = np.random.random((max_size, action_dim))
        self.rewards = np.random.random((max_size, 1))
        self.next_states = np.random.random((max_size, state_dim))
        self.dones = np.full((max_size, 1), True)
        
        self.idx = 0
        self.size = 0
    
    def add(self, state, action, reward, next_state, done):
        
        self.states[self.idx] = state
        self.actions[self.idx] = action
        self.rewards[self.idx] = reward
        self.next_states[self.idx] = next_state
        self.dones[self.idx] = done
        
        self.idx = (self.idx + 1) % self.max_size
        self.size = min(self.size + 1, self.max_size)
    
    def sample(self, batch_size):
        if self.size <= batch_size:
            sample_indices = np.random.choice(self.size, self.size, replace=False)
        else:
            sample_indices = np.random.choice(self.size, batch_size, replace=False)
        batch = Batch(
                        states = self.states[ sample_indices ,:],
                        actions = self.actions[ sample_indices ,:],
                        rewards = self.rewards[ sample_indices ,:],
                        next_states = self.next_states[ sample_indices ,:],
                        dones = self.dones[ sample_indices ,:]
                    )
        return batch

    def populate(self, env, num_steps):
        state = env.reset()
        for i in range(num_steps):
            action = env.action_space.sample()
            next_state, reward, done, info = env.step(action)
            self.add(state, action, reward, next_state, done)
            if done:
                state = env.reset()
            state = next_state
        

In [11]:
# r = ReplayMemory(1000, 2, 1)
# r.populate(env, 500)

In [12]:
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam

tf.keras.backend.set_floatx('float64')

# Critic

In [13]:
class Critic(Model):
    
    def __init__(self):
        super(Critic, self).__init__()
        perceptrons_count = 64
        self.inp = InputLayer(3)
        self.layer1 = Dense(perceptrons_count, activation='relu')
        self.layer2 = Dense(perceptrons_count, activation='relu')
        self.layer3 = Dense(1, activation='linear')
    
    def call(self, input_):
        states, actions = input_
        input_ = tf.concat([states, actions], axis=1)
        input_ = self.inp(input_)
        input_ = self.layer1(input_)
        input_ = self.layer2(input_)
        return self.layer3(input_)
    
    def make(self):
        x = self.call( [tf.constant([[1, 1], [2, 2]]), tf.constant([[1], [2]])] )
       
        

In [14]:
t = Critic()
t.make()
t.summary()

Executing op ConcatV2 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op RandomUniform in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Sub in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Mul in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Add in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarIsInitializedOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op LogicalNot in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Assert in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Fill in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Cast in device /job:localhost/replica:0/task:0/device

InternalError: Blas GEMM launch failed : a.shape=(2, 3), b.shape=(3, 64), m=2, n=64, k=3 [Op:MatMul]

# Actor

In [9]:
class Actor(Model):
    
    def __init__(self):
        super(Actor, self).__init__()
        perceptrons_count = 64
        self.inp = InputLayer(2)
        self.layer1 = Dense(perceptrons_count, activation='relu')
        self.layer2 = Dense(perceptrons_count, activation='relu')
        self.layer3 = Dense(1, activation='tanh')
    
    def call(self, input_):
        input_ = self.inp(input_)
        input_ = self.layer1(input_)
        input_ = self.layer2(input_)
        return self.layer3(input_)
    
    def make(self):
        self.call( tf.constant([[1, 1]]))
        

In [15]:
class DDPG:
    
    def __init__(self):
        self.gamma = 0.99
        self.tau = 0.001
        self.learning_rate = 0.01
        self.actor = Actor()
        self.actor.make()
        self.target_actor = Actor()
        self.target_actor.make()
        
        self.critic = Critic()
        self.critic.make()
        self.target_critic = Critic()
        self.target_critic.make()
        
        self.critic_optimizer = Adam(learning_rate=0.001)
        self.actor_optimizer = Adam(learning_rate=0.001)
        
    def update_weights(self, base_model, target_model):
        for base_layer, target_layer in zip(base_model.weights, target_model.weights):
            target_layer.assign(base_layer)
    
    def update_target_network(self, base_model, target_model):
        for base_layer, target_layer in zip(base_model.weights, target_model.weights):
            target_layer.assign(self.tau * base_layer + (1 - self.tau) * target_layer)
        
    
    def build(self):
        # setting same weights
        self.update_weights(self.actor, self.target_actor)
        self.update_weights(self.critic, self.target_critic)
    
    @tf.function
    def train(self, states, actions, rewards, next_states, dones):
        critic_loss = 0
        critic_loss_method = tf.keras.losses.MeanSquaredError()
        target_next_actions = self.target_actor(next_states)
        q_dash_next_state = self.target_critic([next_states, target_next_actions])
        intd_dones = tf.cast(dones, dtype=tf.float64)
        with tf.GradientTape() as critic_tape:
            q_dash = self.gamma * q_dash_next_state * intd_dones + rewards
            tf.stop_gradient(q_dash)
            q_ = self.critic([states, actions])
            critic_loss = critic_loss_method(q_dash, q_)
        critic_gradients = critic_tape.gradient(critic_loss, self.critic.trainable_variables)
        print(critic_gradients)
        self.critic_optimizer.apply_gradients( zip(critic_gradients, self.critic.trainable_variables))
        
        with tf.GradientTape() as actor_tape:
            predicted_actions = self.actor(states)
            actor_loss = - tf.reduce_mean(self.critic([states, predicted_actions]))
        actor_gradients = actor_tape.gradient(actor_loss, self.actor.trainable_variables)
        self.actor_optimizer.apply_gradients(zip(actor_gradients, self.actor.trainable_variables))
        
        
        
        return critic_loss, actor_loss
    
    def actions(self, states):
        return self.actor(states)
        
        

In [17]:
class OUNoise(object):
    def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000):
        self.mu           = mu
        self.theta        = theta
        self.sigma        = max_sigma
        self.max_sigma    = max_sigma
        self.min_sigma    = min_sigma
        self.decay_period = decay_period
        self.action_dim   = action_space.shape[0]
        self.low          = action_space.low
        self.high         = action_space.high
        self.reset()
        
    def reset(self):
        self.state = np.ones(self.action_dim) * self.mu
        
    def evolve_state(self):
        x  = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
        self.state = x + dx
        return self.state
    
    def get_action(self, action, t=0): 
        ou_state = self.evolve_state()
        self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period)
        return np.clip(action + ou_state, self.low, self.high)

noise = OUNoise(env.action_space)


In [42]:
import time

In [44]:
TIMESTEPS = 100000
BATCH_SIZE = 4
state_dim = 2
action_dim = 1
memory_size = 5
replay_memory = ReplayMemory(memory_size, state_dim, action_dim)
ddpg = DDPG()
ddpg.build()

critic_losses = []
actor_losses = []

start_time = time.time()
state = env.reset()
for timestep in range(TIMESTEPS):
    action = ddpg.actions(np.array([state]))
    action = noise.get_action(action[0])
    next_state, reward, done, info = env.step(action)
    replay_memory.add(state, action, reward, next_state, done)
    if timestep != 0 and timestep % 4 == 0:
        experience_batch = replay_memory.sample(4)
        critic_loss, actor_loss = ddpg.train(experience_batch.states, experience_batch.actions, experience_batch.rewards, experience_batch.next_states, experience_batch.dones)
        critic_losses.append(critic_loss)
        actor_losses.append(actor_loss)
    
    if timestep != 0 and timestep % 5000:
        ddpg.update_target_network(ddpg.critic, ddpg.target_critic)
        ddpg.update_target_network(ddpg.actor, ddpg.target_actor)
    if done:
        state = env.reset()
    else:
        state = next_state

end_time = time.time()
print(start_time - end_time)

[<tf.Tensor 'MatMul_4:0' shape=(3, 64) dtype=float64>, <tf.Tensor 'BiasAddGrad_2:0' shape=(64,) dtype=float64>, <tf.Tensor 'MatMul_3:0' shape=(64, 64) dtype=float64>, <tf.Tensor 'BiasAddGrad_1:0' shape=(64,) dtype=float64>, <tf.Tensor 'MatMul_1:0' shape=(64, 1) dtype=float64>, <tf.Tensor 'BiasAddGrad:0' shape=(1,) dtype=float64>]
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in 

In [45]:
actor_losses

[<tf.Tensor: id=140626, shape=(), dtype=float64, numpy=-0.022675784848910235>,
 <tf.Tensor: id=141057, shape=(), dtype=float64, numpy=-0.012890812345320033>,
 <tf.Tensor: id=141488, shape=(), dtype=float64, numpy=0.0020089205796699587>,
 <tf.Tensor: id=141919, shape=(), dtype=float64, numpy=0.00780244264007387>,
 <tf.Tensor: id=142350, shape=(), dtype=float64, numpy=0.009168045872406282>,
 <tf.Tensor: id=142781, shape=(), dtype=float64, numpy=0.007522673948606293>,
 <tf.Tensor: id=143212, shape=(), dtype=float64, numpy=0.005259218918138205>,
 <tf.Tensor: id=143643, shape=(), dtype=float64, numpy=0.00541325380180298>,
 <tf.Tensor: id=144074, shape=(), dtype=float64, numpy=0.005690371211416926>,
 <tf.Tensor: id=144505, shape=(), dtype=float64, numpy=0.004316524981515782>,
 <tf.Tensor: id=144936, shape=(), dtype=float64, numpy=-0.0006319788734867721>,
 <tf.Tensor: id=145367, shape=(), dtype=float64, numpy=-0.0063185189748377665>,
 <tf.Tensor: id=145798, shape=(), dtype=float64, numpy=-0.0

In [None]:
def render(env, policy=None):
    """Graphically render an episode using the given policy

    :param env:  Gym environment
    :param policy:  function which maps state to action.  If None, the random
                    policy is used.
    """

    if policy is None:

        def policy(state):
            return env.action_space.sample()

    state = env.reset()
    env.render()

    while True:
        action = policy(state)
        state, _, done, _ = env.step(action)
        env.render()

        if done:
            break

    env.close()

In [53]:
state = env.reset()
env.render()

while True:
    action = ddpg.target_actor(np.array([state]))
    state, _, done, _ = env.step(action[0])
    env.render()

    if done:
        break
env.close()

In [None]:
d.actor(tf.constant([[1, 1, 1]]))

In [None]:
d.target_actor(tf.constant([[1, 1, 1]]))

In [None]:
r = ReplayMemory(1000, 2, 1)

In [None]:
r.populate(env, 100)

In [None]:
r.sample(10).states

In [None]:
state = env.reset()

In [None]:
state

In [None]:
np.array([state])

In [None]:
x = env.action_space.sample()

In [None]:
env.step(np.array([0.1]))

In [None]:
t = 0
env.reset()
while True:
    t += 1
    ac = env.action_space.sample()
    next_state, reward, done, info = env.step(ac)
    if done:
        print(info)
        break
        

In [None]:
t

In [None]:
env.action_space.sample()

In [None]:
x = env.action_space.sample()

In [None]:
type(x)

In [None]:
a = tf.constant([[4], [5]], dtype=tf.float64)
b = tf.constant([[6], [6]], dtype=tf.float64)

In [None]:
tf.reduce_mean( tf.reduce_sum((b - a) ** 2) ** 0.5 )

In [33]:
noise.get_action(0.8)

array([1.])