In [1]:
import numpy as np
import time
import ou_noise
import f2_noise
from ReplayBuffer import ReplayBuffer
from tqdm import tqdm
from tqdm import tnrange
from matplotlib import pyplot as plt

In [2]:
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_VISIBLE_DEVICES=1


In [3]:
from pendulumMulti_environment import PendulumEnvMulti
#Environment state size
State_size = 3

In [4]:
#create folder for debug info and plots
save_plots_dir = "results_pend_horizontal"
!mkdir results_pend_horizontal
!mkdir results_pend_horizontal/reward_plots
!mkdir results_pend_horizontal/action_plots
!mkdir results_pend_horizontal/loss_plots
!mkdir results_pend_horizontal/gradient_norm_plots
!mkdir results_pend_horizontal/term_stats
!mkdir results_pend_horizontal/pareto_front_progress

In [5]:
#choose gradient weights for reward components
gradient_weights = list()
for w in np.linspace(0, 1, 21):
    gradient_weights.append(np.array([w, float("{0:.2f}".format(1-w))]))

In [6]:
gradient_weights

[array([ 0.,  1.]),
 array([ 0.05,  0.95]),
 array([ 0.1,  0.9]),
 array([ 0.15,  0.85]),
 array([ 0.2,  0.8]),
 array([ 0.25,  0.75]),
 array([ 0.3,  0.7]),
 array([ 0.35,  0.65]),
 array([ 0.4,  0.6]),
 array([ 0.45,  0.55]),
 array([ 0.5,  0.5]),
 array([ 0.55,  0.45]),
 array([ 0.6,  0.4]),
 array([ 0.65,  0.35]),
 array([ 0.7,  0.3]),
 array([ 0.75,  0.25]),
 array([ 0.8,  0.2]),
 array([ 0.85,  0.15]),
 array([ 0.9,  0.1]),
 array([ 0.95,  0.05]),
 array([ 1.,  0.])]

In [7]:
# Render gym env during training
RENDER_ENV = False
# Use Gym Monitor
GYM_MONITOR_EN = False
# Gym environment

ENV_NAME = 'PendulumMulti-v0'

# Directory for storing gym results
MONITOR_DIR = './results_pend_horizontal/videos_pend'
# Directory for storing tensorboard summary results
SUMMARY_DIR = './results_pend_horizontal/tf_ddpg'

In [8]:
from actor_critic_networks import *
from TerminateChecker import TerminateChecker

In [9]:
def dominates(a, b):
    for ai, bi in zip(a, b):
        if(bi > ai):
            return False
    if(np.all(a == b)):
        return False
    return True

def remove_dominated(xs):
    is_dominated = np.zeros(xs.shape[0])
    
    for i in range(xs.shape[0]):
        for j in range(xs.shape[0]):
            if(i != j and dominates(xs[i], xs[j])):
                is_dominated[j] = 1
                
    return xs[is_dominated == 0]

In [10]:
def build_summaries():
    episode_reward = tf.Variable(0.)
    tf.summary.scalar("Reward", episode_reward)
    episode_ave_max_q = tf.Variable(0.)
    tf.summary.scalar("Qmax Value", episode_ave_max_q)

    summary_vars = [episode_reward, episode_ave_max_q]
    summary_ops = tf.summary.merge_all()

    return summary_ops, summary_vars

def train(sess, env, actor, critic, action_dim, action_bound, grad_weights):
    
    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
    small_replay_buffer = ReplayBuffer(SMALL_BUFFER_SIZE, RANDOM_SEED)
    
    eps_reward = list()
    #constant for scaling the immediate reward
    REWARD_BURN_CONST = 1
    
    reward_timestep_sum = 0
    noise_level = 1.0
    
    #storing loss and gradient norm
    draw_loss = list()
    draw_global_norm = list()
    draw_layers_norm = list()   
    draw_critic_norm = list()
    draw_actor_norm = list()
    
    terminate_statistics = list()
    TERMINATE_TEST_SIZE = 50
    CHECK_FOR_PROFIT_TIMES = 50
    
    for i in range(MAX_EPISODES):
        
        if(i == 1):
            checker = TerminateChecker(eps_reward[0], 0.05, TERMINATE_TEST_SIZE, CHECK_FOR_PROFIT_TIMES)
        
        if(i >= TERMINATE_TEST_SIZE):
            
            if(checker.terminate_check()):
                print("#" * 50)
                print("STOP TRAINING in {} episodes".format(str(i + 1)))
                print("Weights are:", grad_weights)
                break
            
            terminate_statistics.append(np.array([checker.ma, checker.ema]))
            fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(16, 15))
            ax[0].set_title("#Episode/Reward")
            ax[0].set_xlabel("Episode", fontsize=14)
            ax[0].set_ylabel("Reward", fontsize=14)    
            ax[0].plot(np.arange(len(terminate_statistics)) + (TERMINATE_TEST_SIZE), eps_reward[-len(terminate_statistics):])
            
            ax[1].set_title("#Episode/MA value")
            ax[1].set_xlabel("Episode", fontsize=14)
            ax[1].set_ylabel("MA Value", fontsize=14)    
            ax[1].plot(np.arange(len(terminate_statistics)) + (TERMINATE_TEST_SIZE), np.array(terminate_statistics)[:, 0])
            start_x = TERMINATE_TEST_SIZE
            finish_x = len(terminate_statistics) + TERMINATE_TEST_SIZE
            ax[1].plot([start_x, finish_x], [checker.best_ma, checker.best_ma], color="red")
            ax[1].vlines(np.argmax(np.array(terminate_statistics)[:, 0])+(TERMINATE_TEST_SIZE), np.min(eps_reward), np.max(eps_reward), color="green")
            ax[1].set_ylim([np.min(eps_reward)-100, np.max(eps_reward)+100])
            
            ax[2].set_title("#Episode/EMA value")
            ax[2].set_xlabel("Episode", fontsize=14)
            ax[2].set_ylabel("EMA Value", fontsize=14)    
            ax[2].plot(np.arange(len(terminate_statistics)) + (TERMINATE_TEST_SIZE), np.array(terminate_statistics)[:, 1])
            ax[2].plot([start_x, finish_x], [checker.best_ema, checker.best_ema], color="red")
            ax[2].set_ylim([np.min(eps_reward)-100, np.max(eps_reward)+100])
            ax[2].vlines(np.argmax(np.array(terminate_statistics)[:, 1])+(TERMINATE_TEST_SIZE), np.min(eps_reward), np.max(eps_reward), color="green")
            
            plt.savefig(save_plots_dir + "/term_stats/terminate_statistics_plot_" + str(grad_weights) + ".png")
        
        noise_level *= 0.95
        noise_level = max(1e-7,noise_level - 1e-4)
        
        #for plotting
        grad_global_norm = list()
        grad_layers_norm = list()
        episode_actor_grads = list()
        REWARD_PLOT_TIMESTEP = 1
        ACTION_PLOT_TIMESTEP = 10
        #constant for plotting smoothed reward(averaged by last SMOOTH_REWARD episodes)
        SMOOTH_REWARD = 10
        predicted_actions = list()
        noise_actions = list()

        ep_reward = np.zeros(REWARD_SPACE_DIM)
        ep_ave_max_q = np.zeros(REWARD_SPACE_DIM)
        
        state_reward = list()
        loss = list()
        
        s = env.reset()
        for j in range(MAX_EP_STEPS):

            if RENDER_ENV:
                env.render()

            # Added exploration noise
            predicted_action = actor.predict(np.reshape(s, (1, State_size)))
            noise_action = actor.noise.one(action_dim, noise_level)
            a = np.clip(predicted_action + noise_action, -action_bound, action_bound)
            s2, r, terminal, info = env.step(a[0])
            r *= REWARD_BURN_CONST

            replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r,
                              terminal, np.reshape(s2, (actor.s_dim,)))
            
            small_replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r,
                              terminal, np.reshape(s2, (actor.s_dim,)))
            
            if(i % ACTION_PLOT_TIMESTEP == 0):
                predicted_actions.append(predicted_action[0])
                noise_actions.append(noise_action)
                state_reward.append(r)
            
            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            calculated_grad = False
            
            if replay_buffer.size() >= MINIBATCH_SIZE:
                
                for sample_num in range(R):
                    
                    s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(batch_size=MINIBATCH_SIZE)

                    # Calculate targets
                    target_q = critic.predict_target(
                        s2_batch, actor.predict_target(s2_batch))

                    y_i = []
                    for k in range(MINIBATCH_SIZE):
                        if t_batch[k]:
                            y_i.append(r_batch[k])
                        else:
                            y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Update the critic given the targets
                    predicted_q_value, _ = critic.train(
                        s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, REWARD_SPACE_DIM)))
                    
                    gain_predictions = np.mean(critic.predict(s_batch, actor.predict(s_batch)), axis=0)
                    
                    loss.append(np.mean((gain_predictions).dot(grad_weights)))

                # Update the actor policy using the sampled gradient
                    a_outs = actor.predict(s_batch)
                    grads = critic.action_gradients(s_batch, a_outs, np.reshape(grad_weights, (REWARD_SPACE_DIM, 1)))
                    _, global_norm, layers_norm = actor.train(s_batch, grads[0])
                    
                    grad_global_norm.append(global_norm)
                    grad_layers_norm.append(layers_norm)
                    
                    #episode_actor_grads.append(actor_grad)
                    actor.update_target_network()
                    critic.update_target_network()
            
            s = s2
            ep_reward += (np.array(r) * (GAMMA ** j)) / REWARD_BURN_CONST
            
            if(j == MAX_EP_STEPS - 1):
                terminal = True
                
            if(terminal):
                cost = np.sum(ep_reward * grad_weights)
                reward_timestep_sum += cost
                eps_reward.append(cost)        
                
                ###DRAW PLOTS BEGINING
                if(i % REWARD_PLOT_TIMESTEP == 0):
                    #eps_reward.append(reward_timestep_sum / REWARD_PLOT_TIMESTEP)
                    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(16, 6))
                    ax.set_title("Episode/Cumulative Reward", fontsize=18)
                    ax.set_xlabel("Episode", fontsize=14)
                    ax.set_ylabel("Cumulative Reward", fontsize=14)
                    ax.plot((np.arange(len(eps_reward)) + 1) * REWARD_PLOT_TIMESTEP, eps_reward)
                    reward_timestep_sum = 0
                    plt.savefig(save_plots_dir + "/reward_plots/episodes_reward_plot")

                if((i + 1) % SMOOTH_REWARD == 0 and (i + 1) >= SMOOTH_REWARD):
                    smoothed_reward = np.mean(np.array(eps_reward).reshape((len(eps_reward) // SMOOTH_REWARD, SMOOTH_REWARD)), axis=1)
                    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(16, 6))
                    ax.set_title("Episode/Smoothed Cumulative Reward", fontsize=18)
                    ax.set_xlabel("Episode", fontsize=14)
                    ax.set_ylabel("Smoothed Cumulative Reward", fontsize=14)
                    ax.plot(np.arange(smoothed_reward.shape[0]) * SMOOTH_REWARD + 1, smoothed_reward)
                    plt.savefig(save_plots_dir + "/reward_plots/smoothed_episodes_reward_plot")

                fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(16, 6))

                draw_global_norm.append(np.mean(np.array(grad_global_norm)))
                ax.plot(draw_global_norm)
                ax.set_xlabel("Episode", fontsize=14)
                ax.set_ylabel("Average gradient norm for all layers", fontsize=14)
                ax.set_title("#Episode/Gradient Norm", fontsize=18)

                plt.savefig(save_plots_dir + "/gradient_norm_plots/episodes_gradient_norm_plot")

                draw_loss.append(-np.mean(np.array(loss)))
                fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(16, 6))
                ax.set_title("#Episode/Loss")
                ax.set_xlabel("Episode", fontsize=14)
                ax.set_ylabel("Loss", fontsize=14)
                ax.plot(draw_loss)
                plt.savefig(save_plots_dir + "/loss_plots/episodes_loss_plot")
        
                ### DRAW PLOTS ENDING
                plt.close("all")
                
                #Update Statistics
                if(i >= 1):
                    checker.change_ma(eps_reward[-1])
                    checker.change_ema(eps_reward[-1])
                break



def test(weights, actor, test_episodes_num, action_bound):
    
    video_cal = lambda x : x % 25 == 0

    env = PendulumEnvMulti()
    
    eps_reward = list()
    
    print("!!!!!TEST WITH WEIGHTS : ", weights)
    
    for i in range(test_episodes_num):

        s = env.reset()

        cum_reward = np.zeros(REWARD_SPACE_DIM)
        
        for j in range(MAX_EP_STEPS):

            if RENDER_ENV:
                env.render()
            a = actor.predict(np.reshape(s, (1, State_size)))
            a = np.clip(a, -action_bound, action_bound)
            s2, r, terminal, info = env.step(a[0])
            if(j == MAX_EP_STEPS - 1):
                terminal = True
                
            cum_reward += (r * (GAMMA ** j))
            
            s = s2
            
            if(j == MAX_EP_STEPS - 1):
                terminal = True
            
            if(terminal):
                eps_reward.append(cum_reward)
                break
    
    eps_reward = np.mean(eps_reward, axis=0)
    
    return eps_reward
  
def main():
    
    timer = time.time()
    
    with tf.Session() as sess:

        env = PendulumEnvMulti()    
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)
        env.seed(RANDOM_SEED)

        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        action_bound = env.action_space.high
        
        for ind, w in tqdm(enumerate(gradient_weights)):
            env = PendulumEnvMulti()
            actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                             ACTOR_LEARNING_RATE, TAU)

            critic = CriticNetwork(sess, state_dim, action_dim,
                               CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars())
        
            actor.noise = f2_noise.one_fsq_noise()
        
            video_cal = lambda x : x % 20 == 0
        
            if GYM_MONITOR_EN:
                if not RENDER_ENV:
                    env = wrappers.Monitor(
                        env, MONITOR_DIR, video_callable=False, force=True)
                else:
                    env = wrappers.Monitor(env, MONITOR_DIR, video_callable=video_cal, force=True)
            train(sess, env, actor, critic, action_dim, action_bound, w)

            env.close()
            
            test(w, actor, MAX_EPISODES_TEST, action_bound)
        
            pareto[str(w)] = test(w, actor, MAX_EPISODES_TEST, action_bound)
            
            front = remove_dominated(np.array([y for y in pareto.values()]))
            np.save(save_plots_dir + "/pareto_front_progress/" + str(ind + 1) + str("_weights_samples"), front)
            file_pareto = open(save_plots_dir + "/pareto_dict", "w")
            file_pareto.write(str(pareto))
            file_pareto.close()

In [11]:
#Max Number of Training Episodes
MAX_EPISODES = 1000
#Number ot test episodes
MAX_EPISODES_TEST = 400

In [13]:
#storing all found policies
pareto = dict()

main()