In [1]:
import threading
import multiprocessing
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.contrib.slim as slim
import scipy.signal
from helper import *
from random import choice
from time import sleep
from time import time

import os
import glob
import time
from datetime import datetime

import numpy as np
import gymnasium as gym
from LocationEnv import *
import queue

import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

import numpy as np

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [2]:
# Copies one set of variables to another.
# Used to set worker network parameters to those of global network.
def update_target_graph(from_scope,to_scope):
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

    op_holder = []
    for from_var,to_var in zip(from_vars,to_vars):
        op_holder.append(to_var.assign(from_var))
    return op_holder

# Discounting function used to calculate discounted returns.
def discount(x, gamma):
    return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]



In [3]:
class AC_Network():
    def __init__(self,scope,trainer,num_loc):
        with tf.compat.v1.variable_scope(scope):
            self.inputs = tf.placeholder(shape=[None,num_loc,2],dtype=tf.float32)
            self.prev_rewards = tf.placeholder(shape=[None,1],dtype=tf.float32)
            self.prev_actions = tf.placeholder(shape=[None],dtype=tf.int32)
            self.timestep = tf.placeholder(shape=[None,1],dtype=tf.float32)
            
            self.prev_actions_onehot = tf.one_hot(self.prev_actions,num_loc,dtype=tf.float32)
            self.imageIn = tf.reshape(self.inputs,shape=[-1,num_loc,2,1])
            
            self.conv1 = tf.layers.conv2d(
                activation=tf.nn.elu,
                inputs=self.imageIn,filters=32,
                kernel_size=[1,2],strides=[1,1],
                padding='VALID',
                kernel_initializer=tf.contrib.layers.xavier_initializer())
            
            hidden1 = slim.fully_connected(slim.flatten(self.conv1),16,activation_fn=tf.nn.elu,weights_initializer=tf.contrib.layers.xavier_initializer(),
                biases_initializer=None)
            
            hidden2 = slim.fully_connected(self.prev_rewards,12,activation_fn=tf.nn.elu,weights_initializer=tf.contrib.layers.xavier_initializer(),
                biases_initializer=None)
            
            hidden = tf.concat([hidden1,hidden2,self.prev_actions_onehot,self.timestep],1)
            hidd_full_size = 16+12+num_loc+1


            lstm_cell = tf.contrib.rnn.BasicLSTMCell(hidd_full_size,state_is_tuple=True)
            c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
            h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
            self.state_init = [c_init, h_init]
            c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c])
            h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h])
            self.state_in = (c_in, h_in)
            rnn_in = tf.expand_dims(hidden, [0])
            
            step_size = tf.shape(self.imageIn)[:1]
            state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in)
            lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
                lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size,
                time_major=False)
            lstm_c, lstm_h = lstm_state
            self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
            rnn_out = tf.reshape(lstm_outputs, [-1, hidd_full_size])
            
            self.policy = slim.fully_connected(rnn_out,num_loc,
                activation_fn=tf.nn.softmax,
                weights_initializer=tf.contrib.layers.xavier_initializer(),
                biases_initializer=None)
            self.value = slim.fully_connected(rnn_out,1,
                activation_fn=None,
                weights_initializer=tf.contrib.layers.xavier_initializer(),
                biases_initializer=None)
            
            self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
            self.actions_onehot = tf.one_hot(self.actions,num_loc,dtype=tf.float32)
            
            #Only the worker network need ops for loss functions and gradient updating.
            if scope != 'global':
                
                self.target_v = tf.placeholder(shape=[None],dtype=tf.float32)
                self.advantages = tf.placeholder(shape=[None],dtype=tf.float32)

                self.responsible_outputs = tf.reduce_sum(self.policy * self.actions_onehot, [1])

                #Loss functions
                self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value,[-1])))
                self.entropy = - tf.reduce_sum(self.policy * tf.log(self.policy))
                self.policy_loss = -tf.reduce_sum(tf.log(self.responsible_outputs)*self.advantages)
                self.loss = 0.1 * self.value_loss + self.policy_loss - self.entropy * 0.005

                #Get gradients from local network using local losses
                local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                self.gradients = tf.gradients(self.loss,local_vars)
                self.var_norms = tf.global_norm(local_vars)
                grads,self.grad_norms = tf.clip_by_global_norm(self.gradients,10.0)
                
                #Apply local gradients to global network
                global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                self.apply_grads = trainer.apply_gradients(zip(grads,global_vars))

In [4]:
class Worker():
    def __init__(self,name,trainer,global_episodes,env,num_loc,shuffle,max_reward):
        self.name = "worker_" + str(name)
        self.number = name        
        self.trainer = trainer
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []

        #Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_AC = AC_Network(self.name,trainer,num_loc)
        self.update_local_ops = update_target_graph('global',self.name)        
        
        self.env = env

        self.max_reward = max_reward
        self.shuffle = shuffle
        
    def train(self,rollout,sess,gamma,bootstrap_value):
        rollout = np.array(rollout)
        observations = rollout[:,0]
        actions = rollout[:,1]
        rewards = rollout[:,2]
        next_observations = rollout[:,3]
        values = rollout[:,5]
        timesteps = rollout[:,6]
        prev_rewards = [0] + rewards[:-1].tolist()
        prev_actions = [0] + actions[:-1].tolist()
        
        # Here we take the rewards and values from the rollout, and use them to 
        # generate the advantage and discounted returns. 
        # The advantage function uses "Generalized Advantage Estimation"
        self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = discount(self.rewards_plus,gamma)[:-1]
        self.value_plus = np.asarray(values.tolist() + [bootstrap_value])
        advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1]
        advantages = discount(advantages,gamma)

        self.batch_rnn_state = self.local_AC.state_init

        # Update the global network using gradients from loss
        # Generate network statistics to periodically save
        feed_dict = {self.local_AC.target_v:discounted_rewards,
            self.local_AC.inputs:np.stack(observations,axis=0),
            self.local_AC.prev_rewards:np.vstack(prev_rewards),
            self.local_AC.prev_actions:prev_actions,
            self.local_AC.timestep:np.vstack(timesteps),
            self.local_AC.actions:actions,
            self.local_AC.advantages:advantages,
            self.local_AC.state_in[0]:self.batch_rnn_state[0],
            self.local_AC.state_in[1]:self.batch_rnn_state[1]}
        v_l,p_l,e_l,g_n,v_n, self.batch_rnn_state,_ = sess.run([self.local_AC.value_loss,
            self.local_AC.policy_loss,
            self.local_AC.entropy,
            self.local_AC.grad_norms,
            self.local_AC.var_norms,
            self.local_AC.state_out,
            self.local_AC.apply_grads],
            feed_dict=feed_dict)
        return v_l / len(rollout),p_l / len(rollout),e_l / len(rollout), g_n,v_n
        
    def work(self,max_episode_length,gamma,sess,coord,saver):
        
        print ("Starting worker " + str(self.number))
        with sess.as_default(), sess.graph.as_default():   
            day_count = 0              
            while not coord.should_stop():
                sess.run(self.update_local_ops)
                episode_buffer = []
                episode_count = 0
                location_constant  =  random.choice([1])
                self.env.reset_day(location_constant)
                rnn_state = self.local_AC.state_init
                day_rewards = []

                while episode_count< max_day_length :

                    s = self.env.reset()
                    episode_step_count = 0
                    
                    r = 0
                    a = 0
                    t = episode_count
                    episode_rewards = []
                    rnn_state = self.local_AC.state_init


                    while episode_step_count < max_episode_length:
                        a_dist,v,rnn_state = sess.run([self.local_AC.policy,self.local_AC.value,self.local_AC.state_out], 
                            feed_dict={self.local_AC.inputs:[s],
                            self.local_AC.prev_rewards:[[r]],
                            self.local_AC.timestep:[[t]],
                            self.local_AC.prev_actions:[a],

                            self.local_AC.state_in[0]:rnn_state[0],
                            self.local_AC.state_in[1]:rnn_state[1]})
                        a = np.random.choice(a_dist[0],p=a_dist[0])
                        a = np.argmax(a_dist == a)
                        s1, r, d, _ = self.env.step(a)
                        episode_rewards.append(r)
        
                        episode_buffer.append([s,a,r,s1,d,v[0,0],t])

                        s = s1                    
                        episode_step_count += 1

                    day_rewards.append(sum(episode_rewards))
                    episode_count += 1

                               
                if len(episode_buffer) != 0:
                    v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,0.0)
                    
                if self.name == 'worker_0':  
                    print(day_count,sum(day_rewards[:10]),sum(day_rewards[10:]) ,day_rewards)
                    
                sess.run(self.increment)
                day_count = day_count+1
                
            

In [5]:
num_loc = 4
max_episode_length = num_loc
max_day_length = 20
gamma = 0.99 
load_model = False
num_workers = 10

In [6]:
tf.reset_default_graph()
shuffle = True
with tf.device("/cpu:0"): 
    global_episodes = tf.Variable(0,dtype=tf.int32,name='global_episodes',trainable=False)
    trainer = tf.train.AdamOptimizer(learning_rate=1e-3)
    master_network = AC_Network('global',None,num_loc) # Generate global network
    workers = []
    # Create worker classes
    for i in range(num_workers):
        tempenv = LocationEnvironment3()
        workers.append(Worker(i,trainer,global_episodes,tempenv,num_loc,shuffle,0))
    saver = tf.train.Saver(max_to_keep=5)

with tf.Session() as sess:
    coord = tf.train.Coordinator()
    if load_model == True:
        print ('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(model_path)
        saver.restore(sess,ckpt.model_checkpoint_path)
    else:
        sess.run(tf.global_variables_initializer())
        
    # This is where the asynchronous magic happens.
    # Start the "work" process for each worker in a separate thread.
    worker_threads = []
    for worker in workers:
        worker_work = lambda: worker.work(max_episode_length,gamma,sess,coord,saver)
        t = threading.Thread(target=(worker_work))
        t.start()
        sleep(0.5)
        worker_threads.append(t)
    coord.join(worker_threads)




Instructions for updating:
Use `tf.keras.layers.Conv2D` instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where









2023-11-14 18:53:09.550421: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-14 18:53:09.554786: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2918400000 Hz
2023-11-14 18:53:09.555487: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x4ba7e60 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2023-11-14 18:53:09.555497: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
OMP: Info #155: KMP_AFFINITY: Initial OS proc set respected: 0-19
OMP: Info #216: KMP_AFFINITY: decoding x2APIC ids.
OMP: Info #157: KMP_AFFINITY: 20 available OS procs
OMP: Info #159: KMP_AFFINITY: Nonuniform topology
OMP: Info 

Starting worker 0


OMP: Info #254: KMP_AFFINITY: pid 52722 tid 52795 thread 2 bound to OS proc set 4
OMP: Info #254: KMP_AFFINITY: pid 52722 tid 52819 thread 3 bound to OS proc set 6
OMP: Info #254: KMP_AFFINITY: pid 52722 tid 52820 thread 4 bound to OS proc set 8
OMP: Info #254: KMP_AFFINITY: pid 52722 tid 52821 thread 5 bound to OS proc set 10
OMP: Info #254: KMP_AFFINITY: pid 52722 tid 52822 thread 6 bound to OS proc set 12
OMP: Info #254: KMP_AFFINITY: pid 52722 tid 52823 thread 7 bound to OS proc set 13
OMP: Info #254: KMP_AFFINITY: pid 52722 tid 52824 thread 8 bound to OS proc set 14
OMP: Info #254: KMP_AFFINITY: pid 52722 tid 52825 thread 9 bound to OS proc set 15
OMP: Info #254: KMP_AFFINITY: pid 52722 tid 52826 thread 10 bound to OS proc set 16
OMP: Info #254: KMP_AFFINITY: pid 52722 tid 52827 thread 11 bound to OS proc set 17
OMP: Info #254: KMP_AFFINITY: pid 52722 tid 52828 thread 12 bound to OS proc set 18
OMP: Info #254: KMP_AFFINITY: pid 52722 tid 52829 thread 13 bound to OS proc set 19
OMP

Starting worker 1
Starting worker 2
0 11 15 [4, 1, 0, 0, 1, 1, 2, 0, 2, 0, 1, 2, 1, 2, 1, 1, 2, 1, 3, 1]
Starting worker 3
Starting worker 4
Starting worker 5
Starting worker 6
1 16 7 [2, 2, 2, 1, 1, 2, 3, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 3, 0, 2]
Starting worker 7
Starting worker 8
Starting worker 9
2 10 14 [1, 1, 1, 2, 0, 2, 0, 1, 2, 0, 2, 1, 1, 0, 2, 0, 4, 2, 1, 1]
3 8 11 [0, 0, 0, 1, 2, 0, 1, 1, 2, 1, 2, 1, 2, 0, 1, 2, 1, 0, 1, 1]
4 6 14 [0, 1, 0, 0, 2, 1, 2, 0, 0, 0, 1, 1, 2, 1, 2, 2, 2, 0, 1, 2]
5 13 6 [1, 2, 1, 1, 3, 1, 1, 0, 2, 1, 0, 0, 2, 0, 1, 1, 0, 0, 1, 1]
6 11 13 [0, 3, 1, 2, 1, 1, 0, 2, 0, 1, 2, 1, 2, 3, 1, 0, 1, 0, 1, 2]
7 12 16 [0, 3, 1, 0, 2, 1, 1, 1, 0, 3, 3, 1, 2, 3, 2, 0, 2, 2, 0, 1]
8 15 5 [1, 2, 0, 1, 1, 2, 2, 1, 2, 3, 0, 2, 1, 0, 0, 0, 0, 1, 1, 0]
9 8 12 [1, 1, 0, 0, 2, 1, 0, 1, 1, 1, 2, 1, 0, 3, 2, 0, 1, 2, 0, 1]
10 7 8 [1, 0, 2, 1, 2, 0, 0, 0, 1, 0, 2, 1, 0, 0, 2, 1, 0, 0, 0, 2]
11 8 11 [0, 0, 2, 0, 2, 1, 0, 1, 0, 2, 1, 0, 1, 1, 2, 1, 2, 0, 0, 3]
12 12 8 [0, 1, 0,

: 