#### Creating Mapping

In [3]:
from google.colab import drive, files
drive.mount('/content/gdrive')
import sys
sys.path.append('/content/gdrive/MyDrive/Colab/reinforcement_learning/')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


#### Install Libraries

In [4]:
!apt-get install -y xvfb python-opengl > /dev/null 2>&1
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!pip3 install box2d-py > /dev/null 2>&1
!pip install pyyaml h5py  # Required to save models in HDF5 format



#### Importing other dependences

In [5]:
import numpy as np
import tensorflow as tf
import gym

#### Creating Classes

In [6]:
class Memory:

    def __init__(self, states, actions, size=1000000):

        self.s = np.ndarray([size, states])
        self.a = np.ndarray([size, actions])
        self.r = np.ndarray([size, 1])
        self.sp = np.ndarray([size, states])
        self.done = np.ndarray([size, 1])
        self.n = 0

    def __len__(self):

        return self.n

    def add(self, s, a, r, sp, done):

        self.s[self.n, :] = s
        self.a[self.n, :] = a
        self.r[self.n, :] = r
        self.sp[self.n, :] = sp
        self.done[self.n, :] = done
        self.n += 1

    def sample(self, size):

        idx = np.random.randint(0, self.n, size)

        return self.s[idx], self.a[idx], self.r[idx], self.sp[idx], self.done[idx]


class Network:
    def __init__(self, states, actions):
        self.states = states
        self.actions = actions

    def __ilshift__(self, other):

        if isinstance(self, DQN) or isinstance(self, Model):
            self.__model.set_weights(other.__model.get_weights())

        return self

    def combine(self, s, a, force=False):

        # Convert scalars to vectors
        s = np.atleast_1d(np.asarray(s, dtype=np.float32))
        a = np.atleast_1d(np.asarray(a, dtype=np.float32))

        
        # Convert vectors to matrices for single-state environments
        if self.states == 1 and len(s.shape) == 1 and s.shape[0] > 1:
            s = np.atleast_2d(s).transpose()

        # Convert vectors to matrices for single-action environments
        if self.actions == 1 and len(a.shape) == 1 and a.shape[0] > 1:
            a = np.atleast_2d(a).transpose()

        # Normalize to matrices
        s = np.atleast_2d(s)
        a = np.atleast_2d(a)

        # Sanity checking
        if len(s.shape) > 2 or len(a.shape) > 2:
            raise ValueError("Input dimensionality not supported")

        if s.shape[1] != self.states:
            raise ValueError("State dimensionality does not match network")

        if a.shape[1] != self.actions:
            raise ValueError("Action dimensionality does not match network")

        # Replicate if necessary
        if s.shape[0] != a.shape[0] or force:
            reshape = (s.shape[0], a.shape[0])
            s = np.repeat(s, np.repeat(reshape[1], reshape[0]), axis=0)
            a = np.tile(a, (reshape[0], 1))
        else:
            reshape = (s.shape[0], 1)
        

        m = np.hstack((s, a))

        return m, reshape

    def combine2(self, s, a, force=False):

        # Convert scalars to vectors
        s = np.atleast_1d(np.asarray(s, dtype=np.float32))
        a = np.atleast_1d(np.asarray(a, dtype=np.float32))     

        m = np.hstack((s, a))

        return m



class DQN(Network):

    def __init__(self, states, actions=1, hiddens=[25, 25], model_name='generic', load_model=False):

        super(DQN, self).__init__(states, actions)

        self.load_model = load_model
        self.model_name = model_name
        
        
        if load_model:
          print('Loading previously created model...')
          self.__model = tf.keras.models.load_model(model_name+".h5")
          self.__model.compile(loss=tf.keras.losses.MeanSquaredError(),
                              optimizer=tf.keras.optimizers.Adam())

        
        else:
          print('Creating model...')
        
          self.__model = tf.keras.models.Sequential([
            tf.keras.layers.Dense(hiddens[0],activation='relu',input_shape=(states+actions,)),
            tf.keras.layers.Dense(hiddens[1],activation='relu'),
            tf.keras.layers.Dense(1,activation='linear')])

          self.__model.compile(loss=tf.keras.losses.MeanSquaredError(),
                              optimizer=tf.keras.optimizers.Adam())

    def train(self, s, a, target):

        self.__model.train_on_batch(self.combine(s, a), np.atleast_1d(target))

    def train2(self, s, a, target):

        self.__model.train_on_batch(self.combine2(s, a), np.atleast_1d(target))


    def __call__(self, s, a):

        inp, reshape = self.combine(s, a)
        return np.reshape(np.asarray(self.__model(inp)), reshape)

    def __ilshift__(self, other):

        self.__model.set_weights(other.__model.get_weights())

        return self

    def save_model(self):
        self.__model.save(self.model_name+".h5")


#### Auxiliar Functions

In [7]:
def log(file_name, text=None, act='a'):

    fout = open(file_name + ".csv", act)
    if text is not None:
        fout.write(text+'\n')
    fout.close()

#### Creating simulation

In [11]:
# simulation control variables
episode = 200
epsilon = 0.1
gamma = 0.98
interval = 200
batch = 32

# creating evironment
env = gym.make("LunarLander-v2")
num_states = 8
num_actions = 1
network_layers = [200,200]
network_name = 'Lunar_01_colab'
target_name = 'Target'

"""
Permissible Actions
0- Do nothing
1- Fire left engine
2- Fire down engine
3- Fire right engine
"""
actions = [0,1,2,3]

# system objects
network = DQN(num_states,num_actions,network_layers, model_name=network_name)
target = DQN(num_states,num_actions,network_layers, model_name=target_name)
memory = Memory(num_states, num_actions)

# system counter
n = 0
curve = np.zeros(episode)

# log file
fout = open("training_" + network_name + ".csv", 'a')


print('Model name:',network_name,'\nEnv States:',num_states,'\nEnv Actions:',num_actions)

Creating model...
Creating model...
Model name: Lunar_01_colab 
Env States: 8 
Env Actions: 1


#### Starting training

In [12]:
print('System read!\nStarting Training...')

# creating training loop
for i in range(episode):
    observation = env.reset()

    done = False

    while not done:

        n+=1
        # render environment
        if False:#i%episode == episode -2:
            env.render()

        # selection action using network
        action = actions[np.argmax(network(observation,actions))]

        # Selection action with epsilon-Greedy
        if np.random.rand() < epsilon:
            action = np.random.choice(actions)

        # saving previous observation
        prev_obs = observation
        observation, reward, done, info = env.step(action)

        # saving transition to the replay memory
        curve[i] += reward
        memory.add(prev_obs,action, reward, observation, done)

        # updating network training
        if len(memory) > 1000:
            bs, ba, br, bsp, bd = memory.sample(batch)

            qsp = np.amax(target(bsp, actions), axis=1, keepdims=True)
            y = br + (1 - bd) * gamma * qsp

            network.train2(bs, ba, y)

            # update target network every <interval> steps into training cicle
            if n % interval == 0:
                target <<= network

    # loggin training
    log(network_name,str(curve[i]))
    
    # saving the model evey 10 training epochs
    if i%10 == 0:
        network.save_model()

    print('%', i, curve[i])
  
# saving final trained network
network.save_model()
print('Training done!')

System read!
Starting Training...
% 0 -633.9535294045146
% 1 -157.61806052361987
% 2 -621.7364698347156
% 3 -110.4155039049271
% 4 -106.3854060050523
% 5 -751.5113570525798
% 6 -641.9002266531268
% 7 -117.55186296076158
% 8 -535.8017206623601
% 9 -113.98614643142022
% 10 -566.0540093942907
% 11 -526.4377386924466
% 12 -584.4529921515475
% 13 -124.16306376017539
% 14 -137.29991245143268
% 15 -119.36373873682734
% 16 -123.36699722645484
% 17 -137.85147847401754
% 18 -162.86372072214297
% 19 -231.644141249262
% 20 -145.44308740126647
% 21 -389.90868328672013
% 22 -180.7341798006476
% 23 -504.49418448371756
% 24 -158.65352653275568
% 25 -198.3867931696583
% 26 -257.1344210373293
% 27 -61.143656571271485
% 28 -180.1558928088796
% 29 -201.98362994569288
% 30 -58.34031712633768
% 31 -217.39702446531572
% 32 -86.62836974523525
% 33 -189.39543482724758
% 34 -164.39795260285635
% 35 -120.74690273979013
% 36 -160.65525994910456
% 37 -131.6569949472182
% 38 -199.21882481178528
% 39 -159.5547076403

#### Downloading Logs and trained network files

In [13]:
files.download(network_name+'.csv')
files.download(network_name+'.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#### Validating the agent

In [14]:
# simulation control variables
eval_episode = 5

# loading network
net_eval = DQN(num_states,num_actions, model_name=network_name, load_model=True)

# system counter
n = 0
curve = np.zeros(eval_episode)

print('Model name:',network_name,'\nEnv States:',num_states,'\nEnv Actions:',num_actions)
print('System read!\nStarting Rover Control...')

# creating training loop
for i in range(eval_episode):
    observation = env.reset()

    done = False

    while not done:

        n+=1
        # render environment
        if False:
            env.render()

        # selection action using network
        action = actions[np.argmax(net_eval(observation,actions))]


        # performing action
        observation, reward, done, info = env.step(action)
        curve[i] += reward

    print(observation, action, curve[i])

    # loggin training
    log(network_name+'_eval',str(observation))

Loading previously created model...
Model name: Lunar_01_colab 
Env States: 8 
Env Actions: 1
System read!
Starting Rover Control...
[ 0.9241554  -0.03042135  0.14874214  0.02532626  0.17411397  0.07806657
  1.          0.        ] 3 -127.67678427551279
[ 3.3448793e-02 -7.5240375e-04  0.0000000e+00  0.0000000e+00
  1.0010009e-03  0.0000000e+00  1.0000000e+00  1.0000000e+00] 0 235.4905645296238
[0.33497256 0.04659971 0.         0.         0.25174886 0.
 1.         1.        ] 3 127.28208031848548
[ 1.1057711e-01 -7.8830955e-04  0.0000000e+00  0.0000000e+00
  2.8207485e-04  0.0000000e+00  0.0000000e+00  1.0000000e+00] 0 179.0705855262222
[-4.2154692e-02 -6.8466901e-04  0.0000000e+00  0.0000000e+00
 -1.2299978e-03  0.0000000e+00  1.0000000e+00  1.0000000e+00] 0 171.5192779077768


#### Download Evaluation log file

In [15]:
files.download(network_name+'_eval.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>