In [None]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import tensorflow as tf
import random
import os

# Model Definition

In [None]:
GAMMA = 0.95
LEARNING_RATE = 0.001

MEMORY_SIZE = 1000000
BATCH_SIZE = 20

EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.995


class DQNSolver:

    def __init__(self, observation_space, action_space):
        self.exploration_rate = EXPLORATION_MAX

        self.action_space = action_space
        self.memory = deque(maxlen=MEMORY_SIZE)

        self.model = Sequential()
        self.model.add(Dense(16, input_shape=(observation_space,), activation="relu"))
        self.model.add(Dense(32, activation="relu"))
        self.model.add(Dense(64, activation="relu"))
        self.model.add(Dense(32, activation="relu"))
        self.model.add(Dense(16, activation="relu"))
        self.model.add(Dense(self.action_space, activation="linear"))
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() < self.exploration_rate:
            return random.randrange(self.action_space)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        batch = random.sample(self.memory, BATCH_SIZE)
        for state, action, reward, state_next, terminal in batch:
            q_update = reward
            if not terminal:
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))
            q_values = self.model.predict(state)
            q_values[0][action] = q_update
            self.model.fit(state, q_values, verbose=0)
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)

# Training

In [None]:
max_timesteps = 300
max_episodes = 1000


env = gym.make('LunarLanderContinuous-v2')
observation_space = env.observation_space.shape[0]
dqn_solver_vert = DQNSolver(observation_space, 2)
dqn_solver_hori = DQNSolver(observation_space, 3)
run = 0
cur_path = os.getcwd()
print("training started")
for _ in range(max_episodes):
    run += 1
    state = env.reset()
    state = np.reshape(state, [1, observation_space])
    total_reward = 0
    for i in range(max_timesteps):
        #env.render()
        dqn_solver_vert_action = dqn_solver_vert.act(state)
        dqn_solver_hori_action = dqn_solver_hori.act(state)

        action = [0,0]

        action[0] = dqn_solver_vert_action

        if dqn_solver_hori_action == 0:
            action[1] = 0
        elif dqn_solver_hori_action == 1:
            action[1] = 1
        elif dqn_solver_hori_action == 2:
            action[1] = -1



        state_next, reward, terminal, info = env.step(action)
        total_reward += reward
        state_next = np.reshape(state_next, [1, observation_space])
        dqn_solver_vert.remember(state, dqn_solver_vert_action, reward, state_next, terminal)
        dqn_solver_hori.remember(state, dqn_solver_hori_action, reward, state_next, terminal)
        state = state_next
        if terminal:
            break
        dqn_solver_vert.experience_replay()
        dqn_solver_hori.experience_replay()

    print(str(run) + "\t" + str(dqn_solver_vert.exploration_rate) + "\t" + str(total_reward))

    if run%25 == 0:
        file_path_vert = '../models/lunar_lander/dqn_mirror/lunar_lander_dqn_vert_'+str(run)+'.h5'
        file_path_hori = '../models/lunar_lander/dqn_mirror/lunar_lander_dqn_hori_'+str(run)+'.h5'
        dqn_solver_vert.model.save(file_path_vert)
        dqn_solver_hori.model.save(file_path_hori)