In [None]:
from copy import deepcopy
import numpy as np

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model

In [None]:
import gym
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import pandas as pd

In [None]:
class ReplayBuffer():
    def __init__(self, mem_size, batch_size, input_dims):
        self.mem_size = mem_size
        self.mem_centr = 0
        self.batch_size = batch_size

        self.state_memory = np.zeros(
            (self.mem_size, *input_dims), dtype=np.float32)
        self.new_state_memory = np.zeros(
            (self.mem_size, *input_dims), dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.int32)

    def store_transitions(self, state, action, reward, new_state, done):
        index = self.mem_centr % self.mem_size

        self.state_memory[index] = state
        self.new_state_memory[index] = new_state
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.terminal_memory[index] = 1 - int(done)

        self.mem_centr = self.mem_centr + 1

    def is_sampleable(self):
        if self.mem_centr >= self.batch_size:
            return True
        else:
            return False

    def sample_buffer(self):
        if not(self.is_sampleable()):
            return []
        
        max_mem = min(self.mem_size, self.mem_centr)
        
        batch = np.random.choice(max_mem, self.batch_size, replace=False)

        states = self.state_memory[batch]
        new_states = self.new_state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        terminals = self.terminal_memory[batch]

        return states, new_states, actions, rewards, terminals

In [None]:
def build_dqn(lr, n_actions):
    model = keras.Sequential([
        keras.layers.Dense(100, activation='relu'),
        keras.layers.Dense(100, activation='relu'),
        keras.layers.Dense(n_actions, activation=None)
    ])
    model.compile(optimizer=Adam(learning_rate=lr), loss='mean_squared_error')

    return model

In [None]:
class Agent():
    def __init__(self, n_actions, input_dims,
                 lr=1e-4, gamma=0.9, mem_size=128, batch_size=64,
                  epsilon_decay=0.995, target_update_frequency=256):

        self.n_actions = n_actions

        self.gamma = gamma
        self.epsilon_decay = epsilon_decay

        self.batch_size = batch_size
        self.target_update_freq = target_update_frequency

        self.policy_network = build_dqn(lr=lr, n_actions=n_actions)
        self.target_network = deepcopy(self.policy_network)

        self.replay_mem = ReplayBuffer(
            mem_size=mem_size, batch_size=batch_size, input_dims=input_dims)

        self.epsilon = 1

    def choose_action(self, obs):
        if np.random.random() < self.epsilon:
            action = np.random.randint(self.n_actions)
        else:
            obs = np.array([obs])
            policy_values = self.policy_network.predict(obs)
            action = np.argmax(policy_values)

        return action

    def store_memory(self, state, action, reward, new_state, done):
        self.replay_mem.store_transitions(state, action, reward, new_state, done)

    def train(self):

        if not(self.replay_mem.is_sampleable()):
            return 0

        states, new_states, actions, rewards, dones = self.replay_mem.sample_buffer()

        q_eval = self.policy_network.predict(states)
        q_next = self.target_network.predict(new_states)

        batch_index = np.arange(self.batch_size)

        q_target = deepcopy(q_eval)
        q_target[batch_index, actions] = rewards + \
            self.gamma * np.max(q_next, axis=1) * dones

        loss = self.policy_network.train_on_batch(states, q_target)

        self.epsilon = max(self.epsilon * self.epsilon_decay, 0.1)

        if(self.replay_mem.mem_centr % self.target_update_freq == 0):
            self.target_network.set_weights(self.policy_network.get_weights())
        
        return loss

    def save_model(self, file_path='./model/tf_ddqn_model.model'):
        self.policy_network.save(file_path)

    def load_model(self, file_path='./model/tf_ddqn_model.model'):           
        self.policy_network = load_model(file_path)
        
        self.target_network = build_dqn(lr=lr, n_actions=self.n_actions)
        self.target_network.set_weights(self.policy_network.get_weights())
        

In [None]:
tf.compat.v1.disable_eager_execution()
tf.get_logger().setLevel('ERROR')

In [None]:
lr = 3e-4
gamma = 0.99

epsilon_decay = 1 - (2e-5)

episodes = 1000

In [None]:
mem_size = 1024
batch_size = 32

target_update_frequency = 300

In [None]:
env = gym.make('LunarLander-v2')

In [None]:
agent = Agent(n_actions=env.action_space.n, input_dims=env.observation_space.shape,
lr=lr, gamma=gamma, mem_size=mem_size, batch_size=batch_size,
epsilon_decay=epsilon_decay, target_update_frequency=target_update_frequency)

In [None]:
scores = []
eps = []
losses = []

In [None]:
pbar = tqdm(range(episodes))

for i in pbar:
    done = False
    score = 0
    obs = env.reset()
    ep_loss = []

    while not(done):

        action = agent.choose_action(obs)
        
        new_obs, reward, done, _ = env.step(action)
        #env.render()

        score = score + reward
        
        agent.store_memory(state=obs, action=action, reward=reward, new_state=new_obs, done=done)

        obs = deepcopy(new_obs)

        loss = agent.train()
        ep_loss.append(loss)
    
    scores.append(score)
    eps.append(agent.epsilon)
    losses.append(ep_loss)
    pbar.set_description("Current_score = %s" % score)

In [None]:
agent.save_model()
env.close()

In [None]:
plt.plot(eps, label="epsilon")
plt.legend()
plt.savefig('./plots/tf/ddqn/epsilon.png')
plt.show()

In [None]:
losses_array = []
for x in losses:
    losses_array.append(np.mean(np.array(x)))

In [None]:
plt.plot(losses_array, label="loss")
plt.legend()
plt.savefig('./plots/tf/ddqn/losses.png')
plt.show()

In [None]:
resolution = 50

cumsum_losses = np.array(pd.Series(np.array(losses_array)).rolling(window=resolution).mean() )

plt.plot(cumsum_losses, label="loss")
plt.legend()
plt.savefig('./plots/tf/ddqn/losses_trend.png')
plt.show()

In [None]:
plt.plot(scores, label="rewards")
plt.legend()
plt.savefig('./plots/tf/ddqn/rewards.png')
plt.show()

In [None]:
resolution = 50

cumsum_rewards = np.array(pd.Series(np.array(scores)).rolling(window=resolution).mean() )

plt.plot(cumsum_rewards, label="rewards")
plt.legend()
plt.savefig('./plots/tf/ddqn/rewards_trend.png')
plt.show()

In [None]:
test_env = gym.make('LunarLander-v2')

test_agent = Agent(n_actions=test_env.action_space.n, input_dims=test_env.observation_space.shape)

test_agent.epsilon = 0.0
test_agent.load_model()

In [None]:
test_episodes = 10

pbar = tqdm(range(test_episodes))

for i in pbar:
    done = False
    score = 0
    obs = test_env.reset()
    test_env.render()

    while not(done):
        action = test_agent.choose_action(obs)
        
        new_obs, reward, done, _ = test_env.step(action)
        test_env.render()

        score = score + reward

        obs = deepcopy(new_obs)
    
    pbar.set_description("Current_score = %s" % score)
    print("score in episode ", (i+1) ," : ",score)
test_env.close()