In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import gym

from torch import nn
from torch.nn import functional as F
from torch.optim import Adam
from collections import deque

GAME = 'LunarLander-v2'
env = gym.make(GAME)

LOG_PATH = 'log/'

In [2]:
ACTIONS = env.action_space.n
ACTIONS

4

In [3]:
STATE_SHAPE = env.observation_space.shape
STATE_SHAPE

(8,)

In [4]:
FC_SHAPE = 128

In [5]:
class DQN(nn.Module):
    def __init__(self, in_shape, out_shape):
        super(DQN, self).__init__()
        # layers
        self.fc1 = nn.Linear(in_shape, FC_SHAPE)
        self.fc2 = nn.Linear(FC_SHAPE, FC_SHAPE)  # hidden 1
        self.fc3 = nn.Linear(FC_SHAPE, FC_SHAPE)  # hidden 2
        self.out = nn.Linear(FC_SHAPE, out_shape)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        out = self.out(x)
        return out
    

In [6]:
BATCH_SIZE = 32
LEARNING_RATE = 0.001

EPSILON_DECAY = 0.996       # at each step
MAX_EPSILON = 1
MIN_EPSILON = 0.01
GAMMA = 0.99                # discount
SAVE_FREQUENCY = 50         # frequency(in episodes) in which the model is saved
MIN_EXP_REPLAY = 1000       # min. size of exp. replay

In [7]:
model = DQN(STATE_SHAPE[0], ACTIONS)
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)

In [8]:
class Memory(deque):
    def sample(self, n):
        indexes = np.random.choice(len(self), size=n, replace=False)
        minibatch = [self[i] for i in indexes]
        return minibatch
        
    @property
    def size(self):
        return len(self)

In [9]:
experience_replay = Memory(maxlen=500000)

In [10]:
GAMMA = torch.tensor(GAMMA)
def update_step(experiences):
    cols = len(experiences[0])
    # separate each column of the batch
    states, actions, next_states, rewards, dones = [np.array([experiences[i][k] for i in range(BATCH_SIZE)]) for k in range(cols)]
    
    # prepare tensors
    states = torch.tensor(states)
    next_states = torch.tensor(next_states)
    rewards = torch.tensor(rewards)
    dones = torch.tensor(dones)
    actions = torch.tensor(actions)
    
    # mask helps to select desired actions only
    mask = F.one_hot(actions.to(torch.int64), num_classes=ACTIONS)
    
    # setting gradients to zero, from now on we must only use
    # torch functions and tensors
    optimizer.zero_grad()
    
    # forward
    q_values = (model(states)*mask).sum(1)

    # forward
    # according to algorithm, gradients are not computed for q_values_next
    q_values_next = model(next_states).detach().max(1).values

    td_errors = rewards + GAMMA*q_values_next*(~dones) - q_values

    # MSE Error
    loss = (0.5 * (td_errors ** 2)).mean()
        
    # backpropagation
    loss.backward()
    optimizer.step()

In [11]:
# epsilon greedy policy
def get_action(state, epsilon):
    # np.random.uniform() output a random float
    # number between 0 and 1
    if np.random.uniform() < epsilon:    
        # exploring
        action = env.action_space.sample()
    else:
        # exploiting
        q_vals = model(torch.tensor(state))
        # select action with higher q_values
        action = np.argmax(q_vals.detach().numpy())
    return action

In [12]:
def fill_experience_replay():
    state = env.reset()
    
    # fill experience replay buffer with random experiences
    while len(experience_replay) < MIN_EXP_REPLAY:
        # random action
        action = env.action_space.sample()
        # perform random action
        next_state, reward, done, _ = env.step(action)
        # append to experience_replay
        experience_replay.append((state, action ,next_state, reward, done))
        state = next_state
        
        # percentage of fullness
        tmp = len(experience_replay)/MIN_EXP_REPLAY*100
        if tmp % 10 == 0:  print(f"Exp. replay {tmp}%")
            
        if done: state = env.reset()

In [13]:
def save_model(fname):
    torch.save(model.state_dict(), f"model/{fname}.h5")

In [None]:
MAX_EPISODES = 500

n_episodes = 0
epsilon = MAX_EPSILON
episode_reward = 0
total_rewards = []
total_mean_rewards = []
highest_reward = float('-inf')

# filling experience replay
if len(experience_replay) < MIN_EXP_REPLAY: fill_experience_replay()

state = env.reset()

try:
    while n_episodes < MAX_EPISODES:

        # action selection, according to greedy-policy
        action = get_action(state, epsilon)

        # performing the action
        next_state, reward, done, _ = env.step(action)
        env.render()

        # adding experience to experience replay
        experience_replay.append((state, action, next_state, reward, done))

        # learning phase
        minibatch = experience_replay.sample(min(len(experience_replay), BATCH_SIZE))
        update_step(minibatch)
        
        # decrease epsilon
        epsilon = max(MIN_EPSILON, EPSILON_DECAY*epsilon)

        # update actual state
        state = next_state

        episode_reward += reward

        # end of the episode
        if done:
            n_episodes += 1
            total_rewards.append(episode_reward)
            mean_reward = np.mean(total_rewards[-100:])
            total_mean_rewards.append(mean_reward)
            print("Episode %d\t Epsilon %.2f\t Ep.Reward %.2f\t Mean Rew. %.2f" % (n_episodes, epsilon, episode_reward, mean_reward))

            if episode_reward >= highest_reward:
                print("New best score: %.2f. Saving" % episode_reward)
                save_model("best")
                highest_reward = episode_reward

            if n_episodes % SAVE_FREQUENCY == 0:
                save_model(f"auto_{n_episodes}_episodes")

            episode_reward = 0
            
            state = env.reset()
    
except KeyboardInterrupt:
    print("Training interrupted manually...")
    save_model(f"manual_{int(episode_reward)}")

Exp. replay 10.0%
Exp. replay 20.0%
Exp. replay 30.0%
Exp. replay 40.0%
Exp. replay 50.0%
Exp. replay 60.0%
Exp. replay 70.0%
Exp. replay 80.0%
Exp. replay 90.0%
Exp. replay 100.0%
Episode 1	 Epsilon 0.67	 Ep.Reward -23.94	 Mean Rew. -23.94
New best score: -23.94. Saving
Episode 2	 Epsilon 0.42	 Ep.Reward -236.03	 Mean Rew. -129.98
Episode 3	 Epsilon 0.06	 Ep.Reward -72.55	 Mean Rew. -110.84
Episode 4	 Epsilon 0.04	 Ep.Reward -163.48	 Mean Rew. -124.00
Episode 5	 Epsilon 0.01	 Ep.Reward -133.90	 Mean Rew. -125.98
Episode 6	 Epsilon 0.01	 Ep.Reward -116.40	 Mean Rew. -124.38
Episode 7	 Epsilon 0.01	 Ep.Reward 85.55	 Mean Rew. -94.39
New best score: 85.55. Saving
Episode 8	 Epsilon 0.01	 Ep.Reward -177.47	 Mean Rew. -104.78
Episode 9	 Epsilon 0.01	 Ep.Reward -88.35	 Mean Rew. -102.95
Episode 10	 Epsilon 0.01	 Ep.Reward -74.69	 Mean Rew. -100.13
Episode 11	 Epsilon 0.01	 Ep.Reward -22.42	 Mean Rew. -93.06
Episode 12	 Epsilon 0.01	 Ep.Reward -62.89	 Mean Rew. -90.55
Episode 13	 Epsilon 0.0

In [None]:
with open('rewards.npy', 'wb') as f:
    np.save(f, total_mean_rewards)

In [None]:
plt.title("Mean rewards")
plt.xlabel('Episode')
plt.ylabel("Reward")
plt.plot(total_mean_rewards)
plt.grid(True)
plt.show()