In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import gym

from tqdm import trange, tqdm
from torch import nn
from torch.nn import functional as F
from torch.optim import Adam
from collections import deque

GAME = 'LunarLander-v2'
env = gym.make(GAME)

LOG_PATH = 'log/'

In [2]:
ACTIONS = env.action_space.n
ACTIONS

4

In [3]:
STATE_SHAPE = env.observation_space.shape
STATE_SHAPE

(8,)

In [4]:
FC_SHAPE = 128

In [5]:
class DQN(nn.Module):
    def __init__(self, in_shape, out_shape):
        super(DQN, self).__init__()
        # layers
        self.fc1 = nn.Linear(in_shape, FC_SHAPE)
        self.fc2 = nn.Linear(FC_SHAPE, FC_SHAPE)  # hidden 1
        self.fc3 = nn.Linear(FC_SHAPE, FC_SHAPE)  # hidden 2
        self.out = nn.Linear(FC_SHAPE, out_shape)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        out = self.out(x)
        return out

In [6]:
BATCH_SIZE = 32
LEARNING_RATE = 0.001
TAU = 1e-3                  # soft-update step
SOFT_UPDATE = True

EPSILON_DECAY = 0.996       # at each step
MAX_EPSILON = 1
MIN_EPSILON = 0.01
GAMMA = 0.99                # discount
SAVE_FREQUENCY = 50         # frequency(in episodes) in which the model is saved
MIN_EXP_REPLAY = 1000       # min. size of exp. replay

In [7]:
policy_model = DQN(STATE_SHAPE[0], ACTIONS)
target_model = DQN(STATE_SHAPE[0], ACTIONS)
optimizer    = Adam(policy_model.parameters(), lr=LEARNING_RATE)

# set target_model parameters equal to model ones
target_model.load_state_dict(policy_model.state_dict());

In [8]:
def update_target_parameters(soft=False):
    if soft:
        # target_weights = policy_weights * tau + target_weights*(1-tau)
        [target_param.data.copy_(TAU*local_param.data + (1.0-TAU)*target_param.data) \
         for target_param, local_param in zip(target_model.parameters(), policy_model.parameters())]
    else:
        target_model.load_state_dict(policy_model.state_dict()) 

In [9]:
class Memory(deque):
    def sample(self, n):
        indexes = np.random.choice(len(self), size=n, replace=False)
        minibatch = [self[i] for i in indexes]
        return minibatch
        
    @property
    def size(self):
        return len(self)

In [10]:
experience_replay = Memory(maxlen=500000)

In [11]:
GAMMA = torch.tensor(GAMMA)
def update_step(experiences):
    cols = len(experiences[0])
    # separate each column of the batch
    states, actions, next_states, rewards, dones = [torch.tensor(np.array([experiences[i][k] for i in range(BATCH_SIZE)])) for k in range(cols)]
    
    # mask helps to select desired actions only
    mask = F.one_hot(actions.to(torch.int64), num_classes=ACTIONS)
    
    # Loss = R+ γQ(s′,argmaxQ(s′,a;θ);θ−) - Q(s,a;θ)
    # s: actual state
    # a: action taken
    # s′: next_state (resulting state when performing action a in state s)
    # θ: policy_model parameters
    # θ-: target_model parameters
    
    # Q(s,a;θ)
    q_values_policy = (policy_model(states)*mask).sum(1)
    
    # argmaxQ(s′,a;θ) is an ACTION
    # since Q(s′,a;θ) computed by the model
    # return the q_values of EACH ACTION 
    # that can be chosen in that state.
    # Detach is applied to avoid computing gradients
    policy_actions = policy_model(next_states).detach().argmax(1).unsqueeze(-1)

    # Q(s′,a;θ−)
    # target networks evaluates policy_actions,
    # i.e. actions selected by policy network.
    # Detach is applied to avoid computing gradients
    q_values_next_target = target_model(next_states).detach().gather(1, policy_actions).flatten()

    # Loss = R+ γQ(s′,argmaxQ(s′,a;θ);θ−) - Q(s,a;θ)
    td_errors = rewards + GAMMA*q_values_next_target*(~dones) - q_values_policy

    # MSE Error (loss over the batch)
    loss = (0.5 * (td_errors ** 2)).mean()
        
    # optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # update target_model parameters
    update_target_parameters(SOFT_UPDATE)

In [12]:
# epsilon greedy policy
def get_action(state, epsilon):
    # np.random.uniform() output a random float
    # number between 0 and 1
    if np.random.uniform() < epsilon:    
        # exploring
        action = env.action_space.sample()
    else:
        # exploiting
        q_vals = policy_model(torch.tensor(state))
        # select action with higher q_values
        action = np.argmax(q_vals.detach().numpy())
    return action

In [13]:
def fill_experience_replay():
    state = env.reset()
    
    # fill experience replay buffer with random experiences
    for i in  tqdm(range(MIN_EXP_REPLAY)):
        # random action
        action = env.action_space.sample()
        # perform random action
        next_state, reward, done, _ = env.step(action)
        # append to experience_replay
        experience_replay.append((state, action ,next_state, reward, done))
        state = next_state
        
        # percentage of fullness
        tmp = len(experience_replay)/MIN_EXP_REPLAY*100
        if tmp % 10 == 0:  print(f"Exp. replay {tmp}%")
            
        if done: state = env.reset()

In [14]:
def save_model(fname):
    torch.save(policy_model.state_dict(), f"model/{fname}.h5")

In [None]:
MAX_EPISODES = 500

n_episodes = 0
epsilon = MAX_EPSILON
episode_reward = 0
total_rewards = []
total_mean_rewards = []
highest_reward = float('-inf')

# filling experience replay
if len(experience_replay) < MIN_EXP_REPLAY: fill_experience_replay()

state = env.reset()

try:
    while n_episodes < MAX_EPISODES:

        # action selection, according to greedy-policy
        action = get_action(state, epsilon)

        # performing the action
        next_state, reward, done, _ = env.step(action)
        env.render()

        # adding experience to experience replay
        experience_replay.append((state, action, next_state, reward, done))

        # learning phase
        minibatch = experience_replay.sample(min(len(experience_replay), BATCH_SIZE))
        update_step(minibatch)
        
        # decrease epsilon
        epsilon = max(MIN_EPSILON, EPSILON_DECAY*epsilon)

        # update actual state
        state = next_state

        episode_reward += reward

        # end of the episode
        if done:
            n_episodes += 1
            total_rewards.append(episode_reward)
            mean_reward = np.mean(total_rewards[-100:])
            total_mean_rewards.append(mean_reward)
            print("Episode %d\t Epsilon %.2f\t Ep.Reward %.2f\t Mean Rew. %.2f" % (n_episodes, epsilon, episode_reward, mean_reward))

            if episode_reward >= highest_reward:
                print("New best score: %.2f. Saving" % episode_reward)
                save_model("best")
                highest_reward = episode_reward

            if n_episodes % SAVE_FREQUENCY == 0:
                save_model(f"auto_{n_episodes}_episodes")

            episode_reward = 0
            
            state = env.reset()
    
except KeyboardInterrupt:
    print("Training interrupted manually...")
    save_model(f"manual_{int(episode_reward)}")

 86%|████████▌ | 860/1000 [00:00<00:00, 4363.10it/s]

Exp. replay 10.0%
Exp. replay 20.0%
Exp. replay 30.0%
Exp. replay 40.0%
Exp. replay 50.0%
Exp. replay 60.0%
Exp. replay 70.0%
Exp. replay 80.0%
Exp. replay 90.0%


100%|██████████| 1000/1000 [00:00<00:00, 4242.76it/s]


Exp. replay 100.0%
Episode 1	 Epsilon 0.66	 Ep.Reward -106.66	 Mean Rew. -106.66
New best score: -106.66. Saving
Episode 2	 Epsilon 0.44	 Ep.Reward -176.44	 Mean Rew. -141.55
Episode 3	 Epsilon 0.19	 Ep.Reward -237.97	 Mean Rew. -173.69
Episode 4	 Epsilon 0.12	 Ep.Reward -116.32	 Mean Rew. -159.35
Episode 5	 Epsilon 0.09	 Ep.Reward -306.76	 Mean Rew. -188.83
Episode 6	 Epsilon 0.06	 Ep.Reward -334.61	 Mean Rew. -213.13
Episode 7	 Epsilon 0.03	 Ep.Reward -79.76	 Mean Rew. -194.07
New best score: -79.76. Saving
Episode 8	 Epsilon 0.02	 Ep.Reward -90.22	 Mean Rew. -181.09
Episode 9	 Epsilon 0.01	 Ep.Reward -44.49	 Mean Rew. -165.91
New best score: -44.49. Saving
Episode 10	 Epsilon 0.01	 Ep.Reward -179.60	 Mean Rew. -167.28
Episode 11	 Epsilon 0.01	 Ep.Reward -90.98	 Mean Rew. -160.35
Episode 12	 Epsilon 0.01	 Ep.Reward -99.05	 Mean Rew. -155.24
Episode 13	 Epsilon 0.01	 Ep.Reward -274.34	 Mean Rew. -164.40
Episode 14	 Epsilon 0.01	 Ep.Reward -285.59	 Mean Rew. -173.06
Episode 15	 Epsilon

In [None]:
with open('rewards.npy', 'wb') as f:
    np.save(f, total_mean_rewards)

In [None]:
plt.title("Mean rewards")
plt.xlabel('Episode')
plt.ylabel("Reward")
plt.plot(total_mean_rewards)
plt.grid(True)
plt.show()