# Lunar Landing

Traning double dueling deep Q-learning AI to solve [lunar landing environment](https://gymnasium.farama.org/environments/box2d/lunar_lander/) from [Gymnasium](https://gymnasium.farama.org/)

## Intalling packages and importing libraries

### Installing NumPy and PyTorch

In [None]:
%pip install numpy
%pip install torch
%pip install torchvision

### Installing Gymnasium

In [None]:
%pip install gymnasium
%pip install swig # Necessary to build the wheel for box2d-py
%pip install gymnasium[box2d] # Contains lunar lander environment

### Importing Libraries

In [3]:
import os
import random
import numpy as np
from collections import deque, namedtuple

# Pytorch stuff

import torch
import torch.nn as nn # Neural network library
import torch.optim as optim # Optimizer to train AI
import torch.nn.functional as F # Activation function
import torch.autograd as autograd # Stochastic gradient descent for neural net trainig
from torch.autograd import Variable


## Building AI

### Neural Net Architecture

In [4]:
class DuelingDQN(nn.Module):
    """
    Dueling Deep Q-Network

    Dueling separates the value of the state from the value of the actions possible in that state
    """
    def __init__(self, state_size, action_size):
        super(DuelingDQN, self).__init__()
        self.state_size = state_size
        self.action_size = action_size

        # Shared fully connected layers
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)

        # Value stream
        self.value_fc = nn.Linear(128, 64)
        self.value_out = nn.Linear(64, 1)

        # Advantage stream
        self.advantage_fc = nn.Linear(128, 64)
        self.advantage_out = nn.Linear(64, action_size)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))

        # State value stream
        value = F.relu(self.value_fc(x))
        value = self.value_out(value)

        # Advantage stream
        advantage = F.relu(self.advantage_fc(x))
        advantage = self.advantage_out(advantage)

        # Combine value and advantage
        q_values = value + (advantage - advantage.mean(dim=1, keepdim=True))
        return q_values

## Training AI

### Set up Lunar Lander environment

In [5]:
import gymnasium as gym
env = gym.make("LunarLander-v3")

lunar_state_size = env.observation_space.shape[0]
lunar_action_size = env.action_space.n

# Check that environment is set up correctly
# Values are 8 and 4 as of lunar lander v3
assert(lunar_state_size == 8)
assert(lunar_action_size == 4)

### Initialize hyperparameters

In [6]:
learning_rate = 5e-4 # Optimized for lunar landing
mini_batch_size = 100 # Standard for deep Q learning
discount_factor = 0.99 # Optimal discount factor
replay_buffer_size = int(1e5) # Number of experiences stored in memory (1 million is too slow on Colab but would provide better results per episode)
tau = 1e-3 # Optimal value for tau

### Experience Replay

In [7]:
class ReplayMemory(object):

  def __init__(self, capacity) -> None:
    # Capacity is capacity of memory
    self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # GPU acceleration if possible
    self.capacity = capacity
    self.memory = deque(maxlen=capacity)

  def push(self, event):
    # Add experiences to replay memory buffer
    self.memory.append(event)

    # Make sure memory buffer capacity is not exceeded
    if len(self.memory) > self.capacity:
      del self.memory[0] # Delete oldest memory

  def sample(self, batch_size):
    """
    Randomly sample experiences from memory
    """

    experiences = random.sample(self.memory, batch_size)

    # Convert elements of experience to PyTorch tensors and move them to device

    states = torch.from_numpy(np.vstack([e[0] for e in experiences if e is not None])).float().to(self.device)
    actions = torch.from_numpy(np.vstack([e[1] for e in experiences if e is not None])).long().to(self.device) # Actions are either 0, 1, 2, 3
    rewards = torch.from_numpy(np.vstack([e[2] for e in experiences if e is not None])).float().to(self.device)
    next_states = torch.from_numpy(np.vstack([e[3] for e in experiences if e is not None])).float().to(self.device)

    # Convert Boolean data to float tensor
    dones = torch.from_numpy(np.vstack([e[4] for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device) # Last elements in experiences


    return (states, next_states, actions, rewards, dones)

### Double Deep Q Network

In [8]:
class Agent():
  """
  Use double deep Q-learning with gradient clipping to reduce overestimation bias
  """

  def __init__(self, state_size, action_size) -> None:
    self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # GPU acceleration if possible

    self.state_size = state_size
    self.action_size = action_size

    # Q learning
    self.q_network = DuelingDQN(state_size, action_size).to(self.device) # Local Q network
    self.target_network = DuelingDQN(state_size, action_size).to(self.device) # Target Q network

    self.optimizer = optim.Adam(self.q_network.parameters(), lr=learning_rate) # Optimizer for Q network

    self.memory = ReplayMemory(replay_buffer_size)

    self.time_step = 0 # Time step for updating target network

  def step(self, state, action, reward, next_state, done):
    self.memory.push((state, action, reward, next_state, done)) # Add experience to replay memory

    self.time_step = (self.time_step + 1) % 4 # Learn every 4 steps

    # Learn every 4 steps
    if self.time_step == 0 and len(self.memory.memory) > mini_batch_size:
      experiences = self.memory.sample(mini_batch_size)
      self.learn(experiences, discount_factor)

  def act(self, state, epsilon=0):
    """
    Select action based on given state in environment using epsilon-greedy action selection policy.

    Epsilon-greedy is standard in Deep Q learning over softmax. It is also simpler and less computationally expensive.
    """

    # Important to add dimension that includes which batch the state belongs to
    # First dimension of new state tensor is batch number

    state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) # Convert state to tensor and add batch

    self.q_network.eval()

    with torch.no_grad(): # Disable gradient computation (make sure in inference mode)
      action_values = self.q_network(state)
    self.q_network.train()

    # Use epsilon greedy action-selection policy

    if random.random() > epsilon:
      return np.argmax(action_values.cpu().data.numpy())
    else:
      return random.choice(np.arange(self.action_size))

  def learn(self, experiences, gamma):
    """
    Update Q-values based on sampled experiences
    """

    states, next_states, actions, rewards, dones = experiences

    # Get best actions from the local Q-network
    next_actions = self.q_network(next_states).detach().argmax(1).unsqueeze(1)

    # Get corresponding Q-values from the target Q-network
    next_q_targets = self.target_network(next_states).gather(1, next_actions)

    # Compute target Q-values
    target_q_values = rewards + (gamma * next_q_targets * (1 - dones))

    # Compute current Q-values
    predicted_q_values = self.q_network(states).gather(1, actions)

    # Compute loss
    loss = F.mse_loss(predicted_q_values, target_q_values)

    # Optimize the model
    self.optimizer.zero_grad()
    loss.backward()
    nn.utils.clip_grad_norm_(self.q_network.parameters(), 1.0)  # Gradient clipping
    self.optimizer.step()

    # Soft update target network
    self.soft_update(self.q_network, self.target_network, tau)

  def soft_update(self, local_model, target_model, tau):
    """
    Update target network parameters based on weighted average of local network and target network parameters

    Soft update prevents abrupt changes in target network parameters that could destabilize the training
    """

    for local_param, target_param in zip(local_model.parameters(), target_model.parameters()):
      target_param.data.copy_(tau * local_param.data + (1.0-tau) * target_param.data) # Soft update formula

### Train an Agent

In [9]:
# Initialze an agent
agent = Agent(state_size=lunar_state_size, action_size=lunar_action_size)

In [10]:
# Initialize training hyperparameters

num_episodes = 2000
max_time_steps_per_episode = 1000

# Epsilon greedy hyperparameters
epsilon_start = 1.0
epsilon_decay = 0.995 # Decay epsilon slowly
epsilon_min = 0.01

epsilon = epsilon_start

# Window of scores on 100 episodes
window_of_scores = deque(maxlen=100)

In [None]:
# Final training loop

for episode in range(1, num_episodes+1):
  # Reset environment to initial state
  state, _ = env.reset()
  score = 0

  # Agent learning

  for t in range(max_time_steps_per_episode):
    action = agent.act(state=state, epsilon=epsilon)

    next_state, reward, done, _, _ = env.step(action)

    agent.step(state, action, reward, next_state, done)

    state = next_state
    score += reward

    if done:
      break

  window_of_scores.append(score)
  epsilon = max(epsilon_min, epsilon_decay * epsilon) # Decay epsilon

  # Print stuff to get feedback that agent is working
  print(f"\rEpisode: {episode}\tScore: {score}\tAverage Score: {np.mean(window_of_scores)}", end="") # \r allows newly printed line to over-ride previous one

  if episode % 100 == 0:
    print("")

  if np.mean(window_of_scores) >= 200: # Succesful episodes have scores 200 or above, so model is successful on average
    print(f"\nEnvironment solved in {episode} episodes!\t Average Score: {np.mean(window_of_scores)}")

    torch.save(agent.q_network.state_dict(), "lunar_landing_model.pth") # Save parameters to PyTorch file

    break # No more training needed


## Visualization

### Imports

In [13]:
import glob
import io
import base64
import imageio
from IPython.display import HTML, display
from gymnasium.wrappers import RecordVideo

### Video

In [None]:

def show_video_of_model(agent, env_name):
    env = gym.make(env_name, render_mode='rgb_array')
    state, _ = env.reset()
    done = False
    frames = []
    while not done:
        frame = env.render()
        frames.append(frame)
        action = agent.act(state) # AI is in inference mode after training is done
        state, reward, done, _, _ = env.step(action.item())

    env.close()
    imageio.mimsave('video.mp4', frames, fps=30)

show_video_of_model(agent, 'LunarLander-v3')

def show_video():
    # Show video in notebook
    mp4list = glob.glob('*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")

show_video()