<a href="https://colab.research.google.com/github/archit36/projectcdac/blob/main/CapstoneProject_IISc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Development and Optimization of Deep Q-Networks for Autonomous Lunar Lander Control

The project focuses on designing and optimizing a Deep Q-Network (DQN) to autonomously control a lunar lander in a simulated environment. The primary challenge is to enable the lander to reach its target safely by learning optimal policies for thrust control, orientation, and landing speed.

In [16]:
# @title Update/Upgrade the system and install libs
!apt-get update > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get install -y swig build-essential python-dev python3-dev > /dev/null 2>&1
!apt-get install x11-utils > /dev/null 2>&1
!apt-get install xvfb > /dev/null 2>&1

In [19]:
# @title Install dependencies
!pip install rarfile --quiet
!pip install 'stable-baselines3[extra]' --quiet
!pip install ale-py --quiet
!pip install swig
!pip install gym --quiet
!pip install pyvirtualdisplay --quiet
!pip install pyglet --quiet
!pip install pygame --quiet
!pip install minigrid --quiet
!pip install -q swig --quiet
!pip install -q gymnasium --quiet
!pip install 'minigrid<=2.1.1' --quiet
!pip3 install box2d-py --quiet

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for AutoROM.accept-rom-license (pyproject.toml) ... [?25l[?25hdone
Collecting swig
  Using cached swig-4.2.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (3.6 kB)
Using cached swig-4.2.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.9 MB)
Installing collected packages: swig
Successfully installed swig-4.2.1
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m936.6/936.6 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for box2d-py (setup.py) ...

**Importing necessary libraries**

In [20]:
# Imports
import io
import os
import glob
import torch
import base64

import numpy as np
import matplotlib.pyplot as plt

import sys
import gymnasium
sys.modules["gym"] = gymnasium

import stable_baselines3
from stable_baselines3 import DQN
from stable_baselines3.common.results_plotter import ts2xy, load_results
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.env_util import make_atari_env

import gymnasium as gym
from gym import spaces
from gym.envs.box2d.lunar_lander import *
from gym.wrappers.monitoring.video_recorder import VideoRecorder

In [21]:
nn_layers = [64, 64]  # This is the configuration of your neural network. Currently, we have two layers, each consisting of 64 neurons.
                      # If you want three layers with 64 neurons each, set the value to [64,64,64] and so on.

learning_rate = 0.001  # This is the step-size with which the gradient descent

In [22]:
log_dir = "/tmp/gym/"
os.makedirs(log_dir, exist_ok=True)

# Create environment
env_name = 'LunarLander-v2'
env = gym.make(env_name)
# You can also load other environments like cartpole, MountainCar, Acrobot.
# Refer to https://gym.openai.com/docs/ for descriptions.

# For example, if you would like to load Cartpole,
# just replace the above statement with "env = gym.make('CartPole-v1')".

env = stable_baselines3.common.monitor.Monitor(env, log_dir )

callback = EvalCallback(env, log_path=log_dir, deterministic=True)  # For evaluating the performance of the agent periodically and logging the results.
policy_kwargs = dict(activation_fn=torch.nn.ReLU,
                     net_arch=nn_layers)
model = DQN("MlpPolicy", env,policy_kwargs = policy_kwargs,
            learning_rate=learning_rate,
            batch_size=1,  # for simplicity, we are not doing batch update.
            buffer_size=1,  # size of experience of replay buffer. Set to 1 as batch update is not done
            learning_starts=1,  # learning starts immediately!
            gamma=0.99,  # discount facto. range is between 0 and 1.
            tau = 1,  # the soft update coefficient for updating the target network
            target_update_interval=1,  # update the target network immediately.
            train_freq=(1,"step"),  # train the network at every step.
            max_grad_norm = 10,  # the maximum value for the gradient clipping
            exploration_initial_eps = 1,  # initial value of random action probability
            exploration_fraction = 0.5,  # fraction of entire training period over which the exploration rate is reduced
            gradient_steps = 1,  # number of gradient steps
            seed = 1,  # seed for the pseudo random generators
            verbose=0)  # Set verbose to 1 to observe training logs. We encourage you to set the verbose to 1.

# You can also experiment with other RL algorithms like A2C, PPO, DDPG etc.
# Refer to  https://stable-baselines3.readthedocs.io/en/master/guide/examples.html
# for documentation. For example, if you would like to run DDPG, just replace "DQN" above with "DDPG".

The input state of the Lunar Lander consists of following components:

Horizontal Position
Vertical Position
Horizontal Velocity
Vertical Velocity
Angle
Angular Velocity
Left Leg Contact
Right Leg Contact
The actions of the agents are:

Do Nothing
Fire Main Engine
Fire Left Engine
Fire Right Engine

In [23]:
env_name = 'LunarLander-v2'
env = gym.make(env_name)
print('State shape: ', env.observation_space.shape)
print('Number of actions: ', env.action_space.n)

State shape:  (8,)
Number of actions:  4


In [28]:
from IPython.display import HTML
from base64 import b64encode

def render_mp4(path):
    """Render an MP4 file as HTML video."""
    video = open(path, "rb").read()
    encoded = b64encode(video).decode('ascii')
    return f'<video width="640" height="480" controls><source src="data:video/mp4;base64,{encoded}" type="video/mp4"></video>'

In [29]:
env = gym.make(env_name, render_mode="rgb_array")
# Ensure the 'video' directory exists
os.makedirs("video", exist_ok=True)

vid = VideoRecorder(env, path=f"video/{env_name}_pretraining.mp4")

observation = env.reset()[0]
total_reward = 0
done = False
while not done:
  frame = env.render()
  vid.capture_frame()
  action, states = model.predict(observation, deterministic=True)
  observation, reward, done, info, _ = env.step(action)
  total_reward += reward
vid.close()
env.close()
print(f"\nTotal reward: {total_reward}")

# show video
html = render_mp4(f"video/{env_name}_pretraining.mp4")
HTML(html)

  logger.deprecation(
  self.pid = _posixsubprocess.fork_exec(



Total reward: -392.1953049258847


In [None]:
model.learn(total_timesteps=100000, log_interval=10, callback=callback)
# The performance of the training will be printed every 10 episodes. Change it to 1, if you wish to
# view the performance at every training episode.

Eval num_timesteps=10000, episode_reward=-88.51 +/- 46.76
Episode length: 125.60 +/- 69.95
Eval num_timesteps=20000, episode_reward=-60.07 +/- 160.41
Episode length: 374.00 +/- 112.74
Eval num_timesteps=30000, episode_reward=162.01 +/- 110.51
Episode length: 638.60 +/- 295.11
New best mean reward!
Eval num_timesteps=40000, episode_reward=179.98 +/- 112.42
Episode length: 387.20 +/- 43.27
New best mean reward!
Eval num_timesteps=50000, episode_reward=-36.51 +/- 11.61
Episode length: 1000.00 +/- 0.00
Eval num_timesteps=60000, episode_reward=99.13 +/- 167.33
Episode length: 487.80 +/- 64.46
Eval num_timesteps=70000, episode_reward=-185.61 +/- 18.71
Episode length: 626.40 +/- 99.77
Eval num_timesteps=80000, episode_reward=199.03 +/- 22.18
Episode length: 542.40 +/- 151.46
New best mean reward!
Eval num_timesteps=90000, episode_reward=117.70 +/- 110.99
Episode length: 331.60 +/- 44.64


In [31]:
env = gym.make(env_name, render_mode="rgb_array")
vid = VideoRecorder(env, path=f"video/{env_name}_learned.mp4")
observation = env.reset()[0]

total_reward = 0
done = False
while not done:
  frame = env.render()
  vid.capture_frame()
  action, states = model.predict(observation, deterministic=True)
  observation, reward, done, info, _ = env.step(action)
  total_reward += reward
vid.close()
env.close()
print(f"\nTotal reward: {total_reward}")

# show video
html = render_mp4(f"video/{env_name}_learned.mp4")
HTML(html)

  logger.deprecation(
  self.pid = _posixsubprocess.fork_exec(



Total reward: -126.6917397967525
