<a href="https://colab.research.google.com/github/ameebhavsar/LunarLanding/blob/main/Amee_Bhavsar_Lunar_Landing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Deep Q-Learning for Lunar Landing, Gym

## Part 0 - Installing the required packages and importing the libraries

### Installing Gymnasium

In [None]:
!pip install gymnasium
!pip install "gymnasium[atari, accept-rom-license]"
!apt-get install -y swig
!pip install gymnasium[box2d]

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1
Collecting autorom[accept-rom-license]~=0.4.2 (from gymnasium[accept-rom-license,atari])
  Downloading AutoROM-0.4.2-py3-none-any.whl (16 kB)
Collecting shimmy[atari]<1.0,>=0.1.0 (from gymnasium[accept-rom-license,atari])
  Downloading Shimmy-0.2.1-py3-none-any.whl (25 kB)
Collecting AutoROM.accept-rom-license (from autorom[accept-rom-license]~=0.4.2->gymnasium[accept-rom-license,atari])
  Downloading AutoROM.accept-rom-license-0.6.1.tar.gz (434 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m434.7/434.7 kB[0m [31m13.1 

### Importing the libraries

In [None]:
import os

# Random number generator
import random

# Works with arrays and mathametics
import numpy as np

# Pytorch library items related to training the AI model
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.autograd as autograd
from torch.autograd import Variable
from collections import deque, namedtuple

##Building the AI

### Creating the architecture of the Neural Network

In [None]:
 # Notes:

 # From another module called nn, we bring over a class called Module which will define all the features of NeurNetwork

  # initialize attributes and setup necassary states for the created class. initialize self, the state amount 8 and action amount 4 and randomly choose 42
    # call the class and self within super
    # use manual_seed from torch library and pass the seed we defined as 42 earlier
    # define the first connection from the input layer to the first fully connected layer as fc1 using nn library
      # here we pass the numb of input features which is the number of states.. 64 was found through trial and error as the optimal size of neurons to have IN THIS CASE
      # network:  (( number of states --> 64 outputs --> 64 outputs --> number of actions ))


In [None]:

#defing the architecture or brain
class NeuralNetwork(nn.Module):

  # Defining the structure of the layers
  def __init__(self, state_size, action_size, seed = 42):
      super(NeuralNetwork, self).__init__()
      self.seed = torch.manual_seed(seed)
      self.fc1 = nn.Linear(state_size, 64)
      self.fc2 = nn.Linear(64, 64)
      self.fc3 = nn.Linear(64, action_size)

  # Generate the signal of the state being passed forward through the layers
  def forward(self, state):

    # layer 1
    z = self.fc1(state)
    # use a rectifier activation function to "propagate"
    z = F.relu(z)

    # layer 2
    z = self.fc2(z)
    z = F.relu(z)

    # layer 3
    return self.fc3(z)




## Part 2 - Training the AI

In [None]:
# Notes:

  # to check if the parameters are defined correctly: print('state shape', state_shape, 'and state size', state_size, 'and the number of actions is ', number_actions)


  #

### Setting up the environment

In [None]:
# import the env
import gymnasium as gym
env =  gym.make('LunarLander-v2')

# define parameters to build the architecture
state_shape = env.observation_space.shape
state_size = env.observation_space.shape[0]
number_actions = env.action_space.n
print('State shape: ', state_shape)
print('State size: ', state_size)
print('Number of actions: ', number_actions)




State shape:  (8,)
State size:  8
Number of actions:  4


### Initializing the hyperparameters

In [None]:

learning_rate = 5e-4
# standard values
minibatch_size = 100
discount_factor = 0.99
replay_buffer_size = int(1e5)
interpolation_parameter = 0.001

### Implementing Experience Replay

In [None]:

class ReplayMemory(object):

  def __init__(self, capacity):
    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    self.capacity = capacity
    self.memory = []

  # adds experiences into Replay memory buffer without allowing the memory to exceed capacity
  def push(self, event):
    self.memory.append(event)
    if len(self.memory) > self.capacity:
      del self.memory[0]

  def sample(self, batch_size):
    experiences = random.sample(self.memory, k = batch_size)
    # set experience states to torch tensors with data type float
    states = torch.from_numpy(np.vstack([ e[0] for e in experiences if e is not None ])).float().to(self.device)
    actions = torch.from_numpy(np.vstack([ e[1] for e in experiences if e is not None ])).long().to(self.device)
    rewards = torch.from_numpy(np.vstack([ e[2] for e in experiences if e is not None ])).float().to(self.device)
    next_states = torch.from_numpy(np.vstack([ e[3] for e in experiences if e is not None ])).float().to(self.device)
    dones = torch.from_numpy(np.vstack([ e[4] for e in experiences if e is not None ]).astype(np.uint8)).float().to(self.device)
    return states, next_states, actions, rewards, dones

    #np.uint8 is bool data type



### Implementing the DQN class

In [None]:
# The AI or agent
class bigBrainMans():

  def __init__(self, state_size, action_size):
     self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     self.state_size = state_size
     self.action_size = action_size

     self.local_qnetwork = NeuralNetwork(state_size, action_size).to(self.device)
     self.target_qnetwork = NeuralNetwork(state_size, action_size).to(self.device)

     self.optimizer = optim.Adam(self.local_qnetwork.parameters(), lr = learning_rate)
     self.memory = ReplayMemory(replay_buffer_size)
     self.t_stamp = 0

  def step(self, state, action, reward, next_state, done):
    self.memory.push((state, action, reward, next_state, done))
    self.t_stamp = (self.t_stamp + 1) % 4

    # for minitbatch size learn every 4 steps
    if self.t_stamp == 0:
      if len(self.memory.memory) > minibatch_size:
        experiences = self.memory.sample(100)
        self.learn(experiences, discount_factor)

  # Epsilon greedy method
  def act(self, state, epsilon = 0.):

    # numpyarray to torch tensor type, add an extra dimension to correspond to the (mini)batch numbers... 0 represents the index number
    state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
    self.local_qnetwork.eval()

    with torch.no_grad():
      # Enter inference mode to obtain predictions from the local q network
      predicted_action_values = self.local_qnetwork(state)

    # Return back to training mode
    self.local_qnetwork.train()

    # Epsilon greedy method implementation
    if random.random() > epsilon:
      # return index of largest q-value
      return np.argmax(predicted_action_values.cpu().data.numpy())
    else:
      # Randomly choose a q-value (or state)
      return random.choice(np.arange(self.action_size))


  def learn(self, experiences, discount_factor):
    states, next_states, actions, rewards, dones = experiences
    next_qtargets = self.target_qnetwork(next_states).detach().max(1)[0].unsqueeze(1)
    current_qtargets = rewards + discount_factor * next_qtargets * (1 - dones)
    current_qexpected = self.local_qnetwork(states).gather(1, actions)

    loss = F.mse_loss(current_qexpected, current_qtargets)
    self.optimizer.zero_grad()
    loss.backward()
    self.optimizer.step()
    self.soft_update(self.local_qnetwork, self.target_qnetwork, interpolation_parameter)

  def soft_update(self, local_model, target_model, interpolation_parameter ):

    for target_para, local_para in zip(target_model.parameters(), local_model.parameters()):
      # For each parameter of the model/network, update the weight
      target_para.data.copy_( interpolation_parameter * local_para.data + (1.0 - interpolation_parameter) * target_para.data )





### Initializing the DQN agent

In [None]:

agent = bigBrainMans(state_size, number_actions)

### Training the DQN agent

In [None]:
# initialize training parameters
number_episodes = 2000
max_timesteps = 1000
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 0.995
epsilon = epsilon_start
score_per100 = deque(maxlen=100)


for episode in range(1, number_episodes + 1):
  state, _ = env.reset()
  score = 0
  for time in range(max_timesteps):
    action = agent.act(state, epsilon)
    next_state, reward, done, _ , _ = env.step(action)
    agent.step(state, action, reward, next_state, done)
    state = next_state
    score += reward
    if done:
      break
  score_per100.append(score)
  epsilon = max(epsilon_end, epsilon_decay * epsilon)

  print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(score_per100)), end = "")
  if episode % 100 == 0:
    print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(score_per100)))
  if np.mean(score_per100) >= 200.0:
    print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(episode - 100, np.mean(score_per100)))
    torch.save(agent.local_qnetwork.state_dict(), 'checkpoint.pth')
    break





Episode 100	Average Score: -186.01
Episode 200	Average Score: -128.66
Episode 300	Average Score: -84.77
Episode 400	Average Score: -37.31
Episode 500	Average Score: 53.36
Episode 578	Average Score: 201.54
Environment solved in 478 episodes!	Average Score: 201.54


## Part 3 - Visualizing the results

In [None]:
import glob
import io
import base64
import imageio
from IPython.display import HTML, display
from gym.wrappers.monitoring.video_recorder import VideoRecorder

def show_video_of_model(agent, env_name):
    env = gym.make(env_name, render_mode='rgb_array')
    state, _ = env.reset()
    done = False
    frames = []
    while not done:
        frame = env.render()
        frames.append(frame)
        action = agent.act(state)
        state, reward, done, _, _ = env.step(action.item())
    env.close()
    imageio.mimsave('video.mp4', frames, fps=30)

show_video_of_model(agent, 'LunarLander-v2')

def show_video():
    mp4list = glob.glob('*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")

show_video()

