# Deep Q-Learning for Lunar Landing

## Part 0 - Installing the required packages and importing the libraries

### Installing Gymnasium

In [None]:
!pip install gymnasium
!pip install "gymnasium[atari, accept-rom-license]"
!apt-get install -y swig
!pip install gymnasium[box2d]

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1
Collecting autorom[accept-rom-license]~=0.4.2 (from gymnasium[accept-rom-license,atari])
  Downloading AutoROM-0.4.2-py3-none-any.whl (16 kB)
Collecting shimmy[atari]<1.0,>=0.1.0 (from gymnasium[accept-rom-license,atari])
  Downloading Shimmy-0.2.1-py3-none-any.whl (25 kB)
Collecting AutoROM.accept-rom-license (from autorom[accept-rom-license]~=0.4.2->gymnasium[accept-rom-license,atari])
  Downloading AutoROM.accept-rom-license-0.6.1.tar.gz (434 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m434.7/434.7 kB[0m [31m11.1 

### Importing the libraries

In [None]:
import os
import random
import numpy as np
import torch #Pytorch is the framework using which we build this model.
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.autograd as autograd
from torch.autograd import Variable
from collections import deque, namedtuple

## Part 1 - Building the AI

### Creating the architecture of the Neural Network

In [None]:
class Network(nn.Module):

#The fully connected layers act as the brain of the AI.

  #Building function to create the layer architecture.
  def __init__(self, state_size, action_size, seed = 42) -> None: #'seed' ensures that the random number generator is set to an inital state of 42 so that every time a random process is called, it gives the same output ensuring reproducibility given all other parameters remain the same.
      super(Network, self).__init__() #'super()' function is used to call the initializer of the base class to activate inheritance.
      self.seed = torch.manual_seed(seed) #Does random number generation in pytorch.
      self.fc1 = nn.Linear(state_size, 64) #64 is the number of neurons. fc1 means full-connection 1, connection is established between i/p layer and first fully connected layer.
      self.fc2 = nn.Linear(64, 64) #First 64 as 64 neurons are there in fc1 which connects to fc2 and second 64 as next is a second fully connected layer with 64 neurons.
      self.fc3 = nn.Linear(64, action_size) #First 64 as 64 neurons are there in fc2 which connects to fc3 and action_size as next is output layer that has action_size.

  #Basically we created a neural network with input layer as state_size that is connected to first fully connected layer with 64 neurons.
  #We connected the first fully connected layer with 64 neurons to the second fully connected layer that has 64 neurons.
  #We connected the second fully connected layer with 64 neurons to the output layer with action_size.
  #Above is a neural netwok between input state_size and output action_size with 2 hidden layers each having 64 neurons.


  #Building function to propogate signal from input layer to get output.
  def forward(self, state):  #Forward Propogation.
    x = self.fc1(state) #x stores the value in fc1 (which takes state as input) which is the state itself.
    x = F.relu(x) #relu basically updates the value of x by applying activation function to x.
    x = self.fc2(x) #Here the value of x is updated as the signal coming to fc2 from fc1.
    x = F.relu(x)
    return self.fc3(x) #The value of x coming to fc3 from fc2 is returned as o/p.

  #Now we have built the architecture of the AI model. Next comes the training phase and then the model can be visualized.



## Part 2 - Training the AI

### Setting up the environment

In [None]:
import gymnasium as gym
env = gym.make("LunarLander-v2") #Creates an environment object that imports/calls and stores the LunarLander environment.
state_shape = env.observation_space.shape #Provied the dimensions of the possible observations (state vector).
state_size = env.observation_space.shape[0] #Retrieves the first dimension of the observations which is the number of elements in the state vector.
number_actions = env.action_space.n #Provides the number of actions the agent can take.
print("State shape: ", state_shape)
print("State size: ", state_size)
print("Number of actions: ", number_actions)

State shape:  (8,)
State size:  8
Number of actions:  4


### Initializing the hyperparameters

In [None]:
learning_rate = 5e-4 #5*(10^-4)
minibatch_size = 100 #Number of observations used in one step of the AI to update the model parameters.
discount_factor = 0.99 #Gamma used in Bellman equation. It is the present value of future rewards (Close to 1 indicates higher reward and close to 0 indicates lower reward).
replay_buffer_size = int(1e5) #Number of experiences stored in the memory of the AI (10^5).
interpolation_parameter = 1e-3 #Updates the rate at which the target values are updated(10^-3), aka 'Tau'.

#These numbers are derived after a lot of experimentation. These are the best values for the model for this scenario.


### Implementing Experience Replay

In [None]:
class ReplayMemory(object):


  # Method 1: GPU/CPU setting and memory initialization.
  def __init__(self, capacity):
    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #Sets the CPU or GPU for pytorch operations. If 'cuda:0' is satisfied, GPU is set for operations, else CPU is set.
    self.capacity = capacity #The maximum memory buffer for the AI is initialized to capacity which wull get updated later on.
    self.memory = [] #Initializes the memory array as an empty array. Stores the experiences of the array.


  #Method2: Appending an event to memory.
  def push(self, event): #An event is passed as a parameter in the constructer. An event is an experience that an AI gets which contains state, action, next_state and reward and boolean_done(indiates whether event is completed or not).
    self.memory.append(event) #Event is appended to the memory.
    if len(self.memory)>self.capacity: #If size of memory array exceedes the total memory buffer/capacity of AI the first event(oldest event) is removed from the memory as it is less relevant.
      del self.memory[0]


  #Method3: Choses experiences randomly from the memory buffer.
  def sample(self, batch_size): #Batch size is the number of experiences sampled in a batch.
    experiences = random.sample(self.memory, k = batch_size)
    #An experience contains - state, action, reward, next_state, boolean_done.
    states = torch.from_numpy(np.vstack([e[0]for e in experiences if e is not None])).float().to(self.device) #The states from experiences are stacked (e[0] as state is the first element in experience) by ensuring that the experince is not None.
    #np.vstack() Stacks the states from the experiences(states or any other element in experiences) by extracting them and stacking them.
    #torch.from_numpy() converts these numpy arrays (containing the stacked states from experiences) into pythorch tensors for compuation.
    #.float().to(self.device) moves these pytorch tensors converted to floats to the GPU or CPU for processing.
    actions = torch.from_numpy(np.vstack([e[1]for e in experiences if e is not None])).long().to(self.device) #Datatype is long integer and array index is 1 for actions.
    rewards = torch.from_numpy(np.vstack([e[2]for e in experiences if e is not None])).float().to(self.device) #Datatype is float and array index is 2 for rewards.
    next_states = torch.from_numpy(np.vstack([e[3]for e in experiences if e is not None])).float().to(self.device) #Datatype is float and array index is 3 for next_states.
    dones = torch.from_numpy(np.vstack([e[4]for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device) #Datatype is float and array index is 4 for boolean dones.
    #.astype(np.uint8) is used to specfy the boolean datatype before converting it to float. 'uint8' is the datatype used for boolean values.
    return states, next_states, actions, rewards, dones #Samples of all these elements of experiences will be returned.


### Implementing the DQN class

In [None]:
class Agent():


  #Method1: Creating qnetworks.
  def __init__(self, state_size, action_size):
     self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #Sets the CPU or GPU for pytorch operations.
     self.state_size = state_size
     self.action_size = action_size
     self.local_qnetwork = Network(state_size, action_size).to(self.device) #We create local_qnetwork which is an instances of the Network class and is sent to the GPU/CPU. The local_qnetwork basically selects the actions.
     self.target_qnetwork = Network(state_size, action_size).to(self.device) #We create target_qnetwork which is an instances of the Network class and is sent to the GPU/CPU. The target_qnetwork calculates the target q values to be used in training the local_qnetwork.
     self.optimizer = optim.Adam(self.local_qnetwork.parameters(), lr = learning_rate) #An optimizer object is created as an instance of the Adam class of the optim module. It optimizes the parameters of the local_qnetwork for the agent to take better actions. Learning rate is also taken as an argument.
     self.memory = ReplayMemory(replay_buffer_size) #An instance of the 'ReplayMemory' class is stored in the object variable 'self.memory'. The 'ReplayMemory' class takes the capacity as an argument which is the replay_buffer_size. Basically it stores the experiences.
     self.t_step = 0 #Refers to the timestep which is the time/moment in which the network parameters are updated.

  #Method2: Stores experiences and decides when to learn from them.
  def step(self, state, action, reward, next_state, done): #Method step takes experience.
    self.memory.push((state, action, reward, next_state, done)) #We store the experience in the replay memory i.e, by storing it in self.memory which is an instance of the ReplayMemory class. Two brackets are used here - first one for the push function and the second one to enclose the entire experience.
    self.t_step = (self.t_step + 1) % 4 #Incrementing the time step by 1 and reseting it every 4 steps.
    if self.t_step == 0: #Ensuring that learning happens only after every 4 steps (as '(self.t_step + 1) % 4' has been implemented, after every 4 steps self.t_step will be 0).
      if len(self.memory.memory) > minibatch_size: #Ensures that the number of experiences in the memory is greater than the minibatch_size.
        experiences = self.memory.sample(100) #Samples 100 experiences from the memory and stores it in the variable experiences.
        self.learn(experiences, discount_factor) #Learn method is called with parameters experiences and discount_factor aka gamma.

  #Method3: Choses an action based on a given state and epsilon value.
  def act(self, state, epsilon = 0.): #Epsilon in initialized as a float.
    state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) #The state is initially a numpy array which is set to float values and converted to pytorch tensors and is sent to GPU/CPU. The '.unsqueeze(0)' adds an extra dimension to the tensor (8d becomes 9d) which contains the batch number of each state indicating which batch the state belongs to. The batch number (experiences are sampled into multiple batches) is added to the 0th position.
    self.local_qnetwork.eval() #The local_qnetwork is set to eval mode. The local_qnetwork is an instance of the network class and the network class has the .eval() function.
    with torch.no_grad(): #Ensures that the model is in inferance mode (does predictions) instead of training mode by disabling gradiant computations.
      action_values = self.local_qnetwork(state) #We pass the state through the local_qnetwork to get the predictions from the model. This is stored in action_values (these are q values). These are q values of a given state (current state).

    #However we need to update these values using epsillon. So action_values can be viewed as q values which is later updated with epsillon values to get the final prediction/take final action.
    self.local_qnetwork.train() #Sets the model back in training mode using '.train() which is a function of the Network class.
    if random.random() > epsilon: #We generate a random number using the random function and check if it is greater than epsilon.
      return np.argmax(action_values.cpu().data.numpy()) #The 'np.argmax()' selects the value with the highest q value from action_values. The 'cpu()' is used to send the operation to the CPU as it is a simple operation, '.data.numpy()' is used as 'np.argmax()' expects a numpy format of data.
    else:
      return random.choice(np.arange(self.action_size)) #The '.choice()' function of the random library randomly choses one among the 4 possible actions (0, 1, 2, 3). 'np.arange()' is used to understand a range i.e, here it is used to understand the action_size which is 4, so the range is from 0 to 4 -> 0, 1, 2, 3 and '.choice()' choses one among these actions randomly.

  #Method4: Implementing the learn method. It updates the q_values of the local_qnetwork.
  def learn(self, experiences, discount_factor):
    states, next_states, actions, rewards, dones = experiences #Unpacking experiences into its different element states.
    next_q_targets = self.target_qnetwork(next_states).detach().max(1)[0].unsqueeze(1) #We forward pass the next_states to the target_qnetwork to predict the q values of the next states. We detach the obtained q values and find the max value from it so that the agent can go to that new state (Bellman Theory). '.unsqueeze(1)' adds the batch number as an extra dimension at index 1. These are intermediary q values (q values of intermediary states) used in calculating the q_target.
    q_targets = rewards + (discount_factor * next_q_targets * (1 - dones)) #Bellman Equation. This is different from action_values as q_targets incorporate future rewards to update q parameters as requiered.It is the final q value of the target state which is calculated by incorporating the intermediary q values (next_q_targets).
    q_expected = self.local_qnetwork(states).gather(1, actions) #The expected q value from the local_networ (current states) is calculated and gathered.
    loss = F.mse_loss(q_expected, q_targets) #Calculates the loss between the target value and the expected value. 'F.mse_loss' gives the mean squared error loss where F is a module of the torch library.
    self.optimizer.zero_grad() #Resets gradiant parameters to 0 as it has to be done before back propogation.
    loss.backward() #The loss is backpropogated back to the nodes/network.
    self.optimizer.step() #Parameters of the model are updated (local q_network) using the step function.
    self.soft_update(self.local_qnetwork, self.target_qnetwork, interpolation_parameter) #The target_qnetwork is updated using the local_qnetwork using the soft_update function.

  #Method5: Updating the target_qnetwork using the local_qnetwork.
  def soft_update(self, local_model, target_model, interpolation_parameter): #local_model and target_model are the same as local_qnetwork and target_qnetwork.
    for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): #Looping through the local_qnetwork aka local_model and target_qnetwork aka target_model to get the parameters of these networks using '.parameter()'. Zip is used as we are iterating over both target_model and local_model (similar to nested for loop).
      target_param.data.copy_(interpolation_parameter * local_param.data + (1.0 - interpolation_parameter) * target_param.data) # Here we update the target parameters based on the local parameters (basically updating weights). It is implemented as per weight updation formulas. '.copy_()' copies content from one tensor to another tensor i.e, basically updating target_param using weighted sum of target_param and local_param.



### Initializing the DQN agent

In [None]:
agent = Agent(state_size, number_actions) #We are calling an instance of the 'Agent' class and we call this object (the instance of the 'Agent' class in an object) as 'agent' and it will take the size of the state and the number of actions as parameters.


### Training the DQN agent

In [None]:
#Initializing.
number_episodes = 2000 #Maximum number of episodes over which we need to train our model.
maximum_number_timesteps_per_episode = 1000 #Maximum time steps per episode is set to 1000 as we dont want the AI to be in infinite loops.
#A timestep represents one step in the interaction between the agent and the environment. An episode is a sequence of timesteps.
epsilon_starting_value = 1.0 #Starting value of epsilon is set.
epsilon_ending_value = 0.01 #Ending value of epsilon is set.
epsilon_decay_value = 0.995 #Decay value of epsilon is set. During updation the epsilon value is multiplied with the epsilon decay value (will be done till we arrive at epsilon_ending_value).
epsilon = epsilon_starting_value #The epsilon value is initialized to its start value.
scores_on_100_episodes = deque(maxlen = 100) #Scores are calculated after every 100 episodes. Score basically refers to the cumilative reward that the agent receives for its actions and in this case it is calculated after every 100 episodes. 'deque' is a double ended queue.

#We are going to measure the score of the model (cumilative reward) after every 100 episodes.
for episode in range(1, number_episodes + 1): #We loop episodes from 1 to total number of episodes over which we train the model. We add +1 as python does not consider upper limit of range by default.
  state, _ = env.reset() #Here 'state' returns the initial state and '_' eliminates any other unnecessary thing that is returned along with state. The environment is reset as we are starting a new episode.
  score = 0 #We initialize the score to 0 before an episode.
  for t in range(maximum_number_timesteps_per_episode): #We start the second loop which is from 1 to the maximum number of timesteps per episode that we prescribed. This is to prevent the AI from going into an infinite state (similar to infinite loop).
    action = agent.act(state, epsilon) #The model now has to chose an action which is done using the function .act() which is described in the 'Agent' class. 'agent' was initialized as an instance of the 'Agent' class earlier.
    next_state, reward, done, _,_ = env.step(action) #Information we get after performing an action (next_state, reward, done) are returned by the step function for the particular action. The '_' is used to avoid unnecessary information coming with the step function.
    agent.step(state, action, reward, next_state, done) #The step method (which takes experience as parameters) is implemented which has the learn method with which the agent learns.
    state = next_state #The current state is updated to the next state.
    score += reward #The score is updated with the reward. Score is basically the cumilative reward. So we add reward for each action to the score every time an action is performed.
    if done: #If the episode is done at a specific time_step before the maximum number of time_steps then we break the episode and go to the next one.
      break
  scores_on_100_episodes.append(score) #We append the score of the finished episode to the window of 100 episodes. The oldest episode gets removed if the number of episodes exceed 100.
  epsilon = max(epsilon_ending_value, epsilon_decay_value * epsilon) #The epsilon value is updated by multiplying it with the epsilon_decay_value and the higher value among epsilon and epsilon_end_value is chosen as the next epsilon.
  print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores_on_100_episodes)), end = "") #Real-time score of every episode is printed and is replaced by the next one (more like running/real-time score update).
  if episode % 100 == 0:
    print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores_on_100_episodes))) #Score after every 100 episodes are printed.
  if np.mean(scores_on_100_episodes) >= 200.0:
    print('\n Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(episode - 100, np.mean(scores_on_100_episodes))) #The solution episode and the average score is printed when average score crosses 200 which is the ideal result.
    torch.save(agent.local_qnetwork.state_dict(), 'checkpoint.pth') #Model parameters (state, action, reward, next_state, done) are saved in 'checkpoint.pth' file (pytorch file) using .state_dict() function.
    break

Episode 100	Average Score: -152.27
Episode 200	Average Score: -90.26
Episode 300	Average Score: -46.67
Episode 400	Average Score: 24.85
Episode 500	Average Score: 139.92
Episode 600	Average Score: 183.46
Episode 639	Average Score: 200.60
 Environment solved in 539 episodes!	Average Score: 200.60


## Part 3 - Visualizing the results

In [None]:
import glob
import io
import base64
import imageio
from IPython.display import HTML, display
from gym.wrappers.monitoring.video_recorder import VideoRecorder

def show_video_of_model(agent, env_name):
    env = gym.make(env_name, render_mode='rgb_array') #Getting the environment.
    state, _ = env.reset() #Reset the state.
    done = False #Set done to false.
    frames = []
    while not done:
        frame = env.render()
        frames.append(frame)
        action = agent.act(state) #'act' method.
        state, reward, done, _, _ = env.step(action.item()) #'step' method.
    env.close()
    imageio.mimsave('video.mp4', frames, fps=30) #Gathering and saving all the frames in a video.

show_video_of_model(agent, 'LunarLander-v2')

def show_video():
    mp4list = glob.glob('*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")

show_video() #Displaying the video.

