# Lunar Lander with Cross-Entropy Method

In this notebook we look at the lunar lander environment and solve it with the cross-entropy method.

In [1]:
#!pip3 install 'gymnasium[box2d]'

In [2]:
import gymnasium as gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import matplotlib.pyplot as plt
%matplotlib inline
from collections import deque

torch.manual_seed(1)
np.random.seed(1)

# Neural Network

We define a simple neural network that generates the action scores based on a given state.

In [3]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(obs_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, n_actions)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)

# Generate Episodes

We generate a batch of episodes and remember the traversed states, actions and rewards. To select the next action we use the output of the network. For this we first pass the scores through a softmax to get probabilites. In the second step we sampel from this distribution to get the next action to execute.

In [4]:
def generate_batch(env, batch_size, t_max=5000):
    
    activation = nn.Softmax(dim=1)
    batch_actions,batch_states, batch_rewards = [],[],[]
    
    for b in range(batch_size):
        states, actions = [], []
        total_reward = 0
        s, _ = env.reset()
        for t in range(t_max):
            
            s_v = torch.FloatTensor([s])
            act_probs_v = activation(net(s_v))
            act_probs = act_probs_v.data.numpy()[0]
            a = np.random.choice(len(act_probs), p=act_probs)

            new_s, r, done, _, _ = env.step(a)

            # record sessions like you did before
            states.append(s)
            actions.append(a)
            total_reward += r

            s = new_s
            if done:
                batch_actions.append(actions)
                batch_states.append(states)
                batch_rewards.append(total_reward)
                break
                
    return batch_states, batch_actions, batch_rewards

# Training

In the training step, we first use the neural network to generate a batch of episodes and then use the state-action pairs to improve the neural network.

In [5]:
batch_size = 100
session_size = 100
percentile = 80
hidden_size = 200
completion_score = 100
learning_rate = 0.01

env = gym.make("LunarLander-v2")
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n

net = Net(n_states, hidden_size, n_actions)
objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=learning_rate)

for i in range(session_size):
    
    # generate new episodes
    states, actions, rewards = generate_batch(env, batch_size, t_max=500)
    
    threshold = np.percentile(rewards, percentile)
    
    # train on the states using actions as targets
    for s_i in range(len(states)):
        if rewards[s_i] < threshold: # skip this iteration if rewards are too low
            continue
        optimizer.zero_grad()
        tensor_states = torch.FloatTensor(states[s_i])
        tensor_actions = torch.LongTensor(actions[s_i])
        action_scores_v = net(tensor_states)
        loss_v = objective(action_scores_v, tensor_actions)
        loss_v.backward()
        optimizer.step()

    #show results
    mean_reward = np.mean(rewards)
    print("%d: loss=%.3f, reward_mean=%.1f" % (
            i, loss_v.item(), mean_reward))
    
    #check if 
    if np.mean(rewards)> completion_score:
        print("Environment has been successfullly completed!")

  s_v = torch.FloatTensor([s])


0: loss=1.417, reward_mean=-177.1
1: loss=1.394, reward_mean=-160.9
2: loss=1.359, reward_mean=-157.7
3: loss=1.374, reward_mean=-112.8
4: loss=1.366, reward_mean=-113.7
5: loss=1.326, reward_mean=-120.6
6: loss=1.365, reward_mean=-101.1
7: loss=1.266, reward_mean=-100.8
8: loss=1.372, reward_mean=-100.5
9: loss=1.357, reward_mean=-94.2
10: loss=1.310, reward_mean=-83.6
11: loss=1.204, reward_mean=-86.4
12: loss=1.290, reward_mean=-74.0
13: loss=1.247, reward_mean=-52.2
14: loss=1.247, reward_mean=-39.2
15: loss=1.180, reward_mean=-26.0
16: loss=1.214, reward_mean=-26.4
17: loss=1.230, reward_mean=-29.9
18: loss=1.042, reward_mean=-22.7
19: loss=1.219, reward_mean=-23.5
20: loss=1.075, reward_mean=1.2
21: loss=1.072, reward_mean=-10.9
22: loss=1.088, reward_mean=-9.9
23: loss=1.065, reward_mean=-27.3
24: loss=1.017, reward_mean=-1.8
25: loss=1.106, reward_mean=0.7
26: loss=0.976, reward_mean=4.6
27: loss=1.083, reward_mean=11.1
28: loss=1.072, reward_mean=13.6
29: loss=0.999, reward_me

In [6]:
env = gym.make('LunarLander-v2', render_mode="rgb_array")
env = gym.wrappers.RecordVideo(env, "video")

state, _ = env.reset()
total_reward = 0.0
done = False

while not done:
        
        state = torch.FloatTensor([state])
        action_scores_v = net(state)
        action = np.argmax(action_scores_v.cpu().data.numpy())

        state, reward, done, _, _ = env.step(action)
        total_reward += reward

env.close()
print(f"Total reward: {total_reward}")

  logger.warn(


Moviepy - Building video /Users/klausbertsch/Documents/GitHub/rl-course-ws25/solutions/video/rl-video-episode-0.mp4.
Moviepy - Writing video /Users/klausbertsch/Documents/GitHub/rl-course-ws25/solutions/video/rl-video-episode-0.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /Users/klausbertsch/Documents/GitHub/rl-course-ws25/solutions/video/rl-video-episode-0.mp4
Total reward: 177.56683536236716


