In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from DQN import DQN
import gymnasium as gym
import numpy as np
import random

In [2]:
env = gym.make(                          ## initialize the enverionment 
    "LunarLander-v2",
    continuous=False,
    gravity=-10.0,
    enable_wind=False,
    wind_power=15.0,
    turbulence_power=1.5,
    render_mode="human",
)

In [3]:
policy_net = DQN(8, 4).to("cuda")
#policy_net.load_state_dict(torch.load('policy_net_model.pth'))

target_net = DQN(8, 4).to("cuda")
target_net.load_state_dict(policy_net.state_dict())          # initialize the weights and bias
target_net.eval()

DQN(
  (fc1): Linear(in_features=8, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=4, bias=True)
)

In [4]:
optimizer = optim.Adam(policy_net.parameters(), lr=0.0001, weight_decay=1e-5)
replay_memory = []
criterion = nn.MSELoss()

batch_size = 64
gamma = 0.99
epsilon = 1.0
max_steps = 900


In [5]:
for episode in range(500):
    observation = env.reset()        # Get a new environment 
    observation = observation[0]     # Get the valid information ( filter the empty element in the tuple)
    done = False                    # If landed
    total_reward = 0                
    step_count = 0                   # Count the step
    hover_counter = 0
    last_position = None            # track the previous position
    total_loss = 0 
    
    while not done:
        if step_count >= max_steps:  # Check for maximum steps
            break
        
        current_position = (observation[0], observation[1])
        
        if last_position is not None:  # Avoid the lander from hovering too long
            distance = np.sqrt((current_position[0] - last_position[0])**2 + (current_position[1] - last_position[1])**2)
            if distance < 0.01:  
                hover_counter += 1
            else:
                hover_counter = 0      # Reset the counter if the lander has moved

        last_position = current_position
        
        state_tensor = torch.FloatTensor(observation).unsqueeze(0).to("cuda")

        # Epsilon-greedy action
        if np.random.rand() < epsilon:
            action = env.action_space.sample()                       # Which action the lander takes, 0,1,2,3
        else:
            with torch.no_grad():                                    # disable the gradient tracking  
                action = policy_net(state_tensor).max(1)[1].item()   # get the best action 

        next_observation, reward, done, truncated, info = env.step(action)
        
        if hover_counter >= 90:  # If the lander hovers for 40 steps
            reward -= 5         # Apply a penalty of 10 to the reward

        # Store transition
        replay_memory.append((observation, action, reward, next_observation, done))
        if len(replay_memory) > 5000:
            replay_memory.pop(0)

        # Sample mini-batch and update policy_net
        if len(replay_memory) >= batch_size:                  # When the length of buffer greater than the batch size
            batch = random.sample(replay_memory, batch_size)
            batch_states, batch_actions, batch_rewards, batch_next_states, batch_dones = zip(*batch)

            batch_states = torch.FloatTensor(batch_states).to("cuda")
            batch_actions = torch.LongTensor(batch_actions).to("cuda")
            batch_rewards = torch.FloatTensor(batch_rewards).to("cuda")
            batch_next_states = torch.FloatTensor(batch_next_states).to("cuda")
            batch_dones = torch.FloatTensor(batch_dones).to("cuda")
            
            # The TD algo
            current_q_values = policy_net(batch_states).gather(1, batch_actions.unsqueeze(1)).squeeze()
            next_state_actions = policy_net(batch_next_states).max(1)[1]
            next_q_values = target_net(batch_next_states).gather(1, next_state_actions.unsqueeze(1)).squeeze()
            expected_q_values = batch_rewards + gamma * next_q_values * (1 - batch_dones)

            loss = criterion(current_q_values, expected_q_values.detach())
            total_loss += loss.item()  # Accumulate the loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            step_count += 1
        # Update target network
        if episode % 15 == 0:
            target_net.load_state_dict(policy_net.state_dict())        # update the weights and bias

        observation = next_observation
        total_reward += reward
        
        
    # Decay epsilon
    if epsilon > 0.1:
        epsilon *= 0.995
    
    average_loss = total_loss / step_count
    
    print(f"Episode {episode}: Total Reward = {total_reward}, Average Loss = {average_loss:.4f}, Epsilon: {epsilon}")

env.close()
torch.save(policy_net.state_dict(), 'policy_net_model.pth')

  batch_states = torch.FloatTensor(batch_states).to("cuda")


Episode 0: Total Reward = -249.1593968860744, Average Loss = 1.9183, Epsilon: 0.995
Episode 1: Total Reward = -101.90055885614142, Average Loss = 76.1729, Epsilon: 0.990025
Episode 2: Total Reward = -92.36726627956331, Average Loss = 82.9125, Epsilon: 0.985074875
Episode 3: Total Reward = -177.6413030335481, Average Loss = 86.7783, Epsilon: 0.9801495006250001
Episode 4: Total Reward = -90.33425445018669, Average Loss = 84.4105, Epsilon: 0.9752487531218751
Episode 5: Total Reward = -78.30168748069141, Average Loss = 108.3332, Epsilon: 0.9703725093562657
Episode 6: Total Reward = -411.57120090564933, Average Loss = 89.2919, Epsilon: 0.9655206468094844
Episode 7: Total Reward = -420.4554483686753, Average Loss = 92.6828, Epsilon: 0.960693043575437
Episode 8: Total Reward = -108.62154511326364, Average Loss = 82.3597, Epsilon: 0.9558895783575597
Episode 9: Total Reward = -306.6493789030952, Average Loss = 99.9629, Epsilon: 0.9511101304657719
Episode 10: Total Reward = -230.6528221530052, A

Episode 83: Total Reward = -102.22994800896582, Average Loss = 23.0008, Epsilon: 0.6563549768288433
Episode 84: Total Reward = -6.712249369789063, Average Loss = 20.2926, Epsilon: 0.653073201944699
Episode 85: Total Reward = -122.12179738121755, Average Loss = 23.0822, Epsilon: 0.6498078359349755
Episode 86: Total Reward = -466.74497017122314, Average Loss = 18.9262, Epsilon: 0.6465587967553006
Episode 87: Total Reward = -110.32338127651877, Average Loss = 17.7297, Epsilon: 0.6433260027715241
Episode 88: Total Reward = -36.94771726055634, Average Loss = 19.3767, Epsilon: 0.6401093727576664
Episode 89: Total Reward = -37.419843785609366, Average Loss = 17.2382, Epsilon: 0.6369088258938781
Episode 90: Total Reward = -299.631512736796, Average Loss = 20.8697, Epsilon: 0.6337242817644086
Episode 91: Total Reward = -69.55287596315758, Average Loss = 20.5716, Epsilon: 0.6305556603555866
Episode 92: Total Reward = -70.73080623684531, Average Loss = 22.0092, Epsilon: 0.6274028820538087
Episode

Episode 165: Total Reward = -911.9903722687558, Average Loss = 27.5942, Epsilon: 0.4351424010585501
Episode 166: Total Reward = -52.58547374737552, Average Loss = 22.9426, Epsilon: 0.43296668905325736
Episode 167: Total Reward = -3548.6141353045323, Average Loss = 21.3963, Epsilon: 0.43080185560799106
Episode 168: Total Reward = -16.824840768318992, Average Loss = 15.1520, Epsilon: 0.4286478463299511
Episode 169: Total Reward = -68.14208906591158, Average Loss = 12.8145, Epsilon: 0.42650460709830135
Episode 170: Total Reward = -3253.891473285829, Average Loss = 17.2326, Epsilon: 0.42437208406280985
Episode 171: Total Reward = 29.756639684723382, Average Loss = 17.5600, Epsilon: 0.4222502236424958
Episode 172: Total Reward = 11.124708532054157, Average Loss = 13.1657, Epsilon: 0.42013897252428334
Episode 173: Total Reward = -3555.732429070677, Average Loss = 17.2377, Epsilon: 0.4180382776616619
Episode 174: Total Reward = -3191.2178315293254, Average Loss = 17.6696, Epsilon: 0.415948086

Episode 247: Total Reward = -43.42316522922563, Average Loss = 36.4733, Epsilon: 0.2884855236625661
Episode 248: Total Reward = -55.157688914288116, Average Loss = 40.1694, Epsilon: 0.28704309604425327
Episode 249: Total Reward = 42.58411459269763, Average Loss = 31.1494, Epsilon: 0.285607880564032
Episode 250: Total Reward = 5.631982732541601, Average Loss = 42.8601, Epsilon: 0.28417984116121187
Episode 251: Total Reward = -317.1511354944861, Average Loss = 37.7134, Epsilon: 0.2827589419554058
Episode 252: Total Reward = 46.192760509129755, Average Loss = 36.1128, Epsilon: 0.28134514724562876
Episode 253: Total Reward = -80.08767767584621, Average Loss = 40.5044, Epsilon: 0.2799384215094006
Episode 254: Total Reward = -85.11894515388538, Average Loss = 48.2459, Epsilon: 0.27853872940185365
Episode 255: Total Reward = 6.450600426597816, Average Loss = 48.7585, Epsilon: 0.27714603575484437
Episode 256: Total Reward = -52.474560591722266, Average Loss = 52.5472, Epsilon: 0.27576030557607

Episode 329: Total Reward = -77.98133113552453, Average Loss = 11.1215, Epsilon: 0.1912566947289212
Episode 330: Total Reward = 13.308854581561121, Average Loss = 29.6953, Epsilon: 0.1903004112552766
Episode 331: Total Reward = -113.1366517292366, Average Loss = 14.9541, Epsilon: 0.18934890919900021
Episode 332: Total Reward = -39.432061402474446, Average Loss = 22.7792, Epsilon: 0.18840216465300522
Episode 333: Total Reward = -2.069457624535474, Average Loss = 29.8660, Epsilon: 0.18746015382974018
Episode 334: Total Reward = -8.183496252071421, Average Loss = 15.7896, Epsilon: 0.1865228530605915
Episode 335: Total Reward = -37.71877098606737, Average Loss = 12.3652, Epsilon: 0.18559023879528855
Episode 336: Total Reward = 15.967337766913971, Average Loss = 19.6892, Epsilon: 0.1846622876013121
Episode 337: Total Reward = 17.072433643081, Average Loss = 24.1560, Epsilon: 0.18373897616330553
Episode 338: Total Reward = -18.260569953605483, Average Loss = 16.8248, Epsilon: 0.1828202812824

Episode 411: Total Reward = -45.09221794918759, Average Loss = 7.2839, Epsilon: 0.12679708435358925
Episode 412: Total Reward = -61.16418968214671, Average Loss = 7.1989, Epsilon: 0.1261630989318213
Episode 413: Total Reward = -36.66600438278485, Average Loss = 7.5707, Epsilon: 0.1255322834371622
Episode 414: Total Reward = -45.82783217901107, Average Loss = 7.9669, Epsilon: 0.12490462201997637
Episode 415: Total Reward = -48.788654265195106, Average Loss = 6.3500, Epsilon: 0.1242800989098765
Episode 416: Total Reward = -60.65436167026924, Average Loss = 6.8525, Epsilon: 0.12365869841532712
Episode 417: Total Reward = -96.54502541676966, Average Loss = 7.9346, Epsilon: 0.12304040492325048
Episode 418: Total Reward = -40.25914535685565, Average Loss = 7.5574, Epsilon: 0.12242520289863423
Episode 419: Total Reward = -71.03363221479108, Average Loss = 9.7216, Epsilon: 0.12181307688414106
Episode 420: Total Reward = -45.56923575768813, Average Loss = 7.7656, Epsilon: 0.12120401149972035
Ep

Episode 494: Total Reward = -43.59172478417676, Average Loss = 3.1990, Epsilon: 0.0996820918179746
Episode 495: Total Reward = -44.1012782148952, Average Loss = 13.4261, Epsilon: 0.0996820918179746
Episode 496: Total Reward = -21.439819952032153, Average Loss = 8.0824, Epsilon: 0.0996820918179746
Episode 497: Total Reward = -35.576529254463196, Average Loss = 10.9099, Epsilon: 0.0996820918179746
Episode 498: Total Reward = -5.994283427876269, Average Loss = 6.2373, Epsilon: 0.0996820918179746
Episode 499: Total Reward = -21.048956366842745, Average Loss = 3.6378, Epsilon: 0.0996820918179746
