In [1]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
env = gym.make(
    "LunarLander-v2",
    continuous=False,
    gravity=-10.0,
    enable_wind=False,
    wind_power=15.0,
    turbulence_power=1.5,
    render_mode="human",
)


In [3]:
n_actions = env.action_space.n
state_bounds = [(-1.5, 1.5), (-1.5, 1.5), (-5, 5), (-5, 5), (-3.1415927, 3.1415927), (-5, 5), (0, 1), (0, 1)]
n_states = [10, 10, 10, 10, 10, 10, 2, 2]
Q = np.random.rand(*(n_states + [n_actions]))

In [4]:
def discrete(observation):
    discrete_observation = []
    for i in range(len(observation)):
        o = np.clip(observation[i], state_bounds[i][0], state_bounds[i][1])    # 超出bound的值归位bound
        scale = (n_states[i] - 1) / (state_bounds[i][1] - state_bounds[i][0])        

        discrete_observation.append(int(np.round((o - state_bounds[i][0]) * scale)))
    return tuple(discrete_observation)

In [5]:
alpha = 0.1
gamma = 0.99
epsilon = 1.0

for episode in range(500):
    observation = env.reset()
    observation = observation[0]
    done = False
    episode_reward = 0
    while not done:
        # Discretize the current state
        current_state_index = discrete(observation)

        # Choose an action (Epsilon-greedy)
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q[current_state_index])


        # Take the action
        next_observation, reward, done, truncated, info = env.step(action)

        # Discretize the next state
        next_state_index = discrete(next_observation)

        # Update the Q-value for the current state-action pair
        max_next_q_value = np.max(Q[next_state_index])
        current_q_value = Q[current_state_index][action]

        Q[current_state_index][action] = (1 - alpha) * current_q_value + alpha * (reward + gamma * max_next_q_value)

        # Prepare for the next iteration
        observation = next_observation
        
        episode_reward += reward
        
    print(f"Episode {episode}: Total Reward = {episode_reward}")
        
if epsilon > 0.1:
    epsilon *= 0.995

    # Some logging
if episode % 10 == 0:
    print(f"Episode: {episode}, Epsilon: {epsilon}")



env.close()

Episode 0: Total Reward = -75.29782363491016
Episode 1: Total Reward = -247.4729044954347
Episode 2: Total Reward = -373.52283927170424
Episode 3: Total Reward = -386.4227289675309
Episode 4: Total Reward = 25.842603492346726
Episode 5: Total Reward = -117.56634655269518
Episode 6: Total Reward = -297.139014484143
Episode 7: Total Reward = -313.73285816259664
Episode 8: Total Reward = -139.16356810159982
Episode 9: Total Reward = -299.9491053846564
Episode 10: Total Reward = -234.3081651404146
Episode 11: Total Reward = -137.50037796290582
Episode 12: Total Reward = -108.98795205177568
Episode 13: Total Reward = -396.7800372258599
Episode 14: Total Reward = -81.97139204031606
Episode 15: Total Reward = -317.1135916944681
Episode 16: Total Reward = -159.8783069123581
Episode 17: Total Reward = -109.87360900819831
Episode 18: Total Reward = -114.97823522730059


KeyboardInterrupt: 