In [1]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
env = gym.make(
    "LunarLander-v2",
    continuous=False,
    gravity=-10.0,
    enable_wind=False,
    wind_power=15.0,
    turbulence_power=1.5,
    render_mode="human",
)


In [3]:
n_actions = env.action_space.n
state_bounds = [(-1.5, 1.5), (-1.5, 1.5), (-5, 5), (-5, 5), (-3.1415927, 3.1415927), (-5, 5), (0, 1), (0, 1)]
n_states = [10, 10, 10, 10, 10, 10, 2, 2]
Q = np.random.rand(*(n_states + [n_actions]))

In [4]:
def discrete(observation):
    discrete_observation = []
    for i in range(len(observation)):
        o = np.clip(observation[i], state_bounds[i][0], state_bounds[i][1])    # 超出bound的值归位bound
        scale = (n_states[i] - 1) / (state_bounds[i][1] - state_bounds[i][0])        

        discrete_observation.append(int(np.round((o - state_bounds[i][0]) * scale)))
    return tuple(discrete_observation)

In [5]:
alpha = 0.1
gamma = 0.99
epsilon = 1.0

for episode in range(500):
    observation = env.reset()
    observation = observation[0]
    done = False
    episode_reward = 0
    while not done:
        # Discretize the current state
        current_state_index = discrete(observation)

        # Choose an action (Epsilon-greedy)
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q[current_state_index])


        # Take the action
        next_observation, reward, done, truncated, info = env.step(action)

        # Discretize the next state
        next_state_index = discrete(next_observation)

        # Update the Q-value for the current state-action pair
        max_next_q_value = np.max(Q[next_state_index])
        current_q_value = Q[current_state_index][action]

        Q[current_state_index][action] = (1 - alpha) * current_q_value + alpha * (reward + gamma * max_next_q_value)

        # Prepare for the next iteration
        observation = next_observation
        
        episode_reward += reward
        
    print(f"Episode {episode}: Total Reward = {episode_reward}")
        
if epsilon > 0.1:
    epsilon *= 0.995

    # Some logging
if episode % 10 == 0:
    print(f"Episode: {episode}, Epsilon: {epsilon}")



env.close()

Episode 0: Total Reward = -318.000082388785
Episode 1: Total Reward = -395.3609787425669
Episode 2: Total Reward = -222.83964679483364
Episode 3: Total Reward = -203.0020879756267
Episode 4: Total Reward = -329.8805240874067
Episode 5: Total Reward = -170.07549514827815
Episode 6: Total Reward = -239.11078934056806
Episode 7: Total Reward = -116.71902423658855
Episode 8: Total Reward = -97.4949317512604
Episode 9: Total Reward = -483.36318440768997
Episode 10: Total Reward = -79.44452421171451
Episode 11: Total Reward = -235.71114398858532
Episode 12: Total Reward = -137.65370415099747
Episode 13: Total Reward = -82.06355983076512
Episode 14: Total Reward = -211.11286862572967
Episode 15: Total Reward = -187.06620076171805
Episode 16: Total Reward = -168.27992132655368
Episode 17: Total Reward = -124.73983405361466
Episode 18: Total Reward = -146.12123785520913
Episode 19: Total Reward = -142.7671708946028
Episode 20: Total Reward = -177.68942276913867
Episode 21: Total Reward = -111.8

Episode 175: Total Reward = -246.59561175727336
Episode 176: Total Reward = -169.64184547787318
Episode 177: Total Reward = -188.17827797768342
Episode 178: Total Reward = -93.65074811095353
Episode 179: Total Reward = -110.16942326418669
Episode 180: Total Reward = -67.20669805497216
Episode 181: Total Reward = -419.7952896169933
Episode 182: Total Reward = -71.18432882966465
Episode 183: Total Reward = -155.66912407302308
Episode 184: Total Reward = -138.45423106278534
Episode 185: Total Reward = -386.2553739303838
Episode 186: Total Reward = -209.44078507362988
Episode 187: Total Reward = -104.60382094933644
Episode 188: Total Reward = -66.85303826891428
Episode 189: Total Reward = -73.5639589055675
Episode 190: Total Reward = -8.033544839363714
Episode 191: Total Reward = -457.7174113740061
Episode 192: Total Reward = -203.2725370316076
Episode 193: Total Reward = -134.190060800271
Episode 194: Total Reward = -89.0945441105898
Episode 195: Total Reward = -86.21886782377356
Episode 

Episode 348: Total Reward = -117.314507807274
Episode 349: Total Reward = -90.2793376963329
Episode 350: Total Reward = -195.90353413301153
Episode 351: Total Reward = -320.98526678985957
Episode 352: Total Reward = -60.198409627442075
Episode 353: Total Reward = -99.47927821915982
Episode 354: Total Reward = -399.03433679111646
Episode 355: Total Reward = -217.53728177562675
Episode 356: Total Reward = -195.49955866126487
Episode 357: Total Reward = -296.28052993170957
Episode 358: Total Reward = -135.29189759803495
Episode 359: Total Reward = -106.47975218691386
Episode 360: Total Reward = -78.22045107593549
Episode 361: Total Reward = -96.05674978870093
Episode 362: Total Reward = -248.9456583965307
Episode 363: Total Reward = -143.55890558204817
Episode 364: Total Reward = -114.77552614320207
Episode 365: Total Reward = -141.16077016391347
Episode 366: Total Reward = -76.6975576605887
Episode 367: Total Reward = -419.34371950587365
Episode 368: Total Reward = -227.97256524720743
Ep