In [2]:
import gymnasium as gym
import math
import torch
import tqdm
from gymnasium import wrappers
from torch.optim import SGD
from torch.nn import Linear, Softmax, Sequential, ReLU

In [3]:
model = Sequential(
    Linear(8, 128),
    ReLU(),
    Linear(128,128),
    ReLU(),
    Linear(128, 16),
    ReLU(),
    Linear(16,4),
    Softmax(dim=0)  # Ensure softmax is applied along the correct dimension
)
#model = model.to("cuda")


def policy(observation, model):
    input = torch.tensor(observation, dtype=torch.float32)  # Explicitly set dtype
    action_probabilities = model(input)
    p = action_probabilities.cumsum(0)
    idx = torch.searchsorted(p, torch.rand(1))
    return idx.item()  # Simplify to return the scalar value

def loss_function(observation, desired_probability_output, model):
    f_x = model(observation)
    return torch.mean((desired_probability_output - f_x) ** 2)  # Use torch.mean for average loss


In [4]:
env = gym.make("LunarLander-v2")
observation, info = env.reset(seed=42)
optimizer = SGD(model.parameters(), lr=0.03)

In [5]:
for i in tqdm.trange(10000):
    recent_rewards_array = []

    for j in range(10):
        action = policy(observation, model)
        #action = 0
        observation, reward, terminated, truncated, info = env.step(action)
        recent_rewards_array.append([observation, action, reward])

        if terminated or truncated:
            observation, info = env.reset()


    n=3
    reward_values = [entry[2] for entry in recent_rewards_array]  # Extract reward values

    # Find the indices of the top 3 rewards using argsort
    top_n_indexes = torch.argsort(torch.tensor(reward_values), descending=True)[:n]

    best_observations = [recent_rewards_array[j][0] for j in top_n_indexes]
    best_actions = [recent_rewards_array[j][1] for j in top_n_indexes]

    #finding worst n actions
    worst_n_indexes = torch.argsort(torch.tensor(reward_values), descending=False)[:n]

    worst_observations = [recent_rewards_array[j][0] for j in worst_n_indexes]
    worst_actions = [recent_rewards_array[j][1] for j in worst_n_indexes]

    #training on the best n moves
    for j in range(n):
        desired_prob_dist = torch.tensor([0 for _ in range(4)])
        desired_prob_dist[best_actions[j]] = 1
        loss = loss_function(torch.tensor(best_observations[j], dtype=torch.float32),
                             desired_prob_dist, model)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    #training on the worst n moves
    for j in range(n):
        desired_prob_dist = torch.tensor([1/3 for _ in range(4)])
        desired_prob_dist[worst_actions[j]] = 0
        loss = loss_function(torch.tensor(best_observations[j], dtype=torch.float32),
                             desired_prob_dist, model)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
env.close()

100%|██████████| 10000/10000 [01:04<00:00, 154.81it/s]


In [6]:
env = gym.make("LunarLander-v2", render_mode="human")
observation, info = env.reset(seed=42)

for _ in range(800):
   #action = policy(observation, model)  # this is where you would insert your policy
   action = policy(observation, model)
   observation, reward, terminated, truncated, info = env.step(action)

   if terminated or truncated:
      observation, info = env.reset()

env.close()