In [1]:
import gymnasium as gym
import random
from torch import nn, optim, tensor
import numpy as np
import torch

In [None]:
env = gym.make('CartPole-v1', render_mode="human")
env.reset()
for step_index in range(1000):
    env.render()
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)
    print("Step {}:".format(step_index))
    print("action: {}".format(action))
    print("observation: {}".format(observation))
    print("reward: {}".format(reward))
    if terminated or truncated:
        print("done")
        break

In [2]:
env = gym.make('CartPole-v1')
env.reset()
goal_steps = 500
score_requirement = 60
initial_games = 100000

def generate_training_data():
    training_data = []
    accepted_scores = []
    for game_index in range(initial_games):
        score = 0
        game_memory = []
        previous_observation = []
        for step_index in range(goal_steps):
            action = random.randrange(0, 2)
            observation, reward, terminated, truncated, info = env.step(action)
            
            if len(previous_observation) > 0:
                game_memory.append([previous_observation, action])
                
            previous_observation = observation
            score += reward
            if terminated or truncated:
                break
            
        if score >= score_requirement:
            accepted_scores.append(score)
            for data in game_memory:
                output = data[1]
                training_data.append([data[0], output])
        
        env.reset()
    
    return training_data
    
training_data = generate_training_data()

In [3]:
class NN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 2),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        y = self.linear_relu_stack(x)
        return y

def build_model(input_size, output_size):
    model = NN(input_size, output_size)
    
    return model

In [4]:
learning_rate = 0.001
epochs = 20

# Initialize the loss function
loss_fn = nn.CrossEntropyLoss()

def train_loop(model, X, y, loss_fn, optimizer):    
    # Compute prediction and loss
    pred = model(X)
    loss = loss_fn(pred, y)

    # Backpropagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss.item()

def train_model(training_data):
    # FIXME: these two lines are not clear, make the shape of the training dataset better
    X = tensor([i[0] for i in training_data]).reshape(-1, len(training_data[0][0]))
    y = tensor([i[1] for i in training_data], dtype=torch.long)
    print(X)
    print(y)
    model = build_model(input_size=len(X[0]), output_size=2)
    print(model)
    print(model(X))
    print(loss_fn(model(X),y))
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    for t in range(epochs):
        loss = train_loop(model, X, y, loss_fn, optimizer)
        print(f"Epoch {t+1} - loss: {loss:>7f}")
    print("Done!")
    return model

In [5]:
model = train_model(training_data)

  X = tensor([i[0] for i in training_data]).reshape(-1, len(training_data[0][0]))


tensor([[ 0.0447, -0.1743,  0.0488,  0.3394],
        [ 0.0413,  0.0201,  0.0555,  0.0624],
        [ 0.0417, -0.1757,  0.0568,  0.3721],
        ...,
        [ 0.7221,  0.9716, -0.1520, -0.8402],
        [ 0.7415,  1.1685, -0.1688, -1.1766],
        [ 0.7649,  0.9759, -0.1923, -0.9412]])
tensor([1, 0, 1,  ..., 1, 0, 0])
NN(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=4, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=256, bias=True)
    (5): ReLU()
    (6): Linear(in_features=256, out_features=512, bias=True)
    (7): ReLU()
    (8): Linear(in_features=512, out_features=256, bias=True)
    (9): ReLU()
    (10): Linear(in_features=256, out_features=128, bias=True)
    (11): ReLU()
    (12): Linear(in_features=128, out_features=64, bias=True)
    (13): ReLU()
    (14): Linear(in_features=64, out_features=2, bias=True)
    (15): Softmax(dim=1)
  )
)
te

In [9]:
scores = []
choices = []
env = gym.make('CartPole-v1', render_mode="human")
# env = gym.make('CartPole-v1')
for each_game in range(10):
    score = 0
    game_memory = []
    prev_obs = []
    env.reset()
    for _ in range(goal_steps):
        # env.render()
        if len(prev_obs)== 0:
            action = random.randrange(0,2)
        else:
            prev_obs = torch.tensor([prev_obs])
            action = np.argmax(model(prev_obs).detach().numpy()[0])
        choices.append(action)

        new_observation, reward, terminated, truncated,info = env.step(action)
        prev_obs = new_observation
        game_memory.append([new_observation, action])
        score += reward
        if terminated or truncated:
            break
    scores.append(score)
print('Average Score', np.mean(scores))
print('Choice 1 : {}, choice 2: {}'.format(choices.count(1)/len(choices),choices.count(0)/len(choices)))

Average Score 500.0
Choice 1 : 0.5, choice 2: 0.5
