In [4]:
import numpy as np
import gym

# Define hyperparameters
gamma = 0.99  # Discount factor
learning_rate = 0.02
eps_clip = 0.2
K_epochs = 4
t_max = 200
update_timestep = 2000

# Create environment
env = gym.make('CartPole-v1')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# Discretize the state space
n_bins = [16, 16, 16, 16]  # Define the number of bins for each dimension
state_bins = [np.linspace(env.observation_space.low[i], env.observation_space.high[i], n_bins[i]) for i in range(state_dim)]

# Initialize policy and value function parameters
policy_table = np.ones((np.prod(n_bins), action_dim)) / action_dim
value_table = np.zeros(np.prod(n_bins))

# Discretize function
def discretize(obs):
    return [np.digitize(obs[i], state_bins[i]) - 1 for i in range(state_dim)]

# Policy function
def policy(state):
    return np.argmax(policy_table[state])

# Value function
def value(state):
    return value_table[state]

# Training loop
def update():
    for _ in range(K_epochs):
        for i in range(len(states_memory)):
            state = states_memory[i]
            action = actions_memory[i]
            reward = rewards_memory[i]
            next_state = next_states_memory[i]

            # Compute advantages and rewards-to-go
            value_next = value(next_state)
            advantage = reward + gamma * value_next - value(state)
            returns = reward + gamma * value_next

            # Compute old and new policy probabilities
            old_prob = policy_table[state, action]
            new_prob = np.exp(policy_table[state, action])

            # Compute surrogate objective
            ratio = new_prob / old_prob
            surrogate1 = ratio * advantage
            surrogate2 = np.clip(ratio, 1 - eps_clip, 1 + eps_clip) * advantage
            surrogate = -np.minimum(surrogate1, surrogate2)

            # Update policy and value function parameters
            policy_table[state, action] += learning_rate * surrogate
            value_table[state] += learning_rate * (returns - value(state))

# Main training loop
total_timesteps = 0
episode_rewards = []
while total_timesteps < update_timestep:
    states_memory, actions_memory, rewards_memory, next_states_memory = [], [], [], []
    for episode in range(t_max):
        state = discretize(env.reset())
        done = False
        episode_reward = 0
        while not done:
            states_memory.append(state)

            # Sample action from policy
            action = policy(state)
            actions_memory.append(action)

            # Take action in the environment
            next_state, reward, done, _ = env.step(action)
            next_state = discretize(next_state)
            rewards_memory.append(reward)
            next_states_memory.append(next_state)

            state = next_state
            episode_reward += reward

            # Render the environment
            env.render()

        episode_rewards.append(episode_reward)
        print(f"Episode: {episode + 1}, Reward: {episode_reward}")

    update()
    total_timesteps += len(states_memory)

env.close()


KeyboardInterrupt: 