In [2]:
import gym
import numpy as np

# Create CartPole environment
env = gym.make('CartPole-v1')

# Define parameters
num_episodes = 1000
max_steps_per_episode = 500
learning_rate = 0.1
discount_rate = 0.99
epsilon = 0.1

# Discretize the observation space
num_bins = (6, 12, 6, 12)  # (cart position, cart velocity, pole angle, pole velocity)
num_actions = env.action_space.n

# Initialize Q-table with zeros
q_table = {}


def discretize(observation):
    # Discretization logic...
    discretized_values = tuple(int(obs * num_bins[i]) for i, obs in enumerate(observation))
    return discretized_values

    # discretized_values = []
    # for i, obs in enumerate(observation):
    #     discretized_value = int(obs * num_bins[i])
    #     discretized_values.append(discretized_value)

    # discretized_values = tuple(discretized_values)



# Helper function to select an action using epsilon-greedy policy
def choose_action(state):
    if np.random.uniform(0, 1) < epsilon:
        return np.random.choice(num_actions)
    else:
        return np.argmax(q_table[state])


# Q-learning algorithm
for episode in range(num_episodes):
    observation = env.reset()
    state = discretize(observation)
    done = False
    total_reward = 0

    print(f"State Awal: {observation}, Discret State: {state}")

    for step in range(max_steps_per_episode):
        env.render()  # Render the environment

        if state not in q_table:
            q_table[state] = np.zeros(num_actions)

        action = choose_action(state)
        next_observation, reward, done, _ = env.step(action)
        next_state = discretize(next_observation)

        if next_state not in q_table:
            q_table[next_state] = np.zeros(num_actions)

        # Update Q-value
        q_table[state][action] += learning_rate * (
            reward + discount_rate * np.max(q_table[next_state]) - q_table[state][action])

        total_reward += reward
        state = next_state

        if done:
            break

    if episode % 100 == 0:
        print(f"Episode: {episode}, Total Reward: {total_reward}")

print("Training finished.")
env.close()  # Close the environment after training


State Awal: [-0.01412086  0.0054601  -0.01491774  0.02925865], Discret State: (0, 0, 0, 0)
Episode: 0, Total Reward: 10.0
State Awal: [0.01590572 0.04053634 0.02004493 0.00258908], Discret State: (0, 0, 0, 0)
State Awal: [ 0.04615938 -0.03041425 -0.0092471  -0.0350586 ], Discret State: (0, 0, 0, 0)
State Awal: [ 0.00646361  0.0448469  -0.0224766   0.00698196], Discret State: (0, 0, 0, 0)
State Awal: [-0.04783678 -0.02852765 -0.04824876  0.03746404], Discret State: (0, 0, 0, 0)
State Awal: [-0.01981498  0.03974251  0.0316823  -0.02486813], Discret State: (0, 0, 0, 0)
State Awal: [-0.03643712  0.04877403 -0.03783381  0.04084685], Discret State: (0, 0, 0, 0)
State Awal: [-0.01555637 -0.03336356  0.02949278 -0.03405107], Discret State: (0, 0, 0, 0)
State Awal: [-0.03867324  0.02480264  0.00930715 -0.02354019], Discret State: (0, 0, 0, 0)
State Awal: [-0.04692174  0.04259641 -0.04323025  0.01407987], Discret State: (0, 0, 0, 0)
State Awal: [ 0.004784   -0.00516428  0.04478467  0.02614135], 