In [1]:
# Q-Learning in cartpole
# interesting tutorial for this example here
#https://medium.com/swlh/using-q-learning-for-openais-cartpole-v1-4a216ef237df

In [2]:
import numpy as np
import gymnasium as gym
import session_info
import time
import math

In [3]:
# Q-Learning parameters
alpha = 0.1  # Learning rate
gamma = 0.99  # Discount factor
epsilon = 0.9  # Epsilon-greedy policy
epsilon_decay_value = 0.99995

# Sepcific Cartpole parameters
# <cart position, cart velocity, pole angle, pole velocity>

Observation = [30, 30, 50, 50]                            # weights importance of features
np_array_win_size = np.array([0.25, 0.25, 0.01, 0.1])     # steps based -WHAT- PLEASE CORRECT

# Fix these

LEARNING_RATE = 0.1
EPISODES = 60000

gamma = 0.95
epsilon = 1
epsilon_decay_value = 0.9999

Observation = [30, 30, 50, 50]
np_array_win_size = np.array([0.25, 0.25, 0.01, 0.1])

total = 0
total_reward = 0
prior_reward = 0

In [4]:
# Support functions
def choose_action(Q, discrete_state):
    if np.random.random() > epsilon:
        action = np.argmax(Q[discrete_state])               # exploration 
    else:
        action = np.random.randint(0, env.action_space.n)         # explotaition
    return action

def get_discrete_state(state):
    discrete_state = state/np_array_win_size+ np.array([15,10,1,10])
    return tuple(discrete_state.astype(int))

In [5]:
# Initialize the environment and Q table

env = gym.make('CartPole-v1', render_mode='rgb_array')
state, _ = env.reset()
Q = np.random.uniform(low=0, high=1, size=(Observation + [env.action_space.n]))

In [6]:
for episode in range(EPISODES):
    
    t0 = time.time()  
    state, _ = env.reset()
    discrete_state = get_discrete_state(state) 
    done = False
    episode_reward = 0 
       
    while not done: 
        action = choose_action(Q, discrete_state)                    # Exploration or explotaition

        new_state, reward, done, _ , _ = env.step(action) 
        episode_reward += reward                                  # reward by episode
        new_discrete_state = get_discrete_state(new_state)

        if episode % 2000 == 0:                                   # one visualization every 2000 episodes
            env.render()

        if not done: 
            # Q[state, action] += alpha * (reward + gamma * np.max(Q[next_state, :]) - Q[state, action])
            
            max_future_q = np.max(Q[new_discrete_state])
            current_q    = Q[discrete_state + (action,)]
            new_q        = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + gamma * max_future_q)
            Q[discrete_state + (action,)] = new_q

        discrete_state = new_discrete_state

    if epsilon > 0.05:                                            #epsilon modification
        if episode_reward > prior_reward and episode > 10000:
            epsilon = math.pow(epsilon_decay_value, episode - 10000)

    t1 = time.time()                                             #episode has finished
    episode_total = t1 - t0                                      #episode total time
    total = total + episode_total

    total_reward += episode_reward      #episode total reward
    prior_reward = episode_reward

    if (episode % 5000) == 0:           #every 5000 episodes print the average time and the average reward
        mean = total / 1000
        total = 0
        mean_reward = total_reward / 1000
        print("Episode: " + str(episode))
        print("   Epsilon: " + str(epsilon))
        print("   Time Average: " + str(mean))
        print("   Mean Reward: " + str(mean_reward))
        total_reward = 0

env.close()

Episode: 0
   Epsilon: 1
   Time Average: 9.496903419494629e-05
   Mean Reward: 0.042
Episode: 5000
   Epsilon: 1
   Time Average: 0.0011664626598358154
   Mean Reward: 112.547
Episode: 10000
   Epsilon: 1
   Time Average: 0.001163356065750122
   Mean Reward: 112.201
Episode: 15000
   Epsilon: 0.6066368169217945
   Time Average: 0.0015657961368560791
   Mean Reward: 149.308
Episode: 20000
   Epsilon: 0.3680082276450068
   Time Average: 0.003136528253555298
   Mean Reward: 301.191
Episode: 25000
   Epsilon: 0.22313573847218987
   Time Average: 0.007736260890960693
   Mean Reward: 553.293
Episode: 30000
   Epsilon: 0.13534881789285041
   Time Average: 0.009004859209060669
   Mean Reward: 855.701
Episode: 35000
   Epsilon: 0.08208294625097338
   Time Average: 0.012764304161071778
   Mean Reward: 1130.143
Episode: 40000
   Epsilon: 0.04999912418608252
   Time Average: 0.013853009939193726
   Mean Reward: 1124.692
Episode: 45000
   Epsilon: 0.04999912418608252
   Time Average: 0.01339826154

In [7]:
# Testing results from Q Table

env = gym.make('CartPole-v1', render_mode='human')
state, _ = env.reset()
k = 0
# Run the environment for 100 steps
for i in range(1000):
    # Display the current state of the environment
    env.render()
    
    # Choose an action based on the learned Q-network
    discrete_state = get_discrete_state(state)
    action = choose_action(Q, discrete_state)
    
    # Take the chosen action and observe the next state, reward, and termination status
    state, reward, terminated, truncated, _ = env.step(action)
    
    # If the episode is terminated or truncated, reset the environment
    if terminated or truncated:
        print('Terminated', i-k)
        k = i
        state, info = env.reset()

# Close the environment after exploration
env.close()

Terminated 196
Terminated 24
Terminated 86
Terminated 183
Terminated 191
Terminated 169


In [8]:
exit() # closes pygame window
session_info.show(html=False)

-----
gymnasium           0.29.1
numpy               1.26.4
session_info        1.0.0
-----
IPython             8.26.0
jupyter_client      8.6.2
jupyter_core        5.7.2
-----
Python 3.12.3 (main, Sep 11 2024, 14:17:37) [GCC 13.2.0]
Linux-5.15.153.1-microsoft-standard-WSL2-x86_64-with-glibc2.39
-----
Session information updated at 2024-09-20 08:42
