In [2]:
import gymnasium as gym


env = gym.make("CartPole-v1")
state = env.reset()

In [3]:
print("Initial state: ", state)

Initial state:  (array([ 0.03787199,  0.00376896, -0.00967403, -0.00565756], dtype=float32), {})


In [4]:
import tensorflow as tf
from tensorflow.keras import layers

# Define the neural network model
def create_model(input_shape, num_actions):
    model = tf.keras.Sequential([
        layers.InputLayer(shape=input_shape),
        layers.Dense(128, activation='relu'),
        layers.Dense(128, activation='relu'),
        layers.Dense(num_actions, activation='linear')  # Outputs Q-values for each action
    ])
    return model

In [5]:
input_shape = env.observation_space.shape
num_actions = env.action_space.n
model = create_model(input_shape, num_actions)

model.summary()

In [6]:
import time
import numpy as np

sample_state = np.random.random((1, 4)).astype(np.float32)

start_time = time.time()
for _ in range(1000):
    _ = model.predict(sample_state, verbose=0)
end_time = time.time()

print(f"Average inference time: {(end_time - start_time) / 1000:.6f} seconds")

Average inference time: 0.042786 seconds


In [7]:
from collections import deque


learning_rate = 0.0002
gamma = 0.95  # Discount factor for future rewards
epsilon = 1.0  # Initial exploration rate
epsilon_min = 0.1  # Final exploration rate
epsilon_decay = 0.999
batch_size = 32
memory_size = 5000
num_episodes = 500
max_steps_per_episode = 500

replay_memory = deque(maxlen=memory_size)
target_model = create_model(input_shape, num_actions)
target_model.set_weights(model.get_weights())

In [8]:
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
model.compile(optimizer=optimizer, loss='mse')

#Target model is not compiled because it's weights are not getting updated during training

In [9]:
# Epsilon-greedy policy for action selection
def choose_action(state, epsilon):
    if np.random.rand() < epsilon:
        return np.random.choice(num_actions)  # Explore
    q_values = model.predict(state[np.newaxis], verbose=0)
    return np.argmax(q_values[0])  # Exploit

In [10]:
import random

# Train the model using a batch of experiences
def train_model():
    if len(replay_memory) < batch_size:
        return  # Wait until enough experiences are stored

    batch = random.sample(replay_memory, batch_size)
    states, actions, rewards, next_states, dones = map(np.array, zip(*batch))

    # Compute target Q-values
    next_q_values = target_model.predict(next_states, verbose=0)
    max_next_q_values = np.max(next_q_values, axis=1)
    target_q_values = rewards + gamma * max_next_q_values * (1 - dones)

    # Update the model
    q_values = model.predict(states, verbose=0)
    for i, action in enumerate(actions):
        q_values[i, action] = target_q_values[i]

    model.fit(states, q_values, epochs=1, verbose=0)


In [11]:
# Training loop
for episode in range(num_episodes):
    state, _ = env.reset()
    done = False
    total_reward = 0

    for step in range(max_steps_per_episode):
        action = choose_action(state, epsilon)  # Choose action
        next_state, reward, terminated, truncated, _ = env.step(action)
        
        done = terminated or truncated

        # Store the experience
        replay_memory.append((state, action, reward, next_state, done))

        # Train the model
        train_model()

        state = next_state
        total_reward += reward

        if done:
            break

    # Update epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    # Sync weights with the target model periodically
    if episode % 20 == 0:
        target_model.set_weights(model.get_weights())

    if total_reward >= 400:
        model.save(f"../model/dqn-model-{episode}.keras")
    
    print(f"Episode {episode + 1}: Total Reward = {total_reward}")


env.close()

Episode 1: Total Reward = 35.0
Episode 2: Total Reward = 29.0
Episode 3: Total Reward = 16.0
Episode 4: Total Reward = 29.0
Episode 5: Total Reward = 10.0
Episode 6: Total Reward = 13.0
Episode 7: Total Reward = 14.0
Episode 8: Total Reward = 15.0
Episode 9: Total Reward = 11.0
Episode 10: Total Reward = 20.0
Episode 11: Total Reward = 16.0
Episode 12: Total Reward = 20.0
Episode 13: Total Reward = 18.0
Episode 14: Total Reward = 19.0
Episode 15: Total Reward = 20.0
Episode 16: Total Reward = 20.0
Episode 17: Total Reward = 31.0
Episode 18: Total Reward = 19.0
Episode 19: Total Reward = 21.0
Episode 20: Total Reward = 26.0
Episode 21: Total Reward = 18.0
Episode 22: Total Reward = 33.0
Episode 23: Total Reward = 18.0
Episode 24: Total Reward = 31.0
Episode 25: Total Reward = 23.0
Episode 26: Total Reward = 20.0
Episode 27: Total Reward = 22.0
Episode 28: Total Reward = 16.0
Episode 29: Total Reward = 36.0
Episode 30: Total Reward = 25.0
Episode 31: Total Reward = 26.0
Episode 32: Total

In [12]:
model.save("../model/dqn-model.keras")

In [13]:
import pygame


def test():
    test_env = gym.make("CartPole-v1", render_mode="human")
    for e in range(10):
        state, _ = test_env.reset()
        done = False
        i = 0

        while not done:
            test_env.render()
            action = np.argmax(model.predict(state[np.newaxis], verbose=0))
            next_state, reward, terminated, truncated, _ = test_env.step(action)

            done = terminated or truncated

            state = next_state
            i += 1
            if done:
                print("episode: {}/{}, score: {}".format(e, 10, i))
                break
    test_env.close()
    pygame.quit()

In [14]:
test()

2024-12-23 17:34:51.078 Python[29485:9695063] ApplePersistenceIgnoreState: Existing state will not be touched. New state will be written to /var/folders/hc/_rm2mjw14y9_r7933xkcw3p80000gn/T/org.python.python.savedState


episode: 0/10, score: 210
episode: 1/10, score: 316
episode: 2/10, score: 215
episode: 3/10, score: 305
episode: 4/10, score: 243
episode: 5/10, score: 290
episode: 6/10, score: 218
episode: 7/10, score: 221
episode: 8/10, score: 257
episode: 9/10, score: 217


: 