In [2]:
from gym import Env
import gymnasium as gym
from gymnasium import spaces
from gym.spaces import Discrete, Box
import numpy as np
import random
import pygame

In [3]:
class TrainEnv(Env):
    metadata = {'render.modes': ['human']}
    
    def __init__(self):
        # Actions: accelerate, slow down, keep speed
        self.action_space = Discrete(3)
        # Store current speed, position x, position y, distance left
        self.observation_space = Box(low=np.array([0.0, 0.0, 0.0, 0.0]), high=np.array([1.0, 1000.0, 768.0, 20000.0]))

        # Initial state: [speed, x, y, distance_left]
        self.state = [0.0, 118.0, 110.0, 767.0]
        self.target = np.array([885, 110])

        # Initialize variables for Pygame
        self.screen = None
        self.clock = None
        self.is_pygame_initialized = False  # Flag to check if Pygame is initialized

    def step(self, action):
        reward = 0
        speed, x, y, distance_left = self.state
        current_position = np.array([x, y])
        direction = self.target - current_position
        distance_to_target = np.linalg.norm(direction)
        
        # Constants
        max_speed = 1.0  # Maximum speed
        max_acceleration = 0.05  # Maximum speed increase per step
        max_deceleration = 0.05  # Maximum speed decrease per step

        # Action: 0 = accelerate, 1 = decelerate, 2 = maintain speed
        if action == 0:  # Accelerate
            speed = min(max_speed, speed + max_acceleration)
        elif action == 1:  # Decelerate
            speed = max(0.0, speed - max_deceleration)

        # Move the train
        direction_unit = direction / distance_to_target if distance_to_target > 0 else np.array([0, 0])
        new_position = current_position + speed * direction_unit
        self.state = [speed, new_position[0], new_position[1], distance_to_target]

        if distance_to_target < 0.1:
            if speed < 0.1:  # Close enough and nearly stopped
                reward += 100  # Big reward for stopping at target
            else:  # Close enough but still moving too fast
                reward -= 100  # Penalty for overshooting or moving too fast near the target
            done = True
        else:
            reward -= 0.01  # Small time penalty to encourage faster arrival
            done = False

        return self.state, reward, done, {}

    def reset(self):
        self.state = [0.0, 118.0, 110.0, 767.0]
        return self.state
    
    def test(self):
        print('aaa')

    def render(self, mode='human'):
        if not self.is_pygame_initialized:
            print("Initializing Pygame...")
            pygame.init()  # Initialize Pygame only when necessary
            self.screen = pygame.display.set_mode((1000, 768))
            self.clock = pygame.time.Clock()
            self.is_pygame_initialized = True

        # Clear the screen
        self.screen.fill((0, 0, 0))

        # Draw the target
        pygame.draw.circle(self.screen, (255, 0, 0), (int(self.target[0]), int(self.target[1])), 10)

        # Draw the train (as a small circle)
        train_position = (int(self.state[1]), int(self.state[2]))
        pygame.draw.circle(self.screen, (0, 255, 0), train_position, 10)

        # Update the display
        pygame.display.flip()

        # Cap the frame rate
        self.clock.tick(60)

    def close(self):
        if self.is_pygame_initialized:
            print("Closing Pygame...")
            pygame.quit()
            self.is_pygame_initialized = False



In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from keras.optimizers import Adam  # Change made here
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

# Environment test
env = TrainEnv()

states = env.observation_space.shape[0]
actions = env.action_space.n

model = Sequential()
model.add(Flatten(input_shape=(1, states)))
model.add(Dense(24, activation="relu"))
model.add(Dense(24, activation="relu"))
model.add(Dense(actions, activation="linear"))

agent = DQNAgent(
    model=model,
    memory=SequentialMemory(limit=50000, window_length=1),
    nb_actions=actions,
    nb_steps_warmup=10,
    target_model_update=0.01
)

agent.compile(Adam(learning_rate=0.001), metrics=["mae"])  # Here, it should work now
agent.fit(env, nb_steps=100000, visualize=False, verbose=1)
# results = agent.test(env, nb_episodes=10, visualize=False)
# print(np.mean(results.history))

# Save the model weights after training
agent.save_weights('dqn_trainenv_weights.h5f', overwrite=True)

  import sre_constants
  from jax import xla_computation as _xla_computation
  import cgi
  from urllib3.contrib.pyopenssl import orig_util_SSLContext as SSLContext
  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


Training for 100000 steps ...
Interval 1 (0 steps performed)
    1/10000 [..............................] - ETA: 7:33 - reward: -0.0100

  updates=self.state_updates,
  batch_idxs = np.random.random_integers(low, high - 1, size=size)


   58/10000 [..............................] - ETA: 1:40 - reward: -0.0100

  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=s

1 episodes - episode_reward: -182.790 [-182.790, -182.790] - loss: 4.578 - mae: 49.957 - mean_q: 75.596

Interval 2 (10000 steps performed)
3 episodes - episode_reward: -138.790 [-178.520, -115.580] - loss: 2.046 - mae: 27.302 - mean_q: 41.377

Interval 3 (20000 steps performed)
3 episodes - episode_reward: -130.853 [-139.260, -119.240] - loss: 1.463 - mae: 14.730 - mean_q: 22.310

Interval 4 (30000 steps performed)
3 episodes - episode_reward: -132.843 [-141.360, -126.940] - loss: 1.284 - mae: 7.740 - mean_q: 11.776

Interval 5 (40000 steps performed)
2 episodes - episode_reward: -148.125 [-164.770, -131.480] - loss: 1.082 - mae: 3.190 - mean_q: 4.810

Interval 6 (50000 steps performed)
2 episodes - episode_reward: -144.465 [-145.190, -143.740] - loss: 1.182 - mae: 1.025 - mean_q: 1.400

Interval 7 (60000 steps performed)
1 episodes - episode_reward: -169.490 [-169.490, -169.490] - loss: 1.327 - mae: 0.606 - mean_q: 0.584

Interval 8 (70000 steps performed)
3 episodes - episode_reward

In [17]:
env = TrainEnv()

# Reset the environment to get the initial state
state = env.reset()

# Test rendering without agent interaction
for _ in range(500):
    env.render()

# Close the environment properly
env.close()


Initializing Pygame...
Closing Pygame...


In [6]:
# Assuming the agent has been trained and saved
agent.load_weights('dqn_trainenv_weights.h5f')
env = TrainEnv()
# Visualize one episode with the trained agent
state = env.reset()
done = False
total_reward = 0

while not done:
    action = agent.forward(state)
    state, reward, done, _ = env.step(action)

    # Render the environment with Pygame
    env.render()

    total_reward += reward

print(f"Total reward: {total_reward}")
env.close()  # Ensure to close Pygame when done


Initializing Pygame...
Total reward: 25.72999999999891
Closing Pygame...
