<a href="https://colab.research.google.com/github/aadumbuya/Formative_2_Deep_Q_Learning/blob/main/FORMATIVE_2_Deep_Q.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install stable-baselines3[extra] gymnasium[atari] ale-py



In [None]:
import gymnasium as gym
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import CheckpointCallback
import torch
import os
import ale_py

In [None]:
# Create logs directory
log_dir = "./logs/"
os.makedirs(log_dir, exist_ok=True)

# Ensure Stable-Baselines3 uses GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Train MLPPolicy model
print("Training MLPPolicy Model...")
mlp_env = gym.make("ALE/Pong-v5", render_mode="rgb_array")
mlp_env = Monitor(mlp_env, log_dir)

mlp_model = DQN("MlpPolicy", mlp_env, verbose=1, learning_rate=1e-4, gamma=0.99, tensorboard_log="./mlp_dqn_tensorboard/",device=device)
mlp_model.learn(total_timesteps=200000)  # Train for 200k steps
mlp_model.save("dqn_pong_mlp.zip")
mlp_env.close()

Training MLPPolicy Model...
Using cuda device
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.




Logging to ./mlp_dqn_tensorboard/DQN_4
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 867      |
|    ep_rew_mean      | -21      |
|    exploration_rate | 0.835    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 355      |
|    time_elapsed     | 9        |
|    total_timesteps  | 3467     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0158   |
|    n_updates        | 841      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 930      |
|    ep_rew_mean      | -20.4    |
|    exploration_rate | 0.647    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 361      |
|    time_elapsed     | 20       |
|    total_timesteps  | 7440     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss       

In [None]:
# Train CnnPolicy model
print("Training CnnPolicy Model...")
cnn_env = gym.make("ALE/Pong-v5", render_mode="rgb_array")
cnn_env = Monitor(cnn_env, log_dir)

cnn_model = DQN("CnnPolicy", cnn_env, verbose=1, buffer_size=1000000, learning_starts=10000, batch_size=16, gamma=0.99,
                learning_rate=1e-4, target_update_interval=1000, train_freq=(4, "step"), exploration_fraction=0.1,
                exploration_final_eps=0.01, tensorboard_log="./cnn_dqn_tensorboard/",device=device)

Training CnnPolicy Model...
Using cuda device
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.




In [None]:
# Create checkpoint callback to save the CNN model periodically
checkpoint_callback = CheckpointCallback(save_freq=10000, save_path=log_dir, name_prefix="dqn_checkpoint")

cnn_model.learn(total_timesteps=50000, log_interval=10, callback=checkpoint_callback)
cnn_model.save("dqn_model.zip")
cnn_env.close()

Logging to ./cnn_dqn_tensorboard/DQN_4
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 900      |
|    ep_rew_mean      | -20.4    |
|    exploration_rate | 0.01     |
| time/               |          |
|    episodes         | 10       |
|    fps              | 1048     |
|    time_elapsed     | 8        |
|    total_timesteps  | 8999     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 846      |
|    ep_rew_mean      | -20.6    |
|    exploration_rate | 0.01     |
| time/               |          |
|    episodes         | 20       |
|    fps              | 414      |
|    time_elapsed     | 40       |
|    total_timesteps  | 16914    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000222 |
|    n_updates        | 1728     |
----------------------------------
----------------------------------
| rollout/      

In [None]:
# Evaluate CNN model
eval_env = gym.make("ALE/Pong-v5", render_mode="rgb_array")
eval_env = Monitor(eval_env, log_dir)
mean_reward_cnn, std_reward_cnn = evaluate_policy(cnn_model, eval_env, n_eval_episodes=10)
print(f"CnnPolicy - Mean Reward: {mean_reward_cnn} ± {std_reward_cnn}")
eval_env.close()

CnnPolicy - Mean Reward: -21.0 ± 0.0


 Tain with hyper parameters

In [None]:
"""Train a DQN agent on Atari Breakout with hyperparameter support."""

import argparse
import gym
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.optimizers import Adam
from rl.agents import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import EpsGreedyQPolicy

def parse_args():
    parser = argparse.ArgumentParser(description="Train a DQN agent for Atari Breakout using keras-rl")
    parser.add_argument("--env", type=str, default="Breakout-v0", help="Gym environment name")
    parser.add_argument("--learning_rate", type=float, default=0.001, help="Learning rate for the optimizer")
    parser.add_argument("--nb_steps", type=int, default=50000, help="Number of training steps")
    parser.add_argument("--nb_steps_warmup", type=int, default=1000, help="Warmup steps before training begins")
    parser.add_argument("--target_model_update", type=float, default=1e-2, help="Frequency of target model updates")
    parser.add_argument("--memory_limit", type=int, default=50000, help="Limit for SequentialMemory")
    parser.add_argument("--window_length", type=int, default=1, help="Window length for the memory")
    parser.add_argument("--eps", type=float, default=1.0, help="Initial epsilon for exploration")
    parser.add_argument("--save_file", type=str, default="policy.h5", help="Filename to save the trained model weights")
    return parser.parse_args()

def build_model(input_shape, nb_actions, learning_rate):
    model = Sequential()
    model.add(Flatten(input_shape=(1,) + input_shape))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(nb_actions, activation='linear'))
    model.compile(optimizer=Adam(lr=learning_rate), loss='mse')
    return model

def main():
    args = parse_args()

    # Create the gym environment based on the parameter
    env = gym.make(args.env)
    nb_actions = env.action_space.n

    # Build the model using the specified hyperparameters
    model = build_model(env.observation_space.shape, nb_actions, args.learning_rate)

    # Setup memory and policy using command-line parameters
    memory = SequentialMemory(limit=args.memory_limit, window_length=args.window_length)
    policy = EpsGreedyQPolicy(eps=args.eps)

    dqn = DQNAgent(model=model, memory=memory, policy=policy,
                   nb_actions=nb_actions,
                   nb_steps_warmup=args.nb_steps_warmup,
                   target_model_update=args.target_model_update)

    dqn.compile(Adam(lr=args.learning_rate), metrics=['mae'])
    dqn.fit(env, nb_steps=args.nb_steps, visualize=False, verbose=2)

    dqn.save_weights(args.save_file, overwrite=True)

if __name__ == "__main__":
    main()


Train with hyper parameters

In [None]:
"""Play Breakout with a trained DQN agent using GreedyQPolicy."""

import argparse
import gym
from keras.models import Sequential
from keras.layers import Dense, Flatten
from rl.agents import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import GreedyQPolicy

def parse_args():
    parser = argparse.ArgumentParser(description="Play Breakout with a trained DQN agent")
    parser.add_argument("--env", type=str, default="Breakout-v0", help="Gym environment name")
    parser.add_argument("--model_file", type=str, default="policy.h5", help="File with trained model weights")
    parser.add_argument("--episodes", type=int, default=5, help="Number of episodes to play")
    parser.add_argument("--memory_limit", type=int, default=50000, help="Memory limit for the agent")
    parser.add_argument("--window_length", type=int, default=1, help="Window length for the memory")
    return parser.parse_args()

def build_model(input_shape, nb_actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1,) + input_shape))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(nb_actions, activation='linear'))
    return model

def build_agent(model, nb_actions, memory_limit, window_length):
    memory = SequentialMemory(limit=memory_limit, window_length=window_length)
    policy = GreedyQPolicy()
    dqn = DQNAgent(model=model, memory=memory, policy=policy, nb_actions=nb_actions)
    return dqn

def main():
    args = parse_args()

    # Create the gym environment
    env = gym.make(args.env)
    nb_actions = env.action_space.n

    # Build model and agent matching the training architecture
    model = build_model(env.observation_space.shape, nb_actions)
    dqn = build_agent(model, nb_actions, args.memory_limit, args.window_length)

    # Load the trained weights
    dqn.load_weights(args.model_file)

    # Run episodes
    for episode in range(args.episodes):
        state = env.reset()
        done = False
        total_reward = 0
        while not done:
            env.render()
            action = dqn.forward(state)
            state, reward, done, _ = env.step(action)
            total_reward += reward
        print(f"Episode {episode + 1}: Total Reward = {total_reward}")
    env.close()

if __name__ == "__main__":
    main()
