In [1]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

def train_and_save_model(env_id, total_timesteps, save_path):
    vec_env = make_vec_env(env_id)
    model = PPO("MlpPolicy", vec_env, verbose=1)
    model.learn(total_timesteps=total_timesteps)
    model.save(save_path)

def load_and_test_model(env_id, model_path):
    vec_env = make_vec_env(env_id)
    model = PPO.load(model_path)
    obs = vec_env.reset()
    done = False
    scores = 0
    while not done:
        action, _states = model.predict(obs)
        obs, rewards, done, info = vec_env.step(action)
        scores += rewards
        vec_env.render("human")
    print('Score: {}'.format(scores[0]))

train_and_save_model(env_id="CartPole-v1", total_timesteps=10000, save_path="ppo_cartpole")

load_and_test_model(env_id="CartPole-v1", model_path="ppo_cartpole")




Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 22.7     |
|    ep_rew_mean     | 22.7     |
| time/              |          |
|    fps             | 3550     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 24.4        |
|    ep_rew_mean          | 24.4        |
| time/                   |             |
|    fps                  | 1930        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008228056 |
|    clip_fraction        | 0.0621      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.687      |
|    explained_variance   | 0.00191     |
|    learning



Score: 265.0


In [2]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

def train_model(env_id, num_envs, total_timesteps, save_path):
    vec_env = make_vec_env(env_id, n_envs=num_envs)
    model = PPO('MlpPolicy', vec_env)
    model.learn(total_timesteps=total_timesteps)
    model.save(save_path)

def test_model(env_id, model_path, render=True):
    vec_env = make_vec_env(env_id)
    model = PPO.load(model_path)
    obs = vec_env.reset()
    done = False
    scores = 0
    while not done:
        action, _states = model.predict(obs)
        obs, rewards, done, info = vec_env.step(action)
        scores += rewards
        if render:
            vec_env.render("human")
    print('Score: {}'.format(scores[0]))

if __name__ == "__main__":
    train_model(env_id="MsPacman-v0", num_envs=4, total_timesteps=10000, save_path="ppo_pacman")
    test_model(env_id="MsPacman-v4", model_path="ppo_pacman")


  logger.deprecation(
  logger.warn(


Score: 280.0


In [1]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.env_util import make_atari_env


def process_observation(observation):
    observation = observation[:, :, 0] * 0.2989 + observation[:, :, 1] * 0.587 + observation[:, :, 2] * 0.114
    resized_observation = torch.tensor(observation).unsqueeze(0).unsqueeze(0)
    resized_observation = nn.functional.interpolate(resized_observation, size=(84, 84))
    return resized_observation


env_name = 'PongNoFrameskip-v4'
env = make_atari_env(env_name, n_envs=1, seed=0)
env = VecFrameStack(env, n_stack=4)


class CustomCNN(nn.Module):
    def __init__(self, num_actions):
        super(CustomCNN, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )
        self.linear_layers = nn.Sequential(
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, num_actions)
        )

    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)
        x = self.linear_layers(x)
        return x


num_actions = env.action_space.n
model = CustomCNN(num_actions)
optimizer = optim.Adam(model.parameters(), lr=1e-4)


agent = PPO("CnnPolicy", env, verbose=1, tensorboard_log="./ppo_space_invaders_tensorboard/", device="cuda")
agent.learn(total_timesteps=5000)  


agent.save("ppo_space_invaders_weights")


A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


Using cpu device
Wrapping the env in a VecTransposeImage.
Logging to ./ppo_space_invaders_tensorboard/PPO_2
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 3.71e+03 |
|    ep_rew_mean     | -20      |
| time/              |          |
|    fps             | 239      |
|    iterations      | 1        |
|    time_elapsed    | 8        |
|    total_timesteps | 2048     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 3.52e+03   |
|    ep_rew_mean          | -20.2      |
| time/                   |            |
|    fps                  | 65         |
|    iterations           | 2          |
|    time_elapsed         | 62         |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.00930294 |
|    clip_fraction        | 0.0408     |
|    clip_range           | 0.2        |
|    entropy_loss  

In [None]:
import gym


env = gym.make('PongNoFrameskip-v4')
observation = env.reset()

for _ in range(1000):
    env.render()

    action = env.action_space.sample()

   
    observation, reward, done, info = env.step(action)

   
    if done:
        break


env.close()


In [20]:
import numpy as np
import gym
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

class DQNAgent:
    def __init__(self, state_dim, action_dim):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.memory = []
        self.epsilon, self.epsilon_decay, self.epsilon_min = 1.0, 0.995, 0.01
        self.gamma, self.learning_rate = 0.99, 0.001
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential()
        model.add(Dense(24, input_shape=(self.state_dim,), activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_dim, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def select_action(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(self.action_dim)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def store_experience(self, state, action, reward, next_state, done):
        experience = (state, action, reward, next_state, done)
        self.memory.append(experience)

def train_agent_in_environment(environment, agent, episodes, batch_size=32):
    for episode in range(episodes):
        state = environment.reset()
        done = False
        total_reward = 0

        while not done:
            if np.random.rand() < agent.epsilon:
                action = environment.action_space.sample()
            else:
                state_input = np.expand_dims(state, axis=0)
                action = agent.select_action(state_input)

            next_state, reward, done, _ = environment.step(action)
            agent.store_experience(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward

            if agent.epsilon > agent.epsilon_min:
                agent.epsilon *= agent.epsilon_decay

        print(f"Episode: {episode + 1}, Total Reward: {total_reward}, Epsilon: {agent.epsilon}")

    environment.close()

def train_dqn_agent(environment_name, num_episodes):
    gym_env = gym.make(environment_name)
    state_space = gym_env.observation_space.shape
    action_space = gym_env.action_space.n
    state_dim = state_space[0]
    dqn_agent = DQNAgent(state_dim, action_space)
    
    train_agent_in_environment(gym_env, dqn_agent, episodes=num_episodes)
    
    gym_env.close()

if __name__ == "__main__":
    environment_name = 'CartPole-v1'
    num_episodes = 30
    train_dqn_agent(environment_name, num_episodes)


Episode: 1, Total Reward: 12.0, Epsilon: 0.9416228069143757
Episode: 2, Total Reward: 31.0, Epsilon: 0.8061065909263957
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Episode: 3, Total Reward: 13.0, Epsilon: 0.7552531090661897
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Episode: 4, Total Reward: 38.0, Epsilon: 0.6242658676435396
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Episode: 5, Total Reward: 33.0, Epsilon: 0.5290920728090721
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━