## Importing dependencies

In [2]:
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import  VecMonitor
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import CheckpointCallback
import metadrive


### Creating a MetaDrive environment

In [2]:
config = {
    "use_render": False,
    "manual_control": False,
    "traffic_density": 0.3,
    "map": "X",   
    "random_lane_width": True,
    "random_agent_model": True,
    "random_traffic": True,
    "map": 5,  
    "num_agents": 1,
    "allow_respawn": True
}

env = metadrive.MetaDriveEnv(config)

[38;20m[INFO] Environment: MetaDriveEnv[0m
[38;20m[INFO] MetaDrive version: 0.4.3[0m
[38;20m[INFO] Sensors: [lidar: Lidar(), side_detector: SideDetector(), lane_line_detector: LaneLineDetector()][0m
[38;20m[INFO] Render Mode: none[0m
[38;20m[INFO] Horizon (Max steps per agent): 1000[0m


## Exploring the environment

In [11]:
print(env.observation_space)   
print(env.action_space)        

Box(-0.0, 1.0, (261,), float32)
Box(-1.0, 1.0, (2,), float32)


In [24]:
obs = env.reset()
for step in range(10):
    action = env.action_space.sample()   # random action
    obs, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated

    if done:
        obs, info = env.reset()


[38;20m[INFO] Assets version: 0.4.3[0m
[38;20m[INFO] Known Pipes: wglGraphicsPipe[0m
[38;20m[INFO] Start Scenario Index: 0, Num Scenarios : 1[0m


In [9]:
env.close()

## Training a PPO model 

In [3]:
model= PPO("MlpPolicy", env, verbose=1,tensorboard_log="./tb_logs",)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




In [4]:
ckpt_cb = CheckpointCallback(save_freq=100000, save_path="./checkpoints/", name_prefix="ppo_metadrive")

In [None]:
model.learn(total_timesteps=1000000, callback=ckpt_cb)


[38;20m[INFO] Assets version: 0.4.3[0m
[38;20m[INFO] Known Pipes: wglGraphicsPipe[0m
[38;20m[INFO] Start Scenario Index: 0, Num Scenarios : 1[0m


Logging to ./tb_logs\PPO_5
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 434      |
|    ep_rew_mean     | 0.834    |
| time/              |          |
|    fps             | 15       |
|    iterations      | 1        |
|    time_elapsed    | 132      |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 622          |
|    ep_rew_mean          | 4.02         |
| time/                   |              |
|    fps                  | 13           |
|    iterations           | 2            |
|    time_elapsed         | 294          |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0057820645 |
|    clip_fraction        | 0.0581       |
|    clip_range           | 0.2          |
|    entropy_loss         | -2.84        |
|    explained_variance   | 0.0

The model stopped training after 700k steps, so i had to load the last model from checkpoints and continue training it.

In [5]:

model = PPO.load("./checkpoints/ppo_metadrive_700000_steps", env=env, tensorboard_log="./tb_logs")

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [6]:
model.learn(total_timesteps=200000, callback=ckpt_cb)

[38;20m[INFO] Assets version: 0.4.3[0m
[38;20m[INFO] Known Pipes: wglGraphicsPipe[0m
[38;20m[INFO] Start Scenario Index: 0, Num Scenarios : 1[0m


Logging to ./tb_logs\PPO_6
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 81.4     |
|    ep_rew_mean     | 56.7     |
| time/              |          |
|    fps             | 9        |
|    iterations      | 1        |
|    time_elapsed    | 217      |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 84.4        |
|    ep_rew_mean          | 61.1        |
| time/                   |             |
|    fps                  | 9           |
|    iterations           | 2           |
|    time_elapsed         | 442         |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007942596 |
|    clip_fraction        | 0.138       |
|    clip_range           | 0.2         |
|    entropy_loss         | 0.125       |
|    explained_variance   | 0.874       |
|  

<stable_baselines3.ppo.ppo.PPO at 0x2a56083b200>

In [8]:
model.save("ppo_highway")

## Visualizing how the model is performing

In [3]:
config = {
    "use_render": True,
    "manual_control": False,
    "traffic_density": 0.3,
    "map": "X",   
    "random_lane_width": True,
    "random_agent_model": True,
    "random_traffic": True,
    "map": 5,  
    "num_agents": 1,
}

nenv = metadrive.envs.MetaDriveEnv(config)
model = PPO.load("ppo_highway")

[38;20m[INFO] Environment: MetaDriveEnv[0m
[38;20m[INFO] MetaDrive version: 0.4.3[0m
[38;20m[INFO] Sensors: [lidar: Lidar(), side_detector: SideDetector(), lane_line_detector: LaneLineDetector(), main_camera: MainCamera(1200, 900), dashboard: DashBoard()][0m
[38;20m[INFO] Render Mode: onscreen[0m
[38;20m[INFO] Horizon (Max steps per agent): 1000[0m


In [4]:
for episode in range(2):  
    obs, info = nenv.reset()
    done = False
    total_reward = 0

    while not done:
        action, _ = model.predict(obs, deterministic=True)  
        obs, reward, terminated, truncated, info = nenv.step(action)
        total_reward += reward
        nenv.render(mode="human")   

    print(f"Episode {episode} reward: {total_reward}")

[38;20m[INFO] Assets version: 0.4.3[0m
[38;20m[INFO] Known Pipes: wglGraphicsPipe[0m
[38;20m[INFO] Start Scenario Index: 0, Num Scenarios : 1[0m


* interrupt by keyboard


KeyboardInterrupt: 

In [18]:
nenv.close()

## Using tensorboard to see metrics

In [9]:
#!tensorboard --logdir=./tb_logs   