#  Grid4x4

In [13]:
import os

from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.vec_env import VecMonitor
from sumo_rl import parallel_env
import supersuit as ss

from evaluate import evaluate

In [14]:
TEST_NUM = 1

## Create Environment and Instantiate Agent

In [15]:
from pettingzoo.test import parallel_api_test
from reward_functions import diff_wait_time

env_params = {
    "net_file": os.path.join("nets","RESCO","grid4x4","grid4x4.net.xml"),
    "route_file": os.path.join("nets","RESCO","grid4x4","grid4x4_1.rou.xml"),
    "num_seconds": 3600,
    "reward_fn": diff_wait_time,
    "sumo_seed": 42,
}
env = parallel_env(**env_params)

parallel_api_test(env, num_cycles=10)

# Maybe add frame-stacking here
env = ss.pettingzoo_env_to_vec_env_v1(env)
env = ss.concat_vec_envs_v1(env, 8, num_cpus=1, base_class="stable_baselines3")
env = VecMonitor(env)

In [16]:
# Using hyperparams from RESCO supplement/appendix Table 5
# https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/f0935e4cd5920aa6c7c996a5ee53a70f-Abstract-round1.html

model = PPO(
    "MlpPolicy",
    env,
    learning_rate=2.5e-4,
    n_steps=1024,
    batch_size=256,
    n_epochs=4,
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.1,
    ent_coef=1e-3,
    max_grad_norm=0.5,
    tensorboard_log=os.path.join("logs","grid4x4"),
    verbose=1
)

Using cuda device


In [5]:
# Evaluate untrained random agent
csv_dir = os.path.join("outputs","grid4x4",f"test_{TEST_NUM}")
if not os.path.exists(csv_dir):
    os.makedirs(csv_dir)

csv_path = os.path.join(csv_dir, "untrained.csv")
tb_log_dir = os.path.join("logs","grid4x4","eval_untrained")

mean_untrained_reward, std_untrained_reward = evaluate(model, env, csv_path, tb_log_dir, n_eval_episodes=1)

# Change made to SB3 > common > vec_env > vec_monitor.py > VecMonitor
# Line 76 - added extra item to self.venv.step_wait() return
# obs, rewards, dones, infos --> obs, rewards, dones, _, infos

print(mean_untrained_reward)
print(std_untrained_reward)

-107404.0
0.0


## Train and Evaluate Agent

In [18]:
model.learn(total_timesteps=3600)

KeyboardInterrupt: 

In [None]:
# Evaluate trained agent
mean_trained_reward, std_trained_reward = evaluate_policy(model, env, n_eval_episodes=1)

print(mean_trained_reward)
print(std_trained_reward)

### Record Results

In [None]:
data = [env_params['net_file'], TEST_NUM, env.reward_fn, mean_untrained_reward, mean_trained_reward]

with open("test_results.csv", "a", encoding="UTF8", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(data)

### Save Model

In [None]:
model.save(os.path.join("saved_models",f"PPO_grid4x4_{TEST_NUM}"))

## Render Video

In [None]:
env_params.update({"use_gui": True})
render_env = parallel_env(**env_params)

check_env(render_env)

# Maybe add frame-stacking here
render_env = ss.pettingzoo_env_to_vec_env_v1(render_env)
render_env = ss.concat_vec_envs_v1(env, 2, num_cpus=1, base_class="stable_baselines3")
render_env = VecMonitor(render_env)

In [None]:
obs, info = render_env.reset()

folder_path = os.path.join("renders","grid4x4",f"test_{TEST_NUM}")
if not os.path.exists(folder_path):
    os.mkdir(folder_path)

max_time = env.unwrapped.env.sim_max_time
delta_time = env.unwrapped.env.delta_time
vid_length = round(max_time/delta_time)

for i in range(vid_length):
    actions, _ = model.predict(obs)
    obs, reward, terminated, truncated, info = render_env.step(actions)
    env.render()

    im = pyautogui.screenshot(
        os.path.join(folder_path,f"img{i}.jpg"),
        region=(0, 0, 2560, 1542)
    )

render_env.close()  # clean up

In [None]:
subprocess.run([
    "ffmpeg", "-y", "-r", "5", "-i",
    f"renders\\grid4x4\\test_{TEST_NUM}\\img%d.jpg",
    f"videos\\grid4x4_{TEST_NUM}.mp4"
])

## Clean-up

In [None]:
env.close()
render_env.close()