In [8]:
#auto reload modules 
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
from ray.rllib.algorithms.ppo import PPOConfig 
import mediapy as media
from ray.tune.registry import register_env
from snake_env_ray import SnakeEnv
import yaml 


In [10]:
register_env("snake-v0", lambda config: SnakeEnv(config))

In [11]:
configs = yaml.safe_load(open("SnakeDeepQ.yaml"))["env"]

In [12]:
# gamma: 0.99

# lr: 0.005

# train_batch_size: 100000

# model:
#   fcnet_hiddens: [256, 256]
#   fcnet_activation: swish
#   vf_share_layers: false
#   free_log_std: true 
  
# optimizer: 
#   type: adam

# use_critic: true

# use_gae: true

# lambda_: 0.95

# kl_coeff: 0.2

# sgd_minibatch_size: 20000

# num_sgd_iter: 5

# shuffle_sequences: true

# vf_loss_coeff: 0.5

# entropy_coeff: 0.01

# clip_param: 0.3

# kl_target: 0.2

# vf_clip_param: 100.0

# env: cassie-v0

# disable_env_checking: true

# clip_actions: true


# framework: torch


# num_rollout_workers: 32

# num_envs_per_worker: 1

# rollout_fragment_length: auto

# batch_mode: truncate_episodes

# recreate_failed_workers: true  

# observation_filter: MeanStdFilter  

# evaluation_interval: 10

# evaluation_duration: 10

# num_gpus: 0
# num_cpus_per_worker: 1


In [13]:
trainer = PPOConfig().environment(env = "snake-v0", env_config=configs)\
                     .resources(num_gpus=0, num_cpus_per_worker=1)\
                     .rollouts(num_rollout_workers=20, recreate_failed_workers= True )\
                     .training(gamma = 0.9 , kl_coeff = 0.3,model ={
  'fcnet_hiddens' : [256, 256],
  'fcnet_activation': 'swish',
  'vf_share_layers': False,
  'free_log_std': True }, train_batch_size= 10000 ,sgd_minibatch_size = 2000,num_sgd_iter= 5
)
trainer.observation_filter = "MeanStdFilter"

trainer = trainer.build()
  



[2m[1m[36m(autoscaler +5m43s)[0m Tip: use `ray status` to view detailed cluster status. To disable these messages, set RAY_SCHEDULER_EVENTS=0.






KeyboardInterrupt: 



In [None]:
fps = 30
import os 

# Training loop
max_test_i = 0
checkpoint_frequency = 50
simulation_frequency = 20
env = SnakeEnv(config=configs)
env.render_mode = "rgb_array"
sim_dir = "ray sims"
# Create sim directory if it doesn't exist
if not os.path.exists(sim_dir):
    os.makedirs(sim_dir)

# Find the latest directory named test_i in the sim directory
latest_directory = max(
    [int(d.split("_")[-1]) for d in os.listdir(sim_dir) if d.startswith("test_")],
    default=0,
)
max_test_i = latest_directory + 1

# Create folder for test
test_dir = os.path.join(sim_dir, "test_{}".format(max_test_i))
os.makedirs(test_dir, exist_ok=True)

# Define video codec and framerate
fps = 30

# Set initial iteration count
i = trainer.iteration if hasattr(trainer, "iteration") else 0

while True:
    # Train for one iteration
    result = trainer.train()
    #get the current filter params
    i += 1
    print(
        "Episode {} Reward Mean {}".format(
            i,
            result["episode_reward_mean"]
        )
    )

    # Save model every 10 epochs
    if i % checkpoint_frequency == 0:
        checkpoint_path = trainer.save()
        print("Checkpoint saved at", checkpoint_path)

    # Run a test every 20 epochs
    if i % simulation_frequency == 0:
        # make a steps counter
        steps = 0

        # Run test
        video_path = os.path.join(test_dir, "sim_{}.mp4".format(i))
        filterfn = trainer.workers.local_worker().filters["default_policy"]
        env.reset()
        obs = env.reset()[0]
        done = False
        frames = []

        while not done:
            # Increment steps
            steps += 1
            obs = filterfn(obs)
            action = trainer.compute_single_action(obs)
            obs, _, done, _, _ = env.step(action)
            frame = env.render()
            frames.append(frame)

        # Save video
        media.write_video(video_path, frames, fps=fps)
        print("Test saved at", video_path)
        # Increment test index
        max_test_i += 1


Episode 1 Reward Mean 7.1440136964265335
Episode 2 Reward Mean 7.1156700091130105
Test saved at ray sims/test_3/sim_2.mp4
Episode 3 Reward Mean 8.02253512319115
Episode 4 Reward Mean 7.118813108174454
Test saved at ray sims/test_3/sim_4.mp4
Episode 5 Reward Mean 7.836680958245234
Episode 6 Reward Mean 8.76912884245808
Test saved at ray sims/test_3/sim_6.mp4
Episode 7 Reward Mean 8.129996513260192
Episode 8 Reward Mean 9.865301426600558
Test saved at ray sims/test_3/sim_8.mp4
Episode 9 Reward Mean 9.328345619898885
Episode 10 Reward Mean 10.391821383714
Test saved at ray sims/test_3/sim_10.mp4
Episode 11 Reward Mean 11.414136237034793
Episode 12 Reward Mean 17.28870972681886
Test saved at ray sims/test_3/sim_12.mp4
Episode 13 Reward Mean 16.1825666937009
Episode 14 Reward Mean 19.395632861631547
Test saved at ray sims/test_3/sim_14.mp4
Episode 15 Reward Mean 19.105474507873637
Episode 16 Reward Mean 21.345048502209785
Test saved at ray sims/test_3/sim_16.mp4
Episode 17 Reward Mean 23.44

KeyboardInterrupt: 