In [1]:
import numpy as np
import pandas
import gym
import ray
from ray import tune, rllib, air
#from ray.tune.logger import pretty_print
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.utils.pre_checks.env import check_env
from IPython.display import display
print(f"ray: {ray.__version__}")
print(f"gym: {gym.__version__}")

ray: 2.2.0
gym: 0.21.0


In [6]:
# define the cartpole environment
env = gym.make("CartPole-v1")

In [8]:
# inspect the environment
print("action space: ", env.action_space)
print("observation space: ", env.observation_space)
num_episodes = 5
total_reward = 0

for ep in range(num_episodes):
    obs = env.reset()
    done = False
    while True:
        action = env.action_space.sample()
        new_obs, reward, done, info = env.step(action)
        total_reward += reward
        
        if done:
            break
        
        #print(f"episode: {ep}")
        #print(f"obs: {new_obs}, reward: {total_reward}, done: {done}")
        env.render()
env.close()

action space:  Discrete(2)
observation space:  Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)


In [5]:
print("checking environment ...")
try:
    check_env(env)
    print("All checks passed. No errors found.")
except:
    print("failed")

checking environment ...
All checks passed. No errors found.


In [11]:
# calculate the environment baseline
num_episodes = 3000
num_timesteps = 0
episode_rewards = []

for ep in range(num_episodes):
    obs = env.reset()
    done = False
    episode_reward = 0.0

    while True:
        action = env.action_space.sample()
        new_obs, reward, done, info = env.step(action)
        episode_reward += reward
        num_timesteps += 1

        if done:
            episode_rewards.append(episode_reward)
            break

# calculate mean_reward
env_mean_random_reward = np.mean(episode_rewards)
env_sd_reward = np.std(episode_rewards)
# calculate number of wins
total_reward = np.sum(episode_rewards)

print()
print("**************")
print(f"Baseline Mean Reward={env_mean_random_reward:.2f}+/-{env_sd_reward:.2f}", end="")
print()
print(f"Baseline won {total_reward} times over {num_episodes} episodes ({num_timesteps} timesteps)")
print(f"Approx {total_reward/num_episodes:.2f} wins per episode")
print("**************")


**************
Baseline Mean Reward=22.25+/-11.66
Baseline won 66745.0 times over 3000 episodes (66745 timesteps)
Approx 22.25 wins per episode
**************


In [26]:
# configure and build the algorithm
config = (PPOConfig()
          .environment("CartPole-v1")
          .rollouts(num_rollout_workers=2)
          .evaluation(evaluation_interval=15, evaluation_duration=5, evaluation_num_workers=1)
)
algo = config.build()




In [27]:
# train the agent using Tuner
stop_criteria = dict(time_total_s = 120) # trian for 2 min
tuner = tune.Tuner(
    config.algo_class,
    param_space = config.to_dict(),
    run_config = air.RunConfig(
        local_dir = "cartpole_logs",
        stop = stop_criteria,
        verbose = 2
    )
)
experiment_results = tuner.fit()

0,1
Current time:,2023-06-09 17:24:17
Running for:,00:02:13.29
Memory:,9.4/15.4 GiB

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_d35b7_00000,TERMINATED,192.168.1.69:20713,35,121.493,140000,451.82,500,119,451.82


[2m[36m(PPO pid=20713)[0m 2023-06-09 17:22:07,288	INFO algorithm_config.py:2503 -- Your framework setting is 'tf', meaning you are using static-graph mode. Set framework='tf2' to enable eager execution with tf2.x. You may also then want to set eager_tracing=True in order to reach similar execution speed as with static-graph mode.
[2m[36m(PPO pid=20713)[0m 2023-06-09 17:22:07,425	INFO algorithm.py:501 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Trial name,agent_timesteps_total,counters,custom_metrics,episode_len_mean,episode_media,episode_reward_max,episode_reward_mean,episode_reward_min,episodes_this_iter,info,num_agent_steps_sampled,num_agent_steps_trained,num_env_steps_sampled,num_env_steps_sampled_this_iter,num_env_steps_trained,num_env_steps_trained_this_iter,num_faulty_episodes,num_healthy_workers,num_in_flight_async_reqs,num_remote_worker_restarts,num_steps_trained_this_iter,perf,policy_reward_max,policy_reward_mean,policy_reward_min,sampler_perf,sampler_results,timers
PPO_CartPole-v1_d35b7_00000,4000,"{'num_env_steps_sampled': 4000, 'num_env_steps_trained': 4000, 'num_agent_steps_sampled': 4000, 'num_agent_steps_trained': 4000}",{},22.3333,{},60,22.3333,8,177,"{'learner': {'default_policy': {'learner_stats': {'cur_kl_coeff': 0.20000000298023224, 'cur_lr': 4.999999873689376e-05, 'total_loss': 8.919505, 'policy_loss': -0.04005314, 'vf_loss': 8.954202, 'vf_explained_var': 0.0077485726, 'kl': 0.026782028, 'entropy': 0.66744447, 'entropy_coeff': 0.0, 'model': {}}, 'custom_metrics': {}, 'num_agent_steps_trained': 128.0, 'num_grad_updates_lifetime': 465.5, 'diff_num_grad_updates_vs_sampler_policy': 464.5}}, 'num_env_steps_sampled': 4000, 'num_env_steps_trained': 4000, 'num_agent_steps_sampled': 4000, 'num_agent_steps_trained': 4000}",4000,4000,4000,4000,4000,4000,0,2,0,0,4000,"{'cpu_util_percent': 46.03333333333333, 'ram_util_percent': 61.21666666666667}",{},{},{},"{'mean_raw_obs_processing_ms': 0.2436084438090651, 'mean_inference_ms': 0.49566054389481057, 'mean_action_processing_ms': 0.05542348405026465, 'mean_env_wait_ms': 0.04498101053013429, 'mean_env_render_ms': 0.0}","{'episode_reward_max': 60.0, 'episode_reward_min': 8.0, 'episode_reward_mean': 22.333333333333332, 'episode_len_mean': 22.333333333333332, 'episode_media': {}, 'episodes_this_iter': 177, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [24.0, 21.0, 51.0, 21.0, 30.0, 10.0, 17.0, 15.0, 27.0, 22.0, 27.0, 15.0, 34.0, 43.0, 9.0, 45.0, 12.0, 17.0, 20.0, 12.0, 31.0, 19.0, 28.0, 26.0, 26.0, 33.0, 30.0, 12.0, 50.0, 14.0, 10.0, 17.0, 45.0, 30.0, 38.0, 21.0, 25.0, 8.0, 15.0, 23.0, 25.0, 23.0, 13.0, 11.0, 16.0, 14.0, 25.0, 17.0, 11.0, 12.0, 18.0, 19.0, 17.0, 15.0, 16.0, 14.0, 41.0, 23.0, 15.0, 46.0, 26.0, 20.0, 21.0, 19.0, 26.0, 17.0, 17.0, 16.0, 20.0, 15.0, 17.0, 23.0, 20.0, 18.0, 25.0, 17.0, 29.0, 17.0, 24.0, 11.0, 13.0, 21.0, 15.0, 10.0, 21.0, 23.0, 22.0, 19.0, 10.0, 17.0, 13.0, 21.0, 32.0, 37.0, 22.0, 24.0, 14.0, 35.0, 9.0, 22.0, 9.0, 15.0, 28.0, 16.0, 55.0, 11.0, 12.0, 33.0, 31.0, 60.0, 27.0, 12.0, 33.0, 10.0, 32.0, 11.0, 18.0, 10.0, 13.0, 15.0, 23.0, 44.0, 36.0, 15.0, 13.0, 30.0, 41.0, 36.0, 23.0, 24.0, 15.0, 12.0, 12.0, 18.0, 10.0, 27.0, 15.0, 20.0, 17.0, 55.0, 13.0, 17.0, 29.0, 25.0, 30.0, 18.0, 14.0, 10.0, 24.0, 14.0, 25.0, 16.0, 19.0, 12.0, 14.0, 18.0, 54.0, 46.0, 12.0, 17.0, 17.0, 30.0, 31.0, 24.0, 26.0, 17.0, 18.0, 35.0, 12.0, 16.0, 30.0, 13.0, 33.0, 21.0, 25.0, 25.0, 54.0], 'episode_lengths': [24, 21, 51, 21, 30, 10, 17, 15, 27, 22, 27, 15, 34, 43, 9, 45, 12, 17, 20, 12, 31, 19, 28, 26, 26, 33, 30, 12, 50, 14, 10, 17, 45, 30, 38, 21, 25, 8, 15, 23, 25, 23, 13, 11, 16, 14, 25, 17, 11, 12, 18, 19, 17, 15, 16, 14, 41, 23, 15, 46, 26, 20, 21, 19, 26, 17, 17, 16, 20, 15, 17, 23, 20, 18, 25, 17, 29, 17, 24, 11, 13, 21, 15, 10, 21, 23, 22, 19, 10, 17, 13, 21, 32, 37, 22, 24, 14, 35, 9, 22, 9, 15, 28, 16, 55, 11, 12, 33, 31, 60, 27, 12, 33, 10, 32, 11, 18, 10, 13, 15, 23, 44, 36, 15, 13, 30, 41, 36, 23, 24, 15, 12, 12, 18, 10, 27, 15, 20, 17, 55, 13, 17, 29, 25, 30, 18, 14, 10, 24, 14, 25, 16, 19, 12, 14, 18, 54, 46, 12, 17, 17, 30, 31, 24, 26, 17, 18, 35, 12, 16, 30, 13, 33, 21, 25, 25, 54]}, 'sampler_perf': {'mean_raw_obs_processing_ms': 0.2436084438090651, 'mean_inference_ms': 0.49566054389481057, 'mean_action_processing_ms': 0.05542348405026465, 'mean_env_wait_ms': 0.04498101053013429, 'mean_env_render_ms': 0.0}, 'num_faulty_episodes': 0}","{'training_iteration_time_ms': 3631.338, 'load_time_ms': 0.215, 'load_throughput': 18641351.111, 'learn_time_ms': 1904.863, 'learn_throughput': 2099.889, 'synch_weights_time_ms': 3.206}"


2023-06-09 17:24:17,895	INFO tune.py:762 -- Total run time: 133.77 seconds (133.28 seconds for the tuning loop).


In [28]:
# examine the Ray Tune experiment results, get the appropriate learning rate
df = experiment_results.get_dataframe()

temp_columns = [
    "experiment_id", "config/lr", "config/gamma", "episode_reward_mean",
    "episode_len_mean", "timesteps_total", "training_iteration", 
    "done", "time_total_s", 
    "timers/training_iteration_time_ms", 
    "config/num_workers", "config/evaluation_num_workers",
    "config/num_envs_per_worker"
]
temp = df.loc[:, temp_columns].head()
temp.rename(columns={'config/evaluation_config/num_workers':'num_train_workers'}, inplace=True)
temp.rename(columns={'config/num_envs_per_worker':'num_envs_per_eval_worker'}, inplace=True)
temp.rename(columns={'config/evaluation_num_workers':'evaluation_num_workers'}, inplace=True)
temp.rename(columns={'config/num_workers':'num_rollout_workers'}, inplace=True)
display(temp)

Unnamed: 0,experiment_id,config/lr,config/gamma,episode_reward_mean,episode_len_mean,timesteps_total,training_iteration,done,time_total_s,timers/training_iteration_time_ms,num_rollout_workers,evaluation_num_workers,num_envs_per_eval_worker
0,6665276e1ebb45938385226891a89d1f,5e-05,0.99,451.82,451.82,140000,35,True,121.492769,3339.97,2,1,1


In [29]:
# To start fresh, restart Ray in case it is already running
if ray.is_initialized():
    ray.shutdown()

In [3]:
# configure a new agent based on the tuning results obtained above
config2 = (
    PPOConfig()
    .environment(env = "CartPole-v1")
    .rollouts(num_rollout_workers=2)
    .evaluation(evaluation_interval=15, evaluation_duration=5, evaluation_num_workers=1)
    .training(lr=0.00005)
)
algo = config2.build()

2023-06-11 10:10:22,424	INFO worker.py:1538 -- Started a local Ray instance.


In [3]:
# train the new agent using RLlib.train() in a loop
num_iterations = 10
rewards = []
checkpoint_dir = "saved_runs/ppo/"

for i in range(num_iterations):
    result = algo.train()
    rewards.append(result["episode_reward_mean"])

    # save a checkpoint and evaluate the policy at the end of the training
    if (i==num_iterations-1):
        checkpoint_file = algo.save(checkpoint_dir)
        eval_result = algo.evaluate()

'''print(eval_result)
# convert num_iterations to num_episodes
num_episodes = len(eval_result["hist_stats"]["episode_lengths"]) * num_iterations
# convert num_iterations to num_timesteps
num_timesteps = sum(result["hist_stats"]["episode_lengths"] * num_iterations)
# calculate number of wins
num_wins = np.sum(result["hist_stats"]["episode_reward"])

print(f"PPO won {num_wins} times over {num_episodes} episodes ({num_timesteps} timesteps)")
print(f"Approx {num_wins/num_episodes:.2f} wins per episode")'''

'print(eval_result)\n# convert num_iterations to num_episodes\nnum_episodes = len(eval_result["hist_stats"]["episode_lengths"]) * num_iterations\n# convert num_iterations to num_timesteps\nnum_timesteps = sum(result["hist_stats"]["episode_lengths"] * num_iterations)\n# calculate number of wins\nnum_wins = np.sum(result["hist_stats"]["episode_reward"])\n\nprint(f"PPO won {num_wins} times over {num_episodes} episodes ({num_timesteps} timesteps)")\nprint(f"Approx {num_wins/num_episodes:.2f} wins per episode")'

In [None]:
# check tensorboard!

In [4]:
# reload the policy from checkponit and run inference
checkpoint = "./saved_runs/ppo/checkpoint_000010"
#new_config = PPOConfig()
algo = config2.build()
algo.restore(checkpoint)

2023-06-11 10:10:40,483	INFO trainable.py:790 -- Restored on 192.168.1.69 from checkpoint: saved_runs/ppo/checkpoint_000010
2023-06-11 10:10:40,484	INFO trainable.py:799 -- Current state after restoring: {'_iteration': 10, '_timesteps_total': None, '_time_total': 30.973118782043457, '_episodes_total': 372}


In [7]:
# play and render the game
num_episodes = 50
total_reward = 0

for ep in range(num_episodes):
    obs = env.reset()
    done = False
    while True:
        action = algo.compute_single_action(observation=obs, explore=False)
        obs, reward, done, info = env.step(action)
        total_reward += reward
        
        if done:
            break
        
        #print(f"episode: {ep}")
        #print(f"obs: {new_obs}, reward: {total_reward}, done: {done}")
        env.render()
env.close()


