In [1]:
from itertools import islice

import compiler_gym
import ray
from compiler_gym.wrappers import (
    ConstrainedCommandline,
    TimeLimit,
    CycleOverBenchmarks,
)
from matplotlib import pyplot as plt
from ray import tune
from ray.rllib.algorithms.ppo import PPO
from sklearn.model_selection import train_test_split

# import wandb
from train import config
# from ray.rllib.env.wrappers.multi_agent_env_compatibility import MultiAgentEnvCompatibility

In [2]:
def make_env() -> compiler_gym.envs.CompilerEnv:
    env = compiler_gym.make(
        config["compiler_gym_env"],
        observation_space=config["observation_space"],
        reward_space=config["reward_space"],
    )
    env = ConstrainedCommandline(
        env,
        flags=config["actions"],
    )
    env = TimeLimit(env, max_episode_steps=config["episode_length"])
    return env

In [3]:
def prepare_datasets(env: compiler_gym.envs.CompilerEnv) -> tuple:
    train_benchmarks = list(
        islice(env.datasets[config["train_benchmarks"]].benchmarks(), 10000)
    )
    train_benchmarks, val_benchmarks = train_test_split(
        train_benchmarks, test_size=0.15, random_state=config["random_state"]
    )
    test_benchmarks = list(env.datasets[config["test_benchmarks"]].benchmarks())
    return train_benchmarks, val_benchmarks, test_benchmarks

In [4]:
def make_training_env(*args) -> compiler_gym.envs.CompilerEnv:
    del args
    return CycleOverBenchmarks(make_env(), train_benchmarks)

In [5]:
def run_agent_on_benchmarks(benchmarks):
    with make_env() as env:
        rewards = []
        for i, benchmark in enumerate(benchmarks, start=1):
            observation, done = env.reset(benchmark=benchmark), False
            while not done:
                action = agent.compute_single_action(observation)
                observation, _, done, _ = env.step(action)
            rewards.append(env.episode_reward)
            print(f"[{i}/{len(benchmarks)}] {env.state}")

    return rewards

In [6]:
def plot_results(x, y, name, ax):
    plt.sca(ax)
    plt.bar(range(len(y)), y)
    plt.ylabel("Reward (higher is better)")
    plt.xticks(range(len(x)), x, rotation=90)
    plt.title(f"Performance on {name} set")

In [7]:
with make_env() as env:
    train_benchmarks, val_benchmarks, test_benchmarks = prepare_datasets(env)

In [8]:
if ray.is_initialized():
    ray.shutdown()
ray.init(
    include_dashboard=True,
    ignore_reinit_error=True,
    num_gpus=1,
)
tune.register_env("compiler_gym", make_training_env)

2024-01-28 11:43:48,886	INFO worker.py:1529 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
[2m[36m(PPO pid=5115)[0m 2024-01-28 11:43:57,566	INFO algorithm.py:501 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(PPO pid=5115)[0m 2024-01-28 11:52:12,942	ERROR actor_manager.py:486 -- Ray error, taking actor 1 out of service. [36mray::RolloutWorker.apply()[39m (pid=5171, ip=192.168.0.12, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7f5dc9b3b790>)
[2m[36m(PPO pid=5115)[0m   File "/home/flint/diplom/experiments/venv/lib/python3.9/site-packages/ray/rllib/utils/actor_manager.py", line 183, in apply
[2m[36m(PPO pid=5115)[0m     raise e
[2m[36m(PPO pid=5115)[0m   File "/home/flint/diplom/experiments/venv/lib/python3.9/site-packages/ray/rllib/utils/actor_manager.py", line 174, in apply
[2m[36m(PPO pid=5115)[0m     return func(self, *args, 

In [None]:
analysis = tune.run(
    "PPO",
    checkpoint_at_end=True,
    stop={
        "episodes_total": 100,
    },
    max_concurrent_trials=1,
    config={
        "seed": 0xCC,
        "num_workers": 1,
        "env": "compiler_gym",
        "rollout_fragment_length": 5,
        "train_batch_size": 5,
        "sgd_minibatch_size": 5,
        "framework": "torch",
        "resources": {
            "num_gpus_per_worker": 1,
        }
    },
)

0,1
Current time:,2024-01-28 12:23:10
Running for:,00:19:54.24
Memory:,5.0/7.7 GiB

Trial name,# failures,error file
PPO_compiler_gym_131a8_00000,1,/home/flint/ray_results/PPO/PPO_compiler_gym_131a8_00000_0_2024-01-28_12-03-16/error.txt

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_compiler_gym_131a8_00000,ERROR,192.168.0.12:8224,1900,1067.7,9500,0.835251,1.08969,0,100


Trial name,agent_timesteps_total,counters,custom_metrics,date,done,episode_len_mean,episode_media,episode_reward_max,episode_reward_mean,episode_reward_min,episodes_this_iter,episodes_total,experiment_id,experiment_tag,hostname,info,iterations_since_restore,node_ip,num_agent_steps_sampled,num_agent_steps_trained,num_env_steps_sampled,num_env_steps_sampled_this_iter,num_env_steps_trained,num_env_steps_trained_this_iter,num_faulty_episodes,num_healthy_workers,num_in_flight_async_reqs,num_remote_worker_restarts,num_steps_trained_this_iter,perf,pid,policy_reward_max,policy_reward_mean,policy_reward_min,sampler_perf,sampler_results,time_since_restore,time_this_iter_s,time_total_s,timers,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
PPO_compiler_gym_131a8_00000,9500,"{'num_env_steps_sampled': 9500, 'num_env_steps_trained': 9500, 'num_agent_steps_sampled': 9500, 'num_agent_steps_trained': 9500}",{},2024-01-28_12-23-10,False,100,{},1.08969,0.835251,0,1,95,abb3d5a5e5a54ee18f080774d8483156,0,debian,"{'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 0.035421038261544406, 'cur_kl_coeff': 0.0, 'cur_lr': 4.999999999999999e-05, 'total_loss': 5.830952828277438e-05, 'policy_loss': 0.0, 'vf_loss': 5.830952828277438e-05, 'vf_explained_var': -1.0, 'kl': 1.7128623013007437e-09, 'entropy': 0.00030612235617203016, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 5.0, 'num_grad_updates_lifetime': 56985.5, 'diff_num_grad_updates_vs_sampler_policy': 14.5}}, 'num_env_steps_sampled': 9500, 'num_env_steps_trained': 9500, 'num_agent_steps_sampled': 9500, 'num_agent_steps_trained': 9500}",1900,192.168.0.12,9500,9500,9500,5,9500,5,0,1,0,0,5,{},8224,{},{},{},"{'mean_raw_obs_processing_ms': 4.823258006712416, 'mean_inference_ms': 5.619667996760971, 'mean_action_processing_ms': 0.31677730080241623, 'mean_env_wait_ms': 2.7842324865180443, 'mean_env_render_ms': 0.0}","{'episode_reward_max': 1.0896860986547083, 'episode_reward_min': 0.0, 'episode_reward_mean': 0.8352514876889265, 'episode_len_mean': 100.0, 'episode_media': {}, 'episodes_this_iter': 1, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [0.9436619718309858, 0.7889908256880734, 0.7417582417582419, 1.0, 0.9999999999999999, 0.0, 1.0, 0.9888888888888889, 0.9705882352941176, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0896860986547083, 0.994949494949495, 1.0681818181818186, 1.0, 1.0, 0.9999999999999999, 0.9333333333333333, 0.908256880733945, 0.9904761904761904, 1.0, 0.9642857142857143, 1.0, 1.0, 0.9928057553956834, 1.0, 0.9082568807339451, 0.8, 0.8571428571428571, 1.0, 1.0, 1.0, 0.0, 1.0, 0.875, 0.54, 1.0, 0.8571428571428571, 1.0, 0.9259259259259258, 0.875, 0.7868852459016393, 0.9493670886075949, 1.0, 1.0, 1.0, 1.0, 1.0, 0.3181818181818182, 0.9720279720279721, 1.0, 1.0, 0.9257425742574258, 0.5102040816326531, 0.9047619047619048, 0.0, 0.8131868131868132, 0.4666666666666667, 0.8188976377952756, 1.0, 1.0, 1.0, 0.875, 0.875, 0.7272727272727273, 0.5666666666666667, 0.40625, 0.9476744186046512, 0.8916666666666667, 0.7794117647058824, 1.0, 0.8740740740740741, 0.6666666666666666, 1.0, 0.782608695652174, 0.9166666666666666, 0.2605633802816901, 0.9, 0.7419354838709677, 0.826530612244898, 0.5625, 0.8888888888888888, 0.796875, 0.8082191780821918, 1.0, 1.0, 0.6875, 0.0, 0.56, 1.0, 0.86, 0.6666666666666666], 'episode_lengths': [100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100]}, 'sampler_perf': {'mean_raw_obs_processing_ms': 4.823258006712416, 'mean_inference_ms': 5.619667996760971, 'mean_action_processing_ms': 0.31677730080241623, 'mean_env_wait_ms': 2.7842324865180443, 'mean_env_render_ms': 0.0}, 'num_faulty_episodes': 0}",1067.7,0.304872,1067.7,"{'training_iteration_time_ms': 229.875, 'load_time_ms': 0.238, 'load_throughput': 20973.617, 'learn_time_ms': 202.232, 'learn_throughput': 24.724, 'synch_weights_time_ms': 1.542}",1706433790,0,9500,1900,131a8_00000,10.53


2024-01-28 12:23:10,831	ERROR trial_runner.py:1088 -- Trial PPO_compiler_gym_131a8_00000: Error processing event.
ray.exceptions.RayTaskError(ValueError): [36mray::PPO.train()[39m (pid=8224, ip=192.168.0.12, repr=PPO)
  File "/home/flint/diplom/experiments/venv/lib/python3.9/site-packages/ray/tune/trainable/trainable.py", line 367, in train
    raise skipped from exception_cause(skipped)
  File "/home/flint/diplom/experiments/venv/lib/python3.9/site-packages/ray/tune/trainable/trainable.py", line 364, in train
    result = self.step()
  File "/home/flint/diplom/experiments/venv/lib/python3.9/site-packages/ray/rllib/algorithms/algorithm.py", line 749, in step
    results, train_iter_ctx = self._run_one_training_iteration()
  File "/home/flint/diplom/experiments/venv/lib/python3.9/site-packages/ray/rllib/algorithms/algorithm.py", line 2623, in _run_one_training_iteration
    results = self.training_step()
  File "/home/flint/diplom/experiments/venv/lib/python3.9/site-packages/ray/rllib

TuneError: ('Trials did not complete', [PPO_compiler_gym_131a8_00000])

> [0;32m/home/flint/diplom/experiments/venv/lib/python3.9/site-packages/ray/tune/tune.py[0m(756)[0;36mrun[0;34m()[0m
[0;32m    754 [0;31m    [0;32mif[0m [0mincomplete_trials[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    755 [0;31m        [0;32mif[0m [0mraise_on_failed_trial[0m [0;32mand[0m [0;32mnot[0m [0mstate[0m[0;34m[[0m[0;34m"signal"[0m[0;34m][0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 756 [0;31m            [0;32mraise[0m [0mTuneError[0m[0;34m([0m[0;34m"Trials did not complete"[0m[0;34m,[0m [0mincomplete_trials[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    757 [0;31m        [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    758 [0;31m            [0mlogger[0m[0;34m.[0m[0merror[0m[0;34m([0m[0;34m"Trials did not complete: %s"[0m[0;34m,[0m [0mincomplete_trials[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


In [14]:
%pdb

Automatic pdb calling has been turned ON


In [None]:
agent = PPO(
    env="compiler_gym",
    config={
        "num_workers": 1,
        "seed": 0xCC,
        "explore": False,
        "framework": "torch",
    },
)

In [None]:
checkpoint = analysis.get_best_checkpoint(
    metric="episode_reward_mean", mode="max", trial=analysis.trials[0]
)

In [None]:
agent.restore(checkpoint)

In [None]:
# val_rewards = run_agent_on_benchmarks(val_benchmarks)
# test_rewards = run_agent_on_benchmarks(test_benchmarks)

In [None]:
test_rewards = run_agent_on_benchmarks(test_benchmarks)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(13, 3)
# plot_results(val_benchmarks, val_rewards, "val", ax1)
plot_results(test_benchmarks, test_rewards, "test", ax2)
plt.show()