In [38]:
from gymnasium import Env
import gymnasium
from gymnasium.spaces import Box, Dict, Discrete, MultiDiscrete, Tuple
from ray.rllib.utils.spaces.repeated import Repeated
from tqdm import tqdm
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random

In [32]:
class Architecture:
    def __init__(self):
        self.space_flag = 0
        self.c_capacity, self.s_capacity = 1000, 100                                         # Capacity of the devices
        # self.a1, self.a2, self.a3, self.mc = 0.000125, 0.000010, 0.000200, 0.001           # Weight of every type of data
        self.weights = [0.0125, 0.0010, 0.0200, 0.1000]                                      # a1, a2, a3, mc
        self.lifetime = {'a1': 10000, 'a2': 20000, 'a3': 10000, 'mc': 1000}                  # Max lifetime of the different data types
        self.total_c = 2
        self.total_s = 6
        self.max_actions = 20

        self.devices = {}
        self.data = {
            'a1': {},
            'a2': {},
            'a3': {},
            'mc': {},
        }
        self.latencies = {}

        for i in range(self.total_c):
            for j in (self.data.keys()):
                self.data[j]['c'+str(i)] = 0
            self.devices['c'+str(i)] = {'a1': 0, 'a2': 0, 'a3': 0, 'mc': 0}
            self.latencies['c'+str(i)] = {}
        for i in range(self.total_s):
            for j in (self.data.keys()):
                self.data[j]['s'+str(i)] = 0
            self.devices['s'+str(i)] = {'a1': 0, 'a2': 0, 'a3': 0, 'mc': 0}
            self.latencies['s'+str(i)] = {}

        for i in self.latencies.keys():
            for j in self.latencies.keys():
                self.latencies[i][j] = 0

        self.data_types = []
        self.data_allocation = []
        self.data_times = []

        self.data_allocation_dict = []

        self.action_type = []
        self.action_device_dev = []
        self.action_device = []

        randomized = list(self.devices.keys())
        random.shuffle(randomized)

        bound = round(len(self.devices)/4)

        self.clusters = {
            'a1': randomized[:bound],
            'a2': randomized[bound:bound*2],
            'a3': randomized[bound*2:bound*3],
            'mc': randomized[bound*3:]
        }

        self.visualization()

    def to_int(self, device):
        return list(self.devices.keys()).index(device)

    def to_int_type(self, type):
        return list(self.data.keys()).index(type)

    def to_device(self, position):
        return list(self.devices.keys())[position]

    def construct_dictionaries(self):
        for i in self.data:
            for j in self.data[i]:
                self.data[i][j] = 0
        for i in self.devices:
            for j in self.devices[i]:
                self.devices[i][j] = 0
        for device, d_type in zip(self.data_allocation_dict,self.data_types):
            self.data[d_type][device] += 1
            self.devices[device][d_type] += 1

    def update(self, data_type, device):
        self.data_types.append(data_type)
        self.data_allocation_dict.append(device)
        self.data_allocation.append(self.to_int(device))
        self.data_times.append(0)
        self.devices[device][data_type] += 1
        self.data[data_type][device] += 1

    def heart_beat(self):
        deads = []
        for i in range(len(self.data_times)):
            self.data_times[i] += 1
            if self.data_times[i] == self.lifetime[self.data_types[i]]:
                deads.append(i)

        removed = 0
        for i in deads:
            self.devices[self.data_allocation_dict[i-removed]][self.data_types[i-removed]] -= 1
            self.data[self.data_types[i-removed]][self.data_allocation_dict[i-removed]] -= 1
            self.data_types.pop(i - removed)
            self.data_allocation_dict.pop(i - removed)
            self.data_allocation.pop(i - removed)
            self.data_times.pop(i - removed)
            removed += 1

    def free_space(self):
        load = {}
        for i in self.devices.keys():
            if list(i)[0] == 'c':
                load[i] = (self.c_capacity - sum([a*b for a,b in zip(list(self.devices[i].values()), self.weights)]))/self.c_capacity
            else:
                load[i] = (self.s_capacity - sum([a*b for a,b in zip(list(self.devices[i].values()), self.weights)]))/self.s_capacity
            if load[i] <= 0.2:
                self.space_flag = i
        return load

    def visualization(self):
        keys = list(self.devices.keys())
        cs = keys[:self.total_c]
        ss = keys[self.total_c:]

        colors = []
        sizes = []

        graph = nx.Graph()

        colors.append('lightblue')
        sizes.append(1000)
        index = 0
        for i in range(len(cs)):
            if i != len(cs)-1:
                colors.append('lightblue')
                sizes.append(1000)
                graph.add_edge(cs[i],cs[i+1])
                graph.add_weighted_edges_from([(cs[i],cs[i+1],10)])
                for j in range(int(len(ss)/self.total_c)):
                    colors.append('orange')
                    sizes.append(100)
                    graph.add_edge(cs[i],ss[index])
                    graph.add_weighted_edges_from([(cs[i],ss[index],4)])
                    index += 1
            else:
                graph.add_edge(cs[i],cs[0])
                graph.add_weighted_edges_from([(cs[i],cs[0],10)])
                for j in range(int(len(ss)/self.total_c)+len(ss)%self.total_c):
                    graph.add_edge(cs[i],ss[index])
                    graph.add_weighted_edges_from([(cs[i],ss[index],4)])
                    colors.append('orange')
                    sizes.append(100)
                    index += 1

        pos = nx.spring_layout(graph)  # Position nodes using a spring layout algorithm
        # nx.draw(graph, pos, with_labels=True, node_size=sizes, node_color=colors, font_weight=12, font_color='black', edge_color='gray')
        edge_labels = nx.get_edge_attributes(graph, 'weight')
        # nx.draw_networkx_edge_labels(graph, pos, edge_labels=edge_labels)

        # plt.title("Architecture:")
        # plt.axis('off')
        # plt.show()

        for u in graph.nodes:
            shortest_paths = nx.shortest_path_length(graph, source=u, weight='weight')
            for v, weight in shortest_paths.items():
                self.latencies[u][v] = weight

    def compute_total_latency(self):
        total_latency = 0
        for i in range(len(self.action_type)):
            total_latency += sum([a*b for a,b in zip(self.data[self.action_type[i]].values(), self.latencies[self.action_device_dev[i]].values())])
        return total_latency

    def generate(self, operation):
        if operation == 'a1' or operation == 'a2' or operation == 'a3':
            for i in self.devices.keys():
                self.update(operation,i)
        elif operation == 'mc':
            for i in self.devices.keys():
                if list(i)[0] == 's':
                    self.update('mc',i)
        elif operation.split('_')[0] == 'ai':
            device=random.sample(list(self.clusters[operation.split('_')[1]]),1)
            self.action_device_dev.append(device[0])
            self.action_device.append(self.to_int(device[0]))
            self.action_type.append(operation.split('_')[1])
            difference = len(self.action_device_dev) - self.max_actions
            if difference >= 0:
                for i in range(difference):
                    self.action_device_dev.pop(0)
                    self.action_device.pop(0)
                    self.action_type.pop(0)

    def rl_update(self, allocation):
        allocation_dict = []
        for i in allocation:
            allocation_dict.append(self.to_device(i))
        self.data_allocation = allocation
        self.data_allocation_dict =allocation_dict
        self.construct_dictionaries()
        return self.free_space()

    def greedy_algorithm(self): # baseline?
        if self.space_flag == 0:
            allocation = []
            allocation_dev = []
            for i in self.data_types:
                allocation.append(self.to_int(random.sample(list(self.clusters[i]),1)[0]))
                allocation_dev.append(random.sample(list(self.clusters[i]),1)[0])
            self.data_allocation = allocation
            self.data_allocation_dict = allocation_dev
            self.construct_dictionaries()
            result = self.free_space()
        else:
            problematic_key = None
            for key, value in self.clusters.items():
                if self.space_flag in value:
                    problematic_key = key
                    break
            self.clusters[problematic_key].extend(element for element in self.devices if element[0]==('c'))
            self.space_flag = 0
            result = self.greedy_algorithm()

        return result

In [28]:
class SimulatedArchitecture(Env):
    def __init__(self):
        self.architecture = Architecture()
        self.executions = ['a1','a2','mc','a1','ai_a2','mc','a3','ai_a1','mc','a1','ai_a3','mc','a2','a3','ai_a2','mc','ai_a1','a1','mc','ai_a3','ai_mc']

        self.num_devices = len(self.architecture.devices.keys())
        self.observation_space = Dict({
            # "action_mask": Box(0, self.num_devices, shape=(max_avail_actions, ), dtype=np.int32),
            "weights": Box(low=0, high=1, shape=(4,), dtype=np.float32),
            "free_space": Box(low=0, high=1, shape=(self.num_devices,), dtype=np.float32),
            "allocation": Repeated(Discrete(4), 5)
        })
        self.action_space = Tuple((Box(low=0, high=1, shape=(2,), dtype=np.float32),Discrete(1)))

        self.weights = np.array(self.architecture.weights, dtype=np.float32)
        self.free_space = np.array(list(self.architecture.free_space().values()), dtype=np.float32)
    def get_obs(self):
        return {
            "weights": self.weights,
            "free_space": self.free_space,
            "allocation": self.observation_space["allocation"].sample()
        }
    def step(self, action):
        print(action)
        self.length = random.randint(0,5)
        reward = 1
        done = True
        info = {}
        return self.get_obs(), reward, done, info
    def render(self):
        pass
    def reset(self):
        self.action_space = Tuple((Box(low=0, high=1, shape=(10,), dtype=np.float32),Discrete(1)))
        # self.action_space = Box(0, self.num_devices, shape=(random.randint(1,5), ), dtype=np.int32)
        return self.get_obs()
    def close(self):
        pass

In [29]:
simulation = SimulatedArchitecture()
# for i in range(5):
#     print(simulation.observation_space.sample())
#     simulation.reset()
#     simulation.step(simulation.action_space.sample())
print(simulation.action_space.sample())
simulation.reset()
print(simulation.action_space.sample())

(array([0.24810933, 0.23443152], dtype=float32), 0)
(array([0.6265255 , 0.01530288, 0.6290112 , 0.6904504 , 0.2261503 ,
       0.39107797, 0.1684374 , 0.09549104, 0.1496204 , 0.70483094],
      dtype=float32), 0)


In [30]:
from ray.tune.registry import register_env
from gymnasium.wrappers import EnvCompatibility
from ray.rllib.algorithms.ppo import PPOConfig
from gymnasium.wrappers import TimeLimit
import ray

def env_creator(env_config):
    return TimeLimit(EnvCompatibility(SimulatedArchitecture()), max_episode_steps=10)
register_env("ACES", env_creator)

config = (
    PPOConfig()
    .environment("ACES",disable_env_checking=True)
    .framework("torch")
    .rollouts(num_rollout_workers=1,
              num_envs_per_worker=1) # Max = 60
    .resources(num_gpus=0,
               num_cpus_per_worker=1)
    .training(entropy_coeff=0.02)
)

algorithm = config.build()

for i in tqdm(range(3)):
    algorithm.train()

  0%|                                                                                                                           | 0/3 [00:00<?, ?it/s]

[2m[36m(RolloutWorker pid=1148102)[0m (array([0.12151152, 0.34698713], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([1., 1.], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.3172355, 1.       ], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.5633605, 0.       ], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.04510936, 0.7328065 ], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([1., 1.], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.47753993, 0.24093029], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.9665103, 0.7734945], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.57291776, 0.0629358 ], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.30777374, 0.10024071], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.80575436, 0.        ], dtype=float32), 0)
[2m[36m(Rollo

 33%|██████████████████████████████████████▎                                                                            | 1/3 [00:19<00:38, 19.33s/it]

[2m[36m(RolloutWorker pid=1148102)[0m (array([0.11579102, 0.36770535], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.4511677, 0.0169808], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.72097313, 0.5624326 ], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([1., 1.], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.69723594, 0.79343   ], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.43414587, 1.        ], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.96890414, 0.6995577 ], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.6917289 , 0.97540045], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0., 0.], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.08194268, 0.78716743], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.75193584, 0.        ], dtype=float32), 0)
[2m[36m(R

 67%|████████████████████████████████████████████████████████████████████████████▋                                      | 2/3 [00:37<00:18, 18.82s/it]

[2m[36m(RolloutWorker pid=1148102)[0m (array([0.7297218, 0.       ], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.14146566, 0.5351989 ], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.1837281, 1.       ], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([1.       , 0.7809636], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.57053244, 0.56515217], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.50698066, 0.        ], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.11243939, 0.6705893 ], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.14011765, 0.8519641 ], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.46969748, 1.        ], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.21012694, 0.36511844], dtype=float32), 0)
[2m[36m(RolloutWorker pid=1148102)[0m (array([0.5816479 , 0.14901173], dt

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:56<00:00, 18.77s/it]


In [30]:
class OfflineTesting(Env):
    def __init__(self):
        self.observation_space = Box(low=0, high=1, shape=(5,), dtype=np.int32)
        self.action_space = Discrete(2)
        self.observation = []
    def step(self, action):
        if np.sum(self.observation) % 2 == 0:
            if action == 0:
                reward = 1
            else:
                reward = 0
        else:
            if action == 0:
                reward = 0
            else:
                reward = 1
        done = True
        info = {}
        return self.observation, reward, done, info
    def render(self):
        pass
    def reset(self):
        self.observation = self.observation_space.sample()
        return self.observation
    def close(self):
        pass

In [5]:
testEnv = OfflineTesting()
testEnv.action_space.sample()

0

In [217]:
from ray.tune.registry import register_env
from gymnasium.wrappers import EnvCompatibility
from ray.rllib.algorithms.ppo import PPOConfig
from gymnasium.wrappers import TimeLimit
import ray

def env_creator(env_config):
    return TimeLimit(EnvCompatibility(OfflineTesting()), max_episode_steps=10)
register_env("OfflineTest", env_creator)

config = (
    PPOConfig()
    .environment("OfflineTest",disable_env_checking=True)
    # .framework("torch")
    .rollouts(num_rollout_workers=10,
              num_envs_per_worker=5) # Max = 60
    .resources(num_gpus=0,
               num_cpus_per_worker=1)
    .training(entropy_coeff=0.02)
)
config['model']['vf_share_layers'] = False
config["model"]["fcnet_hiddens"] =[32,32]

algorithm = config.build()

# result = []
# for i in tqdm(range(10)):
#     result.append(algorithm.train()["episode_reward_mean"])
#     print(result)

algorithm2 = config.build()
algorithm2.set_weights(algorithm.get_weights())
checkpoint_path = algorithm2.save()
algorithm2 = config.build()
algorithm2.restore(checkpoint_path)

result2 =[]
for i in tqdm(range(10)):
    result2.append(algorithm2.train()["episode_reward_mean"])
    print(result2)

2023-06-01 14:46:47,008	INFO trainable.py:791 -- Restored on 138.4.11.227 from checkpoint: /home/rcalzada/ray_results/PPO_OfflineTest_2023-06-01_14-46-3185tkliq_/checkpoint_000000
2023-06-01 14:46:47,010	INFO trainable.py:800 -- Current state after restoring: {'_iteration': 0, '_timesteps_total': None, '_time_total': 0.0, '_episodes_total': None}
 10%|███████████▍                                                                                                      | 1/10 [00:04<00:44,  4.90s/it]

[0.51]


 20%|██████████████████████▊                                                                                           | 2/10 [00:09<00:37,  4.69s/it]

[0.51, 0.5055]


 20%|██████████████████████▊                                                                                           | 2/10 [00:14<00:58,  7.28s/it]


KeyboardInterrupt: 

In [200]:
print(result2)

[0.612, 0.63825, 0.6605, 0.7185, 0.75225, 0.76725, 0.825, 0.85075, 0.8525, 0.87125]


In [135]:
offline_test_dataset = {
    "observations": [],
    "actions": [],
    "rewards": [],
    "next_observations": []
}

import os
import json
file_path = "/tmp/offline_test.json"
# os.remove(file_path)

# f = open(file_path, "x")

for i in range(5):
    observation = testEnv.observation_space.sample()
    action = testEnv.action_space.sample()
    if np.sum(observation) % 2 == 0:
        if action == 0:
            reward = 1
        else:
            reward = 0
    else:
        if action == 0:
            reward = 0
        else:
            reward = 1
    line = {}
    line["observations"] = []
    line["observations"].append(observation.tolist())
    line["actions"] = []
    line["actions"].append(action)
    line["rewards"] = []
    line["rewards"].append(reward)
    line["dones"] = []
    line["dones"].append(True)
    line["infos"] = []
    line["infos"].append({})
    line["next_observations"] = []
    line["next_observations"].append(observation.tolist())
    with open(file_path, "a") as file:
        json.dump(line,file)

In [15]:
with open("/tmp/invent", "a") as file:
    file.write("hola")
with open("/tmp/invent", "r") as file:
    a=file.read()
    print(a)

holaholahola


In [54]:
from ray.rllib.algorithms.bc import BCConfig
import ray
from ray.tune.registry import register_env
from gymnasium.wrappers import EnvCompatibility
from gymnasium.wrappers import TimeLimit
from ray.rllib.utils.test_utils import check_train_results

ray.shutdown()

def env_creator(env_config):
    return TimeLimit(EnvCompatibility(OfflineTesting()), max_episode_steps=10)
register_env("OfflineTest", env_creator)
path = "/tmp/test-output10"

config = (
    BCConfig()
    .environment(env="OfflineTest",disable_env_checking=True)
    .offline_data(input_=path)
    .training(
        train_batch_size=2,
        # twin_q=False,
    )
    .evaluation(
        evaluation_interval=5,
        evaluation_num_workers=1,
        evaluation_duration=5,
        evaluation_parallel_to_training=True,
        evaluation_config=BCConfig.overrides(input_="sampler"),
    )
    .rollouts(num_rollout_workers=1)
)
config['model']['vf_share_layers'] = False
config["model"]["fcnet_hiddens"] = [64,64]

algorithm_bc = config.build()
for i in tqdm(range(200)):
    result = algorithm_bc.train()
    check_train_results(result)
    # print(result)
    eval_results = result.get("evaluation")
    if eval_results:
        print("iter={} R={}".format(i, eval_results["episode_reward_mean"]))

2023-06-01 16:54:00,009	INFO worker.py:1553 -- Started a local Ray instance.
2023-06-01 16:54:14,196	INFO trainable.py:172 -- Trainable.setup took 16.449 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
  2%|██▊                                                                                                              | 5/200 [00:02<01:52,  1.74it/s]

iter=4 R=0.4


  5%|█████▌                                                                                                          | 10/200 [00:05<01:49,  1.74it/s]

iter=9 R=0.6


  8%|████████▍                                                                                                       | 15/200 [00:08<01:50,  1.67it/s]

iter=14 R=0.8


 10%|███████████▏                                                                                                    | 20/200 [00:11<01:47,  1.68it/s]

iter=19 R=0.0


 12%|██████████████                                                                                                  | 25/200 [00:14<01:46,  1.65it/s]

iter=24 R=0.6


 15%|████████████████▊                                                                                               | 30/200 [00:17<01:43,  1.64it/s]

iter=29 R=0.6


 18%|███████████████████▌                                                                                            | 35/200 [00:20<01:39,  1.65it/s]

iter=34 R=0.4


 20%|██████████████████████▍                                                                                         | 40/200 [00:23<01:36,  1.65it/s]

iter=39 R=0.6


 22%|█████████████████████████▏                                                                                      | 45/200 [00:26<01:33,  1.65it/s]

iter=44 R=0.4


 25%|████████████████████████████                                                                                    | 50/200 [00:29<01:30,  1.66it/s]

iter=49 R=1.0


 28%|██████████████████████████████▊                                                                                 | 55/200 [00:32<01:28,  1.64it/s]

iter=54 R=0.6


 30%|█████████████████████████████████▌                                                                              | 60/200 [00:35<01:24,  1.66it/s]

iter=59 R=0.0


 32%|████████████████████████████████████▍                                                                           | 65/200 [00:38<01:20,  1.67it/s]

iter=64 R=0.6


 35%|███████████████████████████████████████▏                                                                        | 70/200 [00:41<01:17,  1.67it/s]

iter=69 R=0.4


 38%|██████████████████████████████████████████                                                                      | 75/200 [00:44<01:14,  1.67it/s]

iter=74 R=0.6


 40%|████████████████████████████████████████████▊                                                                   | 80/200 [00:47<01:12,  1.65it/s]

iter=79 R=0.6


 42%|███████████████████████████████████████████████▌                                                                | 85/200 [00:50<01:09,  1.66it/s]

iter=84 R=0.6


 45%|██████████████████████████████████████████████████▍                                                             | 90/200 [00:53<01:06,  1.65it/s]

iter=89 R=0.4


 48%|█████████████████████████████████████████████████████▏                                                          | 95/200 [00:56<01:03,  1.65it/s]

iter=94 R=0.4


 50%|███████████████████████████████████████████████████████▌                                                       | 100/200 [00:59<01:00,  1.64it/s]

iter=99 R=0.2


 52%|██████████████████████████████████████████████████████████▎                                                    | 105/200 [01:02<00:57,  1.66it/s]

iter=104 R=0.6


 55%|█████████████████████████████████████████████████████████████                                                  | 110/200 [01:06<01:06,  1.35it/s]

iter=109 R=0.4


 57%|███████████████████████████████████████████████████████████████▊                                               | 115/200 [01:09<00:53,  1.58it/s]

iter=114 R=0.4


 60%|██████████████████████████████████████████████████████████████████▌                                            | 120/200 [01:12<00:49,  1.62it/s]

iter=119 R=0.8


 62%|█████████████████████████████████████████████████████████████████████▍                                         | 125/200 [01:15<00:46,  1.62it/s]

iter=124 R=0.4


 65%|████████████████████████████████████████████████████████████████████████▏                                      | 130/200 [01:18<00:42,  1.63it/s]

iter=129 R=0.8


 68%|██████████████████████████████████████████████████████████████████████████▉                                    | 135/200 [01:21<00:39,  1.63it/s]

iter=134 R=0.2


 70%|█████████████████████████████████████████████████████████████████████████████▋                                 | 140/200 [01:24<00:36,  1.62it/s]

iter=139 R=0.4


 72%|████████████████████████████████████████████████████████████████████████████████▍                              | 145/200 [01:27<00:33,  1.64it/s]

iter=144 R=0.4


 75%|███████████████████████████████████████████████████████████████████████████████████▎                           | 150/200 [01:30<00:30,  1.63it/s]

iter=149 R=0.2


 78%|██████████████████████████████████████████████████████████████████████████████████████                         | 155/200 [01:33<00:27,  1.61it/s]

iter=154 R=0.4


 80%|████████████████████████████████████████████████████████████████████████████████████████▊                      | 160/200 [01:37<00:24,  1.61it/s]

iter=159 R=0.6


 82%|███████████████████████████████████████████████████████████████████████████████████████████▌                   | 165/200 [01:40<00:21,  1.64it/s]

iter=164 R=0.4


 85%|██████████████████████████████████████████████████████████████████████████████████████████████▎                | 170/200 [01:43<00:18,  1.63it/s]

iter=169 R=0.4


 88%|█████████████████████████████████████████████████████████████████████████████████████████████████▏             | 175/200 [01:46<00:15,  1.64it/s]

iter=174 R=0.2


 90%|███████████████████████████████████████████████████████████████████████████████████████████████████▉           | 180/200 [01:49<00:12,  1.64it/s]

iter=179 R=0.4


 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 185/200 [01:52<00:09,  1.62it/s]

iter=184 R=0.0


 95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▍     | 190/200 [01:55<00:06,  1.65it/s]

iter=189 R=0.6


 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 195/200 [01:58<00:03,  1.64it/s]

iter=194 R=0.6


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [02:01<00:00,  1.65it/s]

iter=199 R=0.4





In [59]:
from ray.tune.registry import register_env
from gymnasium.wrappers import EnvCompatibility
from ray.rllib.algorithms.ppo import PPOConfig
from gymnasium.wrappers import TimeLimit
import ray

ray.shutdown()

def env_creator(env_config):
    return TimeLimit(EnvCompatibility(OfflineTesting()), max_episode_steps=10)
register_env("OfflineTest", env_creator)

config = (
    PPOConfig()
    .environment("OfflineTest",disable_env_checking=True)
    .rollouts(num_rollout_workers=10,
              num_envs_per_worker=5) # Max = 60
    .resources(num_gpus=0,
               num_cpus_per_worker=1)
    .training(entropy_coeff=0.02)
)
config['model']['vf_share_layers'] = False
config["model"]["fcnet_hiddens"] =[64,64]

algorithm = config.build()
algorithm.set_weights(algorithm_bc.get_weights())
checkpoint_path = algorithm.save()
algorithm = config.build()
algorithm.restore(checkpoint_path)

result =[]
for i in tqdm(range(10)):
    result.append(algorithm.train()["episode_reward_mean"])
    print(result)

2023-06-01 16:59:43,058	INFO worker.py:1553 -- Started a local Ray instance.
2023-06-01 16:59:53,345	INFO trainable.py:172 -- Trainable.setup took 12.584 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
2023-06-01 17:00:00,754	INFO trainable.py:791 -- Restored on 138.4.11.227 from checkpoint: /home/rcalzada/ray_results/PPO_OfflineTest_2023-06-01_16-59-40n0_v02vw/checkpoint_000000
2023-06-01 17:00:00,757	INFO trainable.py:800 -- Current state after restoring: {'_iteration': 0, '_timesteps_total': None, '_time_total': 0.0, '_episodes_total': None}
 10%|███████████▍                                                                                                      | 1/10 [00:05<00:50,  5.62s/it]

[0.5385]


 20%|██████████████████████▊                                                                                           | 2/10 [00:10<00:40,  5.11s/it]

[0.5385, 0.608]


 30%|██████████████████████████████████▏                                                                               | 3/10 [00:15<00:34,  4.94s/it]

[0.5385, 0.608, 0.70975]


 40%|█████████████████████████████████████████████▌                                                                    | 4/10 [00:19<00:29,  4.87s/it]

[0.5385, 0.608, 0.70975, 0.77175]


 50%|█████████████████████████████████████████████████████████                                                         | 5/10 [00:24<00:24,  4.81s/it]

[0.5385, 0.608, 0.70975, 0.77175, 0.8265]


 60%|████████████████████████████████████████████████████████████████████▍                                             | 6/10 [00:29<00:19,  4.81s/it]

[0.5385, 0.608, 0.70975, 0.77175, 0.8265, 0.86475]


 70%|███████████████████████████████████████████████████████████████████████████████▊                                  | 7/10 [00:34<00:14,  4.98s/it]

[0.5385, 0.608, 0.70975, 0.77175, 0.8265, 0.86475, 0.8925]


 80%|███████████████████████████████████████████████████████████████████████████████████████████▏                      | 8/10 [00:39<00:09,  4.93s/it]

[0.5385, 0.608, 0.70975, 0.77175, 0.8265, 0.86475, 0.8925, 0.9085]


 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 9/10 [00:44<00:04,  4.85s/it]

[0.5385, 0.608, 0.70975, 0.77175, 0.8265, 0.86475, 0.8925, 0.9085, 0.93725]


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:48<00:00,  4.89s/it]

[0.5385, 0.608, 0.70975, 0.77175, 0.8265, 0.86475, 0.8925, 0.9085, 0.93725, 0.9475]





In [32]:
def env_creator(env_config):
    return TimeLimit(EnvCompatibility(OfflineTesting()), max_episode_steps=10)
register_env("OfflineTest", env_creator)

config = (
    PPOConfig()
    .environment("OfflineTest",disable_env_checking=True)
    # .framework("torch")
    .rollouts(num_rollout_workers=10,
              num_envs_per_worker=5) # Max = 60
    .resources(num_gpus=0,
               num_cpus_per_worker=1)
    # .offline_data(input_=offline_test_dataset)
    .offline_data(output="/tmp/test-output20")
    # .exploration(explore=False)
)
config["model"]["fcnet_hiddens"] =[64,64]

algorithm_output = config.build()

for i in range(20):
    result = algorithm_output.train()
    print(result["episode_reward_mean"])



0.4985
0.498
0.49875
0.53
0.56825
0.6375
0.727
0.79075
0.849
0.89375
0.9325
0.95575
0.9655
0.97375
0.9825
0.987
0.988
0.9885
0.99325
0.99575


In [73]:
file_path="/tmp/test-out/output-2023-06-01_11-19-13_worker-1_0.json"

with open(file_path, "r") as file:
    contents = file.read()
    print(contents)

{"type": "MultiAgentBatch", "count": 400, "policy_batches": {"default_policy": {"obs": "BCJNGGhAQUUAAAAAAAActBQAAGGABZU2RQABAPEajBJudW1weS5jb3JlLm51bWVyaWOUjAtfZnJvbWJ1ZmZlcpSTlCiWwEQuAAACABBABQBCAACAPwQADwIACSqgQCQADDQACBAAH0BYAAQAJAAMAgAAWAAAAgAAHAAABAAEAgAEDAAICAAILAAAAgAAHAAABAAEAgAADAAABAAAAgAMCAAPAgAJF4DcAAQ4AAACAAQMAAQIAAACAAgsAAACAAQcAAwIAAACAA8UAAEPAgAGBzQBAAIABDwADwIAAg+wAAQPAgABADQBBEwADwgAHQ8CAAoPsAAYABACD3wABQwCAA9gAQEEPAAAAgAIDAAPWAAaED8+AApAAA8CAAUPNAEBAAIAAHwAAAQABAIAAQwAD4wBGAAwAAAEAA8CABEEKAAECAAPAgAJCAgBAAIABDQABAgACAIAACwAAAIAABwAAAQABAIACAwABAIAABQAAAQADwIAEQ+4AQkNRAAPPAIYADwAAAQADwIAEQC4AQACAAQwAAgIAAgMAAACAA8IAQEEJAAMAgAPWAAJAAIAADgAAAQAAAIADAgADwIACQCEAAQwAAACAAAMAAAEAA8CAAEAGAAABAAPAgASB2ABDzQADQCEAAACAAAoAAAEAAQCAAgMAAQCAAgIAQQgAAQIAAACAAQMAAEIAA+EACwBRAAPAgAAD2gCAQACAAAwAAAEAAgCAAQQAAQIAA8CAAkIWAAIMAAEDAAECAAAAgAINAEAAgAAHAAABAAPAgABADQBBBwADwgABQACAAEcAAdMBAACAAEUAA8CAAQAsAAAAgAAJAAABAAAAgAMCAAEAgAIhAAAAgAMKAAIEAAADAAABAAPAgASD7AAHARYAAQIAAACAAAMAAAEAAgCAADc

In [145]:
!ls /tmp/test-output

output-2023-06-01_13-29-29_worker-1_0.json
