In [7]:
import argparse

import ray
from ray import tune
from ray.rllib.agents import ppo
from ray.rllib.agents import ddpg
from ray.rllib.agents import a3c



from ray.tune import grid_search
from my_env import ContentCaching
import pickle
import time
import numpy as np
import os
import pandas as pd




def ret_lst(cpt):
    string1 =  'data/listfile_evol'+str(cpt)+'.data' #_evol'+ , _pos'+
    with open(string1, 'rb') as filehandle:
    # read the data as binary data stream
        lst = pickle.load(filehandle)
    return lst

def ret_nei(cpt):
    string2 = 'data/nei_tab_pos'+str(cpt)+'.data'
    with open(string2, 'rb') as filehandle:
        # read the data as binary data stream
        nei_tab = pickle.load(filehandle)
    return nei_tab

class customExperimentClass():

    def __init__(self,ttl_var, cpt, variable, stop_iters=2, stop_timesteps=990000000, stop_reward=0.00001):#

        self.env = ContentCaching#gym.make("ContentCaching-v0")
        self.config_train = {
                        "env": ContentCaching,
                        "env_config": {
                        "ttl_var": ttl_var,
                        "variable": variable,#[8,8,8,4],
                        "nei_tab": ret_nei(cpt),
                        "lst_tab": ret_lst(cpt),
                        },
                        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),

                        "model": {
                            # By default, the MODEL_DEFAULTS dict above will be used.

                            # Change individual keys in that dict by overriding them, e.g.
                            "fcnet_hiddens": grid_search( [[64, 64, 64]]),
                            "fcnet_activation": grid_search(["relu"]),
                            "vf_share_layers": False,#True,
                        },

                        "lr": grid_search([1e-2]),  # try different lrs
                        "num_workers": 0,  # parallelism
                        #"framework": "torch" if args.torch else "tf",
        }

        
        self.config_test = {
                        "env": ContentCaching,
                        "env_config": {
                        "ttl_var": ttl_var,
                        "variable": variable,
                        "nei_tab": ret_nei(5),
                        "lst_tab": ret_lst(5),                        

                        },
                        "model": {
                            # By default, the MODEL_DEFAULTS dict above will be used.

                            # Change individual keys in that dict by overriding them, e.g.
                            "fcnet_hiddens": [64, 64, 64],
                            "fcnet_activation": "sigmoid",
                            "vf_share_layers": False,#True,
                        },

                        "lr": [1e-2],  # try different lrs
                        #"num_workers": 2,  # parallelism
                        #"framework": "torch" if args.torch else "tf",
        }
        self.save_dir = "~/ray_results"
        self.stop_criteria = {
                    "training_iteration": stop_iters,#args.stop_iters,
                    "timesteps_total": stop_timesteps,#args.c,
                    "episode_reward_mean": stop_reward#args.stop_reward,
                    }
    
    def train(self, algo):
        """
        Train an RLlib IMPALA agent using tune until any of the configured stopping criteria is met.
            See https://docs.ray.io/en/latest/tune/api_docs/execution.html#tune-run
        :return: Return the path to the saved agent (checkpoint) and tune's ExperimentAnalysis object
            See https://docs.ray.io/en/latest/tune/api_docs/analysis.html#experimentanalysis-tune-experimentanalysis
        """
        if algo == "ppo":
            analysis = ray.tune.run(ppo.PPOTrainer, config=self.config_train, local_dir=self.save_dir, stop=self.stop_criteria,
                               checkpoint_at_end=True)
        if algo == "ddpg":
            analysis = ray.tune.run(ddpg.DDPGTrainer, config=self.config_train, local_dir=self.save_dir, stop=self.stop_criteria,
                                checkpoint_at_end=True)
        if algo == "a3c":
            analysis = ray.tune.run(a3c.A3CTrainer, config=self.config_train, local_dir=self.save_dir, stop=self.stop_criteria,
                                checkpoint_at_end=True)
        if algo == "td3":
            analysis = ray.tune.run(ddpg.TD3Trainer, config=self.config_train, local_dir=self.save_dir, stop=self.stop_criteria,
                                checkpoint_at_end=True)
        if algo == "appo":
            analysis = ray.tune.run(ppo.APPOTrainer, config=self.config_train, local_dir=self.save_dir, stop=self.stop_criteria,
                                checkpoint_at_end=True)

        lr = analysis.get_best_config(metric='episode_reward_mean', mode="max")["lr"] 
        fc_hid = analysis.get_best_config(metric='episode_reward_mean', mode="max")["model"]["fcnet_hiddens"] 
        fc_act = analysis.get_best_config(metric='episode_reward_mean', mode="max")["model"]["fcnet_activation"] 

        # list of lists: one list per checkpoint; each checkpoint list contains 1st the path, 2nd the metric value
        checkpoints = analysis.get_trial_checkpoints_paths(trial=analysis.get_best_trial('episode_reward_mean', mode = 'max'),
                                                           metric='episode_reward_mean')
        # retriev the checkpoint path; we only have a single checkpoint, so take the first one

        df = analysis.results_df

        all_dataframes = analysis.trial_dataframes
        print("all_dataframes ===== : ", all_dataframes)
        print("type all_dataframes ===== : ", type(all_dataframes))

        print("--------------------------------------")
        trials = analysis.trials
        print("trials ===== : ", trials)
        print("trials[0] ===== : ", trials[0])
        print("len trials ===== : ", len(trials))

        print("type trials ===== : ", type(trials))
        #df = analysis.dataframe(metric="episode_reward_mean", mode="max")
        #print("df ===== : ", type(df))

        dfs = analysis.trial_dataframes

        # Plot by epoch
        ax = None  # This plots everything on the same plot
        for d in dfs.values():
            ax = d.episode_reward_mean.plot(ax=ax, legend=False)
        plot.show()


        #dff = pd.DataFrame(df).set_index('Index')
        #print("dff ===== : ", type(dff))


        #print("dff ===== : ", dff["trial_id"])
        #print("dff ===== : ", dff["hist_stats/episode_reward"])

        checkpoint_path = checkpoints[0][0]
        print("Checkpoint path:", checkpoint_path)
        return checkpoint_path, analysis, lr, fc_hid, fc_act

    def load(self, path):
        """
        Load a trained RLlib agent from the specified path. Call this before testing a trained agent.
        :param path: Path pointing to the agent's saved checkpoint (only used for RLlib agents)
        """
        self.agent = ppo.PPOTrainer(config=self.config)
        self.agent.restore(path)

    def test(self,algo, path, lr, fc_hid, fc_act):

        """Test trained agent for a single episode. Return the episode reward"""
        # instantiate env class
        unused_shared = []
        unused_own = []
        unsatisfied_shared = []
        unsatisfied_own = []

        episode_reward = 0
        self.config_test["num_workers"] = 0
        self.config_test["lr"] = lr
        self.config_test['model']["fcnet_hiddens"] = fc_hid
        self.config_test['model']["fcnet_activation"] = fc_act

        if algo == "ppo":
            self.agent = ppo.PPOTrainer(config=self.config_test)
        if algo == "ddpg":
            self.agent = ddpg.DDPGTrainer(config=self.config_test)
        if algo == "a3c":
            self.agent = a3c.A3CTrainer(config=self.config_test)
        if algo == "td3":
            self.agent = ddpg.TD3Trainer(config=self.config_test)
        if algo == "appo":
            self.agent = ppo.APPOTrainer(config=self.config_test)

        self.agent.restore(path)
        env = self.agent.workers.local_worker().env

     
        obs = env.reset()
        done = False

        while not done:
            action = self.agent.compute_action(obs)
            obs, reward, done, info = env.step(action)
            episode_reward += reward

            unused_shared.append(info["unused_shared"])
            unused_own.append(info["unused_own"])
            unsatisfied_shared.append(info["unsatisfied_shared"])
            unsatisfied_own.append(info["unsatisfied_own"])

        return episode_reward, unused_shared, unused_own, unsatisfied_shared, unsatisfied_own



if __name__ == "__main__":


    parser = argparse.ArgumentParser()

    parser.add_argument("--stop-iters", type=int, default= 2)#50)
    parser.add_argument("--stop-timesteps", type=int, default=90000000)
    parser.add_argument("--stop-reward", type=float, default=0.001)
    parser.add_argument("--ttl_var", type=float, default=3)
    parser.add_argument("--cpt", type=float, default=1)
    parser.add_argument("--algo", type=str, default="ppo")   


    ray.shutdown()
    ray.init(num_cpus=3)#num_cpus=2, num_gpus=0)

    args = parser.parse_args()
    # Class instance
    exper = customExperimentClass(args.ttl_var, args.cpt, [8,8,8,4], args.stop_iters) # ttl_var, cpt, variable

    # Train and save for 2 iterations
    checkpoint_path, results, lr, fc_hid, fc_act = exper.train(args.algo)
    
    print("------------------------------------------------------------------------------------")
    print("------------------------------------------------------------------------------------")
    print("------------------------------------------------------------------------------------")
    
    # Load saved
    #exper.load(checkpoint_path)
    # Test loaded
    """
    reward, unused_shared ,unused_own, unsatisfied_shared, unsatisfied_own  = exper.test(args.algo,checkpoint_path, lr, fc_hid, fc_act)
   
    print(" info[unused_shared] = ", unused_shared )
    print(" info[unused_own] = ", unused_own )
    print(" info[unsatisfied_shared] = ", unsatisfied_shared )
    print(" info[unsatisfied_own] = ", unsatisfied_own )
    print(" reward = ", reward )
    """
    
    

"""

 

    config=dict(
        extra_config,
        **{
            "env": "BreakoutNoFrameskip-v4"
            if args.use_vision_network else "CartPole-v0",
            # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
            "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
            "callbacks": {
                "on_train_result": check_has_custom_metric,
            },
            "model": {
                "custom_model": "keras_q_model"
                if args.run == "DQN" else "keras_model"
            },
            "framework": "tf",
        })

"""

2021-05-18 16:46:14,708	INFO services.py:1269 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
usage: ipykernel_launcher.py [-h] [--stop-iters STOP_ITERS]
                             [--stop-timesteps STOP_TIMESTEPS]
                             [--stop-reward STOP_REWARD] [--ttl_var TTL_VAR]
                             [--cpt CPT] [--algo ALGO]
ipykernel_launcher.py: error: unrecognized arguments: -f /home/exo-info/.local/share/jupyter/runtime/kernel-19576497-46ae-41ec-b128-c6aa204285b7.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [5]:
!python customclass.py

Instructions for updating:
non-resource variables are not supported in the long term
2021-05-18 16:42:58,951	INFO services.py:1269 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8266[39m[22m
== Status ==
Memory usage on this node: 14.2/31.1 GiB
Using FIFO scheduling algorithm.
Resources requested: 1.0/3 CPUs, 0/0 GPUs, 0.0/15.64 GiB heap, 0.0/7.82 GiB objects
Result logdir: /home/exo-info/ray_results/PPO_2021-05-18_16-43-00
Number of trials: 1/1 (1 RUNNING)
+--------------------------------+----------+-------+------+--------------------------+-----------------------+
| Trial name                     | status   | loc   |   lr | model/fcnet_activation   | model/fcnet_hiddens   |
|--------------------------------+----------+-------+------+--------------------------+-----------------------|
| PPO_ContentCaching_b99f2_00000 | RUNNING  |       | 0.01 | relu                     | [64, 64, 64]          |
+--------------------------------+----------+-------+------+--------------------

== Status ==
Memory usage on this node: 14.4/31.1 GiB
Using FIFO scheduling algorithm.
Resources requested: 1.0/3 CPUs, 0/0 GPUs, 0.0/15.64 GiB heap, 0.0/7.82 GiB objects
Result logdir: /home/exo-info/ray_results/PPO_2021-05-18_16-43-00
Number of trials: 1/1 (1 RUNNING)
+--------------------------------+----------+--------------------+------+--------------------------+-----------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------+
| Trial name                     | status   | loc                |   lr | model/fcnet_activation   | model/fcnet_hiddens   |   iter |   total time (s) |   ts |   reward |   episode_reward_max |   episode_reward_min |   episode_len_mean |
|--------------------------------+----------+--------------------+------+--------------------------+-----------------------+--------+------------------+------+----------+----------------------+----------------------+--------------------|
| PPO_ContentCa

In [6]:
dfs = analysis.trial_dataframe

NameError: name 'analysis' is not defined