Merge branch 'aistairc:main' into main

aramoto99 · Jan 29, 2024 · 2b91899 · 2b91899
2 parents 361d042 + 353a4e0
commit 2b91899
Show file tree

Hide file tree

Showing 110 changed files with 1,186 additions and 1,150 deletions.
diff --git a/aiaccel/__init__.py b/aiaccel/__init__.py
@@ -11,7 +11,6 @@
     tensorboard,
     util,
     workspace,
-    wrapper_tools,
 )
 from aiaccel.util.aiaccel import Run
 
@@ -21,7 +20,6 @@
     "module",
     "parameter",
     "workspace",
-    "wrapper_tools",
     "abci",
     "cli",
     "optimizer",

diff --git a/aiaccel/abci/__init__.py b/aiaccel/abci/__init__.py
@@ -1,9 +1,7 @@
-from aiaccel.abci.batch import create_abci_batch_file
 from aiaccel.abci.qstat import parse_job_list, parse_qstat
 from aiaccel.abci.qsub import create_qsub_command
 
 __all__ = [
-    "create_abci_batch_file",
     "parse_job_list",
     "parse_qstat",
     "create_qsub_command",

diff --git a/aiaccel/abci/batch.py b/aiaccel/abci/batch.py
diff --git a/aiaccel/abci/qsub.py b/aiaccel/abci/qsub.py
@@ -5,7 +5,7 @@
 from omegaconf.dictconfig import DictConfig
 from omegaconf.listconfig import ListConfig
 
-from aiaccel.common import dict_output
+from aiaccel.common import dict_stderr, dict_stdout
 
 """ Example of stat
 stat = {
@@ -36,7 +36,16 @@ def create_qsub_command(config: DictConfig, runner_file: Path) -> list[str]:
     path = Path(config.generic.workspace).resolve()
     job_execution_options = config.ABCI.job_execution_options
 
-    command = ["qsub", "-g", f"{config.ABCI.group}", "-j", "y", "-o", f"{path / dict_output}", str(runner_file)]
+    command = [
+        "qsub",
+        "-g",
+        f"{config.ABCI.group}",
+        "-o",
+        f"{path / dict_stdout}",
+        "-e",
+        f"{path / dict_stderr}",
+        str(runner_file),
+    ]
 
     #
     # additional option

diff --git a/aiaccel/cli/csv_writer.py b/aiaccel/cli/csv_writer.py
@@ -32,7 +32,7 @@ class CsvWriter:
     def __init__(self, config: DictConfig):
         self.config = config
         self.workspace = Workspace(self.config.generic.workspace)
-        self.fp = self.workspace.retults_csv_file
+        self.fp = self.workspace.result_csv_file
         self.trialid = TrialId(self.config)
         self.storage = Storage(self.workspace.storage_file_path)
         self.lock_file = {"result_txt": str(self.workspace.lock / "result_txt")}
@@ -49,7 +49,14 @@ def _get_zero_padding_trial_id(self, trial_id: int) -> str:
         return self.trialid.zero_padding_any_trial_id(trial_id)
 
     def create(self) -> None:
-        """Creates repoprt."""
+        """Creates repoprt.
+
+        Args:
+            None
+
+        Returns:
+            None
+        """
         data = []
         header = []
 

diff --git a/aiaccel/cli/plot.py b/aiaccel/cli/plot.py
@@ -26,7 +26,6 @@ def __init__(self, config: DictConfig):
         self.workspace = Workspace(config.generic.workspace)
         self.storage = Storage(self.workspace.storage_file_path)
         self.goals = [item.value for item in config.optimize.goal]
-
         self.cplt = EasyVisualizer()
 
     def plot(self) -> None:

diff --git a/aiaccel/cli/set_result.py b/aiaccel/cli/set_result.py
@@ -15,7 +15,6 @@ def write_results_to_database(
     storage_file_path: str | Path,
     trial_id: int,
     objective: list[str | float | int] | None,
-    error: str,
     returncode: int | None,
     start_time: str | None = None,
     end_time: str | None = None,
@@ -27,8 +26,6 @@ def write_results_to_database(
     storage.result.set_any_trial_objective(trial_id, objective)
     if returncode is not None:
         storage.returncode.set_any_trial_returncode(trial_id, returncode)
-    if error != "":
-        storage.error.set_any_trial_error(trial_id, error)
     if start_time is not None:
         storage.timestamp.set_any_trial_start_time(trial_id, start_time)
     if end_time is not None:
@@ -42,7 +39,6 @@ def main() -> None:
     parser.add_argument("--storage_file_path", type=str, required=True)
     parser.add_argument("--trial_id", type=int, required=True)
     parser.add_argument("--objective", nargs="+", type=str_or_float_or_int, default=None)
-    parser.add_argument("--error", type=str, default="")
     parser.add_argument("--returncode", type=int, default=None)
 
     args = parser.parse_known_args()[0]
@@ -61,33 +57,17 @@ def main() -> None:
         "trial_id",
         "config",
         "objective",
-        "error",
         "returncode",
     ]
 
     for key in delete_keys:
         if key in xs.keys():
             del xs[key]
 
-    contents = {
-        "trial_id": args.trial_id,
-        "result": args.objective,
-        "parameters": xs,
-        "returncode": args.returncode,
-        "error": args.error,
-    }
-
-    if args.error == "":
-        del contents["error"]
-
-    # print(contents)
-
-    # create_yaml(args.file, contents)
     write_results_to_database(
         storage_file_path=args.storage_file_path,
         trial_id=args.trial_id,
         objective=args.objective,
-        error=args.error,
         returncode=args.returncode,
     )
 

diff --git a/aiaccel/cli/start.py b/aiaccel/cli/start.py
@@ -109,6 +109,10 @@ def main() -> None:  # pragma: no cover
             if not manager.is_error_free():
                 break
             if int((time.time() - time_s)) % 10 == 0:
+                returncodes = storage.returncode.get_all_trial_returncode()
+                if any(item != 0 for item in returncodes):
+                    logger.error("Some trials are failed.")
+                    break
                 num_ready, num_running, num_finished = storage.get_num_running_ready_finished()
                 available_pool_size = manager.get_available_pool_size(num_ready, num_running, num_finished)
                 now = datetime.now()

diff --git a/aiaccel/cli/view.py b/aiaccel/cli/view.py
@@ -18,7 +18,6 @@ class Viewer:
         config (Config): Config object.
 
     Attributes:
-        config_path (Path): Path to the config file.
         workspace (Workspace): Workspace object.
         storage (Storage): Storage object.
     """
@@ -45,10 +44,11 @@ def view(self) -> None:
             job = self.storage.jobstate.get_any_trial_jobstate(trial_id)
             result = self.storage.result.get_any_trial_objective(trial_id)
 
-            error = str(self.storage.error.get_any_trial_error(trial_id))
-            if error == "None":
-                error = ""
-            else:
+            error_file = self.workspace.get_error_output_file(trial_id)
+            error = ""
+            if error_file.exists():
+                with open(error_file, "r") as f:
+                    error = f.read()
                 if len(error) > 4:
                     error = error[0:3] + "..."
 

diff --git a/aiaccel/common.py b/aiaccel/common.py
@@ -2,50 +2,24 @@
 
 Example: ::
 
-    from aiaccel.common import alive_optimizer
+    from aiaccel.common import dict_lock
 
 """
 
-alive_optimizer = "optimizer.yml"
-alive_manager = "manager.yml"
-
-class_optimizer = "Optimizer"
-class_manager = "Manager"
-
-dict_work = "work_aiaccel"
-dict_alive = "alive"
-dict_pid = "pid"
-dict_ready = "ready"
-dict_running = "running"
-dict_finished = "finished"
-dict_hp = "hp"
-dict_hp_ready = "hp/ready"
-dict_hp_running = "hp/running"
-dict_hp_finished = "hp/finished"
-dict_srialize = "serialize"
 dict_lock = "lock"
 dict_log = "log"
 dict_error = "error"
-dict_output = "abci_output"
-dict_jobstate = "jobstate"
-dict_result = "result"
+dict_stdout = "abci_stdout"
+dict_stderr = "abci_stderr"
 dict_runner = "runner"
-dict_timestamp = "timestamp"
-dict_storage = "storage"
 dict_tensorboard = "tensorboard"
 dict_mpi = "mpi"
 dict_rank_log = "rank_log"
 
-extension_hp = "hp"
-extension_pickle = "pickle"
-extension_resource = "res"
-extension_result = "result"
-
-file_configspace = "configspace"
-file_final_result = "final_result.result"
-file_hyperparameter = "hyperparameter.json"
-file_numpy_random = "numpy_random"
-file_numpy_random_extension = "npy"
+file_final_result = "final_result.yaml"
+file_best_result = "best_result.yaml"
+file_result_csv = "result.csv"
+file_storage = "storage.db"
 
 file_hp_count = "count.txt"
 file_hp_count_lock = "count.lock"
@@ -57,11 +31,6 @@
 goal_maximize = "maximize"
 goal_minimize = "minimize"
 
-key_module_type = "module_type"
-key_path = "path"
-key_pid = "pid"
-key_project_name = "project_name"
-
 resource_type_local = "local"
 resource_type_abci = "abci"
 resource_type_mpi = "mpi"