# TACC Ensemble Simulation

In [1]:
import logging
import os
from pathlib import Path

from taccjm import EnsembleTACCSimulation

In [2]:
import importlib

importlib.reload(EnsembleTACCSimulation)

<module 'taccjm.EnsembleTACCSimulation' from '/home/jovyan/work/repos/taccjm/src/taccjm/EnsembleTACCSimulation.py'>

In [3]:
tasks = [
    {
        "cores": 1,
        "workdir": f"runs/run_{n}",
        "pre": f"mkdir another-dir",
        "cdir": f"another-dir",
        "cmnd": f"touch foo_{n}.txt; sleep {n}; echo DONE > foo_{n}.txt",
        "post": f"cp another-dir/foo_{n}.txt ../../outputs/foo_{n}.txt",
    }
    for n in range(10)
]
tasks[0], tasks[-1]

({'cores': 1,
  'workdir': 'runs/run_0',
  'pre': 'mkdir another-dir',
  'cdir': 'another-dir',
  'cmnd': 'touch foo_0.txt; sleep 0; echo DONE > foo_0.txt',
  'post': 'cp another-dir/foo_0.txt ../../outputs/foo_0.txt'},
 {'cores': 1,
  'workdir': 'runs/run_9',
  'pre': 'mkdir another-dir',
  'cdir': 'another-dir',
  'cmnd': 'touch foo_9.txt; sleep 9; echo DONE > foo_9.txt',
  'post': 'cp another-dir/foo_9.txt ../../outputs/foo_9.txt'})

In [4]:
sim = EnsembleTACCSimulation.EnsembleTACCSimulation(
    system="ls6", name="test", log_config={"level": logging.INFO}
)

{"asctime": "2023-02-14 18:07:05,751", "name": "taccjm.TACCSimulation", "levelname": "INFO", "message": "Logger taccjm.TACCSimulation initialized", "config": {"output": "<ipykernel.iostream.OutStream object at 0x7f7695553850>", "fmt": "json", "level": 20}}
{"asctime": "2023-02-14 18:07:05,753", "name": "taccjm.TACCSimulation", "levelname": "INFO", "message": "Running from inherited class at /home/jovyan/work/repos/taccjm/src/taccjm/EnsembleTACCSimulation.py"}
{"asctime": "2023-02-14 18:07:05,754", "name": "taccjm.TACCClient", "levelname": "INFO", "message": "Logger taccjm.TACCClient initialized", "config": {"output": "<ipykernel.iostream.OutStream object at 0x7f7695553850>", "fmt": "json", "level": 20}}
{"asctime": "2023-02-14 18:07:05,755", "name": "taccjm.TACCClient", "levelname": "INFO", "message": "Not running on TACC. Starting SSH session."}
{"asctime": "2023-02-14 18:07:05,756", "name": "taccjm.TACCClient", "levelname": "INFO", "message": "Looking for ssh_connection taccjm-ls6"}


In [5]:
slurm_config = {"node_cound": 1, "processors_per_node": 5}
job_config = sim.run(
    args={"tasks": tasks},
    slurm_config=slurm_config,
    stage=True,
    python_setup=True,
    run=True,
    remora=True,
)

{"asctime": "2023-02-14 18:07:10,033", "name": "taccjm.TACCSimulation", "levelname": "INFO", "message": "Not in execution environment. Setting up job..."}
{"asctime": "2023-02-14 18:07:10,035", "name": "taccjm.TACCSimulation", "levelname": "INFO", "message": "Starting simulation set-up.", "inputs": {"args": {"tasks": [{"cores": 1, "workdir": "runs/run_0", "pre": "mkdir another-dir", "cdir": "another-dir", "cmnd": "touch foo_0.txt; sleep 0; echo DONE > foo_0.txt", "post": "cp another-dir/foo_0.txt ../../outputs/foo_0.txt"}, {"cores": 1, "workdir": "runs/run_1", "pre": "mkdir another-dir", "cdir": "another-dir", "cmnd": "touch foo_1.txt; sleep 1; echo DONE > foo_1.txt", "post": "cp another-dir/foo_1.txt ../../outputs/foo_1.txt"}, {"cores": 1, "workdir": "runs/run_2", "pre": "mkdir another-dir", "cdir": "another-dir", "cmnd": "touch foo_2.txt; sleep 2; echo DONE > foo_2.txt", "post": "cp another-dir/foo_2.txt ../../outputs/foo_2.txt"}, {"cores": 1, "workdir": "runs/run_3", "pre": "mkdir a

In [6]:
job_config

{'name': 'test',
 'job_id': 'test_20230214_180737h_snak84',
 'job_dir': '/scratch/06307/clos21/taccjm-ls6/jobs/test_20230214_180737h_snak84',
 'slurm': {'allocation': 'ADCIRC',
  'node_count': 1,
  'processors_per_node': 5,
  'max_run_time': 0.2,
  'queue': 'development',
  'dependencies': [],
  'node_cound': 1},
 'args': {'tasks': [{'cores': 1,
    'workdir': 'runs/run_0',
    'pre': 'mkdir another-dir',
    'cdir': 'another-dir',
    'cmnd': 'touch foo_0.txt; sleep 0; echo DONE > foo_0.txt',
    'post': 'cp another-dir/foo_0.txt ../../outputs/foo_0.txt'},
   {'cores': 1,
    'workdir': 'runs/run_1',
    'pre': 'mkdir another-dir',
    'cdir': 'another-dir',
    'cmnd': 'touch foo_1.txt; sleep 1; echo DONE > foo_1.txt',
    'post': 'cp another-dir/foo_1.txt ../../outputs/foo_1.txt'},
   {'cores': 1,
    'workdir': 'runs/run_2',
    'pre': 'mkdir another-dir',
    'cdir': 'another-dir',
    'cmnd': 'touch foo_2.txt; sleep 2; echo DONE > foo_2.txt',
    'post': 'cp another-dir/foo_2.txt

In [11]:
sim.client.showq()



[]

In [12]:
sim.client.list_files("", job_id=job_config["job_id"], recurse=True)

[{'filename': 'runs',
  'st_atime': 1676419798,
  'st_gid': 800588,
  'st_mode': 16832,
  'st_mtime': 1676419798,
  'st_size': 10,
  'st_uid': 856065,
  'ls_str': 'drwx------   1 856065   800588         10 14 Feb 18:09 runs'},
 {'filename': 'job.json',
  'st_atime': 1676419668,
  'st_gid': 800588,
  'st_mode': 33152,
  'st_mtime': 1676419677,
  'st_size': 2697,
  'st_uid': 856065,
  'ls_str': '-rw-------   1 856065   800588       2697 14 Feb 18:07 job.json'},
 {'filename': 'outputs',
  'st_atime': 1676419798,
  'st_gid': 800588,
  'st_mode': 16832,
  'st_mtime': 1676419798,
  'st_size': 0,
  'st_uid': 856065,
  'ls_str': 'drwx------   1 856065   800588          0 14 Feb 18:09 outputs'},
 {'filename': 'submit_script.sh',
  'st_atime': 1676419663,
  'st_gid': 800588,
  'st_mode': 33152,
  'st_mtime': 1676419660,
  'st_size': 915,
  'st_uid': 856065,
  'ls_str': '-rw-------   1 856065   800588        915 14 Feb 18:07 submit_script.sh'},
 {'filename': 'test-log',
  'st_atime': 1676419798,


In [10]:
saved_config = sim.client.read("job.json", job_id=job_config["job_id"])
saved_config

{'name': 'test',
 'job_id': 'test_20230214_180737h_snak84',
 'job_dir': '/scratch/06307/clos21/taccjm-ls6/jobs/test_20230214_180737h_snak84',
 'slurm': {'allocation': 'ADCIRC',
  'node_count': 1,
  'processors_per_node': 5,
  'max_run_time': 0.2,
  'queue': 'development',
  'dependencies': [],
  'node_cound': 1},
 'args': {'tasks': [{'cores': 1,
    'workdir': 'runs/run_0',
    'pre': 'mkdir another-dir',
    'cdir': 'another-dir',
    'cmnd': 'touch foo_0.txt; sleep 0; echo DONE > foo_0.txt',
    'post': 'cp another-dir/foo_0.txt ../../outputs/foo_0.txt'},
   {'cores': 1,
    'workdir': 'runs/run_1',
    'pre': 'mkdir another-dir',
    'cdir': 'another-dir',
    'cmnd': 'touch foo_1.txt; sleep 1; echo DONE > foo_1.txt',
    'post': 'cp another-dir/foo_1.txt ../../outputs/foo_1.txt'},
   {'cores': 1,
    'workdir': 'runs/run_2',
    'pre': 'mkdir another-dir',
    'cdir': 'another-dir',
    'cmnd': 'touch foo_2.txt; sleep 2; echo DONE > foo_2.txt',
    'post': 'cp another-dir/foo_2.txt

In [18]:
err_file = sim.client.read(
    "test_20230214_174009f1lxc8i0.e702406", job_id=job_config["job_id"]
)

In [19]:
print(err_file)

cat: '/proc/fs/lustre/mdc/work*/stats': No such file or directory
cat: '/proc/fs/lustre/llite/work*/stats': No such file or directory
cat: '/proc/fs/lustre/llite/work*/stats': No such file or directory
cat: /proc/sys/lnet/stats: No such file or directory
cat: '/proc/fs/lustre/mdc/work*/stats': No such file or directory
cat: '/proc/fs/lustre/llite/work*/stats': No such file or directory
cat: '/proc/fs/lustre/llite/work*/stats': No such file or directory
cat: /proc/sys/lnet/stats: No such file or directory
cat: '/proc/fs/lustre/mdc/work*/stats': No such file or directory
cat: '/proc/fs/lustre/llite/work*/stats': No such file or directory
cat: '/proc/fs/lustre/llite/work*/stats': No such file or directory
cat: /proc/sys/lnet/stats: No such file or directory
Traceback (most recent call last):
  File "/scratch/06307/clos21/taccjm-ls6/jobs/test_20230214_174009f1lxc8i0/./test.py", line 127, in <module>
    simulation.run()
  File "/work/06307/clos21/ls6/mambaforge/envs/taccjm/lib/python3.11/s

In [15]:
sim.client.list_files("task-queue", job_id=job_config["job_id"], recurse=True)

[{'filename': 'tq_log',
  'st_atime': 1676419266,
  'st_gid': 800588,
  'st_mode': 33152,
  'st_mtime': 1676419266,
  'st_size': 13644,
  'st_uid': 856065,
  'ls_str': '-rw-------   1 856065   800588      13644 14 Feb 18:01 tq_log'}]

In [13]:
sim.client.download("", "data", job_id=job_config["job_id"])

'data'

## Local Test

In [4]:
# Move TACC System


def mock_tacc():
    system = "ls6"
    os.environ["HOSTNAME"] = f"login2.{system}.tacc.utexas.edu"
    os.environ["SCRATCH"] = str(Path.cwd() / "data/test-scratch")
    os.environ["HOME"] = str(Path.cwd() / "data/test-home")
    os.environ["WORK"] = str(Path.cwd() / "data/test-work")


def unmock_tacc():
    os.environ["HOSTNAME"] = ""
    os.environ["SCRATCH"] = ""
    os.environ["HOME"] = ""
    os.environ["WORK"] = ""

In [5]:
import importlib

importlib.reload(EnsembleTACCSimulation)

<module 'taccjm.EnsembleTACCSimulation' from '/home/jovyan/work/repos/taccjm/src/taccjm/EnsembleTACCSimulation.py'>

In [6]:
mock_tacc()
sim = EnsembleTACCSimulation.EnsembleTACCSimulation(
    system="ls6", name="test", log_config={"level": logging.INFO}
)
unmock_tacc()

{"asctime": "2023-02-14 13:18:27,294", "name": "taccjm.TACCSimulation", "levelname": "INFO", "message": "Logger taccjm.TACCSimulation initialized", "config": {"output": "<ipykernel.iostream.OutStream object at 0x7fd23221b850>", "fmt": "json", "level": 20}}
{"asctime": "2023-02-14 13:18:27,296", "name": "taccjm.TACCSimulation", "levelname": "INFO", "message": "Running from inherited class at /home/jovyan/work/repos/taccjm/src/taccjm/EnsembleTACCSimulation.py"}
{"asctime": "2023-02-14 13:18:27,297", "name": "taccjm.TACCClient", "levelname": "INFO", "message": "Logger taccjm.TACCClient initialized", "config": {"output": "<ipykernel.iostream.OutStream object at 0x7fd23221b850>", "fmt": "json", "level": 20}}
{"asctime": "2023-02-14 13:18:27,298", "name": "taccjm.TACCClient", "levelname": "INFO", "message": "Not running on TACC. Starting SSH session."}
{"asctime": "2023-02-14 13:18:27,299", "name": "taccjm.TACCClient", "levelname": "INFO", "message": "Looking for ssh_connection taccjm-ls6"}


In [7]:
sim.class_name

'EnsembleTACCSimulation'

In [10]:
job_config = sim.run(
    args={},
    slurm_config=slurm_config,
    stage=True,
    python_setup=True,
    run=False,
    remora=True,
)

{"asctime": "2023-02-14 13:18:47,536", "name": "taccjm.TACCSimulation", "levelname": "INFO", "message": "Not in execution environment. Setting up job..."}
{"asctime": "2023-02-14 13:18:47,539", "name": "taccjm.TACCSimulation", "levelname": "INFO", "message": "Starting simulation set-up.", "inputs": {"args": {}, "slurm_config": {"node_cound": 1, "processors_per_node": 4}, "stage": true, "run": false}}
{"asctime": "2023-02-14 13:18:47,540", "name": "taccjm.TACCSimulation", "levelname": "INFO", "message": "Setting up python execution environment"}
{"asctime": "2023-02-14 13:18:57,950", "name": "taccjm.TACCClient", "levelname": "INFO", "message": "Processed command 2573", "cmnd_config": {"id": 2573, "cmd": "mamba --help", "ts": "2023-02-14T13:18:57.947675", "status": "COMPLETE", "stdout": "usage: mamba [-h] [-V] command ...\n\nconda is a tool for managing and deploying applications, environments and packages.\n\nOptions:\n\npositional arguments:\n  command\n    clean        Remove unused p

In [11]:
job_config

{'name': 'test',
 'job_id': 'test_20230214_131949kfsqwjjz',
 'job_dir': '/scratch/06307/clos21/taccjm-ls6/jobs/test_20230214_131949kfsqwjjz',
 'slurm': {'allocation': 'ADCIRC',
  'node_count': 1,
  'processors_per_node': 4,
  'max_run_time': 0.5,
  'queue': 'development',
  'dependencies': [],
  'node_cound': 1},
 'args': {'tasks': [],
  'task_max_runtime': 0.1,
  'max_runtime': 0.5,
  'summary_interval': 60},
 'submit_script': '/scratch/06307/clos21/taccjm-ls6/jobs/test_20230214_131949kfsqwjjz/submit_script.sh',
 'sim_script': '/scratch/06307/clos21/taccjm-ls6/jobs/test_20230214_131949kfsqwjjz/test.py'}

In [12]:
sim.client.download("", "data", job_id=job_config["job_id"])

'data'

In [13]:
sim.client.submit_job(job_config["job_id"])

{"asctime": "2023-02-14 13:22:40,711", "name": "taccjm.TACCClient", "levelname": "INFO", "message": "Processed command 2593", "cmnd_config": {"id": 2593, "cmd": "cd /scratch/06307/clos21/taccjm-ls6/jobs/test_20230214_131949kfsqwjjz && sbatch /scratch/06307/clos21/taccjm-ls6/jobs/test_20230214_131949kfsqwjjz/submit_script.sh", "ts": "2023-02-14T13:22:40.706228", "status": "COMPLETE", "stdout": "\n-----------------------------------------------------------------\n          Welcome to the Lonestar6 Supercomputer          \n-----------------------------------------------------------------\n\nNo rese", "stderr": "", "history": [{"ts": "2023-02-14T13:22:39.493995", "status": "STARTED"}], "rt": 1.0, "rc": 0}}


{'name': 'test',
 'job_id': 'test_20230214_131949kfsqwjjz',
 'job_dir': '/scratch/06307/clos21/taccjm-ls6/jobs/test_20230214_131949kfsqwjjz',
 'slurm': {'allocation': 'ADCIRC',
  'node_count': 1,
  'processors_per_node': 4,
  'max_run_time': 0.5,
  'queue': 'development',
  'dependencies': [],
  'node_cound': 1},
 'args': {'tasks': [],
  'task_max_runtime': 0.1,
  'max_runtime': 0.5,
  'summary_interval': 60},
 'submit_script': '/scratch/06307/clos21/taccjm-ls6/jobs/test_20230214_131949kfsqwjjz/submit_script.sh',
 'sim_script': '/scratch/06307/clos21/taccjm-ls6/jobs/test_20230214_131949kfsqwjjz/test.py',
 'slurm_id': '701781'}

In [14]:
sim.client.showq()



[{'job_id': '701781',
  'job_name': 'test_20230',
  'username': 'clos21',
  'state': 'Running',
  'nodes': '1',
  'remaining': '1',
  'start_time': '0:29:47'}]

In [16]:
sim.client.list_files("", job_id=job_config["job_id"], recurse=True)

[{'filename': 'submit_script.sh',
  'st_atime': 1676402395,
  'st_gid': 800588,
  'st_mode': 33152,
  'st_mtime': 1676402391,
  'st_size': 915,
  'st_uid': 856065,
  'ls_str': '-rw-------   1 856065   800588        915 14 Feb 13:19 submit_script.sh'},
 {'filename': 'remora_701781',
  'st_atime': 1676402562,
  'st_gid': 800588,
  'st_mode': 16832,
  'st_mtime': 1676402609,
  'st_size': 34,
  'st_uid': 856065,
  'ls_str': 'drwx------   1 856065   800588         34 14 Feb 13:23 remora_701781'},
 {'filename': 'test_20230214_131949kfsqwjjz.o701781',
  'st_atime': 1676402563,
  'st_gid': 800588,
  'st_mode': 33152,
  'st_mtime': 1676402609,
  'st_size': 177,
  'st_uid': 856065,
  'ls_str': '-rw-------   1 856065   800588        177 14 Feb 13:23 test_20230214_131949kfsqwjjz.o701781'},
 {'filename': 'test.py',
  'st_atime': 1676402395,
  'st_gid': 800588,
  'st_mode': 33216,
  'st_mtime': 1676402391,
  'st_size': 3907,
  'st_uid': 856065,
  'ls_str': '-rwx------   1 856065   800588       3907 

In [18]:
err_file = sim.client.read(
    "test_20230214_131949kfsqwjjz.e701781", job_id=job_config["job_id"]
)

In [19]:
print(err_file)

cat: '/proc/fs/lustre/mdc/work*/stats': No such file or directory
cat: '/proc/fs/lustre/llite/work*/stats': No such file or directory
cat: '/proc/fs/lustre/llite/work*/stats': No such file or directory
cat: /proc/sys/lnet/stats: No such file or directory
cat: '/proc/fs/lustre/mdc/work*/stats': No such file or directory
cat: '/proc/fs/lustre/llite/work*/stats': No such file or directory
cat: '/proc/fs/lustre/llite/work*/stats': No such file or directory
cat: /proc/sys/lnet/stats: No such file or directory
cat: '/proc/fs/lustre/mdc/work*/stats': No such file or directory
cat: '/proc/fs/lustre/llite/work*/stats': No such file or directory
cat: '/proc/fs/lustre/llite/work*/stats': No such file or directory
cat: /proc/sys/lnet/stats: No such file or directory
Traceback (most recent call last):
  File "/scratch/06307/clos21/taccjm-ls6/jobs/test_20230214_131949kfsqwjjz/./test.py", line 123, in <module>
    simulation = EnsembleTACCSimulation(name='test',
                 ^^^^^^^^^^^^^^^^^^^^^