In [16]:
import os
import sys

dir = os.path.abspath('')
while not dir.endswith('ardt'): dir = os.path.dirname(dir)
if not dir in sys.path: sys.path.append(dir)

In [17]:
import io
import time

import paramiko

from utils.helpers import find_root_dir

from private_keys import KNUCKLES_PASSWORD, CS_PRIVATE_KEY, GITHUB_PRIVATE_KEY

In [18]:
HOME_DIR = "/home/amarques"
ARDT_DIR = find_root_dir()
SCRIPT_PATH = f"{HOME_DIR}/action-robust-decision-transformer/cluster-scripts/cscluster-run-scripts/"

In [19]:
CLUSTER_TO_TRY = 'beaker'  # beaker, vic, pryor

## Connecting to chosen cluster

In [20]:
# Private key
k = CS_PRIVATE_KEY.strip()
padding = len(k) % 4
if padding > 0:
    k += "=" * (4 - padding)
private_key_file = io.StringIO(k)
pkey = paramiko.RSAKey.from_private_key(private_key_file)

# Establish SSH connection to the first server (knuckles)
knuckles = paramiko.SSHClient()
knuckles.set_missing_host_key_policy(paramiko.AutoAddPolicy())
knuckles.connect('knuckles.cs.ucl.ac.uk', port=22, username='amarques', password=KNUCKLES_PASSWORD)

# Use this connection to establish a second SSH connection to the second server
# Paramiko Transport is used here for nested ssh
transport = knuckles.get_transport()
dest_addr = (f'{CLUSTER_TO_TRY}.cs.ucl.ac.uk', 22) # IP and port
local_addr = ('knuckles.cs.ucl.ac.uk', 22) # IP and port
jump = transport.open_channel("direct-tcpip", dest_addr, local_addr)

cluster = paramiko.SSHClient()
cluster.set_missing_host_key_policy(paramiko.AutoAddPolicy())
cluster.connect(f'{CLUSTER_TO_TRY}.cs.ucl.ac.uk', port=22, sock=jump, username='amarques', password=KNUCKLES_PASSWORD)

## Set up environment, code, files

In [None]:
def sftp_upload_dir(sftp, localdir, remotedir):
    os.chdir(os.path.split(localdir)[0])
    parent = os.path.split(localdir)[1]
    for walker in os.walk(parent):
        try:
            sftp.mkdir(os.path.join(remotedir, walker[0]))
        except Exception as e:
            print("Exception:", e)
        for file in walker[2]:
            sftp.put(os.path.join(walker[0], file), os.path.join(remotedir, walker[0], file))

In [None]:
# sadly these seem to need to be repeated all the time
core_commands = f"""
                source /share/apps/source_files/python/python-3.9.5.source &&
                source /share/apps/source_files/cuda/cuda-11.2.source &&
                export PATH=$PATH:/share/apps/git-lfs-2.11.0/bin/
                """.strip().replace("\n", " ")

In [None]:
f"""
source /share/apps/source_files/python/python-3.9.5.source &&
source /share/apps/source_files/cuda/cuda-11.2.source &&
export PATH=$PATH:/share/apps/git-lfs-2.11.0/bin/
""".strip().replace("\n", " ")

In [None]:
stdin, stdout, stderr = cluster.exec_command(core_commands)
stdout.channel.recv_exit_status()

In [None]:
stdin, stdout, stderr = cluster.exec_command(f"""
                                             rm -rf {HOME_DIR}/action-robust-decision-transformer && 
                                             git clone https://{GITHUB_PRIVATE_KEY}@github.com/afonsosamarques/action-robust-decision-transformer.git &&
                                             cd {HOME_DIR}/action-robust-decision-transformer &&
                                             git pull &&
                                             git checkout afonso-experiments
                                             """.strip().replace("\n", " "))
stdout.channel.recv_exit_status()

In [None]:
# # NOTE: not always needed...
# stdin, stdout, stderr = cluster.exec_command(f"""
#                                              {core_commands} &&
#                                              mkdir {HOME_DIR}/envs &&
#                                              python3 -m venv {HOME_DIR}/envs/ardt-env &&
#                                              source {HOME_DIR}/envs/ardt-env/bin/activate &&
#                                              pip install --upgrade pip --user && 
#                                              pip install -r {HOME_DIR}/action-robust-decision-transformer/requirements.txt --user
#                                              """.strip().replace("\n", " "))
# stdout.channel.recv_exit_status()

In [None]:
stdin, stdout, stderr = cluster.exec_command(f"""
                                             {core_commands} &&
                                             source {HOME_DIR}/envs/ardt-env/bin/activate &&
                                             pip install --pre torch==2.0.1 -f https://download.pytorch.org/whl/nightly/cu114/torch_nightly.html --user
                                             """.strip().replace("\n", " "))
stdout.channel.recv_exit_status()


In [None]:
stdin, stdout, stderr = cluster.exec_command(f"""
                                             mkdir {HOME_DIR}/action-robust-decision-transformer/codebase/ardt/eval-outputs &&
                                             mkdir {HOME_DIR}/action-robust-decision-transformer/codebase/ardt/eval-outputs-pipeline &&
                                             mkdir {HOME_DIR}/action-robust-decision-transformer/codebase/ardt/eval-outputs-test &&
                                             mkdir {HOME_DIR}/action-robust-decision-transformer/codebase/ardt/agents &&
                                             mkdir {HOME_DIR}/action-robust-decision-transformer/codebase/ardt/agents-pipeline &&
                                             mkdir {HOME_DIR}/action-robust-decision-transformer/codebase/ardt/agents-test &&
                                             mkdir {HOME_DIR}/action-robust-decision-transformer/codebase/ardt/wandb &&
                                             mkdir {HOME_DIR}/action-robust-decision-transformer/codebase/ardt/wandb-json
                                             """.strip().replace("\n", " "))
stdout.channel.recv_exit_status()

In [None]:
sftp = cluster.open_sftp()

sftp.put(f'{ARDT_DIR}/access_tokens.py', f"{HOME_DIR}/action-robust-decision-transformer/codebase/ardt/access_tokens.py")
time.sleep(2)

sftp.put(f'{ARDT_DIR}/access_tokens.py', f"{HOME_DIR}/action-robust-decision-transformer/codebase/evaluation_protocol/access_tokens.py")
time.sleep(2)

In [None]:
# # NOTE: not always needed...
# stdin, stdout, stderr = cluster.exec_command(f'rm -rf {HOME_DIR}/datasets-to-push') 
# stdout.channel.recv_exit_status()

# sftp_upload_dir(sftp, f'{ARDT_DIR}/datasets-to-push', f"{HOME_DIR}")
# time.sleep(2)

In [None]:
stdin, stdout, stderr = cluster.exec_command(f'cp -r {HOME_DIR}/datasets-to-push {HOME_DIR}/action-robust-decision-transformer/codebase/ardt') 
stdout.channel.recv_exit_status()

stdin, stdout, stderr = cluster.exec_command(f'mv {HOME_DIR}/action-robust-decision-transformer/codebase/ardt/datasets-to-push {HOME_DIR}/action-robust-decision-transformer/codebase/ardt/datasets') 
stdout.channel.recv_exit_status()

In [None]:
f'mv {HOME_DIR}/action-robust-decision-transformer/codebase/ardt/datasets-to-push {HOME_DIR}/action-robust-decision-transformer/codebase/ardt/datasets'

## Run script and close connections

In [None]:
stdin, stdout, stderr = cluster.exec_command(f"""
                                                qsub {SCRIPT_PATH}/run_experiment_1.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_2.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_3.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_4.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_5.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_6.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_7.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_8.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_9.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_10.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_11.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_12.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_13.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_14.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_15.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_16.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_17.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_18.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_19.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_20.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_21.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_22.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_23.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_24.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_25.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_26.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_27.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_28.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_29.sh &&
                                                qsub {SCRIPT_PATH}/run_experiment_30.sh
                                             """.strip().replace("\n", " "))
stdout.channel.recv_exit_status()

In [22]:
time.sleep(10)
sftp.close()
cluster.close()
knuckles.close()