diff --git a/api/PclusterApiHandler.py b/api/PclusterApiHandler.py index 51083318..fb587185 100644 --- a/api/PclusterApiHandler.py +++ b/api/PclusterApiHandler.py @@ -271,6 +271,69 @@ def submit_job(): return resp if type(resp) == tuple else {"success": "true"} +def translate_job(request_body, user): + # format job parameters so that they are accepted by Slurm API + job_properties_dict = { + "job-name": "name", + "chdir": "current_working_directory", + "mem": "memory_per_node" + } + + translated_job = { + "job": { + "environment": { + "PATH": "/bin:/usr/bin/:/usr/local/bin/:/opt/slurm/bin/" + }, + "name": "" + } + } + + request_body["chdir"] = f"/home/{user}" + for key in request_body: + if key == "command": + translated_job["script"] = request_body[key] + else: + translated_job["job"][job_properties_dict.get(key, key)] = request_body[key] + + return translated_job + + +def post_job_slurm_api(body, user, token, ip): + body_data = json.dumps(translate_job(body, user)) + + url = "https://"+ip+"/slurm/v0.0.36/job/submit" + headers = { + "Content-Type": "application/json", + "X-SLURM-USER-NAME": user, + "X-SLURM-USER-TOKEN": token + } + + resp = requests.post(url=url, data=body_data, headers=headers, verify=False) + print("POST /slurm/v0.0.36/job/submit", resp.status_code, resp.reason, '\n') + + error_dicts = resp.json().get('errors') + errors_text = '\n'.join([error_data.get("error") for error_data in error_dicts]) + + return {"errors": errors_text, "status_code": resp.status_code, "reason": resp.reason} + + +def submit_job_script(): + body = request.json + cluster_name = request.args.get("cluster_name") + instance_id = request.args.get("instance_id") + region = request.args.get("region") + user = request.args.get("user", "ec2-user") + + ec2 = boto3.resource("ec2", region_name=region) + instance = ec2.Instance(instance_id) + ip = instance.public_dns_name + + client = boto3.client("secretsmanager") + jwt_token = client.get_secret_value(SecretId="slurm_token_"+cluster_name)["SecretString"] + + return post_job_slurm_api(body, user, jwt_token, ip) + + def _price_estimate(cluster_name, region, queue_name): config_text = get_cluster_config_text(cluster_name, region) config_data = yaml.safe_load(config_text) diff --git a/api/tests/test_submit_job_script.py b/api/tests/test_submit_job_script.py new file mode 100644 index 00000000..eb9fa4fc --- /dev/null +++ b/api/tests/test_submit_job_script.py @@ -0,0 +1,85 @@ +import json +from unittest import mock +from requests import Response +from api.PclusterApiHandler import post_job_slurm_api, translate_job + + +@mock.patch("api.PclusterApiHandler.requests.post") +def test_post_to_slurm_api_with_correct_request(mock_post): + """ + Given a function that posts jobs to the Slurm API + When a job body, user, token, and ip are inputted + It should call request.post with the correct url, data, and headers + """ + post_job_slurm_api( + {'job-name': 'test'}, + 'test-user', 'slurm-token123', '123.456.7.8' + ) + + mock_post.assert_called_once_with( + url='https://123.456.7.8/slurm/v0.0.36/job/submit', + data=json.dumps({ + "job": { + "environment": { + "PATH": "/bin:/usr/bin/:/usr/local/bin/:/opt/slurm/bin/" + }, + "name": "test", + "current_working_directory": "/home/test-user" + } + }), + headers={ + 'Content-Type': 'application/json', + 'X-SLURM-USER-NAME': 'test-user', + 'X-SLURM-USER-TOKEN': 'slurm-token123' + }, + verify=False + ) + + +@mock.patch("api.PclusterApiHandler.requests.post") +def test_post_to_slurm_api_returns_errors(mock_post): + """ + Given a function that posts jobs to the Slurm API + When the response contains errors + It should return the error messages separated by new lines + """ + resp = Response() + resp.status_code = 400 + resp.reason = 'BAD REQUEST' + resp._content = b'{"errors": [ \ + {"error": "test", "error_code": -1}, \ + {"error": "testing", "error_code": 9001}]}' + + mock_post.return_value = resp + + ret = post_job_slurm_api( + {'job-name': 'test'}, + 'test-user', 'slurm-token123', '123.456.7.8' + ) + + assert ret.get('errors') == 'test\ntesting' + assert ret.get('status_code') == 400 + assert ret.get('reason') == 'BAD REQUEST' + + +def test_translate_job(): + """ + Given a function that translates a job + When a dict of job data is inputted + It should format the job to be accepted by the Slurm API + """ + request_body = {'job-name': 'test', 'nodes': 1, 'command': 'test command'} + + translation = translate_job(request_body, 'test-user') + + assert translation == { + "job": { + "name": "test", + "current_working_directory": "/home/test-user", + "nodes": 1, + "environment": { + "PATH": "/bin:/usr/bin/:/usr/local/bin/:/opt/slurm/bin/" + } + }, + "script": 'test command' + } diff --git a/app.py b/app.py index 4a63599a..5a2807f8 100644 --- a/app.py +++ b/app.py @@ -42,6 +42,7 @@ scontrol_job, set_user_role, submit_job, + submit_job_script ) @@ -168,6 +169,11 @@ def submit_job_(): def sacct_(): return sacct() + @app.route("/manager/submit_job_script", methods=["POST"]) + @authenticated() + def submit_job_script_(): + return submit_job_script() + @app.route("/manager/scontrol_job") @authenticated() def scontrol_job_(): diff --git a/frontend/locales/en/strings.json b/frontend/locales/en/strings.json index 68d73c09..704e4be3 100644 --- a/frontend/locales/en/strings.json +++ b/frontend/locales/en/strings.json @@ -365,9 +365,62 @@ } }, "JobSubmitDialog": { - "requiredMemory": { + "header": "Submit Job", + "cancel": "Cancel", + "submit": "Submit", + "job-name": { + "header": "Job Name", + "description": "Please choose an identifier for this job.", + "placeholder": "job-name" + }, + "chdir": { + "header": "Working Directory", + "description": "Please choose a working directory for the job [optional]" + }, + "nodes": { + "header": "Nodes", + "description": "Number of nodes for job [optional]" + }, + "ntasks": { + "header": "Number of tasks", + "description": "Number of tasks for a job [optional]" + }, + "mem": { "header": "Required memory", "description": "Real memory required per node, in MB. A memory size specification of zero is treated as a special case and grants the job access to all of the memory on each node. [optional]" + }, + "jobTypeCommand": { + "header": "Command", + "description": "The command to run as a part of this job.", + "radioGroup": "Run a command" + }, + "jobTypeFile": { + "header": "Script Path", + "description": "Path to the script to run", + "radioGroup": "Run a script on the head node" + }, + "jobTypeScript": { + "radioGroup": "Enter sbatch script manually", + "radioGroupDisabledDescription": "Requires Slurm REST API" + }, + "queue": { + "header": "Queue", + "description": "Queue where the job will run." + }, + "costEstimate": { + "header": "Cost estimate", + "alertHeader": "Experimental!", + "alertContent": "This provides a basic cost estimate based on the expected job run-time, the number of nodes and their instance type. Actual costs will vary based on node uptime, storage, and other factors. Please refer to Cost Explorer for actual cluster costs.", + "timeEstimateHeader": "Your estimate of the total runtime of the job (in Hours).", + "button": "Estimate", + "estimatedCost": "Estimated job cost:", + "formula": "Price ($/h) * Time (h) * NodeCount" + }, + "errors": { + "mustSelectQueue": "Error: You must select a queue.", + "mustSelectNodes": "Error: You must select a node count.", + "mustSelectRuntime": "Error: You must select a job runtime.", + "emptyJob": "Error: Job is empty" } } } diff --git a/frontend/src/model.tsx b/frontend/src/model.tsx index efb33a1b..dfb58b98 100644 --- a/frontend/src/model.tsx +++ b/frontend/src/model.tsx @@ -21,6 +21,7 @@ import identityFn from 'lodash/identity'; import { getAppConfig } from './app-config'; // Types +import { Job } from './types/jobs'; type Callback = (arg?: any) => void; const axiosInstance = axios.create({ @@ -680,25 +681,33 @@ function CancelJob(instanceId: any, user: any, jobId: any, callback?: Callback) }) } -function SubmitJob(instanceId: any, user: any, job: any, successCallback?: Callback, failureCallback?: Callback) { +async function SubmitJob(instanceId: string, user: string, job: Job): Promise { const region = getState(['app', 'selectedRegion']) || getState(['aws', 'region']); let url = `manager/submit_job?instance_id=${instanceId}&user=${user || 'ec2-user'}®ion=${region}` - request('post', url, job).then((response: any) => { - if(response.status === 200) { - console.log(response.data) - successCallback && successCallback(response.data) - } - }).catch((error: any) => { - if(error.response) - { - failureCallback && failureCallback(error.response.data.message) - console.log(error.response) + try { + const { data } = await request('post', url, job); + return data?.message || ""; + } catch (error: any) { + if(error.response) { notify(`Error: ${error.response.data.message}`, 'error'); } - console.log(error) - }) + throw error; + } } +async function SubmitJobScript(clusterName: string, instanceId: string, user: string, job: Job): Promise { + const region = getState(['app', 'selectedRegion']) || getState(['aws', 'region']); + let url = `manager/submit_job_script?cluster_name=${clusterName}&instance_id=${instanceId}&user=${user || 'ec2-user'}®ion=${region}` + try { + const { data } = await request('post', url, job); + return data?.errors || ""; + } catch (error: any) { + if(error.response) { + notify(`Error: ${error.response.data.message}`, 'error'); + } + throw error; + } +} function JobInfo(instanceId: any, user: any, jobId: any, successCallback?: Callback, failureCallback?: Callback) { const region = getState(['app', 'selectedRegion']) || getState(['aws', 'region']); @@ -823,5 +832,6 @@ export {CreateCluster, UpdateCluster, ListClusters, DescribeCluster, BuildImage, GetCustomImageStackEvents, ListCustomImageLogStreams, GetCustomImageLogEvents, ListOfficialImages, LoadInitialState, Ec2Action,LoadAwsConfig, GetDcvSession, QueueStatus, CancelJob, SubmitJob, - PriceEstimate, SlurmAccounting, JobInfo, ListUsers, SetUserRole, notify, - CreateUser, DeleteUser} + SubmitJobScript, PriceEstimate, SlurmAccounting, JobInfo, ListUsers, + SetUserRole, notify, CreateUser, DeleteUser} + \ No newline at end of file diff --git a/frontend/src/old-pages/Clusters/JobSubmitDialog.tsx b/frontend/src/old-pages/Clusters/JobSubmitDialog.tsx index 5100767f..683e53fe 100644 --- a/frontend/src/old-pages/Clusters/JobSubmitDialog.tsx +++ b/frontend/src/old-pages/Clusters/JobSubmitDialog.tsx @@ -10,9 +10,13 @@ // limitations under the License. import * as React from 'react'; import { useTranslation } from 'react-i18next'; -import { useState, setState, getState, clearState } from '../../store' -import { findFirst, clusterDefaultUser } from '../../util' -import { SubmitJob, PriceEstimate } from '../../model' +import { useState, setState, getState, clearState } from '../../store'; +import { findFirst, clusterDefaultUser } from '../../util'; +import { getScripts } from './util'; +import { SubmitJob, SubmitJobScript, PriceEstimate } from '../../model'; +import ConfigView from '../../components/ConfigView'; +import { Job } from '../../types/jobs'; +import FileUploadButton from '../../components/FileChooser'; // UI Elements import { @@ -25,31 +29,31 @@ import { Header, Input, Modal, + RadioGroup, Select, SpaceBetween, - Toggle, } from "@awsui/components-react"; const submitPath = ['app', 'clusters', 'jobSubmit']; +const jobPath = [...submitPath, 'job']; function itemToOption([value, title]: [string, string]) { return {label: title, value: value} } function QueueSelect() { + const { t } = useTranslation(); const clusterName = getState(['app', 'clusters', 'selected']); const clusterPath = ['clusters', 'index', clusterName]; const queues = getState([...clusterPath, 'config', 'Scheduling', 'SlurmQueues']) || [] - const jobPath = [...submitPath, 'job']; let partition = useState([...jobPath, 'partition']); let queuesOptions = [["[ANY]", "[ANY]"], ...queues.map((q: any) => [q.Name, q.Name])] return <> - {/* @ts-expect-error TS(2322) FIXME: Type '"h4"' is not assignable to type 'Variant | u... Remove this comment to see the full error message */} -
Queue
+
{t("JobSubmitDialog.queue.header")}
- + - {costEstimate && Estimated job cost: ${costEstimate.toFixed(2)}} - {costEstimate &&
Price ($/h) * Time (h) * NodeCount => {priceEstimate} * {jobRuntime} * {nodes}
} + {costEstimate && {t("JobSubmitDialog.costEstimate.estimatedCost")} ${costEstimate.toFixed(2)}} + {costEstimate &&
{t("JobSubmitDialog.costEstimate.formula")} => {priceEstimate} * {jobRuntime} * {nodes}
} } +interface JobFieldProps { + header: string; + description: string; + placeholder: string; + property: string; + disabled?: boolean; +} + +function JobField({ + header, + description, + placeholder, + property, + disabled=false +}: JobFieldProps) { + + const setStateIfNotEmpty = React.useCallback((value: string) => { + !value || value === '' ? clearState([...jobPath, property]) : setState([...jobPath, property], value); + }, [property]) + + return <> + +
+ {header} +
+ + setStateIfNotEmpty(detail.value)} + value={useState([...jobPath, property])} + placeholder={placeholder} + disabled={disabled} + /> + +
+ +} + export default function JobSubmitDialog({ submitCallback }: any) { + const { t } = useTranslation(); const open = useState([...submitPath, 'dialog']); - const error = useState([...submitPath, 'error']); - const jobPath = [...submitPath, 'job']; + const error = useState([...submitPath, 'error']) || ""; const clusterName = getState(['app', 'clusters', 'selected']); + const clusterPath = ['clusters', 'index', clusterName]; + const cluster = getState(clusterPath); + const headNode = getState([...clusterPath, 'headNode']); - const job = useState(jobPath); + const job: Job = useState(jobPath); const submitting = useState([...submitPath, 'pending']); - let jobName = useState([...jobPath, 'job-name']); - let chdir = useState([...jobPath, 'chdir']); - let nodes = useState([...jobPath, 'nodes']); - let ntasks = useState([...jobPath, 'ntasks']); - let mem = useState([...jobPath, 'mem']); - let command = useState([...jobPath, 'command']); - let wrap = useState([...jobPath, 'wrap']) || false; + const jobType: string = useState([...submitPath, 'job-entry']); + const jobCommand: string = useState([...jobPath, 'command']); let isMemBasedSchedulingEnabled = useState( - ['clusters', 'index', clusterName, 'config', 'Scheduling', 'SlurmSettings', 'EnableMemoryBasedScheduling'] + [...clusterPath, 'config', 'Scheduling', 'SlurmSettings', 'EnableMemoryBasedScheduling'] ) || false; - const submitJob = () => { - const clusterPath = ['clusters', 'index', clusterName]; - const cluster = getState(clusterPath); + let isSlurmApiEnabled = getScripts( + useState([...clusterPath, 'config', 'HeadNode', 'CustomActions']) + ).includes('slurm-rest-api'); + + const isScriptSelected = jobType === 'script'; + + const jobTypeSelect = (entryType: string) => { + setState([...submitPath, 'job-entry'], entryType); + clearState([...jobPath, 'command']); + entryType === 'command' ? setState([...jobPath, 'wrap'], true) : clearState([...jobPath, 'wrap']); + if(entryType === 'script') clearState([...jobPath, 'chdir']); + } + + const scriptSelectDescription = isSlurmApiEnabled ? "" : t("JobSubmitDialog.jobTypeScript.radioGroupDisabledDescription"); + + const jobTypeHeader = { + 'command': t("JobSubmitDialog.jobTypeCommand.header"), + 'file': t("JobSubmitDialog.jobTypeFile.header") + }[jobType] + + const jobTypeDescription = { + 'command': t("JobSubmitDialog.jobTypeCommand.description"), + 'file': t("JobSubmitDialog.jobTypeFile.description") + }[jobType] + + const jobTypePlaceholder = { + 'command': "sleep 30", + 'file': "/home/ec2-user/myscript.sbatch", + 'script': jobCommand || "#!/bin/bash\n" + }[jobType] + + const submitJob = React.useCallback(async () => { let user = clusterDefaultUser(cluster); - const headNode = getState([...clusterPath, 'headNode']); const success_callback = () => { + clearState([...submitPath, 'error']); setState([...submitPath, 'dialog'], false); setState([...submitPath, 'pending'], false); submitCallback && submitCallback(); } - const failure_callback = (message: any) => { - setState([...submitPath, 'error'], message) + const failure_callback = (message: string) => { + setState([...submitPath, 'error'], message); // TODO: doesn't support translation setState([...submitPath, 'pending'], false); } setState([...submitPath, 'pending'], true); - SubmitJob(headNode.instanceId, user, job, success_callback, failure_callback) - }; + const errorMessage = isScriptSelected ? + await SubmitJobScript(clusterName, headNode.instanceId, user, job) + : + await SubmitJob(headNode.instanceId, user, job); + errorMessage === "" ? success_callback() : failure_callback(errorMessage); + errorMessage === "" ? success_callback() : failure_callback(errorMessage); + }, [cluster, clusterName, headNode.instanceId, isScriptSelected, job, submitCallback]); + + const submitJobIfEntered = React.useCallback(() => { + const entry = (getState([...jobPath, 'command']) || '').replace(/\n| /gm, ''); + if(entry === '#!/bin/bash' || entry === '') { + setState([...submitPath, 'error'], t("JobSubmitDialog.errors.emptyJob")); + } else { + submitJob(); + } + }, [submitJob]) const cancel = () => { - setState([...submitPath, 'dialog'], false) + setState([...submitPath, 'dialog'], false); + clearState([...submitPath, 'error']); }; - const enableWrap = (enable: any) => { - setState([...jobPath, 'wrap'], enable); - } - return ( - - + + } - header="Submit Job"> + header={t("JobSubmitDialog.header")}> - -
- Job Name -
- - {setState([...jobPath, 'job-name'], detail.value);}} - value={jobName} - placeholder="job-name" - /> - -
+ + - -
- Working Directory -
- - {setState([...jobPath, 'chdir'], detail.value);}} - value={chdir} - placeholder="/home/ec2-user/" - /> - -
+ - -
- Nodes -
- - {setState([...jobPath, 'nodes'], detail.value);}} - value={nodes} - inputMode='numeric' - placeholder="0" - /> - -
+ - -
- Number of tasks -
- - {setState([...jobPath, 'ntasks'], detail.value);}} - value={ntasks} - inputMode='numeric' - placeholder="0" - /> - -
+ - { - isMemBasedSchedulingEnabled && -
- {t("JobSubmitDialog.requiredMemory.header")} -
- - {setState([...jobPath, 'mem'], detail.value);}} - value={mem} - inputMode='numeric' - placeholder="0" - /> - -
+ { isMemBasedSchedulingEnabled && + } - enableWrap(!wrap)}>Run a Command (instead of script) - - -
- {wrap ? "Command" : "Script Path"} -
- - {setState([...jobPath, 'command'], detail.value);}} - value={command} - placeholder={ wrap ? "sleep 30" : "/home/ec2-user/myscript.sbatch"} - /> - -
+ {jobTypeSelect(detail.value);}} + value={jobType} + items={[ + {value: "command", label: t("JobSubmitDialog.jobTypeCommand.radioGroup")}, + {value: "file", label: t("JobSubmitDialog.jobTypeFile.radioGroup")}, + {value: "script", label: t("JobSubmitDialog.jobTypeScript.radioGroup"), disabled: !isSlurmApiEnabled, description: scriptSelectDescription} + ]} + />
- + + { isScriptSelected ? + <> + setState([...jobPath, 'command'], data)}/> + { + setState([...jobPath, 'command'], detail.value); + }}> + + + : + + } + +
-
{(error || "").split('\n').map((line: any, i: any) =>
{line}
)}
+
{error.split('\n').map((line: any, i: any) =>
{line}
)}
); } diff --git a/frontend/src/old-pages/Clusters/Scheduling.tsx b/frontend/src/old-pages/Clusters/Scheduling.tsx index b6de7fbf..e3ab1eed 100644 --- a/frontend/src/old-pages/Clusters/Scheduling.tsx +++ b/frontend/src/old-pages/Clusters/Scheduling.tsx @@ -189,6 +189,8 @@ export default function ClusterScheduling() { const jobs: JobSummary[] = useState(['clusters', 'index', clusterName, 'jobs']); const defaultRegion = useState(['aws', 'region']); const region = useState(['app', 'selectedRegion']) || defaultRegion; + const jobInfoPath = ['app', 'clusters', 'jobInfo'] + const jobSubmitPath = ['app', 'clusters', 'jobSubmit'] function isSsmPolicy(p: any) { return p.hasOwnProperty('Policy') && p.Policy === ssmPolicy(region); @@ -207,7 +209,7 @@ export default function ClusterScheduling() { }, []) const selectJobCallback = (jobInfo: any) => { - setState(['app', 'clusters', 'jobInfo', 'data'], jobInfo); + setState([...jobInfoPath, 'data'], jobInfo); } const selectJob = (jobId: string) => { @@ -217,8 +219,8 @@ export default function ClusterScheduling() { const cluster = getState(clusterPath); let user = clusterDefaultUser(cluster); const headNode = getState([...clusterPath, 'headNode']); - clearState(['app', 'clusters', 'jobInfo', 'data']); - headNode && setState(['app', 'clusters', 'jobInfo', 'dialog'], true); + clearState([...jobInfoPath, 'data']); + headNode && setState([...jobInfoPath, 'dialog'], true); headNode && JobInfo(headNode.instanceId, user, jobId, selectJobCallback); } } @@ -247,11 +249,18 @@ export default function ClusterScheduling() { } ); + const showJobSubmitDialog = () => { + setState([...jobSubmitPath, 'dialog'], true); + if (!getState([...jobSubmitPath, 'job-entry'])) { + setState([...jobSubmitPath, 'job-entry'], 'command'); + setState([...jobSubmitPath, 'job', 'wrap'], true); + } + } return - {ssmEnabled && } + {ssmEnabled && } {clusterMinor > 0 && ssmEnabled && (jobs ?