In [2]:
import htcondor
import classad

In [None]:
def submit_job(executable, arguments, requirements=None):
    submit_description = classad.ClassAd({
        "Executable": executable,
        "Arguments": arguments,
        "Output": "job_output.txt",
        "Error": "job_error.txt",
        "Log": "job_log.txt",
        "request_cpus": "1",
        "request_memory": "1GB",
        "request_disk": "1GB",
    })
    if requirements:
        submit_description["Requirements"] = requirements

    schedd = htcondor.Schedd()
    with schedd.transaction() as txn:
        cluster_id = schedd.submit(submit_description, 1, txn)
    return cluster_id

In [None]:
failed_jobs = {}
failed_nodes = set()

In [None]:
def resubmit_failed_jobs():
    schedd = htcondor.Schedd()
    for job_id in failed_jobs.keys():
        if failed_nodes:  # Only add requirements if there are failed nodes
            requirements = " && ".join(f"(Machine != \"{node}\")" for node in failed_nodes)
            submit_job("executable", "arguments", requirements)
            print(f"Resubmitted job {job_id} avoiding all failed nodes: {', '.join(failed_nodes)}")
        else:
            submit_job("executable", "arguments")
            print(f"Resubmitted job {job_id} without additional requirements")

In [None]:
query = schedd.xquery(projection=["ClusterId", "ProcId", "JobStatus", "LastRemoteHost"])
for job in query:
    job_id = f"{job['ClusterId']}.{job['ProcId']}"
    status = job.get('JobStatus')
    last_remote_host = job.get('LastRemoteHost', None)
    
    if status == 3:  # Job status 3 corresponds to "Removed"
        print(f"Job {job_id} was removed.")
        continue
    
    if status == 4:  # Job status 4 corresponds to "Completed"
        print(f"Job {job_id} completed successfully.")
        continue
    
    if status == 5:  # Job status 5 corresponds to "Held"
        if last_remote_host:
            failed_nodes.add(last_remote_host)
            failed_jobs[job_id] = last_remote_host
            print(f"Job {job_id} failed on node {last_remote_host}. Will avoid this node for future submissions.")
            resubmit_failed_jobs()
            failed_jobs.pop(job_id, None)  # Remove the job from failed_jobs after resubmission