# First steps in workflow construction with pyiron

Author: Jörg Neugebauer  
Date: Jan 14, 2023

In [1]:
import numpy as np
import matplotlib.pylab as plt

from pyiron_base import Project

## Add a simple Hello World python module to pyiron

### Define your function

In [2]:
def my_python_function(name):
    return f'Hello {name}' 

### Write pyiron wrapper around your function

In [3]:
from pyiron_base import PythonTemplateJob

class HelloWorld(PythonTemplateJob):
    def __init__(self, project, job_name):
        """Hello World module"""
        super().__init__(project, job_name)
        
        self.input.name = None
        # self.executable = my_python_function
        
    # Allow writing of the input file
    def _check_if_input_should_be_written(self):
        return True
            
    # Check for valid input
    def validate_ready_to_run(self):
        if self.input.name is None:
            raise ValueError(
                f"Provide name (job.name=...)"
            )

    # This function is executed
    def run_static(self):
        out = my_python_function(self.input.name)

        self.output.greetings = out
        
        self.status.finished = True
        self.to_hdf()
        self.project.db.item_update(self._runtime(), self.job_id)
        #self.compress()

### Create a project

Note: To not spoil the large database we work with a local (lightweight) database

In [4]:
pr = Project('Hello')
pr.remove_jobs_silently()
# pr.switch_to_local_database()

  pr.remove_jobs_silently()


### Create a job

In [5]:
job = pr.create_job(job_type=HelloWorld, job_name=f'job')
job.input.name = 'CM'

#### Inspect input

In [6]:
job.input

### Run the job

Note: If a job with the same name exists already in the project it will be loaded rather than run again. For test purposes we enforce here to remove an existing job and run it again. 

In [7]:
job.run(delete_existing_job=True)

The job job was saved and received the ID: 18758578


In [8]:
pr.job_table()

Unnamed: 0,id,status,chemicalformula,job,subjob,projectpath,project,timestart,timestop,totalcputime,computer,hamilton,hamversion,parentid,masterid
0,18758578,finished,,job,/job,/cmmc/u/,neugebau/myjupyternotebooks/pyiron/Developments/workflow/Hello/,2023-01-16 16:54:45.342632,2023-01-16 16:54:45.442127,0.0,neugebau@cmti001#1,HelloWorld,0.4,,


### Analyse the job

In [9]:
job['input/generic_dict']

{'restart_file_list': [],
 'restart_file_dict': {},
 'exclude_nodes_hdf': [],
 'exclude_groups_hdf': []}

In [10]:
job.storage.input

In [11]:
job.status

'finished'

In [12]:
job

{'groups': ['input', 'storage'], 'nodes': ['HDF_VERSION', 'NAME', 'OBJECT', 'TYPE', 'VERSION', 'job_id', 'server', 'status']}

Q:Why storage and not output? Input appears to be in *input* and *storage*/*input*!

TODO: Unify output for job and job.storage (make also pyiron_object output json-like (or beyond)

In [13]:
job.storage

### Analyse project

In [14]:
pr.job_table()

Unnamed: 0,id,status,chemicalformula,job,subjob,projectpath,project,timestart,timestop,totalcputime,computer,hamilton,hamversion,parentid,masterid
0,18758578,finished,,job,/job,/cmmc/u/,neugebau/myjupyternotebooks/pyiron/Developments/workflow/Hello/,2023-01-16 16:54:45.342632,2023-01-16 16:54:45.442127,0.0,neugebau@cmti001#1,HelloWorld,0.4,,


## Upscale to HPC

### Create job and set input

In [15]:
job = pr.create_job(job_type=HelloWorld, job_name=f'job_hpc')
job.input.name = 'CM'

### List available queues

In [16]:
job._server.queue_list

['cm', 'cmfe', 'cmti', 's_cmfe', 'p_cmfe', 'cmti_large']

### Set queue and number of cores

In [17]:
job.server.queue = 's_cmfe'
job.server.cores = 1

### Run job on queue

In [18]:
job.run()

The job job_hpc was saved and received the ID: 18758579
Queue system id:  4714964


In [23]:
pr.job_table()

Unnamed: 0,id,status,chemicalformula,job,subjob,projectpath,project,timestart,timestop,totalcputime,computer,hamilton,hamversion,parentid,masterid
0,18758578,finished,,job,/job,/cmmc/u/,neugebau/myjupyternotebooks/pyiron/Developments/workflow/Hello/,2023-01-16 16:54:45.342632,2023-01-16 16:54:45.442127,0.0,neugebau@cmti001#1,HelloWorld,0.4,,
1,18758579,submitted,,job_hpc,/job_hpc,/cmmc/u/,neugebau/myjupyternotebooks/pyiron/Developments/workflow/Hello/,2023-01-16 16:54:46.616886,NaT,,neugebau@cmti001#1#s_cmfe,HelloWorld,0.4,,


### Debug

#### List files in working directory (contains all input, output, errors, etc.)

In [24]:
job.list_files()

 'time.out',
 'pyiron.log',
 'run_queue.sh',
 'error.out']

#### Print error message

TODO: 
- Provide pretty print without extra code. Could we even provide the links to inspect the relevant modules?
- Make it also working on queuing system

In [25]:
print (''.join(job['error.out']))

Traceback (most recent call last):
  File "/u/system/SLES12/soft/pyiron/dev/anaconda3/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/u/system/SLES12/soft/pyiron/dev/anaconda3/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/u/system/SLES12/soft/pyiron/dev/anaconda3/lib/python3.8/site-packages/pyiron_base/cli/__main__.py", line 3, in <module>
    main()
  File "/u/system/SLES12/soft/pyiron/dev/anaconda3/lib/python3.8/site-packages/pyiron_base/cli/__init__.py", line 63, in main
    args.cli(args)
  File "/u/system/SLES12/soft/pyiron/dev/anaconda3/lib/python3.8/site-packages/pyiron_base/cli/wrapper.py", line 31, in main
    job_wrapper_function(
  File "/u/system/SLES12/soft/pyiron/dev/anaconda3/lib/python3.8/site-packages/pyiron_base/jobs/job/wrapper.py", line 148, in job_wrapper_function
    job = JobWrapper(
  File "/u/system/SLES12/soft/pyiron/dev/anaconda3/lib/python3.8/site-packages

## Provide an easy way to register new job types to pyiron

**TODO**
- Autocompletion does not work (not even for job)
- Provide an easy way to register new job types so that the following construction will work

In [22]:
pr.create.job.HelloWorld

AttributeError: no job class named 'HelloWorld' defined

## Run a simple workflow using the HelloWorld job type

### Run a loop

#### Define a list of names  

In [26]:
name_lst = ['pyiron', 'CM', 'MPIE']

#### Loop over all names

Note: 
- Make sure that each job has a unique name 
- A good and intuitive choice is to connect the name to the variable you change

In [27]:
for name in name_lst:
    job = pr.create_job(job_type=HelloWorld, job_name=f'job_{name}')
    job.input.name = name
    job.run()

The job job_pyiron was saved and received the ID: 18758593
The job job_CM was saved and received the ID: 18758594
The job job_MPIE was saved and received the ID: 18758595


#### Inspect the project

In [28]:
pr.job_table()

Unnamed: 0,id,status,chemicalformula,job,subjob,projectpath,project,timestart,timestop,totalcputime,computer,hamilton,hamversion,parentid,masterid
0,18758578,finished,,job,/job,/cmmc/u/,neugebau/myjupyternotebooks/pyiron/Developments/workflow/Hello/,2023-01-16 16:54:45.342632,2023-01-16 16:54:45.442127,0.0,neugebau@cmti001#1,HelloWorld,0.4,,
1,18758579,submitted,,job_hpc,/job_hpc,/cmmc/u/,neugebau/myjupyternotebooks/pyiron/Developments/workflow/Hello/,2023-01-16 16:54:46.616886,NaT,,neugebau@cmti001#1#s_cmfe,HelloWorld,0.4,,
2,18758593,finished,,job_pyiron,/job_pyiron,/cmmc/u/,neugebau/myjupyternotebooks/pyiron/Developments/workflow/Hello/,2023-01-16 16:59:59.111716,2023-01-16 16:59:59.227211,0.0,neugebau@cmti001#1,HelloWorld,0.4,,
3,18758594,finished,,job_CM,/job_CM,/cmmc/u/,neugebau/myjupyternotebooks/pyiron/Developments/workflow/Hello/,2023-01-16 16:59:59.326965,2023-01-16 16:59:59.457135,0.0,neugebau@cmti001#1,HelloWorld,0.4,,
4,18758595,finished,,job_MPIE,/job_MPIE,/cmmc/u/,neugebau/myjupyternotebooks/pyiron/Developments/workflow/Hello/,2023-01-16 16:59:59.539439,2023-01-16 16:59:59.680603,0.0,neugebau@cmti001#1,HelloWorld,0.4,,


Note: Since we did not create a new project also the previous jobs are included

### Analyze the jobs using pyiron_tables

#### Define functions to extract the relevant info from job

In [29]:
def get_job_name(job):
    return job.name

def get_name(job):
    return job['storage/input']['name']

**TODO**
- should work (pyiron native formulation)     

      job['input/name']    

#### Create table

In [30]:
table = table = pr.create_table(delete_existing_job=True)

table.add['job_name'] = get_job_name
table.add['name'] = get_name

In [31]:
table.run(delete_existing_job=True)

The job table was saved and received the ID: 18758596


Loading and filtering jobs:   0%|          | 0/6 [00:00<?, ?it/s]

Processing jobs:   0%|          | 0/4 [00:00<?, ?it/s]

#### Get pandas dataframe

TODO: 
- Make nice repr for table (i.e., table should provide pandas like output)

In [33]:
table

{'groups': ['input', 'output'], 'nodes': ['HDF_VERSION', 'NAME', 'TYPE', 'VERSION', 'job_id', 'server', 'status']}

In [32]:
table.get_dataframe()

Unnamed: 0,job_id,job_name,name
0,18758578,job,CM
1,18758593,job_pyiron,pyiron
2,18758594,job_CM,CM
3,18758595,job_MPIE,MPIE


TODO: 
- Make the following 'pyironic', i.e., show content such in *job.input*

In [34]:
job['input']

{'groups': [], 'nodes': ['generic_dict']}

In [35]:
job.input

## Test limits of pyiron data container

#### Store objects

In [36]:
from pyiron_atomistics import Project as ProjectAtomistic



In [37]:
pr_atomistic = ProjectAtomistic('test2')

Al = pr_atomistic.create.structure.bulk('Al')

In [38]:
job = pr.create_job(job_type=HelloWorld, job_name=f'job_struct')
# job.input.name = 'CM'
job.input.name = Al

In [39]:
job.input.name

Al: [0. 0. 0.]
pbc: [ True  True  True]
cell: 
Cell([[0.0, 2.025, 2.025], [2.025, 0.0, 2.025], [2.025, 2.025, 0.0]])

In [40]:
job.run()

The job job_struct was saved and received the ID: 18758599


In [42]:
job = pr.load('job_struct')
job.input.name

Al: [0. 0. 0.]
pbc: [ True  True  True]
cell: 
Cell([[0.0, 2.025, 2.025], [2.025, 0.0, 2.025], [2.025, 2.025, 0.0]])

In [44]:
job.output.greetings

'Hello job_struct'

#### Include links (e.g. to other pyiron objects such as jobs)

In [None]:
job.input.ref_job = pr.load()

TODO: 
- add a property to pyiron object that allows to load it
- this could be e.g. the job ID + database identifier
- e.g.

      job.identifier  # or
      job.get_link()  # or
      job.uri         # uniform resource identifier
      
      pr.load(object.uri) # -> object

## Relation to ironflow

### Define a HelloWorld node in ironflow

In [45]:
# import sys
# sys.path.insert(0, '/cmmc/u/neugebau/git_projects/ironflow2/ironflow')
# sys.path

In [46]:
from ironflow.node_tools import Node, NodeInputBP, NodeOutputBP, dtypes, input_widgets


class HellowWorld(Node):
    title = "HelloWorldNode"
    init_inputs = [
        NodeInputBP(dtype=dtypes.String(default=1), label="name")
    ]
    init_outputs = [
        NodeOutputBP(label="greetings", storage_priority= 10)
    ]
    color = 'cyan'

    def node_function(self, name):
        return {'greetings': name}


gui.register_node(My_Node)

ModuleNotFoundError: No module named 'ironflow.node_tools'

Notes:
- Close similarity to pyiron job
- input, output, run description
    - identical to a generic python function (o1, o2, ... = func(i1, i2, ...)
    - pyiron syntax appears a bit easier and more intuitive
- dtype definition not yet in pyiron 
     - will be introduced with ontology
     - needed mainly to select suitable gui for input
- project and name have no equivalent in ironflow nodes
    - script/macronode could be used to provide this info and to inherit it to all nodes running in this script
    - when running the same script twice with different input a new name/project is required
        - script becomes the new job
        - node names (for logging/hdf5, not the name of the node) could be automatically constructed
        - names could be semantically enriched by automatically using about name and value of changed parameter(s)
        - node output should contain a storage label (priority, opt in/out for specific parameters)
            - give users the option to enable storage policies
- being able to register nodes is a key feature of ironflow
     - should be extended to url/uri (store node in database that provides a link to extract/load node)
     - could be a pyironObject
- TODO: make pyironObjects native nodes

## General notes

- Presently the project contains only all info regarding the jobs
- Workflow info is only available in the jupyter notebook
    - This can be modified, deleted etc.!
    - Project, jobs have no info regarding their creating notebook, python script etc.
- ironflow provides the necessary infrastructure    