# Using MLRUN with MpiJobs (Horovod)

In [1]:
from mlrun import new_function
from mlrun.platforms import mount_v3io

In [2]:
base_dir = '/User/horovod'
HOROVOD_JOB_NAME = "horovod-cats-n-dogs"
GPU_HOROVOD_FILE = '/horovod_train_cats_n_dogs.py'
CPU_HOROVOD_FILE = '/horovod_train_cats_n_dogs-cpu.py'

## Create an Mpi Job Runner
Set the job image, command, args, and add a v3io (iguazio) volume mount

In [11]:
image = 'zilbermanor/horovod_cpu:0.2'
fn = new_function(command='mpijob://{}'.format(base_dir + CPU_HOROVOD_FILE), 
                  args= [base_dir + '/cats_and_dogs_filtered', base_dir], image=image, mode='noctx')
fn.apply(mount_v3io())

## Initiate a new job

In [12]:
run = fn.run(name='ml')

[mlrun] 2019-09-16 05:18:53,947 starting run ml uid=31adb1ac3b7544f59dc10db3ac5617a7
[mlrun] 2019-09-16 05:18:53,957 using in-cluster config.
[mlrun] 2019-09-16 05:18:53,971 MpiJob ml-d3381f46 created
[mlrun] 2019-09-16 05:18:53,971 use runner.watch(ml-d3381f46) to see logs
[mlrun] 2019-09-16 05:18:57,985 MpiJob ml-d3381f46 state=Active
[mlrun] 2019-09-16 05:18:57,997 MpiJob ml-d3381f46 launcher pod ml-d3381f46-launcher-2lng2 state Pending


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...5617a7,0,,created,ml,kind=mpijobowner=iguaziomlrun/job=ml-d3381f46,,,,


type result.show() to see detailed results/progress or use CLI:
!mlrun get run --uid 31adb1ac3b7544f59dc10db3ac5617a7 
[mlrun] 2019-09-16 05:18:58,009 run executed, status=created


## List Running MpiJobs and their Pods

In [13]:
jobs = fn.list_jobs(show=True)

status     name                 start                 end
Succeeded  horovod-cats-n-dogs  2019-09-04T17:06:39Z  2019-09-04T17:46:32Z
Failed     ml-15fdf246          2019-09-16T05:12:51Z  
Succeeded  ml-1765d58d          2019-09-12T13:50:50Z  2019-09-12T13:50:54Z
Succeeded  ml-1a485165          2019-09-12T13:07:49Z  2019-09-12T13:07:53Z
           ml-1e40907a                                
Succeeded  ml-8bb61559          2019-09-12T13:43:44Z  2019-09-12T13:43:48Z
Succeeded  ml-9c2dcc02          2019-09-12T13:41:55Z  2019-09-12T13:41:59Z
           ml-c1cf67a2                                
Succeeded  ml-c7e33e46          2019-09-12T13:58:46Z  2019-09-12T13:58:50Z
Active     ml-d3381f46          2019-09-16T05:18:57Z  
Succeeded  ml-fa82c521          2019-09-12T13:46:54Z  2019-09-12T13:46:58Z


## Watch Job logs

In [None]:
fn.watch('ml-d3381f46')

[mlrun] 2019-09-16 05:19:21,998 watching pod ml-d3381f46-launcher-2lng2, status = Running

+ POD_NAME=ml-d3381f46-worker-0
+ shift
+ /opt/kube/kubectl exec ml-d3381f46-worker-0 -- /bin/sh -c     PATH=/usr/local/bin:$PATH ; export PATH ; LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; DYLD_LIBRARY_PATH=/usr/local/lib:$DYLD_LIBRARY_PATH ; export DYLD_LIBRARY_PATH ;   /usr/local/bin/orted -mca ess "env" -mca ess_base_jobid "449904640" -mca ess_base_vpid 1 -mca ess_base_num_procs "2" -mca orte_node_regex "ml-d[4:3381]f46-launcher-2lng2,ml-d[4:3381]f46-worker-0@0(2)" -mca orte_hnp_uri "449904640.0;tcp://10.233.92.124:54214" -mca plm "rsh" --tree-spawn -mca orte_parent_uri "449904640.0;tcp://10.233.92.124:54214" -mca plm_rsh_agent "/etc/mpi/kubexec.sh" -mca orte_default_hostfile "/etc/mpi/hostfile" -mca pmix "^s1,s2,cray,isolated"
Using TensorFlow backend.
2019-09-16 05:19:03.852000: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instruction

## Delete a Job

In [None]:
fn.delete_job('mpij-77afc965')