# mpi manager - granularity plugin


- *Steps*
    1. Import Scanflow and check the local environment
    2. Develop scanflow application (mpi workloads)
    3. Build scanflow application
    4. Deploy scanflow environment (namespace, agents)
    5. ****[RUN MPI Workloads]****
        1. mpi job (volcano job)
    6. Clean environment

## Steps
### Step1: Import Scanflow and check the local environment
1. import scanflow
    - For defining and building scanflow application, we need to import ScanflowClient
    - For deploying scanflow application, we need to import ScanflowDeployerClient
2. check local environment
    - For deploying scanflow application
        - If user starts the notebook at local and has the privilege to submit object on Kubernetes. We don't need to configure "SCANFLOW_SERVER_URI"
        - If user starts the notebook inside Kubernetes pod, or the local user does not have privilege to connect Kubernetes. We need to configure "SCANFLOW_SERVER_URI"
    - For saving deliverables, we need to configure url of Scanflow-tracker on "SCANFLOW_TRACKER_URI" and url of Scanflow-local-tracker on "SCANFLOW_TRACKER_LOCAL_URI"
    - If Scanflow-tracker is using S3 artifact storage, we need to configure S3 url "MLFLOW_S3_ENDPOINT_URL", username "AWS_ACCESS_KEY_ID" and password "AWS_SECRET_ACCESS_KEY"

In [1]:
import sys
import os
sys.path.insert(0,'../..')

import scanflow
from scanflow.client import ScanflowClient
from scanflow.client import ScanflowDeployerClient

In [2]:
from scanflow.tools import env
print(env.get_env("SCANFLOW_SERVER_URI"))
print(env.get_env("SCANFLOW_TRACKER_URI"))
#print(env.get_env("SCANFLOW_TRACKER_LOCAL_URI"))
print(env.get_env("MLFLOW_S3_ENDPOINT_URL"))
print(env.get_env("AWS_ACCESS_KEY_ID"))
print(env.get_env("AWS_SECRET_ACCESS_KEY"))

http://172.30.0.50:46666
http://172.30.0.50:46667
http://172.30.0.50:43447
admin
admin123


### Step2: Develop scanflow application

  1. develop component (mainfile.yaml)
  2. define scanflow workflows (MPIWorkload, Workflow)
  3. define agents to supervise the workflows
  4. define scanflow application
  ```bash
     Application
        - List: Workflow
                - MPIWorkload
  ```

#### 2.1. Develop scanflow workflows

In [3]:
# App folder
scanflow_path = "/gpfs/bsc_home/xpliu/pv/jupyterhubpeini/scanflow"
app_dir = os.path.join(scanflow_path, "examples/mpi/dataengineer")
app_name = "mpi"
team_name = "dataengineer"

# scanflow client
client = ScanflowClient(
              #if you defined "SCANFLOW_SERVER_URI", you dont need to provide this
              #scanflow_server_uri="http://172.30.0.50:46666",
              verbose=True)

In [4]:
#mpi workloads
mpi1 = client.ScanflowMPIWorkload(name='hpccfft',
                                  mainfile='hpccfft.yaml',
                                  plugins=['granularity'],
                                  characteristic='memory',
                                  nTasks=16,
                                  nNodes=4,)
#workflow
workflowhpccfft = client.ScanflowWorkflow(type='mpi',
                                       name='hpccfft',
                                       nodes=[mpi1],
                                       output_dir = "/home")

In [5]:
# #mpi workloads
# mpi2 = client.ScanflowMPIWorkload(name='hpccdgemm',
#                                   mainfile='hpccdgemm.yaml',
#                                   plugins=['granularity'],
#                                   characteristic='cpu',
#                                   nTasks=16,
#                                   nNodes=4,)
# #workflow
# workflowhpccdgemm = client.ScanflowWorkflow(type='mpi',
#                                        name='hpccdgemm',
#                                        nodes=[mpi2],
#                                        output_dir = "/home")

In [6]:
#planner
planner = client.ScanflowAgent(name='planner',
                              template='planner',
                              dockerfile='Dockerfile_fortest')

#### 2.3. Define scanflow application

In [7]:
app = client.ScanflowApplication(app_name = app_name,
                                 app_dir = app_dir,
                                 team_name = team_name,
                                 workflows=[workflowhpccfft],
                                 agents=[planner])

In [8]:
dic = app.to_dict()

03-Mar-22 17:30:33 -  INFO - workflow hpccfft: {'name': 'hpccfft', 'nodes': [{'name': 'hpccfft', 'node_type': 'mpi', 'mainfile': 'hpccfft.yaml', 'plugins': ['granularity'], 'characteristic': 'memory', 'nTasks': 16, 'nNodes': 4, 'nCpuPerTask': 1, 'masterName': 'mpimaster', 'workerName': 'mpiworker', 'oversubscribe': False, 'body': None}], 'edges': None, 'type': 'mpi', 'resources': None, 'affinity': None, 'kedaSpec': None, 'hpaSpec': None, 'output_dir': '/home'}
03-Mar-22 17:30:33 -  INFO - Scanflowagent-planner: {'name': 'planner', 'template': 'planner', 'sensors': None, 'dockerfile': 'Dockerfile_fortest', 'image': None}


  
### Step3: Build scanflow application (local)
   
  1. build images for Executor -> save to image registry



In [9]:
build_app = client.build_ScanflowApplication(app = app, trackerPort=46672)

03-Mar-22 17:30:36 -  INFO - Building image 172.30.0.49:5000/planner-agent


In [10]:
build_app.to_dict()

03-Mar-22 17:30:38 -  INFO - workflow hpccfft: {'name': 'hpccfft', 'nodes': [{'name': 'hpccfft', 'node_type': 'mpi', 'mainfile': 'hpccfft.yaml', 'plugins': ['granularity'], 'characteristic': 'memory', 'nTasks': 16, 'nNodes': 4, 'nCpuPerTask': 1, 'masterName': 'mpimaster', 'workerName': 'mpiworker', 'oversubscribe': False, 'body': {'apiVersion': 'batch.volcano.sh/v1alpha1', 'kind': 'Job', 'metadata': {'name': 'hpccfft'}, 'spec': {'schedulerName': 'volcano', 'plugins': {'ssh': [], 'svc': []}, 'tasks': [{'replicas': 1, 'name': 'mpimaster', 'policies': [{'event': 'TaskCompleted', 'action': 'CompleteJob'}], 'template': {'spec': {'containers': [{'command': ['/bin/sh', '-c', 'MPI_HOST="/etc/volcano/mpi/HOSTFILE";\necho `cat ${MPI_HOST}`\nmkdir -p /var/run/sshd; /usr/sbin/sshd;\ncp /opt/results/hpccinf.txt /home;\nsleep 60;\nmpirun --allow-run-as-root --hostfile ${MPI_HOST} --nooversubscribe --mca mpi_yield_when_idle 0 --mca btl ^openib,uct --mca pml ucx --display-devel-map --display-allocatio

{'app_name': 'mpi',
 'app_dir': '/gpfs/bsc_home/xpliu/pv/jupyterhubpeini/scanflow/examples/mpi/dataengineer',
 'team_name': 'dataengineer',
 'workflows': [{'name': 'hpccfft',
   'nodes': [{'name': 'hpccfft',
     'node_type': 'mpi',
     'mainfile': 'hpccfft.yaml',
     'plugins': ['granularity'],
     'characteristic': 'memory',
     'nTasks': 16,
     'nNodes': 4,
     'nCpuPerTask': 1,
     'masterName': 'mpimaster',
     'workerName': 'mpiworker',
     'oversubscribe': False,
     'body': {'apiVersion': 'batch.volcano.sh/v1alpha1',
      'kind': 'Job',
      'metadata': {'name': 'hpccfft'},
      'spec': {'schedulerName': 'volcano',
       'plugins': {'ssh': [], 'svc': []},
       'tasks': [{'replicas': 1,
         'name': 'mpimaster',
         'policies': [{'event': 'TaskCompleted', 'action': 'CompleteJob'}],
         'template': {'spec': {'containers': [{'command': ['/bin/sh',
              '-c',
              'MPI_HOST="/etc/volcano/mpi/HOSTFILE";\necho `cat ${MPI_HOST}`\nmkdir 

### Step4: Deploy scanflow environment (local/incluster)
  
  1. Create k8s environment
        - create namespace
        - create RBAC, secret, configmap, PV, PVC
        
  2. Deploy scanflow-local-tracker (deployment, service)

       

In [15]:
deployerClient = ScanflowDeployerClient(user_type="local",
                                        deployer="volcano",
                                        k8s_config_file="/gpfs/bsc_home/xpliu/.kube/config")

03-Mar-22 17:31:45 -  INFO - loading kubernetes configuration from /gpfs/bsc_home/xpliu/.kube/config
03-Mar-22 17:31:45 -  INFO - found local kubernetes configuration


In [14]:
await deployerClient.create_environment(app=build_app)

03-Mar-22 16:41:45 -  INFO - [++]Creating env
03-Mar-22 16:41:45 -  INFO - [++]Creating namespace "scanflow-mpi-dataengineer"
03-Mar-22 16:41:45 -  INFO - create_namespace true
03-Mar-22 16:41:45 -  INFO - [++]Creating Role for 'default service account'
03-Mar-22 16:41:45 -  INFO - create_rolebinding info
03-Mar-22 16:41:45 -  INFO - [++]Creating s3 secret {'AWS_ACCESS_KEY_ID': 'admin', 'AWS_SECRET_ACCESS_KEY': 'admin123', 'MLFLOW_S3_ENDPOINT_URL': 'http://minio.minio-system.svc.cluster.local:9000', 'AWS_ENDPOINT_URL': 'http://minio.minio-system.svc.cluster.local:9000'}
03-Mar-22 16:41:45 -  INFO - create_secret true
03-Mar-22 16:41:45 -  INFO - [++]Creating tracker configmap {'TRACKER_STORAGE': 'postgresql://scanflow:scanflow123@postgresql-service.postgresql.svc.cluster.local/scanflow-mpi-dataengineer', 'TRACKER_ARTIFACT': 's3://scanflow/scanflow-mpi-dataengineer'}
03-Mar-22 16:41:45 -  INFO - create_configmap true
03-Mar-22 16:41:45 -  INFO - [++]Creating client configmap {'SCANFLOW_

### Step5: ****[RUN MPI Workloads]****

In [11]:
deployerClient1 = ScanflowDeployerClient(user_type="autoconfig",
                                        deployer="volcano",
                                        scanflow_autoconfig_server_uri = "http://172.30.0.50:35212/sensors",
                                        k8s_config_file="/gpfs/bsc_home/xpliu/.kube/config")

In [12]:
await deployerClient1.run_autoconfig_workflow(app_name='mpi', 
                                             team_name='dataengineer',
                                             workflow = build_app.workflows[0])

03-Mar-22 17:30:46 -  INFO - workflow hpccfft: {'name': 'hpccfft', 'nodes': [{'name': 'hpccfft', 'node_type': 'mpi', 'mainfile': 'hpccfft.yaml', 'plugins': ['granularity'], 'characteristic': 'memory', 'nTasks': 16, 'nNodes': 4, 'nCpuPerTask': 1, 'masterName': 'mpimaster', 'workerName': 'mpiworker', 'oversubscribe': False, 'body': {'apiVersion': 'batch.volcano.sh/v1alpha1', 'kind': 'Job', 'metadata': {'name': 'hpccfft'}, 'spec': {'schedulerName': 'volcano', 'plugins': {'ssh': [], 'svc': []}, 'tasks': [{'replicas': 1, 'name': 'mpimaster', 'policies': [{'event': 'TaskCompleted', 'action': 'CompleteJob'}], 'template': {'spec': {'containers': [{'command': ['/bin/sh', '-c', 'MPI_HOST="/etc/volcano/mpi/HOSTFILE";\necho `cat ${MPI_HOST}`\nmkdir -p /var/run/sshd; /usr/sbin/sshd;\ncp /opt/results/hpccinf.txt /home;\nsleep 60;\nmpirun --allow-run-as-root --hostfile ${MPI_HOST} --nooversubscribe --mca mpi_yield_when_idle 0 --mca btl ^openib,uct --mca pml ucx --display-devel-map --display-allocatio

True

In [16]:
await deployerClient.delete_workflow(app_name='mpi', 
                                  team_name='dataengineer',
                                  workflow = build_app.workflows[0])

03-Mar-22 17:31:49 -  INFO - delete_pvc true
03-Mar-22 17:31:49 -  INFO - delete_pv true


True


  
### Step8: Clean scanflow environment
  
  1. delete environment
  

In [12]:
await deployerClient.clean_environment(app=build_app)

03-Mar-22 16:12:29 -  INFO - [++] Stopping agent: [planner].
03-Mar-22 16:12:29 -  ERROR - delete_deployment error
03-Mar-22 16:12:29 -  ERROR - delete_service error
03-Mar-22 16:12:29 -  INFO - [++] Stopping tracker: [scanflow-tracker].
03-Mar-22 16:12:29 -  INFO - delete_deployment true
03-Mar-22 16:12:29 -  INFO - delete_service true
03-Mar-22 16:12:29 -  INFO - [++]Delete tracker configmap scanflow-tracker-env
03-Mar-22 16:12:29 -  INFO - delete_configmap true
03-Mar-22 16:12:29 -  INFO - [++]Delete client configmap scanflow-client-env
03-Mar-22 16:12:29 -  INFO - delete_configmap true
03-Mar-22 16:12:29 -  INFO - [++]Delete s3 secret scanflow-secret
03-Mar-22 16:12:29 -  INFO - delete_secret true
03-Mar-22 16:12:29 -  INFO - [++]Delete rolebinding default-admin
03-Mar-22 16:12:29 -  INFO - delete_rolebinding info
03-Mar-22 16:12:29 -  INFO - [++]Delete namespace "scanflow-mpi-dataengineer"
03-Mar-22 16:12:29 -  INFO - delete_namespace true
03-Mar-22 16:12:29 -  INFO - delete_pvc t