# CloudEdge DataEngineer (Inference Stage)

****Inference Scenarios****

| scenarios | reference app | framework | model/dataset |
| ---- | ---- | ---- | ---- |
| batch-inference-workflow | [scenarios/job-pipeline](https://github.com/peiniliu/inference/tree/dev/vision/classification_and_detection/scenarios/job-pipeline) | tensorflow | resnet/dumy |

## Architecture

Make sure to set these environment variables in your session with the proper values. All of them are mandatory except:
- `DOCKER_REGISTRY`: if you plan to push the images to a private registry
- `DOCKER_TAG`: if you don't want to leave the default `latest` tag
- `DOCKER_REGISTRY_USERNAME`: if your private registry requires authentication
- `DOCKER_REGISTRY_PASSWORD`: if your private registry requires authentication

In [None]:
# Only for debug purposes, don't leave them enable in the repository!!!
# %env WORKDIR=/root/cloudskin/data-connector
# %env REACTIVE_MIGRATION_DATAENGINEER_APP_DIR=examples/cloudedge-reactive-migration/dataengineer
# %env SCANFLOW_SERVER_URI=http://10.0.26.8:32766
# %env SCANFLOW_TRACKER_URI=http://10.0.26.8:32766
# %env MLFLOW_S3_ENDPOINT_URL=http://10.0.26.8:32910
# %env AWS_ACCESS_KEY_ID=admin
# %env AWS_SECRET_ACCESS_KEY=scanflow123
# %env DOCKER_REGISTRY=registry.gitlab.bsc.es/datacentric-computing/cloudskin-project/cloudskin-registry
# If you use invalid characters for a tag, Scanflow will replace them with '-'
# %env DOCKER_TAG=feat/reactive-migration
# %env DOCKER_REGISTRY_USERNAME=data-connector
# %env DOCKER_REGISTRY_PASSWORD=fake-password

In [2]:
import sys
import os
sys.path.insert(0,'../..')

from scanflow.client import ScanflowClient
from scanflow.client import ScanflowTrackerClient
from scanflow.client import ScanflowDeployerClient

In [None]:
from scanflow.tools import env
print(env.get_env("SCANFLOW_SERVER_URI"))
print(env.get_env("SCANFLOW_TRACKER_URI"))
print(env.get_env("MLFLOW_S3_ENDPOINT_URL"))
print(env.get_env("AWS_ACCESS_KEY_ID"))
print(env.get_env("AWS_SECRET_ACCESS_KEY"))
print(env.get_env("DOCKER_REGISTRY"))
print(env.get_env("DOCKER_TAG"))

In [None]:
# App folder - Must point to the folder includeing all 'dataengineer' and 'datascience' folders
# for cloudedge-reactive-migration, allocated in examples/cloudedge-reactive-migration
app_dir = os.path.join(env.get_env('WORKDIR'), env.get_env('REACTIVE_MIGRATION_DATAENGINEER_APP_DIR'))
print(app_dir)
app_name = "cloudedge-reactive-migration"
team_name = "dataengineer"

# Initialize the Scanflow Client
client = ScanflowClient(
    #if you defined "SCANFLOW_SERVER_URI", you dont need to provide this
    registry=env.get_env("DOCKER_REGISTRY"),
    verbose=True)

## Batch-inference-graph for prediction

### Predictor

In [5]:
# Predictor stages
# - Executor 1: Data retrieval from Prometheus
# - Executor 2: Data pre-processing
# - Executor 3: QoS Predictor
executor1 = client.ScanflowExecutor(
    name="data-retrieval",
    mainfile="data-retrieval.py",
    dockerfile="Dockerfile_data_retrieval_no_buildkit",
    parameters={
        'app_name': app_name,
        'team_name': team_name,
        'promcsv_config': "/app/data-retrieval/promql_queries.json" # Config file already included in the Docker image
    }
)

# Stages dependencies
# TODO: define them once other stages have been developed

# Predictor workflow: batch-inference-reactive-graph
# TODO: add missing executors and dependencies
workflow1 = client.ScanflowWorkflow(
    name="batch-inference-reactive-graph",
    nodes=[executor1],
    edges=[],
    type="batch",
    cron="*/5 * * * *",
    output_dir="/workflow"
)

### Planner

In [6]:
trigger = client.ScanflowAgentSensor_IntervalTrigger(minutes=5)
sensor = client.ScanflowAgentSensor(
    name="reactive_watch_qos",
    isCustom=True,
    func_name="reactive_watch_qos",
    trigger=trigger,
    kwargs={
        'frequency': 300
    }
)
planner = client.ScanflowAgent(
    name="planner",
    template="planner",
    sensors=[sensor]
)

### Compose the Scanflow Application

In [7]:
app = client.ScanflowApplication(
    app_name=app_name,
    app_dir=app_dir,
    team_name=team_name,
    workflows=[workflow1],
    agents=[planner]
)

### DEBUG: show application config

In [8]:
#app.to_dict()

### Build the Scanflow Application
- This step builds the Docker images for all the Scanflow executors and uploads them to the container registry (currently hardcoded in the `scanflow` module)

In [None]:
# Define the Scanflow Tracker Port (32766)
build_app = client.build_ScanflowApplication(
    app=app,
    trackerPort=32766
)

### DEBUG: show built application config

In [10]:
#build_app.to_dict()