# CloudEdge DataScience Team (Training)

## Architecture

Make sure to set these environment variables in your session with the proper values. All of them are mandatory except:
- `DOCKER_REGISTRY`: if you plan to push the images to a private registry
- `DOCKER_TAG`: if you don't want to leave the default `latest` tag
- `DOCKER_REGISTRY_USERNAME`: if your private registry requires authentication
- `DOCKER_REGISTRY_PASSWORD`: if your private registry requires authentication

In [1]:
# Only for debug purposes, don't leave them enable in the repository!!!
%env WORKDIR=/home/jolivera/Documents/CloudSkin/Scanflow/data-connector
%env KUBECONFIG_PATH=/home/jolivera/.kube/config_ncloud_socat
%env PROACTIVE_MIGRATION_DATASCIENCE_APP_DIR=examples/cloudedge-proactive-migration/datascience
%env SCANFLOW_SERVER_URI=http://84.88.189.179:32767
%env SCANFLOW_TRACKER_URI=http://84.88.189.179:32766
%env MLFLOW_S3_ENDPOINT_URL=http://84.88.189.179:32645
# PostgreSQL URI with credentials
%env SCANFLOW_TRACKER_STORAGE=postgresql://postgres:scanflow123@scanflow-postgres.scanflow-server.svc.cluster.local/scanflow-cloudedge-datascience
# MinIO API endpoint, not console!
%env AWS_ACCESS_KEY_ID=scanflow
%env AWS_SECRET_ACCESS_KEY=scanflow123
%env DOCKER_REGISTRY=registry.gitlab.bsc.es/datacentric-computing/cloudskin-project/cloudskin-registry
# If you use invalid characters for a tag, Scanflow will replace them with '-'
%env DOCKER_TAG=feat/proactive-migration
%env DOCKER_REGISTRY_USERNAME=cloudskin-scanflow-builds
%env DOCKER_REGISTRY_PASSWORD=ii6c4bvSp58yzhckoyBA
%env SCANFLOW_APP_NAME=cloudedge-proactive-migration-experiment
%env SCANFLOW_TEAM_NAME=datascience
# NEARBYONE CONTROLLER VARIABLES
%env NBY_SERVICE_NAME=dlstreamer-pipeline-server
%env NBY_ORGANIZATION_ID=abcd-1234-uvxyz-9876
%env NBY_ENV_NAME=nearbyone.innovationlab
%env NBY_ENV_EMAIL=fake.username@example.com
%env NBY_ENV_PASSWORD=fake-password
# This is to avoid CI pipelines to deploy anything
%env LOCAL_DEPLOY=1

env: WORKDIR=/home/jolivera/Documents/CloudSkin/Scanflow/data-connector
env: KUBECONFIG_PATH=/home/jolivera/.kube/config_ncloud_socat
env: PROACTIVE_MIGRATION_DATASCIENCE_APP_DIR=examples/cloudedge-proactive-migration/datascience
env: SCANFLOW_SERVER_URI=http://84.88.189.179:32767
env: SCANFLOW_TRACKER_URI=http://84.88.189.179:32766
env: MLFLOW_S3_ENDPOINT_URL=http://84.88.189.179:32645
env: SCANFLOW_TRACKER_STORAGE=postgresql://postgres:scanflow123@scanflow-postgres.scanflow-server.svc.cluster.local/scanflow-cloudedge-datascience
env: AWS_ACCESS_KEY_ID=scanflow
env: AWS_SECRET_ACCESS_KEY=scanflow123
env: DOCKER_REGISTRY=registry.gitlab.bsc.es/datacentric-computing/cloudskin-project/cloudskin-registry
env: DOCKER_TAG=feat/proactive-migration
env: DOCKER_REGISTRY_USERNAME=cloudskin-scanflow-builds
env: DOCKER_REGISTRY_PASSWORD=ii6c4bvSp58yzhckoyBA
env: SCANFLOW_APP_NAME=cloudedge-proactive-migration-experiment
env: SCANFLOW_TEAM_NAME=datascience
env: NBY_SERVICE_NAME=dlstreamer-pipeline

## Pre-run cleanup

Make sure that the experiment isn't already running by removing its namespace

In [2]:
import sys
import os

# Make sure "scanflow" path is added in available module paths
sys.path.insert(0,'../..')

Let's define some useful Kubernetes client functions:

In [3]:
from kubernetes import client
from kubernetes import config
from kubernetes.client.rest import ApiException
from kubernetes.stream import stream
from scanflow.tools import env
import tarfile
import os
from time import time, sleep
import yaml
import io
import pathlib


def delete_namespace_and_wait(client: client.CoreV1Api = None, namespace:str = None, timeout:int = 300):
    """
    Deletes a namespace and waits until its deletion is fully terminated.

    Parameters:
    - client: client.CoreV1Api - A Kubernetes API client; locally initialized if not provided
    - namespace: str - The name of the namespace to delete
    - timeout: int - Time to wait in seconds before giving up (default: 300)
    """
    if not client:
        client = client.CoreV1Api()
    
    try:
        # Delete the namespace
        client.delete_namespace(name=namespace)
        # Wait for the namespace to be completely deleted
        start_time = time()
        while True:
            try:
                # Try fetching the namespace, if it's still there
                response = client.read_namespace(name=namespace)
                print(f"Namespace '{namespace}' is still being deleted...")
            except ApiException as e:
                if e.status == 404:
                    # Namespace is deleted, exit loop
                    print(f"Namespace '{namespace}' has been successfully deleted.")
                    break
                else:
                    print(f"Error occurred: {e}")
                    raise
            # Check if timeout is reached
            if time() - start_time > timeout:
                print(f"Timeout reached: Namespace '{namespace}' still exists after {timeout} seconds.")
                break

            # Wait for some time before checking again
            sleep(5)
    
    except ApiException as e:
        print(f"Failed to delete namespace '{namespace}': {e}")


def deploy_pod_and_wait_for_completion(client: client.CoreV1Api = None, yaml_file: str = None, namespace: str = "default", timeout: int = 300) -> None:
    """
    Deploy a pod from a YAML manifest and wait until it reaches the Completed state.

    Parameters:
    - client: client.CoreV1Api - Kubernetes API client; locally initialized if not provided
    - namespace: str - Kubernetes namespace (default: "default")
    - timeout: int - Time to wait in seconds before giving up (default: 300)
    """

    if not client:
        client = client.CoreV1Api()

    # Load the YAML file
    if not yaml_file:
        print(f"Missing YAML file! Please make sure to provide a valid YAML file path")
        sys.exit(1)
    
    with open(yaml_file, 'r') as f:
        pod_manifest = yaml.safe_load(f)

    # Extract pod name from the manifest
    pod_name = pod_manifest['metadata']['name']

    try:
        # Create the pod
        print(f"Creating pod: {pod_name} in namespace: {namespace}")
        client.create_namespaced_pod(namespace=namespace, body=pod_manifest)

        # Wait for the pod to reach Completed state
        start_time = time()
        while True:
            try:
                # Get the pod's current status
                pod = client.read_namespaced_pod(name=pod_name, namespace=namespace)
                pod_phase = pod.status.phase
                print(f"Pod '{pod_name}' is currently in phase: {pod_phase}")
                
                if pod_phase == "Succeeded":
                    print(f"Pod '{pod_name}' has completed successfully (Succeeded).")
                    break
                elif pod_phase == "Failed":
                    print(f"Pod '{pod_name}' has failed.")
                    break
                
            except ApiException as e:
                print(f"Error fetching pod status: {e}")
                raise

            # Check if timeout is reached
            if time() - start_time > timeout:
                print(f"Timeout reached: Pod '{pod_name}' is not in Completed state after {timeout} seconds.")
                break

            # Wait for a few seconds before checking again
            sleep(5)

    except ApiException as e:
        print(f"Failed to create pod '{pod_name}': {e}")


def deploy_pod_and_wait(client: client.CoreV1Api = None, yaml_file: str = None, namespace: str = "default", timeout: int = 300) -> None:
    """
    Deploys a pod using a YAML manifest and waits until its state is 'Running'.

    Parameters:
    - client: client.CoreV1Api - Kubernetes API client; locally initialized if not provided
    - yaml_file: str - Path to the YAML file containing the pod manifest.
    - namespace: str - Kubernetes namespace (default: 'default').
    - timeout: int - Time to wait in seconds before timing out (default: 300).
    """

    # Load YAML file
    with open(yaml_file, 'r') as f:
        pod_manifest = yaml.safe_load(f)

    # Extract pod name from manifest
    pod_name = pod_manifest['metadata']['name']
    
    try:
        # Create the pod
        print(f"Creating pod: {pod_name} in namespace: {namespace}")
        client.create_namespaced_pod(namespace=namespace, body=pod_manifest)

        # Wait for the pod to reach Running state
        start_time = time()
        while True:
            try:
                # Get the pod's current status
                pod = client.read_namespaced_pod(name=pod_name, namespace=namespace)
                pod_phase = pod.status.phase
                print(f"Pod '{pod_name}' is currently in phase: {pod_phase}")
                
                if pod_phase == "Running":
                    print(f"Pod '{pod_name}' is now in Running state.")
                    break
                elif pod_phase == "Failed":
                    print(f"Pod '{pod_name}' has failed to start.")
                    break
                
            except ApiException as e:
                print(f"Error fetching pod status: {e}")
                raise

            # Check if timeout is reached
            if time() - start_time > timeout:
                print(f"Timeout reached: Pod '{pod_name}' is not in Running state after {timeout} seconds.")
                break

            # Wait for a few seconds before checking again
            sleep(5)

    except ApiException as e:
        print(f"Failed to create pod '{pod_name}': {e}")


def check_if_object_exists_and_ready(
    client: client.CoreV1Api = None, # Kubernetes API client; locally initialized if not provided
    object_type: str = "namespace",  # Type of Kubernetes object: 'namespace' or 'persistentVolumeClaim'
    name: str = "default",  # Name of the Kubernetes object (namespace or PVC)
    namespace: str = None  # Namespace where the object is located (only for PVC)
) -> bool:
    """
    Checks if a Kubernetes object (namespace or persistentVolumeClaim) exists and is ready.

    Parameters:
    - object_type: str - Type of Kubernetes object ('namespace' or 'persistentVolumeClaim').
    - name: str - Name of the Kubernetes object.
    - namespace: str - Namespace where the object is located (only relevant for PVCs).

    Returns:
    - bool: True if the object exists and is ready, False otherwise.
    """
    

    # Initialize API clients
    if not client:
        client = client.CoreV1Api()

    try:
        if object_type == "namespace":
            # Check if the namespace exists
            print(f"Checking if namespace '{name}' exists...")
            namespace_obj = client.read_namespace(name=name)
            if namespace_obj.status.phase == "Active":
                print(f"Namespace '{name}' exists and is Active.")
                return True
            else:
                print(f"Namespace '{name}' is not Active.")
                return False

        elif object_type == "persistentVolumeClaim":
            if namespace is None:
                raise ValueError("Namespace must be specified for persistentVolumeClaim check.")

            # Check if the PVC exists and is bound
            print(f"Checking if persistentVolumeClaim '{name}' exists in namespace '{namespace}'...")
            pvc_obj = client.read_namespaced_persistent_volume_claim(name=name, namespace=namespace)
            if pvc_obj.status.phase == "Bound":
                print(f"PersistentVolumeClaim '{name}' is Bound and ready.")
                return True
            else:
                print(f"PersistentVolumeClaim '{name}' is not in Bound state.")
                return False

        else:
            raise ValueError(f"Unsupported object type '{object_type}'. Use 'namespace' or 'persistentVolumeClaim'.")

    except ApiException as e:
        if e.status == 404:
            print(f"{object_type.capitalize()} '{name}' not found.")
        else:
            print(f"Error fetching {object_type} status: {e}")
        return False


def copy_local_path_to_pod(client: client.CoreV1Api, namespace: str, pod_name: str, local_path: pathlib.Path, dest_path: str, exclude_paths: list = []):
    """
    Transfer the content of a local path to the desired pod
    """
    import re, os

    # Define the pattern to exclude undesired paths to transfer
    pattern = '.*(?:% s)' % '|'.join(exclude_paths)

    buf = io.BytesIO()
    with tarfile.open(fileobj=buf, mode='w:tar') as tar:
        tar.add(
            local_path,
            arcname=pathlib.Path(dest_path).joinpath(local_path.name),
            filter=lambda x: None if re.match(pattern, x.name) else x
        )
    
    commands = [buf.getvalue()]

    # Copying file
    exec_command = ['tar', 'xvf', '-', '-C', '/']
    resp = stream(client.connect_get_namespaced_pod_exec, pod_name, namespace,
                         command=exec_command,
                         stderr=True, stdin=True,
                         stdout=True, tty=False,
                         _preload_content=False)

    while resp.is_open():
        resp.update(timeout=1)
        if resp.peek_stdout():
            print(f"STDOUT: {resp.read_stdout()}")
        if resp.peek_stderr():
            print(f"STDERR: {resp.read_stderr()}")
        if commands:
            c = commands.pop(0)
            resp.write_stdin(c)
        else:
            break
    resp.close()

Remove the experiment namespace if it exists in the Kubernetes cluster:
- Wait for its proper termination before proceeding

In [4]:
# Initialize kube config and client
# DEBUG: show the kubeconfig path due to GitHub CI issues
print(env.get_env("KUBECONFIG_PATH"))
try:
    config.load_kube_config(config_file=env.get_env("KUBECONFIG_PATH"))
except Exception as e:
    raise Exception(f"Something wrong with kubeconfig at {env.get_env('KUBECONFIG_PATH'): {e}}")

kube_client = client.CoreV1Api()

# Look for all available namespaces
namespaces = kube_client.list_namespace()
# Compose the expected namespace that Scanflow creates based on app_name and team_name
environment_namespace = f"scanflow-{env.get_env('SCANFLOW_APP_NAME')}-{env.get_env('SCANFLOW_TEAM_NAME')}"

# Remove the namespace if it exists
for namespace in namespaces.items:
    if environment_namespace == namespace.metadata.name:
        delete_namespace_and_wait(client=kube_client, namespace=environment_namespace)

/home/jolivera/.kube/config_ncloud_socat
Namespace 'scanflow-cloudedge-proactive-migration-experiment-datascience' is still being deleted...




Namespace 'scanflow-cloudedge-proactive-migration-experiment-datascience' is still being deleted...




Namespace 'scanflow-cloudedge-proactive-migration-experiment-datascience' is still being deleted...




Namespace 'scanflow-cloudedge-proactive-migration-experiment-datascience' is still being deleted...




Namespace 'scanflow-cloudedge-proactive-migration-experiment-datascience' is still being deleted...




Namespace 'scanflow-cloudedge-proactive-migration-experiment-datascience' is still being deleted...




Namespace 'scanflow-cloudedge-proactive-migration-experiment-datascience' is still being deleted...




Namespace 'scanflow-cloudedge-proactive-migration-experiment-datascience' is still being deleted...
Namespace 'scanflow-cloudedge-proactive-migration-experiment-datascience' has been successfully deleted.




Remove any experiment's pre-built docker image as they prevent fresh builds if the `repository:tag` is found

In [5]:
import docker
docker_client=docker.DockerClient()

In [6]:
# Also remove any pre-built docker image
import docker

repository_prefix = f"{env.get_env('DOCKER_REGISTRY')}/{env.get_env('SCANFLOW_APP_NAME')}-{env.get_env('SCANFLOW_TEAM_NAME')}"

docker_client = docker.DockerClient()

# - First remove any unused container
print("Purging containers...")
docker_client.containers.prune()

# - Then prune any image that matches the repository_prefix
print(f"Purging docker tags starting with {repository_prefix}...")
for docker_image in docker_client.images.list():
    print(docker_image)
    for tag in docker_image.tags:
        print(f"Image: {docker_image} with tag {tag}")
        if tag.startswith(repository_prefix):
            print(f"Removing {docker_image} with tag {tag}.")
            docker_client.images.remove(tag)
            break
        else:
            print("No images to remove.")

Purging containers...
Purging docker tags starting with registry.gitlab.bsc.es/datacentric-computing/cloudskin-project/cloudskin-registry/cloudedge-proactive-migration-experiment-datascience...
<Image: 'registry.gitlab.bsc.es/datacentric-computing/cloudskin-project/cloudskin-registry/cloudedge-proactive-migration-experiment-datascience-proactive-training-datascience-modeling:feat-proactive-migration'>
Image: <Image: 'registry.gitlab.bsc.es/datacentric-computing/cloudskin-project/cloudskin-registry/cloudedge-proactive-migration-experiment-datascience-proactive-training-datascience-modeling:feat-proactive-migration'> with tag registry.gitlab.bsc.es/datacentric-computing/cloudskin-project/cloudskin-registry/cloudedge-proactive-migration-experiment-datascience-proactive-training-datascience-modeling:feat-proactive-migration
Removing <Image: 'registry.gitlab.bsc.es/datacentric-computing/cloudskin-project/cloudskin-registry/cloudedge-proactive-migration-experiment-datascience-proactive-train

## ScanflowClient initialization

In [7]:
import scanflow
from scanflow.client import ScanflowClient
# from scanflow.client import ScanflowTrackerClient
from scanflow.client import ScanflowDeployerClient

### Debug: available environment variables

In [8]:
print(env.get_env("SCANFLOW_SERVER_URI"))
print(env.get_env("SCANFLOW_TRACKER_URI"))
#print(env.get_env("SCANFLOW_TRACKER_LOCAL_URI"))
print(env.get_env("MLFLOW_S3_ENDPOINT_URL"))
print(env.get_env("AWS_ACCESS_KEY_ID"))
print(env.get_env("AWS_SECRET_ACCESS_KEY"))
print(env.get_env("DOCKER_REGISTRY"))
print(env.get_env("DOCKER_TAG"))

http://84.88.189.179:32767
http://84.88.189.179:32766
http://84.88.189.179:32645
scanflow
scanflow123
registry.gitlab.bsc.es/datacentric-computing/cloudskin-project/cloudskin-registry
feat/proactive-migration


Initialize the ScanflowClient

In [9]:
# App folder - Must point to the folder includeing all 'dataengineer' and 'datascience' folders
# for cloudedge-reactive-migration, allocated in examples/cloudedge-reactive-migration
app_dir = os.path.join(env.get_env('WORKDIR'), env.get_env('PROACTIVE_MIGRATION_DATASCIENCE_APP_DIR'))
print(app_dir)
app_name = env.get_env("SCANFLOW_APP_NAME")
team_name = env.get_env("SCANFLOW_TEAM_NAME")

# Initialize the Scanflow Client
scanflow_client = ScanflowClient(
    #if you defined "SCANFLOW_SERVER_URI", you dont need to provide this
    registry=env.get_env("DOCKER_REGISTRY"),
    verbose=True,
    docker_network_mode="host"
)

/home/jolivera/Documents/CloudSkin/Scanflow/data-connector/examples/cloudedge-proactive-migration/datascience


## Data Science graph for proactive training.

### Trainer

In [10]:
# # App folder
# scanflow_path = "/home/rocky/k8s_resources/data-connector"
# app_dir = os.path.join(scanflow_path, "examples/cloudedge/datascience")
# app_name = "cloudedge"
# team_name = "datascience"

# # scanflow client
# client = ScanflowClient(
#              #if you defined "SCANFLOW_SERVER_URI", you dont need to provide this
#              #scanflow_server_uri="http://172.30.0.50:46666",
#              verbose=True)

In [11]:
# DUMY: now the modelling simply save a resnet model

# executor1 = client.ScanflowExecutor(name='load-data', 
#                       mainfile='main.py',
#                       parameters={
#                         '--config': '/home/rocky/k8s_resources/data-connector/examples/cloudedge/datascience/workflows/load-data/config_prep.json',
#                                   },
#                       requirements='requirements.txt',
#                       dockerfile='dockerfile')

# Predictor stages
# - Executor 1: Data retrieval from Prometheus
# - Executor 2: Data pre-processing + QoS Predictor

# Define common variables for the Application stages
output_dir = "/workflow"
csv_root_path = os.path.join(output_dir, f"{app_name}-{team_name}")

executor3 = scanflow_client.ScanflowExecutor(name='modeling', 
                    mainfile='run_longExp.py',
                    dockerfile='Dockerfile_mod_no_buildkit',
                    image_pull_policy="Always",
                    parameters={
                        'random_seed': 42,
                        'is_training': 1,
                        'freq':'t',
                        'root_path': './data/',
                        'data_path': 'df.csv',
                        'model_id': 'PatchMixer',
                        'model': 'PatchMixer',
                        'data': 'custom',
                        'features': 'MS',
                        'target': 'PredictionTimeTS',
                        'seq_len': 10,
                        'pred_len': 3,
                        'label_len': 0,
                        'enc_in': 10,
                        'e_layers': 1,
                        'd_model': 256,
                        'dropout': 0.2,
                        'head_dropout': 0,
                        'patch_len': 16,
                        'stride': 8,
                        'des': 'Exp',
                        'train_epochs': 15,
                        'patience': 5,
                        'loss_flag': 2,
                        'use_gpu': False,
                        'itr': 1,
                        'batch_size': 256,
                        'learning_rate': 0.001,
                        'mlflow_loader': True,
                        'app_name':app_name,
                        'team_name':team_name
                    }
                  )

# dependency1 = client.ScanflowDependency(dependee='preprocessing',
#                                     depender='modeling')


##workflow1 
workflow1 = scanflow_client.ScanflowWorkflow(name='proactive-training-datascience', 
                     nodes=[executor3],
                    #  edges=[dependency1],
                    type = "batch",
                    output_dir = output_dir,
                    # cron="*/5 * * * *",
                    image_pull_secrets=["cloudskin-registry"] # Required for Workflow templates
)

              

### Compose the Scanflow Application

In [12]:
app = scanflow_client.ScanflowApplication(app_name = app_name,
                                 app_dir = app_dir,
                                 team_name = team_name,
                                 workflows=[workflow1])

### DEBUG: show application config

In [13]:
app.to_dict()

09-Dec-24 09:13:04 -  INFO - workflow proactive-training-datascience: {'name': 'proactive-training-datascience', 'nodes': [{'name': 'modeling', 'node_type': 'executor', 'mainfile': 'run_longExp.py', 'parameters': {'random_seed': 42, 'is_training': 1, 'freq': 't', 'root_path': './data/', 'data_path': 'df.csv', 'model_id': 'PatchMixer', 'model': 'PatchMixer', 'data': 'custom', 'features': 'MS', 'target': 'PredictionTimeTS', 'seq_len': 10, 'pred_len': 3, 'label_len': 0, 'enc_in': 10, 'e_layers': 1, 'd_model': 256, 'dropout': 0.2, 'head_dropout': 0, 'patch_len': 16, 'stride': 8, 'des': 'Exp', 'train_epochs': 15, 'patience': 5, 'loss_flag': 2, 'use_gpu': False, 'itr': 1, 'batch_size': 256, 'learning_rate': 0.001, 'mlflow_loader': True, 'app_name': 'cloudedge-proactive-migration-experiment', 'team_name': 'datascience'}, 'requirements': None, 'dockerfile': 'Dockerfile_mod_no_buildkit', 'base_image': None, 'env': None, 'image': None, 'timeout': None, 'resources': None, 'affinity': None, 'image

{'app_name': 'cloudedge-proactive-migration-experiment',
 'app_dir': '/home/jolivera/Documents/CloudSkin/Scanflow/data-connector/examples/cloudedge-proactive-migration/datascience',
 'team_name': 'datascience',
 'workflows': [{'name': 'proactive-training-datascience',
   'nodes': [{'name': 'modeling',
     'node_type': 'executor',
     'mainfile': 'run_longExp.py',
     'parameters': {'random_seed': 42,
      'is_training': 1,
      'freq': 't',
      'root_path': './data/',
      'data_path': 'df.csv',
      'model_id': 'PatchMixer',
      'model': 'PatchMixer',
      'data': 'custom',
      'features': 'MS',
      'target': 'PredictionTimeTS',
      'seq_len': 10,
      'pred_len': 3,
      'label_len': 0,
      'enc_in': 10,
      'e_layers': 1,
      'd_model': 256,
      'dropout': 0.2,
      'head_dropout': 0,
      'patch_len': 16,
      'stride': 8,
      'des': 'Exp',
      'train_epochs': 15,
      'patience': 5,
      'loss_flag': 2,
      'use_gpu': False,
      'itr': 1,
 

### Build the Scanflow Application
- This step builds the Docker images for all the Scanflow executors and uploads them to the container registry (currently hardcoded in the `scanflow` module)

In [14]:
# Define the Scanflow Tracker Port (32767)
build_app = scanflow_client.build_ScanflowApplication(
    app=app,
    trackerPort=32760, # Change this port to avoid conflict with any svc already using it.
    image_pull_secret="cloudskin-registry" # Required when deploying to Kubernetes (created during deployment)
)

09-Dec-24 09:13:04 -  INFO - Building image registry.gitlab.bsc.es/datacentric-computing/cloudskin-project/cloudskin-registry/cloudedge-proactive-migration-experiment-datascience-proactive-training-datascience-modeling
09-Dec-24 09:13:04 -  INFO - [+] Image [registry.gitlab.bsc.es/datacentric-computing/cloudskin-project/cloudskin-registry/cloudedge-proactive-migration-experiment-datascience-proactive-training-datascience-modeling] not found in repository. Building a new one.
09-Dec-24 09:13:04 -  INFO - dockerfile for using /home/jolivera/Documents/CloudSkin/Scanflow/data-connector/examples/cloudedge-proactive-migration/datascience/workflows/modeling/Dockerfile_mod_no_buildkit from /home/jolivera/Documents/CloudSkin/Scanflow/data-connector/examples/cloudedge-proactive-migration/datascience/workflows
09-Dec-24 09:16:36 -  INFO - [+] Image [modeling] was built successfully. image_tag ['registry.gitlab.bsc.es/datacentric-computing/cloudskin-project/cloudskin-registry/cloudedge-proactive-m

### DEBUG: show built application config

In [15]:
#build_app.to_dict()

### Create a ScanflowDeployerClient

This client creates the required environment for Scanflow to run the pipelines in a Kubernetes cluster based on the built application. It can:

- Create an environment for the Scanflow application within its own namespace
- Deploy a local Scanflow Tracker
- Run the application as an Argo Workflow

In [16]:
# Initialize the deployer client
if env.get_env("LOCAL_DEPLOY"):
    deployer_client = ScanflowDeployerClient(
        user_type="local",
        deployer="argo",
        k8s_config_file=env.get_env("KUBECONFIG_PATH")
    )

09-Dec-24 09:19:33 -  INFO - loading kubernetes configuration from /home/jolivera/.kube/config_ncloud_socat
09-Dec-24 09:19:33 -  INFO - found local kubernetes configuration


### Deploy the ScanflowEnvironment
This creates:
- A namespace for the application
- A Deployment for the local scanflow tracker
- A Deployment for all the agents (in this case there's only the planner)
  - Planner doesn't include right now the `scanflow` module, so it must be copied inside the planner's PVC so the container finds it in the `/scanflow/scanflow/scanflow` path

Go to your Kubernetes cluster and check that both tracker and planner pods are Running without errors in the `scanflow-cloudedge-reactive-migration-dataengineer`.

In [17]:
# Compose a custom ScanflowEnvironment
from scanflow.deployer.env import ScanflowEnvironment
data_sci_env = ScanflowEnvironment()
data_sci_env.namespace=f"scanflow-{build_app.app_name}-{build_app.team_name}"
# TRACKER STORAGE MUST BE ALREADY DEPLOYED IN ITS OWN NAMESPACE (i.e: "scanflow-server")
# - "scanflow" db must already exist in postgresql
# - "scanflow" bucket must already exist in MinIO
#data_sci_env.tracker_config.TRACKER_STORAGE = f"postgresql://postgres:scanflow123@postgresql.scanflow-server/scanflow"
data_sci_env.tracker_config.TRACKER_STORAGE = env.get_env("SCANFLOW_TRACKER_STORAGE")
data_sci_env.tracker_config.TRACKER_ARTIFACT = f"s3://scanflow/{data_sci_env.namespace}"
# CLIENT CONFIG: REPLACE WITH CURRENTLY DEPLOYED SERVICES IN "scanflow-server" namespace
data_sci_env.client_config.SCANFLOW_TRACKER_LOCAL_URI = env.get_env("SCANFLOW_TRACKER_URI")
data_sci_env.client_config.SCANFLOW_TRACKER_URI = env.get_env("SCANFLOW_TRACKER_URI")
data_sci_env.client_config.SCANFLOW_SERVER_URI = env.get_env("SCANFLOW_SERVER_URI")
# MINIO MUST BE ALREADY DEPLOYED IN ITS OWN NAMESPACE (i.e: "scanflow-server")
data_sci_env.secret.AWS_ACCESS_KEY_ID = env.get_env("AWS_ACCESS_KEY_ID")
data_sci_env.secret.AWS_SECRET_ACCESS_KEY = env.get_env("AWS_SECRET_ACCESS_KEY")
data_sci_env.secret.MLFLOW_S3_ENDPOINT_URL = env.get_env("MLFLOW_S3_ENDPOINT_URL")
data_sci_env.secret.AWS_ENDPOINT_URL = env.get_env("AWS_ENDPOINT_URL")
# NEW: configure image pull secret
data_sci_env.image_pull_secret.name = "cloudskin-registry"
data_sci_env.image_pull_secret.registry = env.get_env("DOCKER_REGISTRY")
data_sci_env.image_pull_secret.username = env.get_env("DOCKER_REGISTRY_USERNAME")
data_sci_env.image_pull_secret.password = env.get_env("DOCKER_REGISTRY_PASSWORD")
data_sci_env.image_pull_secret.email = "cloudskin-project@bsc.es"

In [18]:
# Create the application environment
if env.get_env("LOCAL_DEPLOY"):
    await deployer_client.create_environment(
        app=build_app,
        scanflowEnv=data_sci_env
    )

09-Dec-24 09:19:33 -  INFO - [++]Creating env
09-Dec-24 09:19:33 -  INFO - [++]Creating namespace "scanflow-cloudedge-proactive-migration-experiment-datascience"
09-Dec-24 09:19:33 -  INFO - create_namespace true
09-Dec-24 09:19:33 -  INFO - [++]Creating Role for 'default service account'


09-Dec-24 09:19:33 -  INFO - create_rolebinding info
09-Dec-24 09:19:33 -  INFO - [++]Creating s3 secret {'AWS_ACCESS_KEY_ID': 'scanflow', 'AWS_SECRET_ACCESS_KEY': 'scanflow123', 'MLFLOW_S3_ENDPOINT_URL': 'http://84.88.189.179:32645', 'AWS_ENDPOINT_URL': None}
09-Dec-24 09:19:33 -  INFO - create_secret true
09-Dec-24 09:19:33 -  INFO - [++]Creating tracker configmap {'TRACKER_STORAGE': 'postgresql://postgres:scanflow123@scanflow-postgres.scanflow-server.svc.cluster.local/scanflow-cloudedge-datascience', 'TRACKER_ARTIFACT': 's3://scanflow/scanflow-cloudedge-proactive-migration-experiment-datascience'}
09-Dec-24 09:19:33 -  INFO - create_configmap true
09-Dec-24 09:19:33 -  INFO - [++]Creating client configmap {'SCANFLOW_TRACKER_URI': 'http://84.88.189.179:32766', 'SCANFLOW_SERVER_URI': 'http://84.88.189.179:32767', 'SCANFLOW_TRACKER_LOCAL_URI': 'http://84.88.189.179:32766'}
09-Dec-24 09:19:33 -  INFO - create_configmap true
09-Dec-24 09:19:33 -  INFO - create_pvc true
09-Dec-24 09:19:33

### Manual task: copy `scanflow` module
This step copies this repository version of `scanflow` module inside the environment's PersistentVolumeClaim. The environment creation is done with asynchronous API calls, so we must ensure that both the `namespace` and the `persistentVolumeClaim` are already available before proceeding.

In [19]:
# Steps:
# - Local variables:
debug_pod_yaml = os.path.join(env.get_env("WORKDIR"), "tutorials", "cloudedge-proactive-migration", "debug_pod_datascience.yaml")
persistent_volume_claim = f"scanflow-{environment_namespace}"
scanflow_folder = pathlib.Path(os.path.join(env.get_env("WORKDIR"), "scanflow"))

# - Check that the persistentVolumeClaim is properly Bound
while not check_if_object_exists_and_ready(
    client=kube_client,
    object_type="persistentVolumeClaim",
    name=persistent_volume_claim,
    namespace=environment_namespace
):
    # Wait 2 seconds for the next check
    sleep(2)

# - Deploy a Pod in the environment namespace that mounts the environment's persistentVolumeClaim.
#   For now we'll provide a YAML file with the expected name of the PVC, but in the future
#   this should be provided either by the ScanflowDeployClient or a Kubernetes API call
deploy_pod_and_wait(
    client=kube_client,
    yaml_file=debug_pod_yaml,
    namespace=environment_namespace
)

# - Once the pod is Running, proceed to compress the `scanflow` folder onto a tar file; then send it to the Pod
#   and uncompress it at the destination path
copy_local_path_to_pod(
    client=kube_client,
    namespace=environment_namespace,
    pod_name="cloudedge-debug-pod",
    local_path=scanflow_folder,
    dest_path="/scanflow/scanflow",
    exclude_paths=["__pycache__"]
)

# - We can leave the Pod running for debugging purposes



Checking if persistentVolumeClaim 'scanflow-scanflow-cloudedge-proactive-migration-experiment-datascience' exists in namespace 'scanflow-cloudedge-proactive-migration-experiment-datascience'...
PersistentVolumeClaim 'scanflow-scanflow-cloudedge-proactive-migration-experiment-datascience' is Bound and ready.
Creating pod: cloudedge-debug-pod in namespace: scanflow-cloudedge-proactive-migration-experiment-datascience
Pod 'cloudedge-debug-pod' is currently in phase: Pending




Pod 'cloudedge-debug-pod' is currently in phase: Running
Pod 'cloudedge-debug-pod' is now in Running state.


## Run Workflow to test
This composes an Argo CronWorkflow for the application and submits it to the Argo Workflows engine:
- Pre-requisites: Argo Workflows must be set to use the `default` service account when no `serviceAccount` is provided in the template

In [20]:
if env.get_env("LOCAL_DEPLOY"):
    await deployer_client.run_app(app=build_app)
    # DEBUG - TODO: if using external config files, automate their copy inside the workflow PVC instead of doing it manually
    # - Copy Promcsv config file so it is available within the container in the /workflow/promql_queries.json path

09-Dec-24 09:19:42 -  INFO - [++] Running workflow: [proactive-training-datascience].
09-Dec-24 09:19:42 -  INFO - [+] output dir /workflow
09-Dec-24 09:19:42 -  INFO - [+] Create proactive-training-datascience output PVC
09-Dec-24 09:19:43 -  INFO - create_pvc true
09-Dec-24 09:19:43 -  INFO - output dir created
09-Dec-24 09:19:43 -  INFO - env for executor {'AWS_ACCESS_KEY_ID': 'scanflow', 'AWS_SECRET_ACCESS_KEY': 'scanflow123', 'MLFLOW_S3_ENDPOINT_URL': 'http://84.88.189.179:32645', 'AWS_ENDPOINT_URL': None, 'SCANFLOW_TRACKER_URI': 'http://84.88.189.179:32766', 'SCANFLOW_SERVER_URI': 'http://84.88.189.179:32767', 'SCANFLOW_TRACKER_LOCAL_URI': 'http://84.88.189.179:32766'}
09-Dec-24 09:19:43 -  INFO - [+] Building workflow: [proactive-training-datascience:modeling].
09-Dec-24 09:19:43 -  INFO -  parameters: ['--random_seed', '42', '--is_training', '1', '--freq', 't', '--root_path', './data/', '--data_path', 'df.csv', '--model_id', 'PatchMixer', '--model', 'PatchMixer', '--data', 'cus

## Clean-up

In [21]:
# if env.get_env("LOCAL_DEPLOY"):
#     await deployer_client.delete_app(app=build_app)

### Remove Scanflow environment

In [22]:
# if env.get_env("LOCAL_DEPLOY"):
#     await deployer_client.clean_environment(app=build_app, scanflow_env=data_eng_env)

## MLFlow debug cell

In [49]:
if env.get_env("LOCAL_DEPLOY"):
    import mlflow
    from scanflow.client import ScanflowTrackerClient

    client = ScanflowTrackerClient(scanflow_tracker_local_uri=env.get_env("SCANFLOW_TRACKER_URI"))
    mlflow.set_tracking_uri(client.get_tracker_uri(True))
    # Retrieve the Application experiment
    
    # reactive_experiment = mlflow.get_experiment_by_name(app_name)
    proactive_experiment = mlflow.get_experiment_by_name("PatchMixer")
    experiment_id = proactive_experiment.experiment_id
    print(experiment_id)

    # Retrieve filtered experiment runs by run_name, ordered by descending end time --> First entry will be the most recent
    # runs_df = mlflow.search_runs([experiment_id], filter_string=f"run_name='{team_name}'", order_by=["end_time DESC"])
    # run_id = runs_df.loc[[0]]['run_id'][0]
    # print(run_id)

    # Delete experiment
    #mlflow.delete_experiment(experiment_id=str(experiment_id))



In [24]:
# trackerClient = ScanflowTrackerClient(
#                         scanflow_tracker_local_uri="http://84.88.189.179:32765",
#                         verbose=True)

In [25]:
# trackerClient.save_app_model(app_name=app_name,
#                             team_name=team_name,
#                             model_name="model_LSTM_a6050a48-03a9-4d02-927a-878317624b52.pt",
#                             model_version="1")
# trackerClient.save_app_model(app_name=app_name,
#                             team_name=team_name,
#                             model_name="model_LSTM_c2313dca-f464-4fe3-96c6-cdbe530fe89e.pt",
#                             model_version="1")



# # Saving scalers in order to normalize/denormalize for inference. These could potentially be artifacts if automated properly.
# trackerClient.save_app_model(app_name=app_name,
#                             team_name=team_name,
#                             model_name="LSTM_scaler_x_a6050a48-03a9-4d02-927a-878317624b52",
#                             model_version="1")
# trackerClient.save_app_model(app_name=app_name,
#                             team_name=team_name,
#                             model_name="LSTM_scaler_y_a6050a48-03a9-4d02-927a-878317624b52",
#                             model_version="1")

# trackerClient.save_app_model(app_name=app_name,
#                             team_name=team_name,
#                             model_name="LSTM_scaler_x_c2313dca-f464-4fe3-96c6-cdbe530fe89e",
#                             model_version="1")
# trackerClient.save_app_model(app_name=app_name,
#                             team_name=team_name,
#                             model_name="LSTM_scaler_y_c2313dca-f464-4fe3-96c6-cdbe530fe89e",
#                             model_version="1")
