## Notebook 2 - Operatioanlize Machine Learning Workflow using Hybrid ML Pipelines

#### This notebook creates hybrid Kubeflow Pipelines that runs distributed training using either. PyTorch Training Operators on Kubernetes or Amazon SageMaker service based on conditional statements. 



In [12]:
# Import necessary libraries 

import kfp
from kfp import components
from kubeflow.training.utils import utils

from kfp import dsl
from kfp import compiler
import yaml
import json
from kubeflow.training import PyTorchJobClient
import time
import boto3
import kfp.components as comp



In [13]:
# Initialize global variables 

user_namespace = utils.get_default_target_namespace()

efs_mount_point='efs-sc-claim'

aws_dlc_sagemaker_train_image='763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.8.0-gpu-py3'

aws_dlc_sagemaker_inference_image='763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference:1.6.0-gpu-py3'


# Create SageMaker session and default bucket

In [14]:
# Import SageMaker specific libraries
import sagemaker
import boto3
import random, string

sess = boto3.Session()
sm= sess.client('sagemaker',region_name='us-west-2')

# Use role as shared in WorkShop Steps 
role = 'arn:aws:iam::913278749917:role/sagemakerrole' 
sagemaker_session = sagemaker.Session(boto_session=sess)
dataset_folder = 'datasets'

Found credentials in shared credentials file: ~/.aws/credentials


In [15]:
# SageMaker default bucket
bucket_name = sagemaker_session.default_bucket()
job_folder      = 'jobs'
dataset_folder  = 'datasets'
local_dataset = 'cifar10'
pytorchjob_name   = f'pytorch-dist-gpu-{time.strftime("%Y-%m-%d-%H-%M-%S-%j", time.gmtime())}'

bucket_name

'sagemaker-us-west-2-913278749917'

# Download the Cifar 10 dataset

In [5]:
import torchvision
cifar10_dataset = torchvision.datasets.CIFAR10(efs_mount_point+'/cifar10-dataset', 
                                     train=True, 
                                     download=True)

datasets = sagemaker_session.upload_data(path=efs_mount_point+'/cifar10-dataset', 
                                         key_prefix=f'{dataset_folder}/cifar10-dataset')

#datasets

Files already downloaded and verified


In [16]:
# Get kuberenetes pvc claim id for the provisioned efs from Kubeflow Volumes on the dashboard. eg. efs-sc-claim

pvc_claim_id=!(kubectl get pvc --no-headers=true | awk '/efs-sc-claim/{print$3}' )

#pvc_claim_id[0]

In [17]:
efs_fs_id=!kubectl describe pv $pvc_claim_id   | awk '/VolumeHandle/{print $2}' | cut -d':' -f1

#efs_fs_id[0]

In [18]:
# Get Subnet Id and Security Group for the EFS Mounted on your VPC Subnet

client = boto3.client('efs')

# Previous cell pre-populates File System Id. Please log in to AWS console and go to EFS service home page to verify it
file_system_id=efs_fs_id[0]
file_system_dir_path="/"+pvc_claim_id[0]+"/cifar10-dataset"

efs_mount_target_resp = client.describe_mount_targets(
    MaxItems=123,
    FileSystemId=file_system_id
)

subnet_id = efs_mount_target_resp['MountTargets'][0]['SubnetId']

efs_mount_target_sg = client.describe_mount_target_security_groups(
    MountTargetId=efs_mount_target_resp['MountTargets'][0]['MountTargetId']
)

security_group_id=efs_mount_target_sg['SecurityGroups'][0]

In [19]:
# Loads SageMaker components for Kubeflow pipeline from the URL

sagemaker_hpo_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/cb36f87b727df0578f4c1e3fe9c24a30bb59e5a2/components/aws/sagemaker/hyperparameter_tuning/component.yaml')
sagemaker_train_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/cb36f87b727df0578f4c1e3fe9c24a30bb59e5a2/components/aws/sagemaker/train/component.yaml')
sagemaker_model_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/cb36f87b727df0578f4c1e3fe9c24a30bb59e5a2/components/aws/sagemaker/model/component.yaml')
sagemaker_deploy_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/cb36f87b727df0578f4c1e3fe9c24a30bb59e5a2/components/aws/sagemaker/deploy/component.yaml')

In [20]:
# Upload the training script to the S3 bucket to be accessed by SageMaker training job

!tar cvfz sourcedir.tar.gz --exclude='./cifar10-dataset' --exclude='./*ipynb*' -C $efs_mount_point .
source_s3 = sagemaker_session.upload_data(path='sourcedir.tar.gz', key_prefix='training-scripts')
print('\nUploaded to S3 location:')
print(source_s3)

./
./mnist.py
./cifar10-distributed-gpu-final-Copy1.py
./cifar10-distributed-gpu-final.py
./model.pth

Uploaded to S3 location:
s3://sagemaker-us-west-2-913278749917/training-scripts/sourcedir.tar.gz


In [23]:
# Read PyTorch Operator master and worker from the YAML file

with open("pipeline_yaml_specifications/pipeline_master_spec.yml", 'r') as master_stream:
    master_spec_loaded = yaml.safe_load(master_stream)
    
with open("pipeline_yaml_specifications/pipeline_worker_spec.yml", 'r') as worker_stream:
    worker_spec_loaded = yaml.safe_load(worker_stream)

In [24]:
# Loads PyTorch Training Operator component from the File

pytorch_job_op = components.load_component_from_file('pipeline_components/pytorch_component.yaml')

In [28]:
# Define a function to check if the runtime passed to a pipeline is sagemaker or kubernetes
def check_the_condition(training_runtime: str="sagemaker") -> str:
    if training_runtime.lower() == "sagemaker":
        return "sagemaker"
    elif training_runtime.lower() == "kubernetes":
        return "kubernetes"
        
    return ""

In [29]:
# Create Python function-based components for Kubeflow Pipeline

check_condition_op = comp.func_to_container_op(check_the_condition)

# Create hybrid Kubeflow Pipeline

In [34]:
# Create job name for tracking kuberenets PyTorchJob custom resource or SageMaker training job
pytorch_distributed_jobname=f'pytorch-cnn-dist-job-{time.strftime("%Y-%m-%d-%H-%M-%S-%j", time.gmtime())}'

# Create Hybrid Pipeline using Kubeflow PyTorch Training Operators and Amazon SageMaker Service
@dsl.pipeline(name="PyTorch Training pipeline", description="Sample training job test")
def pytorch_cnn_pipeline(region='us-west-2',
                           training_input_mode='File',
                           namespace=user_namespace,
                           train_image=aws_dlc_sagemaker_train_image,
                           serving_image=aws_dlc_sagemaker_inference_image,
                           volume_size='54',
                           max_run_time='86400',
                           learning_rate='0.01',
                           pytorch_backend='gloo',
                           training_job_name=pytorch_distributed_jobname, 
                           instance_type='ml.g4dn.12xlarge',
                           network_isolation='False',
                           traffic_encryption='False',
                           spot_instance='False',
                           training_runtime='kubernetes', # Define either sagemaker or kubernetes
                           channels='[ \
                            { \
                                "ChannelName": "train", \
                                "DataSource": { \
                                    "FileSystemDataSource": { \
                                        "FileSystemId": "'+file_system_id+'", \
                                        "FileSystemType": "EFS", \
                                        "FileSystemAccessMode": "ro", \
                                        "DirectoryPath": "'+file_system_dir_path+'" \
                                    } \
                                }, \
                                "CompressionType": "None", \
                                "RecordWrapperType": "None" \
                            } \
                         ]'
                        ):
    
    # Step to evaluate the condition. You can enter any logic here. For demonstration we are checking if GPU is needed for training 
    condition_result = check_condition_op(training_runtime)
    
    # Step to run training on Kuberentes using PyTorch Training Operators. This will be executed if gpus are not needed
    with dsl.Condition(condition_result.output == 'kubernetes', name="PyTorch_Comp"):
        train_task = pytorch_job_op(
            name=training_job_name, 
            namespace=user_namespace, 
            master_spec=json.dumps(master_spec_loaded), # Please refer file at pipeline_yaml_specifications/pipeline_master_spec.yml
            worker_spec=json.dumps(worker_spec_loaded), # Please refer file at pipeline_yaml_specifications/pipeline_worker_spec.yml
            delete_after_done=False
        ).after(condition_result)
    
    # Step to run training on SageMaker using SageMaker Components for Pipeline. This will be executed if gpus are needed 
    with dsl.Condition(condition_result.output == 'sagemaker', name="SageMaker_Comp"):
        training = sagemaker_train_op(
            region=region,
            image=train_image,
            job_name=training_job_name,
            training_input_mode=training_input_mode,
            hyperparameters='{ \
                "backend": "'+str(pytorch_backend)+'", \
                "batch-size": "64", \
                "epochs": "3", \
                "lr": "'+str(learning_rate)+'", \
                "model-type": "custom", \
                "sagemaker_container_log_level": "20", \
                "sagemaker_program": "cifar10-distributed-gpu-final.py", \
                "sagemaker_region": "us-west-2", \
                "sagemaker_submit_directory": "'+source_s3+'" \
            }',
            channels=channels,
            instance_type=instance_type,
            instance_count=1,
            volume_size=volume_size,
            max_run_time=max_run_time,
            model_artifact_path=f's3://{bucket_name}/jobs',
            network_isolation=network_isolation,
            traffic_encryption=traffic_encryption,
            role=role,
            vpc_subnets=subnet_id,
            vpc_security_group_ids=security_group_id
        ).after(condition_result)
        
    #Disable pipeline cache 
    train_task.execution_options.caching_strategy.max_cache_staleness = "P0D"

# Compile the pipeline

In [35]:
kfp.compiler.Compiler().compile(pytorch_cnn_pipeline, "pytorch_cnn_pipeline_new.yaml")

# Execute the Pipeline using Kubeflow Pipeline Client

In [36]:
client = kfp.Client()

experiment = client.create_experiment(name="kubeflow")

my_run = client.run_pipeline(experiment.id, "pytorch_cnn_pipeline", "pytorch_cnn_pipeline_new.yaml")

# CleanUp

In [85]:
# Delete all previously submitted pipelines through this command. You can run in notebook as well on kubernetes cli 

!kubectl get pods --no-headers=true  | awk '/pytorch-training-pipeline/{print $1}' | xargs  kubectl delete pod

pod "pytorch-training-pipeline-hrz8x-4103682115" deleted


In [86]:
# Delete all previously submitted PyTorchJobs through this command. You can run in notebook as well on kubernetes cli 

!kubectl get pytorchjob --no-headers=true -A | awk '/pytorch-cnn-dist/{print $2}' | xargs  kubectl delete pytorchjob  -n aws-hybrid-training-ns              

No resources found
error: resource(s) were provided, but no name was specified
