# PyTorch Inference using KServe on AWS Kubeflow Distribution


In this example, we deploy a trained PyTorch CNN image classification model to predict CIFAR 10 images by running an InferenceService with TorchServe runtime which is the default installed serving runtime for PyTorch models.

We will create KServe InferenceService yaml, pipeline, and will run the inference all from this notebook 

The KServe/TorchServe integration expects following model store layout on the storage with TorchServe Model Archive and Model Configuration.


TorchServe provides a utility to package all the model artifacts into a single TorchServe Model Archive Files (MAR), after model artifacts are packaged into MAR file you then upload to the model-store under model storage path which will be S3 in this example.

In [1]:
# Restart Kernel after you have installed the utility

# Install kserve python sdk to create and run InferenceService yaml
#!pip install kserve==0.7.0

# Install torch-model-archiver, a TorchServe utility to package all the model artifacts into a single TorchServe Model Archive Files (MAR). 
#!pip install torch-model-archiver

Defaulting to user installation because normal site-packages is not writeable
Collecting kserve==0.7.0
  Downloading kserve-0.7.0-py3-none-any.whl (284 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.9/284.9 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting azure-core==1.17.0
  Downloading azure_core-1.17.0-py2.py3-none-any.whl (165 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.8/165.8 kB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
Collecting google-cloud-storage==1.41.1
  Downloading google_cloud_storage-1.41.1-py2.py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 kB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
Collecting grpcio==1.38.1
  Downloading grpcio-1.38.1-cp38-cp38-manylinux2014_x86_64.whl (4.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting cacheto

In [2]:
import argparse
import json
import logging
import os
import sys
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision
from torchvision import datasets, transforms
import json
#import subprocess
from model import Net

from kubernetes import client 
from kserve import KServeClient
from kserve import constants
from kserve import utils
from kserve import V1beta1InferenceService
from kserve import V1beta1InferenceServiceSpec
from kserve import V1beta1PredictorSpec
from kserve import V1beta1SKLearnSpec
from kserve import V1beta1TorchServeSpec
from kserve import V1beta1TorchServeSpec
from kubernetes.client import models as k8s

import kfp
from kfp import components
from kubeflow.training.utils import utils
from kfp import dsl
from kfp import compiler

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))

In [3]:
# Load the model created by training step  
model = Net()

## Load the model 

In [8]:
# Same bucket name that we used in notebook 1_submit_pytorchdist_k8s.ipynb
s3_bucket_name='kserve-model-20230526045744'
model_name='model-kserve.pth'
!aws s3 cp s3://{s3_bucket_name}/{model_name} ./kserve_inference

download: s3://kserve-model-20230526045744/model-kserve.pth to kserve_inference/model-kserve.pth


## Create model store required by KServe/TorchServe integration

In [10]:
# Generate the MAR file with torch-model-archiver
!$HOME/.local/bin/torch-model-archiver -f --model-name cifar --version 1.0 --model-file ./kserve_inference/model.py --serialized-file ./kserve_inference/model-kserve.pth --handler  ./kserve_inference/model_handler.py & mv cifar.mar ./kserve_inference/

In [None]:
#Clean up the madel_layout package if it is alread there
#!rm -r model_layout

In [11]:
#Create a model store layout required by the PyTorch Serve
!mkdir -p kserve_inference/model_layout/config 
!mkdir -p kserve_inference/model_layout/model-store

In [15]:
# Copy the model archive
!cp kserve_inference/cifar.mar kserve_inference/model_layout/model-store/
!cp kserve_inference/config.properties kserve_inference/model_layout/config/

In [16]:
# Load the model package to S3 bucket 
!aws s3 cp ./kserve_inference/model_layout s3://{s3_bucket_name}/ --recursive

upload: kserve_inference/model_layout/config/config.properties to s3://kserve-model-20230526045744/config/config.properties
upload: kserve_inference/model_layout/model-store/cifar.mar to s3://kserve-model-20230526045744/model-store/cifar.mar


In [17]:
namespace = utils.get_default_target_namespace()
namespace

'kubeflow-user-example-com'

## Create InferenceService yaml

In [22]:
resource_req=k8s.V1ResourceRequirements(requests={'cpu':'100m'})
predictor_spec=V1beta1PredictorSpec(service_account_name="aws-sa",
                                    pytorch=(V1beta1TorchServeSpec(
                                   resources=resource_req,
                                   storage_uri=f's3://{s3_bucket_name}')))
    
#predictor_spec

In [23]:
inference_endpoint='image-classify'
kserve_version='v1beta1'
api_version = constants.KSERVE_GROUP + '/' + kserve_version

isvc = V1beta1InferenceService(api_version=api_version,
                               kind=constants.KSERVE_KIND,
                               metadata=client.V1ObjectMeta(
                                   name=inference_endpoint, namespace=namespace, annotations={'sidecar.istio.io/inject':'false'}),
                               spec=V1beta1InferenceServiceSpec(
                               predictor=predictor_spec)
)

#isvc

In [24]:
#Create KServe Inference EndPoint
KServe = KServeClient()
KServe.create(isvc)

{'apiVersion': 'serving.kserve.io/v1beta1',
 'kind': 'InferenceService',
 'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'},
  'creationTimestamp': '2023-05-26T05:32:12Z',
  'generation': 1,
  'labels': {'serviceEnvelope': 'kserve'},
  'managedFields': [{'apiVersion': 'serving.kserve.io/v1beta1',
    'fieldsType': 'FieldsV1',
    'fieldsV1': {'f:metadata': {'f:annotations': {'.': {},
       'f:sidecar.istio.io/inject': {}}},
     'f:spec': {'.': {},
      'f:predictor': {'.': {},
       'f:pytorch': {'.': {},
        'f:resources': {'.': {}, 'f:requests': {'.': {}, 'f:cpu': {}}},
        'f:storageUri': {}},
       'f:serviceAccountName': {}}}},
    'manager': 'OpenAPI-Generator',
    'operation': 'Update',
    'time': '2023-05-26T05:32:10Z'}],
  'name': 'image-classify',
  'namespace': 'kubeflow-user-example-com',
  'resourceVersion': '61244274',
  'uid': 'db2d2d20-9396-476e-aba9-b627b997794c'},
 'spec': {'predictor': {'model': {'modelFormat': {'name': 'pytorch'},
    '

In [21]:
#Delete the KServe Inference EndPoint 
#name="image-classify"
#KServe.delete(name, namespace=namespace)

{'apiVersion': 'serving.kserve.io/v1beta1',
 'kind': 'InferenceService',
 'metadata': {'creationTimestamp': '2023-05-26T02:49:11Z',
  'deletionGracePeriodSeconds': 0,
  'deletionTimestamp': '2023-05-26T05:32:00Z',
  'finalizers': ['inferenceservice.finalizers'],
  'generation': 2,
  'labels': {'serviceEnvelope': 'kserve'},
  'managedFields': [{'apiVersion': 'serving.kserve.io/v1beta1',
    'fieldsType': 'FieldsV1',
    'fieldsV1': {'f:spec': {'.': {},
      'f:predictor': {'.': {},
       'f:canaryTrafficPercent': {},
       'f:pytorch': {'.': {}, 'f:name': {}, 'f:storageUri': {}},
       'f:serviceAccountName': {},
       'f:timeout': {}}}},
    'manager': 'OpenAPI-Generator',
    'operation': 'Update',
    'time': '2023-05-26T02:49:09Z'},
   {'apiVersion': 'serving.kserve.io/v1beta1',
    'fieldsType': 'FieldsV1',
    'fieldsV1': {'f:metadata': {'f:finalizers': {'.': {},
       'v:"inferenceservice.finalizers"': {}}}},
    'manager': 'manager',
    'operation': 'Update',
    'time': 

In [None]:
#KServe.get(inference_endpoint, namespace=namespace)

In [None]:
KServe.get(inference_endpoint, namespace=namespace, timeout_seconds=120)