# PyTorch Inference using KServe on AWS Kubeflow Distribution


In this example, we deploy a trained PyTorch CNN image classification model to predict CIFAR 10 images by running an InferenceService with TorchServe runtime which is the default installed serving runtime for PyTorch models.

We will create KServe InferenceService yaml, pipeline, and will run the inference all from this notebook 

The KServe/TorchServe integration expects following model store layout on the storage with TorchServe Model Archive and Model Configuration.


TorchServe provides a utility to package all the model artifacts into a single TorchServe Model Archive Files (MAR), after model artifacts are packaged into MAR file you then upload to the model-store under model storage path which will be S3 in this example.

In [None]:
# Restart Kernel after you have installed the utility

# Install kserve python sdk to create and run InferenceService yaml
#!pip install kserve==0.7.0

# Install torch-model-archiver, a TorchServe utility to package all the model artifacts into a single TorchServe Model Archive Files (MAR). 
#!pip install torch-model-archiver

In [None]:
import argparse
import json
import logging
import os
import sys
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision
from torchvision import datasets, transforms
import json
#import subprocess
from model import Net

from kubernetes import client 
from kserve import KServeClient
from kserve import constants
from kserve import utils
from kserve import V1beta1InferenceService
from kserve import V1beta1InferenceServiceSpec
from kserve import V1beta1PredictorSpec
from kserve import V1beta1SKLearnSpec
from kserve import V1beta1TorchServeSpec
from kserve import V1beta1TorchServeSpec
from kubernetes.client import models as k8s

import kfp
from kfp import components
from kubeflow.training.utils import utils
from kfp import dsl
from kfp import compiler

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))

In [None]:
# Load the model created by training step  
model = Net()

In [None]:
# Original saved file with DataParallel
checkpoint = torch.load('./model.pth',map_location=lambda storage, loc: storage)

## Load the model 
We have saved the model using nn.DataParallel, which stores the model in module, and we wont be able to load it without DataParallel. So below we create a new ordered dict without the module prefix, and load it back.

In [None]:
# Create new OrderedDict that does not contain `module.`
from collections import OrderedDict

new_state_dict = OrderedDict()
for k, v in checkpoint.items():
    name = k[7:] # remove `module.`
    new_state_dict[name] = v

# Load parameters
model.load_state_dict(new_state_dict)

In [None]:
# Save a new model without module keyword
torch.save(new_state_dict, "./model_new.pth")

## Create model store required by KServe/TorchServe integration

In [None]:
# Generate the MAR file with torch-model-archiver
!$HOME/.local/bin/torch-model-archiver -f --model-name cifar --version 1.0 --model-file ./model.py --serialized-file ./model_new.pth --handler  ./model_handler.py

In [None]:
#Clean up the madel_layout package if it is alread there
#!rm -r model_layout

In [None]:
#Create a model store layout required by the PyTorch Serve
!mkdir -p model_layout/config 
!mkdir -p model_layout/model-store

In [None]:
%%bash

cat <<EOF > model_layout/config/config.properties
inference_address=http://0.0.0.0:8085
management_address=http://0.0.0.0:8085
metrics_address=http://0.0.0.0:8082
grpc_inference_port=7070
grpc_management_port=7071
enable_metrics_api=true
metrics_format=prometheus
number_of_netty_threads=4
job_queue_size=10
enable_envvars_config=true
install_py_dep_per_model=true
model_store=/mnt/models/model-store
model_snapshot={"name":"startup.cfg","modelCount":1,"models":{"cifar":{"1.0":{"defaultVersion":true,"marName":"cifar.mar","minWorkers":1,"maxWorkers":5,"batchSize":1,"maxBatchDelay":10,"responseTimeout":120}}}}
EOF

In [None]:
# Copy the model archive
!cp cifar.mar model_layout/model-store/

In [None]:
import boto3
import time
from datetime import datetime
timsestamp = datetime.now().strftime('%Y%m%d%H%M%S')

region='us-west-2'
s3 = boto3.client('s3', region_name=region)
location = {'LocationConstraint': region}

bucketname='kserve-model-layout-'+timsestamp
s3.create_bucket(Bucket=bucketname,CreateBucketConfiguration=location)
bucketname

In [None]:
# Load the model package to S3 bucket 
!aws s3 cp ./model_layout s3://{bucketname}/ --recursive

In [None]:
namespace = utils.get_default_target_namespace()
namespace

## Create InferenceService yaml

In [None]:
resource_req=k8s.V1ResourceRequirements(requests={'cpu':'100m'})
predictor_spec=V1beta1PredictorSpec(service_account_name="aws-sa",
                                    pytorch=(V1beta1TorchServeSpec(
                                   resources=resource_req,
                                   storage_uri=f's3://{bucketname}')))
    
#predictor_spec

In [None]:
inference_endpoint='image-classify'
kserve_version='v1beta1'
api_version = constants.KSERVE_GROUP + '/' + kserve_version

isvc = V1beta1InferenceService(api_version=api_version,
                               kind=constants.KSERVE_KIND,
                               metadata=client.V1ObjectMeta(
                                   name=inference_endpoint, namespace=namespace, annotations={'sidecar.istio.io/inject':'false'}),
                               spec=V1beta1InferenceServiceSpec(
                               predictor=predictor_spec)
)

#isvc

In [None]:
#Create KServe Inference EndPoint
KServe = KServeClient()
KServe.create(isvc)

In [None]:
#Delete the KServe Inference EndPoint 
#name="image-classify"
#KServe.delete(name, namespace=namespace)

In [None]:
#KServe.get(inference_endpoint, namespace=namespace)

In [None]:
KServe.get(inference_endpoint, namespace=namespace, timeout_seconds=120)