# Deploy Whisper model on AWS Inference 2 with manual porting
Manual porting is another option when using AWS Neuron Chips. This typically requires understanding well the model architecture so it can be split and its individual components compiled.

First install dependencies and download test file. You can skip this step if you executed [01_Whisper_gpu](01_Whisper_gpu.ipynb) notebook

In [None]:
%%capture
!pip install -U transformers==4.36.2 datasets==2.18.0 soundfile==0.12.1 librosa==0.10.1 sagemaker
!wget --no-check-certificate https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac

Remember to restart kernel after installing dependencies

In [None]:
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFaceModel

inf_region = 'us-east-2'

session = sagemaker.Session(boto_session=boto3.Session(region_name=inf_region))

try:
	role = sagemaker.get_execution_role(sagemaker_session=session)
except ValueError:
	iam = boto3.client('iam')
	role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']
print(role)

inf_bucket = session.default_bucket()
print(inf_bucket)

## Compile model to neuron
Before we deploy the model we need to compile it so it can run on neuron devices. To do that we will use training job on Amazon Sagemaker that will run the compilation script and export compiled model to s3.

Unlike with the Optimum Neuron example [02_Whisper_optimum_neuron](02_Whisper_optimum_neuron.ipynb), here we have to be specific with regards to how we want to compile the model, instead of relying on ON to do it for us.

Let's start with creating `src` directory when we put requirements.txt file for the compilation job and compilation script

In [None]:
!mkdir -p src

In [None]:
%%writefile src/requirements.txt
--extra-index-url https://pip.repos.neuron.amazonaws.com
transformers==4.36.2
datasets==2.18.0 
soundfile==0.12.1 
librosa==0.10.1

In [None]:
%%writefile src/compile.py
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
import os
import io
import logging
import argparse
import torch
import types
import torch_neuronx
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch.nn.functional as F
from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions,BaseModelOutput

os.environ['NEURON_RT_NUM_CORES']='1'

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

def load_model(model_id):
    logger.info(f"Loading model: {model_id}")
    processor = WhisperProcessor.from_pretrained(model_id)
    model = WhisperForConditionalGeneration.from_pretrained(model_id, torchscript=True)
    logger.info(f"Model loaded - encoder dim: {model.config.num_mel_bins}, decoder dim: {model.config.d_model}")
    return model, processor


def enc_f(self, input_features, attention_mask, **kwargs):
    if hasattr(self, 'forward_neuron'):
        out = self.forward_neuron(input_features, attention_mask)
    else:
        out = self.forward_(input_features, attention_mask, return_dict=True)
    return BaseModelOutput(**out)


def dec_f(self, input_ids, attention_mask=None, encoder_hidden_states=None, **kwargs):
    out = None
    if not attention_mask is None and encoder_hidden_states is None:
        encoder_hidden_states, attention_mask = attention_mask,encoder_hidden_states
    inp = [input_ids, encoder_hidden_states]

    if inp[0].shape[1] > self.max_length:
        logger.error(f"Decoded sequence length {inp[0].shape[1]} exceeds max {self.max_length}")
        raise Exception(f"The decoded sequence is not supported. Max: {self.max_length}")
    pad_size = torch.as_tensor(self.max_length - inp[0].shape[1])
    inp[0] = F.pad(inp[0], (0, pad_size), "constant", processor.tokenizer.pad_token_id)

    if hasattr(self, 'forward_neuron'):
        out = self.forward_neuron(*inp)
    else:
        out = self.forward_(input_ids=inp[0], encoder_hidden_states=inp[1], return_dict=True, use_cache=False, output_attentions=output_attentions)
    out['last_hidden_state'] = out['last_hidden_state'][:, :input_ids.shape[1], :]
    if not out.get('attentions') is None:
        out['attentions'] = torch.stack([torch.mean(o[:, :, :input_ids.shape[1], :input_ids.shape[1]], axis=2, keepdim=True) for o in out['attentions']])
    if not out.get('cross_attentions') is None:
        out['cross_attentions'] = torch.stack([torch.mean(o[:, :, :input_ids.shape[1], :], axis=2, keepdim=True) for o in out['cross_attentions']])
    return BaseModelOutputWithPastAndCrossAttentions(**out)


def proj_out_f(self, inp):
    pad_size = torch.as_tensor(self.max_length - inp.shape[1], device=inp.device)
    if inp.shape[1] > self.max_length:
        logger.error(f"Input sequence length {inp.shape[1]} exceeds max {self.max_length}")
        raise Exception(f"The decoded sequence is not supported. Max: {self.max_length}")
    x = F.pad(inp, (0,0,0,pad_size), "constant", processor.tokenizer.pad_token_id)

    if hasattr(self, 'forward_neuron'):
        out = self.forward_neuron(x)
    else:
        out = self.forward_(x)
    out = out[:, :inp.shape[1], :]
    return out


def update_forward_methods(model, max_dec_len=35):
    logger.info("Updating forward methods for encoder, decoder, and projection output")
    if not hasattr(model.model.encoder, 'forward_'): model.model.encoder.forward_ = model.model.encoder.forward
    if not hasattr(model.model.decoder, 'forward_'): model.model.decoder.forward_ = model.model.decoder.forward
    if not hasattr(model.proj_out, 'forward_'): model.proj_out.forward_ = model.proj_out.forward

    model.model.encoder.forward = types.MethodType(enc_f, model.model.encoder)
    model.model.decoder.forward = types.MethodType(dec_f, model.model.decoder)
    model.proj_out.forward = types.MethodType(proj_out_f, model.proj_out)

    model.model.decoder.max_length = max_dec_len
    model.proj_out.max_length = max_dec_len
    logger.info(f"Forward methods updated - max_length set to {max_dec_len}")

    return model


def trace_encoder(model, model_dir):
    logger.info("Starting encoder tracing")
    dim_enc=model.config.num_mel_bins
    inp = (torch.zeros([1, dim_enc, 3000], dtype=torch.float32), torch.zeros([1, dim_enc], dtype=torch.int64))
    if hasattr(model.model.encoder, 'forward_neuron'): del model.model.encoder.forward_neuron
    neuron_encoder = torch_neuronx.trace(
        model.model.encoder,
        inp,
        compiler_args='--model-type=transformer --auto-cast=all --auto-cast-type=bf16',
        compiler_workdir='./enc_dir',
        inline_weights_to_neff=False)
    save_path = model_dir+"/"+"enc.neuron"
    neuron_encoder.save(save_path)
    logger.info(f"Encoder traced and saved to {save_path}")
    del inp, neuron_encoder
    import gc
    gc.collect()


def trace_decoder(model, model_dir, max_dec_len):
    logger.info("Starting decoder tracing")
    dim_dec=model.config.d_model
    inp = (torch.zeros([1, max_dec_len], dtype=torch.int64), torch.zeros([1, 1500, dim_dec], dtype=torch.float32))
    if hasattr(model.model.decoder, 'forward_neuron'): del model.model.decoder.forward_neuron
    neuron_decoder = torch_neuronx.trace(
        model.model.decoder,
        inp,
        compiler_args='--model-type=transformer --auto-cast=all --auto-cast-type=bf16',
        compiler_workdir='./dec_dir',
        inline_weights_to_neff=True)
    save_path = model_dir+"/"+"dec.neuron"
    neuron_decoder.save(save_path)
    logger.info(f"Decoder traced and saved to {save_path}")
    del inp, neuron_decoder
    import gc
    gc.collect()


def proj_output(model, model_dir, max_dec_len):
    logger.info("Starting projection output tracing")
    dim_dec=model.config.d_model
    inp = torch.zeros([1, max_dec_len, dim_dec], dtype=torch.float32)
    if hasattr(model.proj_out, 'forward_neuron'): del model.proj_out.forward_neuron
    neuron_decoder = torch_neuronx.trace(
        model.proj_out,
        inp,
        compiler_args='--model-type=transformer --auto-cast=all --auto-cast-type=bf16',
        compiler_workdir='./proj_out_dir',
        inline_weights_to_neff=True)
    save_path = model_dir+"/"+"proj.neuron"
    neuron_decoder.save(save_path)
    logger.info(f"Projection output traced and saved to {save_path}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument("--batch_size", type=int, default=1, help="Number of samples processed in each batch during training or inference")
    parser.add_argument("--max_dec_len", type=int, default=448, help="Maximum sequence length for input data")
    parser.add_argument("--hf_token", type=str, default=None, help="Which is used for authentication with Hugging Face's model hub")
    parser.add_argument("--model_id", type=str, default="openai/whisper-large-v3", help="Specifies the id for the pre-trained model to be used")
    parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
    parser.add_argument("--output_attentions", action="store_true", help="Enable output attentions")

    args, _ = parser.parse_known_args()

    logger.info(f"Starting model compilation with args: {args}")

    if not args.hf_token is None and len(args.hf_token) > 0:
        logger.info("HF token provided - logging in to Hugging Face")
        login(token=args.hf_token)

    compiler_args = {"auto_cast": "all", "auto_cast_type": "bf16", "model_type": "transformer"}

    try:
        model, processor = load_model(model_id=args.model_id)

        model = update_forward_methods(model,
                                       max_dec_len=args.max_dec_len)

        global output_attentions
        output_attentions = args.output_attentions

        trace_encoder(model, args.model_dir)
        trace_decoder(model, args.model_dir, args.max_dec_len)
        proj_output(model, args.model_dir, args.max_dec_len)

        logger.info("Model compilation completed successfully")
    except Exception as e:
        logger.error(f"Model compilation failed: {str(e)}", exc_info=True)
        raise

Define the training job and run it. We are using trn1.2xlarge instance because compilation requires extra amount of memory then running the model. We will use AWS Inference 2 later to deploy already compiled model.

In [None]:
import json
import logging
from sagemaker.pytorch import PyTorch

HF_TOKEN=""
tp_degree=1
max_dec_len=35 #--> This was chosen for our audio sample of 13s
model_id = "openai/whisper-large-v3"

hyperparameters={
    "max_dec_len": max_dec_len,
    "batch_size": 1,
    "model_id": model_id
}

if HF_TOKEN and len(HF_TOKEN) > 3:
    hyperparameters["hf_token"]= HF_TOKEN
    
estimator = PyTorch(
    entry_point="compile.py", # Specify your train script
    source_dir="src",
    role=role,
    sagemaker_session=session,
    container_log_level=logging.DEBUG,
    instance_count=1,
    instance_type='ml.trn1.2xlarge',
    output_path=f"s3://{inf_bucket}/output",
    disable_profiler=True,
    disable_output_compression=True,


    image_uri=f"763104351884.dkr.ecr.{inf_region}.amazonaws.com/pytorch-training-neuronx:2.7.0-neuronx-py310-sdk2.25.0-ubuntu22.04",
    env={
        'NEURON_RT_NUM_CORES': str(tp_degree)
    },
    # volume_size = 512,
    hyperparameters=hyperparameters
)

The compilation time takes around 18min, but it only needs to be done once.

In [None]:
estimator.fit(wait=True)

## Deploy compiled model to AWS Inferentia2


After compilation, we have to push the inference and requirements files to the same s3 bucket where the model files are.
The final structure will be as follows:

```bash
s3://bucket/output/job-name/output/model/
├── enc.neuron          # Compiled encoder model
├── dec.neuron          # Compiled decoder model
├── proj.neuron         # Compiled projection model
├── model.pt            # Dummy PyTorch model (for container validation)
└── code/
    ├── inference.py    # Custom inference handler
    └── requirements.txt # Python dependencies
```
**Note: The container runs a validation check on model file. To do it, it looks for files with .pt or .pth extension, since we have few of them, we renamed them to have a .neuron extension and create a dummy model.pt just to pass the validation step. Ultimately what is loaded and served, are our .neuron files, which are the compiled versions of our model.**

Let's define the inference.py file below

In [None]:
%%writefile src/inference.py
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

import os
import io
import librosa
import logging
import torch
import types
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline, WhisperConfig
import torch.nn.functional as F
from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, BaseModelOutput

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Global variables set during model loading
processor = None
output_attentions = False


def enc_f(self, input_features, attention_mask, **kwargs):
    if hasattr(self, 'forward_neuron'):
        out = self.forward_neuron(input_features, attention_mask)
    else:
        out = self.forward_(input_features, attention_mask, return_dict=True)
    return BaseModelOutput(**out)


def dec_f(self, input_ids, attention_mask=None, encoder_hidden_states=None, **kwargs):
    global processor, output_attentions
    if attention_mask is not None and encoder_hidden_states is None:
        encoder_hidden_states, attention_mask = attention_mask, encoder_hidden_states

    inp = [input_ids, encoder_hidden_states]
    if inp[0].shape[1] > self.max_length:
        raise Exception(f"Sequence length {inp[0].shape[1]} exceeds max {self.max_length}")

    pad_size = torch.as_tensor(self.max_length - inp[0].shape[1])
    inp[0] = F.pad(inp[0], (0, pad_size), "constant", processor.tokenizer.pad_token_id)

    if hasattr(self, 'forward_neuron'):
        out = self.forward_neuron(*inp)
    else:
        out = self.forward_(input_ids=inp[0], encoder_hidden_states=inp[1], return_dict=True, use_cache=False, output_attentions=output_attentions)

    out['last_hidden_state'] = out['last_hidden_state'][:, :input_ids.shape[1], :]
    if out.get('attentions') is not None:
        out['attentions'] = torch.stack([torch.mean(o[:, :, :input_ids.shape[1], :input_ids.shape[1]], axis=2, keepdim=True) for o in out['attentions']])
    if out.get('cross_attentions') is not None:
        out['cross_attentions'] = torch.stack([torch.mean(o[:, :, :input_ids.shape[1], :], axis=2, keepdim=True) for o in out['cross_attentions']])
    return BaseModelOutputWithPastAndCrossAttentions(**out)

def proj_out_f(self, inp):
    global processor
    if inp.shape[1] > self.max_length:
        raise Exception(f"Sequence length {inp.shape[1]} exceeds max {self.max_length}")

    pad_size = self.max_length - inp.shape[1]

    if pad_size > 0:
        x = F.pad(inp, (0, 0, 0, pad_size), "constant", processor.tokenizer.pad_token_id)
    else:
        x = inp

    if hasattr(self, 'forward_neuron'):
        out = self.forward_neuron(x)
    else:
        out = self.forward_(x)
    return out[:, :inp.shape[1], :]


def model_fn(model_dir, context=None):
    global processor, output_attentions

    model_id = os.environ.get("MODEL_ID", "openai/whisper-large-v3")
    max_dec_len = int(os.environ.get("MAX_DEC_LEN", "448"))
    output_attentions = os.environ.get("OUTPUT_ATTENTIONS", "false").lower() == "true"

    # Create minimal config
    config = WhisperConfig.from_pretrained(model_id)

    # Use meta device to avoid allocating memory for weights
    with torch.device('meta'):
        model = WhisperForConditionalGeneration(config)

    # Move only the necessary components to CPU
    model.to_empty(device='cpu')

    # Load neuron models directly
    model.model.encoder.forward_neuron = torch.jit.load(os.path.join(model_dir, "enc.neuron"))
    model.model.decoder.forward_neuron = torch.jit.load(os.path.join(model_dir, "dec.neuron"))
    model.proj_out.forward_neuron = torch.jit.load(os.path.join(model_dir, "proj.neuron"))

    # Set up forwards
    model.model.encoder.forward = types.MethodType(enc_f, model.model.encoder)
    model.model.decoder.forward = types.MethodType(dec_f, model.model.decoder)
    model.proj_out.forward = types.MethodType(proj_out_f, model.proj_out)

    model.model.decoder.max_length = max_dec_len
    model.proj_out.max_length = max_dec_len

    # Load processor last
    processor = WhisperProcessor.from_pretrained(model_id)

    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        chunk_length_s=30
    )

    return pipe


def input_fn(input_data, content_type, context=None):
    logger.info(f"Processing input with content_type: {content_type}")
    if content_type == 'audio/x-audio':
        audio_array, sr = librosa.load(io.BytesIO(input_data), sr=16000)
        logger.info(f"Audio loaded - sample rate: {sr}, shape: {audio_array.shape}")
        return audio_array
    else:
        raise Exception(f"Unsupported mime type: {content_type}. Supported: audio/x-audio")


def predict_fn(audio_array, asr_pipeline, context=None):
    logger.info(f"Starting inference on audio with shape: {audio_array.shape}")
    output = asr_pipeline(audio_array)
    logger.info(f"Inference completed - transcription length: {len(output['text'])}")
    return {"transcription": output["text"]}

Create the dummy `model.py` file

In [None]:
import torch

# Create a tiny dummy TorchScript model
dummy_model = torch.jit.script(torch.nn.Linear(1, 1))
torch.jit.save(dummy_model, 'model.pt')

Now we can upload the files. The S3 location can be extracted from the `estimator.model_data` variable

In [None]:
import subprocess

# Extract S3 URI from model_data dict
model_s3_uri = estimator.model_data['S3DataSource']['S3Uri']

# Upload inference code to model/code/
subprocess.run(['aws', 's3', 'cp', 'src/', f'{model_s3_uri}code/', '--recursive'])
subprocess.run(['aws', 's3', 'cp', 'model.pt', f'{model_s3_uri}model.pt'])


Now let's deploy the model.

In [None]:
from sagemaker.model import Model

model = Model(
    model_data=estimator.model_data,
    role=role,
    sagemaker_session=session,
    entry_point='inference.py',
    source_dir='src',
    env={
        'MODEL_ID': model_id,
        'MAX_DEC_LEN': str(max_dec_len),
        'NEURON_RT_NUM_CORES': str(tp_degree),
        'SAGEMAKER_MODEL_SERVER_WORKERS': '1'
    },
    image_uri=f"763104351884.dkr.ecr.{inf_region}.amazonaws.com/pytorch-inference-neuronx:2.7.0-neuronx-py310-sdk2.25.0-ubuntu22.04"
)

model._is_compiled_model = True

Deployment can take roughly take 8-10 minutes 

In [None]:
instance_type="ml.inf2.xlarge"

predictor = model.deploy(
    initial_instance_count=1,
    instance_type=instance_type,
    model_data_download_timeout=3600, # it takes some time to download all the artifacts and load the model
    container_startup_health_check_timeout=1800
)

Play the audio file we gonna transcibe

In [None]:
import IPython.display as ipd
import librosa

# Load and play
audio, sr = librosa.load("mlk.flac")
ipd.Audio(audio, rate=sr)

Configure serializers for input and output. Input is an audio file and output it's transcription

In [None]:
from sagemaker.serializers import DataSerializer
from sagemaker.deserializers import JSONDeserializer	
from sagemaker.predictor import Predictor

predictor = Predictor(
    endpoint_name=model.endpoint_name,
    sagemaker_session=session
)

predictor.serializer = DataSerializer(content_type='audio/x-audio')
predictor.deserializer = JSONDeserializer()

In [None]:
with open("mlk.flac", "rb") as f:
	data = f.read()

output = predictor.predict(data)
output

Measure the average transcription time

In [None]:
import time
iters = 10

start = time.time()
for i in range(0,iters):
    predictor.predict(data)
end = time.time()

transcription_time = (end-start)/iters
print(f"average transcription time is: {transcription_time}")

## Cost performance calculation

In [None]:
duration = librosa.get_duration(path="mlk.flac")
print(f"Audio duration: {duration}")

At the moment `ml.inf2.xlarge` is not supported by pricing api so we set the price per hour manualy according to [AWS Pricing Calculator](https://aws.amazon.com/sagemaker/ai/pricing/)

In [None]:
price=0.99 # USD/hour in us-east-2

In [None]:
price_to_transcribe_1_sec = price / (3600.0/transcription_time*duration)
print(f"Cost to transcribe 1 second of audio using Whisper on {instance_type}: {price_to_transcribe_1_sec} USD")

## Clean up resources

In [None]:
predictor.delete_model()
predictor.delete_endpoint()