# 音声認識モデル ReaonSpeech を SageMaker の推論エンドポイントにデプロイする

本チュートリアルでは、音声認識モデルである [ReazonSpeech](https://research.reazon.jp/projects/ReazonSpeech/index.html) を Amazon SageMaker 上にデプロイする流れを体験してみます。

## 準備

### モジュールのインポート、定数の設定、boto3 クライアントの設定、ロールの取得

In [None]:
import sagemaker
from typing import Final
import boto3
import os
from time import sleep
from huggingface_hub import hf_hub_download
smr_client:Final = boto3.client('sagemaker-runtime')
sm_client:Final = boto3.client('sagemaker')
s3_client:Final = boto3.client('s3')
ecr_client:Final = boto3.client('ecr')
endpoint_inservice_waiter:Final = sm_client.get_waiter('endpoint_in_service')
role: Final[str] = sagemaker.get_execution_role()
region: Final[str] = sagemaker.Session().boto_region_name
bucket: Final[str] = sagemaker.Session().default_bucket()

## モデルの作成

### モデルと推論コードを保存するディレクトリの作成

In [None]:
model_dir: Final[str] = 'model'
!if [ -d ./{model_dir} ]; then rm -rf ./{model_dir}/;fi
!mkdir -p ./{model_dir}/code

### モデルのダウンロード

In [None]:
hf_hub_download(repo_id='reazon-research/reazonspeech-nemo-v2',
                filename='reazonspeech-nemo-v2.nemo',
                revision='33693408be76b7cba9fd4a7546a0a8772430211b', local_dir=model_dir)

### 推論コードの作成

In [None]:
%%writefile ./{model_dir}/code/inference.py
import json
import logging
import sys
import os
import io
from reazonspeech.nemo.asr import transcribe, audio_from_numpy, TranscribeConfig
import torch
import soundfile as sf


logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
transcribe_config = TranscribeConfig(verbose=False)

def parse_transcribe_result(transcribe_result):
    text = transcribe_result.text
    segments = []
    subwords = []
    for subword in transcribe_result.subwords:
        s = {"seconds": subword.seconds, "token_id": subword.token_id, "text": subword.token}
        subwords.append(s)
    for segment in transcribe_result.segments:
        s = {"start_seconds": segment.start_seconds, "end_seconds": segment.end_seconds, "text": segment.text}
        segments.append(s)
    return {"text": text, "subwords": subwords, "segments": segments}

def model_fn(model_dir):
    from nemo.collections.asr.models import EncDecRNNTBPEModel
    model_path = os.path.join(model_dir, 'reazonspeech-nemo-v2.nemo')
    model = EncDecRNNTBPEModel.restore_from(restore_path=model_path, map_location=device)
    
    return model

def input_fn(request_body, request_content_type):
    if not request_content_type.startswith('audio'):
        raise ValueError('Content type: audio/ is only accepted.')
    data = io.BytesIO(request_body)
    audio_array, sampling_rate = sf.read(data)
    data = {"array": audio_array, "sr": sampling_rate}
    
    return data

def predict_fn(input_object, model):
    audio = audio_from_numpy(input_object["array"], input_object["sr"])
    ret = transcribe(model, audio, transcribe_config)
    return ret

def output_fn(predictions, content_type):
    return json.dumps(parse_transcribe_result(predictions), ensure_ascii=False)

### モデルと推論コードを `model.tar.gz` に固める

In [None]:
%cd {model_dir}
!tar zcvf model.tar.gz ./*
!mv model.tar.gz ../
%cd ..

###　`model.tar.gz` を S3 にアップロード

In [None]:
print(sagemaker.Session().default_bucket())

In [None]:
s3_key_prefix: Final[str] = 'reazonspeech'
model_s3_uri:Final[str] = sagemaker.session.Session().upload_data(
    f'./model.tar.gz',
    key_prefix = s3_key_prefix
)
print(model_s3_uri)

## 推論用コンテナの作成

### ECR リポジトリを作成

In [None]:
repo_name: Final[str] = 'sagemaker-reazonspeech'
repo_uri: Final[str] = ecr_client.create_repository(repositoryName=repo_name)['repository']['repositoryUri']

### `Dockerfile` の作成

In [None]:
%%writefile ./Dockerfile
ARG BASE_IMAGE="763104351884.dkr.ecr.BASE_IMAGE_REGION.amazonaws.com/pytorch-inference:2.3.0-gpu-py311-cu121-ubuntu20.04-sagemaker"
FROM ${BASE_IMAGE}
ARG REAZONSPEECH_RELEASE="2.0.0"
ARG HUGGINGFACE_HUB_RELEASE="0.23.2"
RUN apt update && apt install -y ffmpeg
RUN pip install Cython
RUN wget https://github.com/reazon-research/ReazonSpeech/archive/refs/tags/v${REAZONSPEECH_RELEASE}.tar.gz && \
    tar -zxvf v${REAZONSPEECH_RELEASE}.tar.gz && \
    pip install ReazonSpeech-${REAZONSPEECH_RELEASE}/pkg/nemo-asr
RUN pip install huggingface-hub==${HUGGINGFACE_HUB_RELEASE}

In [None]:
!sed -i "s/BASE_IMAGE_REGION/{region}/" Dockerfile

### コンテナイメージのビルド

In [None]:
image_tag: Final[str] = 'gpu'
image_uri: Final[str] = '{}:{}'.format(repo_uri, image_tag)

In [None]:
if os.environ.get('SAGEMAKER_SPACE_NAME'):
    docker_options: Final[str] = '--network sagemaker'
else:
    docker_options: Final[str] = ''

In [None]:
!aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin 763104351884.dkr.ecr.{region}.amazonaws.com
!docker build {docker_options} -t {image_uri} .

### コンテナイメージを ECR リポジトリにプッシュ

In [None]:

!aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {repo_uri}
!docker push {image_uri}

## AWS SDK for Python でモデルをデプロイしてリアルタイム推論

### 推論エンドポイントのデプロイ

In [None]:
# 名前の設定
model_name: Final[str] = 'ReazonSpeech'
endpoint_config_name: Final[str] = model_name + 'EndpointConfig'
endpoint_name: Final[str] = model_name + 'Endpoint'

In [None]:
# Model 作成
response = sm_client.create_model(
    ModelName=model_name,
    PrimaryContainer={
        'Image': image_uri,
        'ModelDataUrl': model_s3_uri
    },
    ExecutionRoleArn=role,
)
# EndpointConfig 作成
response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            'VariantName': 'AllTrafic',
            'ModelName': model_name,
            'InitialInstanceCount': 1,
            'InstanceType': 'ml.p3.2xlarge',
        },
    ],
)
# Endpoint 作成
response = sm_client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name,
)
# Endpoint が有効化されるまで待つ
endpoint_inservice_waiter.wait(
    EndpointName=endpoint_name,
    WaiterConfig={'Delay': 5,}
)

### 推論の実行

事前に任意の WAV ファイルをダウンロードして、下記コード内の `file_name` にファイルパスを指定してください。

In [None]:
file_name = 'path/your-audio-data.wav'
with open(file_name, 'rb') as audio_data:
    f = audio_data.read() 
    b = bytearray(f)
    response = smr_client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType='audio/wav',
        Accept='application/json',
        Body=b
    )
    predictions = response['Body'].read().decode('utf-8')
    print(predictions)

## ECR リポジトリ, Model, EndpointConfig, Endpoint を削除

In [None]:
sm_client.delete_endpoint(EndpointName=endpoint_name)
sm_client.delete_endpoint_config(EndpointConfigName=endpoint_config_name)
sm_client.delete_model(ModelName=model_name)
ecr_client.delete_repository(repositoryName=repo_name, force=True)

In [None]:
sleep(5)