### 1. 安装依赖 & 变量设置

In [None]:
# Image: PyTorch 2.0.0 Python 3.10 CPU Optimized
# Kernel: Python3

In [None]:
!pip install huggingface-hub -Uqq
!pip install --upgrade sagemaker -Uqq

In [None]:
!pip install -Uqq datasets urlparse -i https://pypi.tuna.tsinghua.edu.cn/simple

In [None]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

In [None]:
from pathlib import Path

local_model_path = Path("./whisper-model")
local_model_path.mkdir(exist_ok=True)
s3_code_prefix = "aigc-asr-models"

### 2. 模型部署准备（entrypoint脚本，容器镜像，服务配置）

In [None]:
inference_image_uri = (
    f"763104351884.dkr.ecr.{region}.amazonaws.com/huggingface-pytorch-inference:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04"
)

#中国区需要替换为下面的image_uri
if region in ['cn-north-1', 'cn-northwest-1']:
    inference_image_uri = (
        f"727897471807.dkr.ecr.{region}.amazonaws.com.cn/huggingface-pytorch-inference:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04"
    )

print(f"Image going to be used is ---- > {inference_image_uri}")

In [None]:
!mkdir -p code

In [None]:
%%writefile ./code/inference.py
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

# Model and task specifications
task = "automatic-speech-recognition"

# Device configuration
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

def model_fn(model_dir):
    try:
        print(f"Loading model: {model_dir}")
        # Load the model
        model = AutoModelForSpeechSeq2Seq.from_pretrained(
            model_dir, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
        )
        model.to(device)
        print(f"Model loaded on device: {device}")

        # Load the processor
        processor = AutoProcessor.from_pretrained(model_dir)
        print("Processor loaded")

        # Create and return a pipeline for ASR
        asr_pipeline = pipeline(
            task,
            model=model,
            tokenizer=processor.tokenizer,
            feature_extractor=processor.feature_extractor,
            return_timestamps=True,
            torch_dtype=torch_dtype,
            device=device,
        )
        print("Pipeline created")

        return asr_pipeline
    except Exception as e:
        print(f"An error occurred: {e}")
        raise

#### 执行下面这个cell，在requirements.txt中添加国内的pip镜像

In [None]:
%%writefile ./code/requirements.txt
-i https://pypi.tuna.tsinghua.edu.cn/simple
transformers==4.38.0
accelerate==0.26.1

In [None]:
# 1. 首先安装必要的库
!pip install -U modelscope -i https://pypi.tuna.tsinghua.edu.cn/simple

In [None]:
# 2. 下载模型文件
from modelscope import snapshot_download
model_id = "openai-mirror/whisper-large-v3-turbo"
local_model_path = "./whisper-model"

# 下载模型文件
snapshot_download(
    model_id=model_id,
    local_dir=local_model_path,
    ignore_patterns=["*.md", ".git*"]
)

In [None]:
# 3. 打包模型文件
!tar -czf model.tar.gz -C {local_model_path} .

# 4. 检查打包的文件大小
!ls -lh model.tar.gz

In [None]:
model_uri = sess.upload_data("model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {model_uri}")

### 3. 创建模型 & 创建endpoint

In [None]:
from sagemaker.huggingface.model import HuggingFaceModel

model_name = f"whisper-large-v3-{account_id}"

whisper_hf_model = HuggingFaceModel(
    model_data=model_uri,
    role=role,
    image_uri=inference_image_uri,
    entry_point="inference.py",
    source_dir='./code',
    name=model_name
)

In [None]:
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

endpoint_name = f'{account_id}-whisper-real-time-endpoint'

real_time_predictor = whisper_hf_model.deploy(
    initial_instance_count=1,
    instance_type="ml.g4dn.xlarge",
    endpoint_name=endpoint_name
)

### 4. 模型测试

In [None]:
from sagemaker.serializers import DataSerializer

real_time_predictor.serializer = DataSerializer(content_type='audio/x-audio')

# Make sure the input file "sample1.flac" exists
with open("./cosyvoice/happy.wav", "rb") as f:
    data = f.read()
real_time_predictor.predict(data)

In [33]:
import boto3

sagemaker_client = boto3.client(
    "sagemaker-runtime",
    region_name=region
)

with open("./cosyvoice/happy.wav", "rb") as f:
    data = f.read()

    resp = sagemaker_client.invoke_endpoint(
        EndpointName=endpoint_name, Body=data, ContentType='audio/x-audio'
    )
    print(resp["Body"].read().decode("utf8"))

{"text":"希望你以后能够做得比我还好哟","chunks":[{"timestamp":[0.0,3.14],"text":"希望你以后能够做得比我还好哟"}]}


### 5. 清理模型端点

In [None]:
real_time_predictor.delete_endpoint()
real_time_predictor.delete_model()