# Korean ASR with Riva

Refer to: https://docs.nvidia.com/deeplearning/riva/user-guide/docs/asr/asr-customizing.html

## Download Korean ASR models from NGC

In [None]:
# Acoustic models from NGC.
# Note that
#     1) NGC API is installed.
#     2) The directory where being downloaded must have "write" authority.
#     3) NGC CLI should be executed at the location where models are going to be downloaded.(i.e., at ./models/korean)
#     4) It's OK to execute the NGC CLI outside the container(i.e., local workstation). However, make sure that the downloaded directory should be mounted to the container so that you can access to the models inside the container.

# Conformer-CTC
import numpy as np
!ngc registry model download-version "nvidia/tao/speechtotext_ko_kr_conformer:deployable_v1.0"
# Citrinet-1024
!ngc registry model download-version "nvidia/tao/speechtotext_ko_kr_citrinet:deployable_v1.0"

In [None]:
# N-gram decoder models from NGC
# N-Gram
!ngc registry model download-version "nvidia/tao/speechtotext_ko_kr_lm:deployable_v1.0"

In [None]:
# Punctuation(optional)
!ngc registry model download-version "nvidia/tao/punctuationcapitalization_ko_kr_bert_base:deployable_v1.1"

In [None]:
!apt-get install -y libsndfile1-dev
!pip install librosa
!pip install datasets[audio]
!pip install openai
!pip install -U openai-whisper
!pip install triton client
!pip install tritonclient
!pip install nvidia-pyindex
!pip install nvidia-eff
!pip install nemo2riva


In [None]:
!apt-get install -y libasound2-dev portaudio19-dev libportaudio2 libportaudiocpp0 python3-pyaudio
!pip install pyaudio
# For some errors: https://stackoverflow.com/questions/59006083/how-to-install-portaudio-on-pi-properly

In [None]:
!cd ../..

## Build and deploy

Let's deploy Korean Citrinet-1024 as an example. The detailed pipeline configurations are specified in https://docs.nvidia.com/deeplearning/riva/user-guide/docs/asr/asr-pipeline-configuration.html.
1. Launch Riva Servicemaker **at your workstation**.
    - `./scripts/build_deploy/riva_servicemaker.sh`
        - docker run --init -it --rm --gpus  all -v riva-model-repo:/data -v E:\기본연구_Data\riva_quickstart_v2.8.1\riva_demo:/servicemaker-dev --name riva-service-maker nvcr.io/nvidia/riva/riva-speech:2.8.1-servicemaker /bin/bash
                -
2. Build rmir file **inside the servicemaker container**.
    - `cd /servicemaker-dev`
    - `./scripts/build_deploy/korean_models/riva_asr_citrinet_kr_build.sh`
    - This shell script consists of like:
        ```sh
        riva-build speech-recognition \
            /servicemaker-dev/<rmir_filename>:<encryption_key> \
            /servicemaker-dev/<riva_filename>:<encryption_key> \
            --name=<pipeline_name>
            --decoder_type=flashlight \
            --decoding_language_model_binary=<KENLM_binary_filename> \
            --decoding_vocab=<decoder_vocab_file>
            ...
        ```
3. Deploy models **inside the servicemaker container**.
    - `./scripts/build_deploy/korean_models/riva_asr_citrinet_kr_deploy.sh`

You can also deploy Conformer-CTC for Korean with the same procedure using those scripts:
- `./scripts/build_deploy/korean_models/riva_asr_conformer_kr_build.sh`
- `./scripts/build_deploy/korean_models/riva_asr_conformer_kr_deploy.sh`

For reference, **building and deploying each model takes approximately over 30 mins(Citrinet-1024 takes especially much longer), respectively.** After the deployment is done, restart the riva server at your local workstation.
```bash
bash resources/riva_quickstart_v2.8.1/riva_stop.sh
bash resources/riva_quickstart_v2.8.1/riva_start.sh
```

## Check whether your model is successfully deployed using Trtion APIs.

In [None]:
import grpc
from tritonclient.grpc import service_pb2
from tritonclient.grpc import service_pb2_grpc
# docker ps 로 8001에 매핑되는 주소를 매번 바꿔 적어줘야 함.
#trt_channel = grpc.insecure_channel("riva-speech:57876")
trt_channel = grpc.insecure_channel("localhost:8001")
grpc_stub = service_pb2_grpc.GRPCInferenceServiceStub(trt_channel)

try:
    request = service_pb2.ServerLiveRequest()
    response = grpc_stub.ServerLive(request)
    print("server {}".format(response))
except Exception as ex:
    print(ex)

In [None]:
request = service_pb2.RepositoryIndexRequest()
response = grpc_stub.RepositoryIndex(request)

print("num models: {}\n".format(len(response.models)))
print([i for i in response.models])

## Offline test

In [None]:
import IPython.display as ipd
import io, os
import librosa
import riva.client
from tqdm.auto import tqdm
import json

# Create Riva clients and connect to Riva Speech API server
auth =riva.client.Auth(uri="localhost:50051")
nemo_dir = os.path.join('E:/기본연구_Data')

#server
riva_asr = riva.client.ASRService(auth)
riva_nlp = riva.client.NLPService(auth)
# riva_tts = riva.client.SpeechSynthesisService(auth)

In [None]:

preds = []
refs = []
with open(f"{nemo_dir}/Validation_RealAudio/Manifests/test_merged.json", 'r') as inpufile:
    for line in tqdm(inpufile):
        sample = json.loads(line.strip())

        audio, sr = librosa.core.load(sample['audio_filepath'], sr=None)
        with io.open(sample['audio_filepath'], 'rb') as fh:
            content = fh.read()

        offline_config = riva.client.RecognitionConfig(
            encoding=riva.client.AudioEncoding.LINEAR_PCM,                     # Supports LINEAR_PCM, FLAC, MULAW and ALAW audio encodings
            sample_rate_hertz = sr,                                            # Audio will be resampled if necessary
            max_alternatives=1,                                                # How many top-N hypotheses to return
            enable_automatic_punctuation=False,                                 # Add punctuation when end of VAD detected
            audio_channel_count = 1,                                           # Mono channel"
            verbatim_transcripts=True,
            # model="jbs_quartznet_kr_E500"                   #  In the case where multiple models might be able to fulfill the client request, one model is selected at random. Y
            model="woojin-conformer-ko-KR-asr-streaming",
            # model="woojin-citrinet-1024-ko-KR-asr-streaming",
        )
        response = riva_asr.offline_recognize(content, offline_config)
        try:
            asr_best_transcript = response.results[0].alternatives[0].transcript
            preds.append(asr_best_transcript)
        except IndexError as e:
            preds.append("")
            refs.append(sample['text'])
            continue
        refs.append(sample['text'])


In [None]:
path = "../../Validation_RealAudio/audio/DOC/20220805_1_DOC_058.wav"
with io.open(path, 'rb') as fh:
    content = fh.read()
ipd.Audio(path)

response = riva_asr.offline_recognize(content, offline_config)
asr_best_transcript = response.results[0].alternatives[0].transcript
print("ASR Transcript:", asr_best_transcript)

print("\n\nFull Response Message:")
print(response)

In [None]:
import jiwer
import pandas as pd
from whisper.normalizers import BasicTextNormalizer


normalizer = BasicTextNormalizer()
hypotheses = []
references = []
data = pd.DataFrame(dict(hypothesis=preds, reference=refs))
data["hypothesis_clean"] = [normalizer(text) for text in preds]
data["reference_clean"] = [normalizer(text) for text in refs]
data.to_csv('result-conformer.csv')

wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))
cer = jiwer.cer(list(data["reference_clean"]), list(data["hypothesis_clean"]))
print(f"WER: {wer * 100:.3f} %")
print(f"CER: {cer * 100:.3f} %")

In [None]:
# Punctuation model test(optional)
model_name = "woojin-punctuation-KR"
response = riva_nlp.transform_text(input_strings=asr_best_transcript, model_name=model_name)

print("Transformed results are:")
print("\n".join([i for i in response.text]))

## Whisper Benchmark
https://github.com/openai/whisper

In [None]:
from datasets import load_dataset
dataset = load_dataset("Bingsu/zeroth-korean")

In [None]:
import openai
import whisper
model = whisper.load_model("large")
options = whisper.DecodingOptions(language="ko")

In [None]:
preds_w = []
refs_w = []
with open(f"{nemo_dir}/Validation_RealAudio/Manifests/test_merged.json", 'r') as inpufile:
    for line in tqdm(inpufile):
        sample = json.loads(line.strip())

        audio = whisper.load_audio(sample['audio_filepath'])
        audio = whisper.pad_or_trim(audio)
        mel = whisper.log_mel_spectrogram(audio).to(model.device)
        result = whisper.decode(model, mel, options)
        preds_w.append(result.text)
        refs_w.append(sample['text'])


In [None]:
import jiwer
import pandas as pd
from whisper.normalizers import BasicTextNormalizer

normalizer = BasicTextNormalizer()
hypotheses = []
references = []
data = pd.DataFrame(dict(hypothesis=preds_w, reference=refs_w))
data["hypothesis_clean"] = [normalizer(text) for text in preds_w]
data["reference_clean"] = [normalizer(text) for text in refs_w]
data.to_csv('result-whisper-large.csv')

wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))
cer = jiwer.cer(list(data["reference_clean"]), list(data["hypothesis_clean"]))
print(f"WER: {wer * 100:.3f} %")
print(f"CER: {cer * 100:.3f} %")

### E100 model

In [None]:
import nemo
import nemo.collections.asr as nemo_asr

preds_j = []
refs_j = []
audiofiles = []
#model_j = nemo_asr.models.EncDecCTCModel.restore_from(f"{nemo_dir}/ASR_Nemo_Results/A6000/Model-ko-epoch 500-00-59-29-January-from-RealAudio.nemo")

model_j = nemo_asr.models.EncDecCTCModel.restore_from(f"{nemo_dir}/ASR_Nemo_Results/A6000/Model-ko-epoch 100-04-12-06-January-from-RealAudio.nemo")

#model_j = nemo_asr.models.EncDecCTCModel.restore_from(f"{nemo_dir}/ASR_Nemo_Results/A6000/Model-ko-epoch 150-06-40-07-January-from-RealAudio.nemo")

with open(f"{nemo_dir}/Validation_RealAudio/Manifests/test_merged.json", 'r') as inpufile:
    for line in tqdm(inpufile):
        sample = json.loads(line.strip())
        audiofiles.append(sample['audio_filepath'])
        refs_j.append(sample['text'])

preds_j = model_j.transcribe(paths2audio_files=audiofiles)


In [None]:
import jiwer
import pandas as pd
from whisper.normalizers import BasicTextNormalizer

normalizer = BasicTextNormalizer()
hypotheses = []
references = []
data = pd.DataFrame(dict(hypothesis=preds_j, reference=refs_j))
data["hypothesis_clean"] = [normalizer(text) for text in preds_j]
data["reference_clean"] = [normalizer(text) for text in refs_j]
data.to_csv('result-finetuned-E100.csv')

wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))
cer = jiwer.cer(list(data["reference_clean"]), list(data["hypothesis_clean"]))
print(f"WER: {wer * 100:.3f} %")
print(f"CER: {cer * 100:.3f} %")

## zeroth-korean


In [None]:
from datasets import load_dataset
dataset = load_dataset("Bingsu/zeroth-korean")
dataset["train"][0]

In [None]:
import whisper
model = whisper.load_model("large")
options = whisper.DecodingOptions(language="ko")

In [None]:
import numpy as np
preds_z = []
refs_z = []

for k in tqdm(dataset['test']):
    preds_z.append(whisper.transcribe(model, np.float32(k['audio']['array']), language="ko"))
    refs_z.append(k['text'])


In [None]:
import jiwer
import pandas as pd
from whisper.normalizers import BasicTextNormalizer

preds_z2 = []
for k in preds_z:
    preds_z2.append(k['text'])

normalizer = BasicTextNormalizer()
hypotheses = []
references = []
data = pd.DataFrame(dict(hypothesis=preds_z2, reference=refs_z))
data["hypothesis_clean"] = [normalizer(text) for text in preds_z2]
data["reference_clean"] = [normalizer(text) for text in refs_z]
data.to_csv('result-whisper-zeroth-testset.csv')

wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))
cer = jiwer.cer(list(data["reference_clean"]), list(data["hypothesis_clean"]))
print(f"WER: {wer * 100:.3f} %")
print(f"CER: {cer * 100:.3f} %")