**Installing Malaya-Speech Library**

In [None]:
!pip install malaya-speech -q

Additional Tensorflow Dependencies


*   requires tensorflow-addons version 0.17.1 for tensorflow == 2.8, 2.9, 2.10 & python == 3.7, 3.8, 3.9, 3.10
*   additional info regarding requirements: https://github.com/tensorflow/addons



In [None]:
!pip install tensorflow-addons==0.17.1 -q

Requirement Check: Tensorflow >= 1.15.0, and PyTorch >= 1.10.

In [None]:
import torch
print(torch.__version__)

In [None]:
import tensorflow as tf
print(tf.__version__)

**Model Import**

In [None]:
import malaya_speech
import numpy as np
from malaya_speech import Pipeline

----------------------------------------------------------------------------------------------------------------------------------------------------------------

**RNNT Models**

List available transducers: 

In [None]:
malaya_speech.stt.available_transducer()

Initialize Transducer:

In [None]:
model = malaya_speech.stt.deep_transducer('conformer-stack-2mixed')

In [None]:
y, sr = malaya_speech.load('drive/MyDrive/ASR-Malaya-Speech/audio-data/mixed/unifiport-min6.wav') #FOR MIXED RUN

In [None]:
y, sr = malaya_speech.load('drive/MyDrive/ASR-Malaya-Speech/audio-data/malay/makanan.wav') #FOR MALAY RUN

Greedy Decoder:

In [None]:
%%time

text = model.greedy_decoder([y])
print(text)

----------------------------------------------------------------------------------------------------------------------------------------------------------------

**RNNT + Huggingface Wav2Vec2 XLS-R**

In [None]:
malaya_speech.stt.available_huggingface()

In [None]:
model = malaya_speech.stt.huggingface(model = 'mesolitica/wav2vec2-xls-r-300m-mixed')

In [None]:
y, sr = malaya_speech.load('drive/MyDrive/ASR-Malaya-Speech/audio-data/mixed/unifiport-min6.wav') #FOR MIXED RUN

In [None]:
y, sr = malaya_speech.load('drive/MyDrive/ASR-Malaya-Speech/audio-data/malay/makanan.wav') #FOR MALAY RUN

Greedy Decoder

In [None]:
%%time

text = model.greedy_decoder([y])
print(text)

----------------------------------------------------------------------------------------------------------------------------------------------------------------

WER Calculation

In [None]:
def wer(reference, hypothesis):
    r = reference.split()
    h = hypothesis.split()
    d = np.zeros((len(r)+1)*(len(h)+1), dtype=np.uint8)
    d = d.reshape((len(r)+1, len(h)+1))
    for i in range(len(r)+1):
        for j in range(len(h)+1):
            if i == 0:
                d[0][j] = j
            elif j == 0:
                d[i][0] = i
    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
            else:
                substitute = d[i-1][j-1] + 1
                insert = d[i][j-1] + 1
                delete = d[i-1][j] + 1
                d[i][j] = min(substitute, insert, delete)
    return float(d[len(r)][len(h)]) / len(r) * 100


In [None]:
transcript =  open('drive/MyDrive/ASR-Malaya-Speech/transcripts/mixed/unifiport-min6.txt', 'r') #FOR MIXED RUN

In [None]:
transcript =  open('drive/MyDrive/ASR-Malaya-Speech/transcripts/malay/makanan.txt', 'r') #FOR MALAY RUN

In [None]:
reference = transcript.read()

In [None]:
type(text) #check that model output is string

In [None]:
#to turn ouput into string
hypothesis = ' '.join(map(str, text))

In [None]:
wer(reference, hypothesis)

----------------------------------------------------------------------------------------------------------------------------------------------------------------

**Voice Activity Detection**

In [None]:
!pip install webrtcvad -q

In [None]:
y, sr = malaya_speech.load('drive/MyDrive/ASR-Malaya-Speech/audio-data/malay/makanan.wav') #FOR MIXED RUN
len(y), sr

In [None]:
y, sr = malaya_speech.load('drive/MyDrive/ASR-Malaya-Speech/audio-data/malay/makanan.wav') #FOR MALAY RUN
len(y), sr

In [None]:
y_int = malaya_speech.astype.float_to_int(y)
vad = malaya_speech.vad.webrtc(minimum_amplitude = int(np.quantile(np.abs(y_int), 0.2)))

Available VAD Models

In [None]:
malaya_speech.vad.available_model()

Loading Deep Model

In [None]:
def deep_model(model: str = 'marblenet-factor1', quantized: bool = False, **kwargs):
    """
    Load VAD model.

    Parameters
    ----------
    model : str, optional (default='vggvox-v2')
        Model architecture supported. Allowed values:

        * ``'vggvox-v1'`` - finetuned VGGVox V1.
        * ``'vggvox-v2'`` - finetuned VGGVox V2.
        * ``'speakernet'`` - finetuned SpeakerNet.
        * ``'marblenet-factor1'`` - Pretrained MarbleNet * factor 1.
        * ``'marblenet-factor3'`` - Pretrained MarbleNet * factor 3.
        * ``'marblenet-factor5'`` - Pretrained MarbleNet * factor 5.

    quantized : bool, optional (default=False)
        if True, will load 8-bit quantized model.
        Quantized model not necessary faster, totally depends on the machine.

    Returns
    -------
    result : malaya_speech.supervised.classification.load function
    """

In [None]:
model_factor3 = malaya_speech.vad.deep_model(model = 'marblenet-factor3')

Generate Frames 


*   to split audio into chunks for VAD



In [None]:
def frames(
    audio,
    frame_duration_ms: int = 30,
    sample_rate: int = 16000,
    append_ending_trail: bool = True,
):
    """
    Generates audio frames from PCM audio data.
    Takes the desired frame duration in milliseconds, the PCM data, and the sample rate.

    Parameters
    ----------

    audio: np.array / list
    frame_duration_ms: int, optional (default=30)
    sample_rate: int, optional (default=16000)
    append_ending_trail: bool, optional (default=True)
        if True, will append last trail and this last trail might not same length as `frame_duration_ms`.

    Returns
    -------
    result: List[malaya_speech.model.frame.FRAME]
    """

Batching Pipeline


*   utilizes parallel processing of GPU and CPU to speed up processing



In [None]:
p = Pipeline()
pipeline = (
    p.batching(5)
    .foreach_map(model_factor3.predict)
    .flatten()
)
#p.visualize()

In [None]:
%%time

frames_int = list(malaya_speech.utils.generator.frames(y_int, 30, sr))

In [None]:
%%time

frames = list(malaya_speech.utils.generator.frames(y, 50, sr))
result = p.emit(frames)
frames_deep_batch = [(frame, result['flatten'][no]) for no, frame in enumerate(frames)]

In [None]:
%%time

frames_deep_factor3 = [(frame, model_factor3(frame)) for frame in frames]

In [None]:
%%time

result = p.emit(frames)
result.keys()

In [None]:
malaya_speech.extra.visualization.visualize_vad(y, frames_deep_factor3, sr)