In [1]:
# !pip install datasets
# !pip install transformers
# !pip install torchaudio
# !pip install jiwer
# !pip install transformers[torch]
# !pip install accelerate

# Description
Project is trained using python 3.9.13 and pytorch 2.1 cuda version. It works mainly on the version of transformers(4.17.0) and datasets(1.18.3). Updated or Down grade version may affect the project execution steps and encounter errors.

Dataset is acquired from librispeech
https://www.openslr.org/80/

In [49]:
import transformers
transformers.__version__

'4.17.0'

In [50]:
import datasets
datasets.__version__

'1.18.3'

In [1]:
import torch
torch.cuda.is_available()

True

In [1]:
from datasets import Dataset, Audio
import pandas as pd
import torch
import numpy as np
import torchaudio

In [2]:
df = pd.read_csv("./my_mm_female/line_index.tsv", sep="\t", header=None)

In [3]:
df.rename(columns = {0:'filename',
                       1:'sentence'},
            inplace = True)

In [4]:
main_folder = "./my_mm_female/"
def add_filepath(filename):
    return main_folder + filename + ".wav"

In [5]:
import re


chars_to_ignore_regex = r'[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

def remove_special_characters(text):
    return re.sub(chars_to_ignore_regex, '', str(text)).lower()

In [6]:
df['path'] = df['filename'].apply(add_filepath)
df['sentence'] = df['sentence'].apply(remove_special_characters)

# Data Preparation

In [7]:
dataset = Dataset.from_pandas(df)

In [8]:
dataset

Dataset({
    features: ['filename', 'sentence', 'path'],
    num_rows: 2530
})

In [10]:
# Define the split ratios
train_ratio = 0.8  # 80% for training
validation_ratio = 0.1  # 10% for validation
test_ratio = 0.1  # 10% for testing

# Calculate the number of samples for each split
total_samples = len(dataset)
train_size = int(train_ratio * total_samples)
validation_size = int(validation_ratio * total_samples)
test_size = int(test_ratio * total_samples)

# Split the dataset
train_dataset = dataset.select(range(train_size))
validation_dataset = dataset.select(range(train_size, train_size + validation_size))
test_dataset = dataset.select(range(train_size + validation_size, total_samples))



In [11]:
train_dataset

Dataset({
    features: ['filename', 'sentence', 'path'],
    num_rows: 2024
})

In [11]:
validation_dataset

Dataset({
    features: ['filename', 'sentence', 'path'],
    num_rows: 253
})

In [12]:
test_dataset

Dataset({
    features: ['filename', 'sentence', 'path'],
    num_rows: 253
})

In [13]:
test_dataset[0]

{'filename': 'bur_7543_6474935046',
 'sentence': 'ကိုမင်းလူ ကြောင့် သူ့ နာရေး က အလွဲတွေ နှင့် ပျော်စရာကြီး ဖြစ်နေသည်',
 'path': './my_mm_female/bur_7543_6474935046.wav'}

# Visualization of Data

In [14]:
from IPython.display import Audio as aud, display
display(aud(train_dataset['path'][0]))

In [15]:
import torchaudio
sp, rate = torchaudio.load(train_dataset['path'][0])
print(rate)
display(aud(sp, rate=rate))

48000


In [12]:
train_dataset['sentence'][0]

'ပြီးတော့ တရုတ် နဲ့လည်း ချစ်ကြည်ရင်းနှီးတဲ့ ဆက်ဆံရေး ရှိတယ်'

# Creating Tokenizer

In [13]:
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  print([vocab])
  return {"vocab": [vocab], "all_text": [all_text]}

In [18]:
vocab_train = train_dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=train_dataset.column_names)
vocab_val = validation_dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=test_dataset.column_names)
vocab_test = test_dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=test_dataset.column_names)



  0%|          | 0/1 [00:00<?, ?ba/s]

[['ဧ', 'ဇ', 'ူ', 'ရ', 'ဤ', 'ဈ', 'ဉ', '၏', 'ပ', 'ဘ', 'ါ', 'ဍ', 'ဿ', 'ဏ', 'ဓ', 'ဲ', 'တ', 'လ', 'ဆ', 'က', 'ဠ', 'န', 'င', 'ဝ', '၍', '၎', 'ည', 'သ', 'ံ', 'ဒ', 'ဩ', 'ဋ', 'ဖ', 'ဗ', 'ထ', '်', 'ဦ', '္', 'ေ', 'ိ', '့', 'ယ', 'ာ', 'ု', ' ', 'ဃ', 'ဥ', 'ှ', 'ြ', 'ီ', 'ဌ', '၌', 'ွ', 'ဂ', 'ခ', 'စ', 'ျ', 'း', 'ဟ', 'အ', 'မ']]


  0%|          | 0/1 [00:00<?, ?ba/s]

[['ဧ', 'ဇ', 'ူ', 'ရ', 'ဈ', '၏', 'ပ', 'ဘ', 'ါ', 'ဍ', 'ဿ', 'ဏ', 'ဓ', 'ဲ', 'တ', 'ဟ', 'လ', 'ဆ', 'က', 'န', 'င', 'ဝ', '၎', '၍', 'ည', 'သ', 'ံ', 'ဒ', 'ဋ', 'ဖ', 'ဗ', 'ထ', '်', 'ဦ', '္', 'ေ', 'ိ', '့', 'ယ', 'ု', 'ာ', ' ', 'ဃ', 'ဥ', 'ှ', 'ြ', 'ီ', 'ဌ', '၌', 'ွ', 'ဂ', 'ခ', 'စ', 'ျ', 'း', 'ဉ', 'အ', 'မ']]


  0%|          | 0/1 [00:00<?, ?ba/s]

[['ဧ', 'ဇ', 'ူ', 'ရ', 'ဉ', '၏', 'ပ', 'ဘ', 'ါ', 'ဏ', 'ဓ', 'ဲ', 'တ', 'လ', 'ဆ', 'က', 'ဠ', 'န', 'င', 'ဝ', 'ဩ', '၍', 'သ', 'ည', 'ံ', 'ဒ', 'ဋ', 'ဖ', 'ဗ', 'ထ', '်', 'ဦ', '္', 'ေ', 'ိ', '့', 'ယ', 'ု', 'ာ', ' ', 'ဃ', 'ဥ', 'ှ', 'ြ', 'ီ', 'ဌ', '၌', 'ွ', 'ဂ', 'ခ', 'စ', 'ျ', 'း', 'ဟ', 'အ', 'မ']]


In [19]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_val["vocab"][0]) | set(vocab_test["vocab"][0]))
#vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

In [20]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{'ဧ': 0,
 'ဇ': 1,
 'ူ': 2,
 'ရ': 3,
 'ပ': 4,
 'ဘ': 5,
 'ဓ': 6,
 'တ': 7,
 'ဲ': 8,
 'လ': 9,
 'ဆ': 10,
 'က': 11,
 '၎': 12,
 'ဩ': 13,
 '၍': 14,
 'ဒ': 15,
 'ဖ': 16,
 'ထ': 17,
 '္': 18,
 'ေ': 19,
 'ိ': 20,
 '့': 21,
 'ု': 22,
 ' ': 23,
 'ဃ': 24,
 'ှ': 25,
 'ြ': 26,
 'ွ': 27,
 'ဂ': 28,
 'ျ': 29,
 'း': 30,
 'ဉ': 31,
 'အ': 32,
 'ဤ': 33,
 'ဈ': 34,
 '၏': 35,
 'ါ': 36,
 'ဍ': 37,
 'ဿ': 38,
 'ဏ': 39,
 'ဠ': 40,
 'န': 41,
 'င': 42,
 'ဝ': 43,
 'ည': 44,
 'သ': 45,
 'ံ': 46,
 'ဋ': 47,
 'ဗ': 48,
 '်': 49,
 'ဦ': 50,
 'ယ': 51,
 'ာ': 52,
 'ဥ': 53,
 'ီ': 54,
 'ဌ': 55,
 '၌': 56,
 'ခ': 57,
 'စ': 58,
 'ဟ': 59,
 'မ': 60}

In [21]:
# vocab_dict["|"] = vocab_dict[" "]
# del vocab_dict[" "]

In [2]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

63

In [3]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [14]:
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor

tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token=" ")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [15]:
processor.save_pretrained("./wav2vec2_burmese")

# Prepare Dataset
### converting speech to numerical representation

In [16]:
import torchaudio

def speech_file_to_array(batch):
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    
    # Create a new dictionary under the "audio" key
    batch["audio"] = {
        "array": speech_array[0].numpy().tolist(),  # Convert the NumPy array to a list
        "path": batch["path"],
        "sampling_rate": sampling_rate
    }
    
    batch["target_text"] = batch["sentence"]
    return batch

In [17]:
# Preprocess the train and test datasets
train_dataset = train_dataset.map(speech_file_to_array)
validation_dataset = validation_dataset.map(speech_file_to_array)
test_dataset = test_dataset.map(speech_file_to_array)

# from datasets import Audio

# # Cast the 'speech' column to the Audio type
# train_dataset = train_dataset.cast_column("speech", Audio(sampling_rate=16_000))



0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [18]:
train_dataset.column_names

['filename', 'sentence', 'path', 'audio', 'target_text']

In [19]:
train_dataset = train_dataset.cast_column("path", Audio(sampling_rate=16_000))
validation_dataset = validation_dataset.cast_column("path", Audio(sampling_rate=16_000))
test_dataset = test_dataset.cast_column("path", Audio(sampling_rate=16_000))

In [20]:
train_dataset[0]["path"]

{'path': './my_mm_female/bur_7865_1250917969.wav',
 'array': array([ 5.0581372e-07, -9.2042376e-07,  1.3887200e-06, ...,
         2.0313350e-05, -7.1072980e-05,  0.0000000e+00], dtype=float32),
 'sampling_rate': 16000}

In [21]:
validation_dataset[0]["path"]

{'path': './my_mm_female/bur_5903_4145333777.wav',
 'array': array([ 6.9382843e-08, -2.6092792e-07,  4.9969321e-07, ...,
         1.7058956e-05,  2.0293417e-05,  0.0000000e+00], dtype=float32),
 'sampling_rate': 16000}

In [18]:
# from IPython.display import Audio as aud, display
# aud(data=train_dataset[0]["audio"]["array"], rate=16000)

from IPython.display import Audio as aud, display
aud(data=train_dataset[0]["path"]["array"], rate=16000)

In [22]:
# processor(train_dataset[0]["audio"]["array"], sampling_rate=train_dataset[0]["audio"]["sampling_rate"]).input_values[0]
processor(train_dataset[0]["path"]["array"], sampling_rate=train_dataset[0]["path"]["sampling_rate"]).input_values[0]

array([ 0.00078052,  0.0007608 ,  0.00079273, ...,  0.00105448,
       -0.00020948,  0.00077353], dtype=float32)

In [23]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [24]:
def prepare_dataset(batch):
    audio = batch["path"]

    # batched output is "un-batched"
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch




In [25]:
train_dataset = train_dataset.map(prepare_dataset, remove_columns=train_dataset.column_names)
validation_dataset = validation_dataset.map(prepare_dataset, remove_columns=validation_dataset.column_names)
test_dataset = test_dataset.map(prepare_dataset, remove_columns=test_dataset.column_names)

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [26]:
train_dataset.column_names

['input_values', 'labels']

In [27]:
train_dataset[0]['labels']

[4,
 26,
 54,
 30,
 7,
 19,
 52,
 21,
 23,
 7,
 3,
 22,
 7,
 49,
 23,
 41,
 8,
 21,
 9,
 44,
 49,
 30,
 23,
 57,
 29,
 58,
 49,
 11,
 26,
 44,
 49,
 3,
 42,
 49,
 30,
 41,
 25,
 54,
 30,
 7,
 8,
 21,
 23,
 10,
 11,
 49,
 10,
 46,
 3,
 19,
 30,
 23,
 3,
 25,
 20,
 7,
 51,
 49]

In [None]:
[4,
 26,
 54,
 30,
 7,
 19,
 52,
 21,
 23,
 7,
 3,
 22,
 7,
 49,
 23,
 41,
 8,
 21,
 9,
 44,
 49,
 30,
 23,
 57,
 29,
 58,
 49,
 11,
 26,
 44,
 49,
 3,
 42,
 49,
 30,
 41,
 25,
 54,
 30,
 7,
 8,
 21,
 23,
 10,
 11,
 49,
 10,
 46,
 3,
 19,
 30,
 23,
 3,
 25,
 20,
 7,
 51,
 49]

In [39]:
processor.decode(train_dataset[0]['labels'])

'ပြီးတော့ တရုတ် နဲ့လည်း ချစ်ကြည်ရင်းနှီးတဲ့ ဆက်ဆံရေး ရှိတယ်'

In [40]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [41]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

# Training & Evaluation

In [25]:
from datasets import load_metric
cer_metric = load_metric("cer")

In [26]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer}

In [28]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base",
    gradient_checkpointing=True, 
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForCTC: ['project_hid.weight', 'project_hid.bias', 'quantizer.weight_proj.weight', 'project_q.weight', 'project_q.bias', 'quantizer.codevectors', 'quantizer.weight_proj.bias']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

In [45]:
model.freeze_feature_encoder()

In [46]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./finetuned_burmese",
  group_by_length=True,
  per_device_train_batch_size=8,
  evaluation_strategy="steps",
  num_train_epochs=30,
  fp16=True,
  gradient_checkpointing=True,
  save_steps=500,
  eval_steps=500,
  logging_steps=500,
  learning_rate=1e-4,
  weight_decay=0.005,
  warmup_steps=1000,
  save_total_limit=2,
)

In [47]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=processor.feature_extractor,
)

Using amp half precision backend


In [48]:
trainer.train()

***** Running training *****
  Num examples = 2024
  Num Epochs = 30
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 7590


Step,Training Loss,Validation Loss,Cer
500,5.5949,3.456782,0.999769
1000,3.1994,1.590729,0.43949
1500,1.2146,0.764498,0.229061
2000,0.7569,0.607796,0.190182
2500,0.5708,0.535575,0.172127
3000,0.4733,0.535766,0.165667
3500,0.421,0.516209,0.154476
4000,0.3576,0.529679,0.148708
4500,0.2905,0.539534,0.149285
5000,0.2945,0.576897,0.146862


***** Running Evaluation *****
  Num examples = 253
  Batch size = 8
Saving model checkpoint to ./finetuned_burmese\checkpoint-500
Configuration saved in ./finetuned_burmese\checkpoint-500\config.json
Model weights saved in ./finetuned_burmese\checkpoint-500\pytorch_model.bin
Feature extractor saved in ./finetuned_burmese\checkpoint-500\preprocessor_config.json
Deleting older checkpoint [finetuned_burmese\checkpoint-7000] due to args.save_total_limit
Deleting older checkpoint [finetuned_burmese\checkpoint-7500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 253
  Batch size = 8
Saving model checkpoint to ./finetuned_burmese\checkpoint-1000
Configuration saved in ./finetuned_burmese\checkpoint-1000\config.json
Model weights saved in ./finetuned_burmese\checkpoint-1000\pytorch_model.bin
Feature extractor saved in ./finetuned_burmese\checkpoint-1000\preprocessor_config.json
Deleting older checkpoint [finetuned_burmese\checkpoint-50] due to args.save_total_lim

TrainOutput(global_step=7590, training_loss=0.9427033661853654, metrics={'train_runtime': 10396.0182, 'train_samples_per_second': 5.841, 'train_steps_per_second': 0.73, 'total_flos': 3.2923883631043743e+18, 'train_loss': 0.9427033661853654, 'epoch': 30.0})

In [28]:
from transformers import Wav2Vec2ForCTC

trained_model = Wav2Vec2ForCTC.from_pretrained("./finetuned_burmese/checkpoint-7500/").to("cuda")




In [29]:
import editdistance

def test_model(start, end):
    for i in range(start, end):

        input_dict = processor(test_dataset["input_values"][i], return_tensors="pt", sampling_rate=16_000, padding=True)
        
        logits = trained_model(input_dict.input_values.to("cuda")).logits
        
        pred_ids = torch.argmax(logits, dim=-1)[0]
    
        #removing white space
        actual = ''.join(processor.decode(test_dataset[i]["labels"]))
        prediction = ''.join(processor.decode(pred_ids))
    
        cer = editdistance.eval(actual, prediction) / len(actual)
    
        # Calculate Character Accuracy
        char_accuracy = 1 - cer
        
        print("Prediction:",prediction) 
        print("actual:", actual)
        print("cer:", cer)
        print("char accuracy:", "{:.2f}%".format(char_accuracy * 100))
        print("-----------------------------------------------------------------")



In [30]:
test_model(0, 10)

Prediction: ကိုမင်း လူကြောင်း သူ့နာရေး က အလွဲတွေ နှင် ပျော်စရာကြီး ဖြစ်နေသည်
actual: ကိုမင်းလူ ကြောင့် သူ့ နာရေး က အလွဲတွေ နှင့် ပျော်စရာကြီး ဖြစ်နေသည်
cer: 0.09090909090909091
char accuracy: 90.91%
-----------------------------------------------------------------
Prediction: သမ္မတ ရုပ်ရှင်ရုံးရှေ့ မှာ စ ဖြစ်ကြတယ် လို့ သိထား တော့ ရဲပစတ်ခန်း မှာလည်း အမှု ဖွင့်ထားလား မသိဘူး
actual: သမ္မတ ရုပ်ရှင်ရုံ ရှေ့ မှာ စ ဖြစ်ကြတယ် လို့ သိထား တော့ ရဲစခန်း မှာလည်း အမှု ဖွင့်ထားလား မသိဘူး
cer: 0.0425531914893617
char accuracy: 95.74%
-----------------------------------------------------------------
Prediction: အရားဝယ် အားလုံး ရပ်နေတယ် ဟု စား လုပ်ငန်းရှင် တော့ ထွေးမြင်း က ပြောသည်
actual: အရောင်းအဝယ် အားလုံး ရပ်နေတယ် ဟု ဆားလုပ်ငန်းရှင် ဒေါ်ဌေးမြင့် က ပြောသည်
cer: 0.2
char accuracy: 80.00%
-----------------------------------------------------------------
Prediction: အဲဒီလို လုပ်ဆောင်ဖို့ အင်ကျောလ်း က ကြိုပတ် နေတာ ဖြစ် ပြီး နည်းပညာ က ဝိုင်ဖိုင် ဖြစ်ပါတယ်
actual: အဲ့ဒီလို လုပ်ဆောင်ဖို့ အင်တဲလ် က ကြိုးပမ်းန

In [54]:
aud(data=test_dataset[0]["input_values"], rate=16000)