# Fine-turn ASR - Vietnamese

- Dataset: VIVOS: Vietnamese Speech Corpus for ASR
- Link: https://www.kaggle.com/datasets/kynthesis/vivos-vietnamese-speech-corpus-for-asr

## Import datasets

In [1]:
import os 
import pandas as pd
import pathlib

In [2]:
train_dir = "datasets/vivos/train"
test_dir = "datasets/vivos/test"

In [3]:
def load_audio_df(dir):
    gender_df = pd.read_csv(os.path.join(dir, "genders.txt"), 
                            sep=" ", 
                            header=None, 
                            names=["folder_name", "gender"])
    prompts_df = pd.read_csv(os.path.join(dir, "prompts.txt"), 
                            header=None, 
                            names=["prompts"])
    prompts_df["file_name"] = prompts_df["prompts"].apply(lambda x: x.split(" ")[0])
    prompts_df["text"] = prompts_df["prompts"].apply(lambda x: " ".join(x.split(" ")[1:]))
    prompts_df = prompts_df.drop(columns=["prompts"])
    
    file_data = []
    for root, directories, files in os.walk(dir):
        for file in files:
            if pathlib.Path(file).suffix == '.wav':
                file_path = os.path.join(root, file)
                file_name = file.split(".")[0]
                folder_name = file_name.split("_")[0]
                file_data.append({
                    "path": file_path,
                    "file_name": file_name,
                    "folder_name": folder_name,
                })
                
    file_df = pd.DataFrame(file_data)
    # join file_df and gender_df based on folder_name
    full_ds = pd.merge(file_df, gender_df, on="folder_name")
    full_ds = pd.merge(full_ds, prompts_df, on="file_name")
    return full_ds
    

In [4]:
train_df = load_audio_df(train_dir)
test_df = load_audio_df(test_dir)

train_df.head()

Unnamed: 0,path,file_name,folder_name,gender,text
0,datasets/vivos/train/waves/VIVOSSPK17/VIVOSSPK...,VIVOSSPK17_094,VIVOSSPK17,f,VÌ CÁI VÒNG SỐ BA QUÁ HẤP DẪN
1,datasets/vivos/train/waves/VIVOSSPK17/VIVOSSPK...,VIVOSSPK17_039,VIVOSSPK17,f,THÀNH TÍCH VÀ QUY GÁN
2,datasets/vivos/train/waves/VIVOSSPK17/VIVOSSPK...,VIVOSSPK17_178,VIVOSSPK17,f,TẤT CẢ NHỮNG THỨ XUNG QUANH BẠN
3,datasets/vivos/train/waves/VIVOSSPK17/VIVOSSPK...,VIVOSSPK17_022,VIVOSSPK17,f,LÀ SẼ XỬ LÝ CƯƠNG QUYẾT NHỮNG SAI PHẠM
4,datasets/vivos/train/waves/VIVOSSPK17/VIVOSSPK...,VIVOSSPK17_052,VIVOSSPK17,f,CUỘC THI NÀY CÓ THỂ TỔ CHỨC TRONG CẢ NƯỚC


In [5]:
train_df.shape, test_df.shape

((11660, 5), (760, 5))

### Convert to Dataset

In [6]:
# add train_df and test_df to dataset
from datasets import Dataset, DatasetDict

ds = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'test': Dataset.from_pandas(test_df)
})
ds

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['path', 'file_name', 'folder_name', 'gender', 'text'],
        num_rows: 11660
    })
    test: Dataset({
        features: ['path', 'file_name', 'folder_name', 'gender', 'text'],
        num_rows: 760
    })
})

## EDA