In [1]:
root = '../dataset/wav/_d_system_1/'

audio_list = ['segment_5.wav',
              'segment_10.wav',
              'segment_15.wav',
              'segment_20.wav',
              'segment_24.wav',
              'segment_25.wav',
              'segment_30.wav',
              'segment_40.wav',
              'segment_46.wav',
              'segment_48.wav',
              'segment_49.wav'
              ]

text_list = ['와이가 들어왔을 때 엑스가 일이고 와이가 영이거나',
             '같을 때 일을 출력하는 거예요. 그래서',
             '엑스 와이 그다음에 엑스 익스클루시브 오아 와이',
             '그래서 익스클루시브 오아를 어떻게 구현하느냐 엔드오아로',
             '여기까지 따라온 사람 손 들어보세요',
             '잘들 따라오고 있죠? 아직까지는 어려운 거 없어요',
             '집중 안하고 있죠 아침 월요일 아침이라 로그인만 해놓고 자는 거',
             '얘 둘도 낸드가 돼요 근데 문제는 이쪽 파시 포션이죠',
             '력은 뭐가 돼요 우리 저 손들고 학생 있는 학생 손 내리고',
             '이승주 학생도 아직 손 들고 있는데 손 내리고 김용재 김은솔 이선주',
             '돌아왔으면 손 들어보세요 정신차렸으면 손 손 들어보세요'
             ]

In [2]:
import librosa
import numpy as np
from datasets import Dataset, DatasetDict, Features, Array2D, Value
from transformers import WhisperProcessor, WhisperFeatureExtractor, WhisperTokenizer

In [3]:
model_name_or_path ='ymlee/ML_project_voice2text_largev2_1epoch'
task = "transcribe"
language = "Korean" 
language_abbr = 'ko'

In [4]:
processor = WhisperProcessor.from_pretrained(model_name_or_path, language=language, task=task)
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name_or_path)
tokenizer = WhisperTokenizer.from_pretrained(model_name_or_path, language=language_abbr, task=task)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids 
    batch["labels"] = tokenizer(batch["text"]).input_ids
    return batch

In [9]:
import pyarrow as pa
import os

concat_data = []

for audio, text_data in zip(audio_list, text_list):
    audio_path = os.path.join(root, audio)
    audio_array, sampling_rate = librosa.load(audio_path, sr=16000)
    audio_data = {
                'path': None,
                'array': audio_array,
                'sampling_rate': sampling_rate
                }
    data = {
    'audio': audio_data,
    'text': text_data
    }
    concat_data.append(data)

def set_numpy(sample):
    sample['audio']['array'] = np.array(sample['audio']['array'], dtype=np.float32)

dataset = Dataset.from_list(concat_data).train_test_split(test_size=0.2, seed=42) # train, test 분할
dataset_dict = DatasetDict({'train' : dataset['train'], 'test':dataset['test']}).with_format("np")

In [10]:
columns_to_remove = ['audio', 'text']

# converted_dict = dataset_dict.map(prepare_dataset, remove_columns=columns_to_remove, num_proc=4)
converted_dict = dataset_dict.map(prepare_dataset).remove_columns(columns_to_remove)

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [11]:
converted_dict.save_to_disk("../custom_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/8 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3 [00:00<?, ? examples/s]

### datasets 확인

In [14]:
from datasets import load_from_disk

new_dataset_load = '../custom_dataset'
loaded_datasets = load_from_disk(new_dataset_load)
loaded_datasets

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 8
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 3
    })
})