# 환경 구성

In [1]:
from setproctitle import setproctitle
setproctitle("ktalk_Wav2Vec2 textfile")

import os
from tqdm import tqdm

dataset_path = "/wav2vec2/s-kr/fine-tune/dataset"
kspon_path = os.path.join(dataset_path, "Broadcast-contents") # 방소, 취미 등등

In [2]:
os.environ["WANDB_DISABLED"] = "true"

In [3]:
# os.environ['WANDB_SILENT']="true"

In [4]:
import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"]= "2"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print('Device:', device)
print('Count of using GPUs:', torch.cuda.device_count()) 

Device: cuda
Count of using GPUs: 1


# KsponSpeech Vocab 생성

In [22]:
from setproctitle import setproctitle
setproctitle("aihub_Wav2Vec2 Vocab_mario")

from tqdm import tqdm
from datasets import Dataset, ClassLabel
from IPython.display import display, HTML
from glob import glob
from kspon_preprocess import special_filter, bracket_filter

import re
import librosa
import random
import numpy as np
import pandas as pd
import os


def show_random_elements(dataset, num_examples=15):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    
    df = pd.DataFrame(dataset[:num_examples])
    display(HTML(df.to_html()))

    
def _read_txt_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
    # with open(file_path, 'r', encoding='cp949') as f:
        text = f.read()
        
    return text


text_list = list()

In [23]:
import matplotlib.pyplot as plt

num_attention_heads=8
sr=8000
batch_size=1
extension='.mp3'

In [24]:
audio_list = list()
durations = 0
max_sec = 10.0
min_sec = 2.0

# kspon_wavs = glob(os.path.join(kspon_path, '**', '*.wav'), recursive=True)
# kspon_wavs = glob(os.path.join(kspon_path, '3.일상안부','dialog_01','001', '*.wav'), recursive=True)
kspon_wavs = glob(os.path.join('*.mp3'), recursive=True)

# print(kspon_wavs)
random.seed(44)
random.shuffle(kspon_wavs)

remove_re = '[a-zA-Z0-9%]'

for file in tqdm(kspon_wavs):
    duration = librosa.get_duration(filename=file, sr=sr)
    if (min_sec <= duration) and (max_sec >= duration):
        text_path = file.replace(extension, ".txt")
        text = _read_txt_file(text_path)
        text = special_filter(bracket_filter(text))
        if re.findall(remove_re, text) == []:
            text_list.append(text)
            audio_list.append(file)
            durations += duration
    if durations >= 500*60*60:
        break

len(text_list)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 971/971 [00:00<00:00, 1739.65it/s]


965

In [25]:
text_dict = {"text": text_list}

vocab_timit = Dataset.from_dict(text_dict)
print(len(vocab_timit))
print(vocab_timit)

show_random_elements(vocab_timit)

965
Dataset({
    features: ['text'],
    num_rows: 965
})


Unnamed: 0,text
0,내가 아는 한의원 소개해 줄까요
1,대신 결과는 책임져야 해
2,오늘은 나도 한 이만 원 정도 복권을 사볼까
3,저 장난감은 얼마인가요
4,어머님 낼 아침에 오신데
5,갈 때 꼭 영수증 챙겨서 가야 한다
6,이 크래커 위에 참치 올려서 먹으면 맛있다
7,안녕하세요 청년 피자입니다 무엇을 드릴까요
8,오늘은 해가 쨍쨍해서 정말 기분이 좋은데요
9,좀 믿고 먹일만한 간식 어디 없을까


In [26]:
vocab_timit.to_csv('/wav2vec2/s-kr/fine-tune/dataset/aihub_results/mario_gec_testdata.tsv')

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

56071

In [27]:
dada=pd.read_csv('/wav2vec2/s-kr/fine-tune/dataset/aihub_results/mario_gec_testdata.tsv')

dada

Unnamed: 0.1,Unnamed: 0,text
0,0,내가 아는 한의원 소개해 줄까요
1,1,대신 결과는 책임져야 해
2,2,오늘은 나도 한 이만 원 정도 복권을 사볼까
3,3,저 장난감은 얼마인가요
4,4,어머님 낼 아침에 오신데
...,...,...
960,960,요즘에 안성탕면이 유행이라면서 나도 안성탕면 좋아하는데 어디서 파는 거지
961,961,진짜 오랜만에 만난 거 같네요
962,962,절대 안 되니까 면허증 보여주세요
963,963,그 보험 상담사는 나의 심기를 건드려서 나는 그 보험을 들지 않았어


In [None]:
# 여기까지만 진행

In [18]:
def extract_all_chars(batch):
    all_text = " ".join(batch["text"])
    vocab = list(set(all_text))
    
    return {"vocab": [vocab], "all_text": [all_text]}

vocabs = extract_all_chars(vocab_timit)
vocab_list = list(set(vocabs["vocab"][0]))

vocab_dict = {v: k for k, v in enumerate(vocab_list)}

In [19]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

1999

In [20]:
import json
import os

vocab_path = os.path.join(dataset_path, 'broadcast_vocab.json')
with open(vocab_path, 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

# Processor 생성

In [21]:
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor

tokenizer = Wav2Vec2CTCTokenizer(vocab_path, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=sr, padding_value=0.0, do_normalize=True, return_attention_mask=False)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor

Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

- tokenizer: PreTrainedTokenizer(name_or_path='', vocab_size=1999, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '[UNK]', 'pad_token': '[PAD]'})

# 학습 데이터셋 구축

In [22]:
from setproctitle import setproctitle
setproctitle("aihub_Wav2Vec2 Dataset")

from datasets import Dataset
from transformers import Wav2Vec2Processor
from kspon_preprocess import special_filter, bracket_filter, del_noise  # 특수 기호 제거하는 전처리 코드 함수
from tqdm import tqdm

import numpy as np
import os
import soundfile as sf
import librosa
import re

input_values_list = list()
input_length_list = list()
labels_list = list()
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'
remove_re = '[a-zA-Z0-9%]'

def remove_special_characters(text: str) -> str:
    text = special_filter(bracket_filter(text))
    # text = re.sub(chars_to_ignore_regex, '', text)
    
    return text

In [23]:
for text, audio_path in tqdm(zip(text_list, audio_list), total=len(text_list)):
    # audio, _ = sf.read(audio_path)
    audio, _ = librosa.load(audio_path, sr=sr)
    non_silence_indices = del_noise(audio, top_db=30)  # del_noise 함수를 통해 노이즈 제거
    audio = np.concatenate([audio[start:end] for start, end in non_silence_indices])
    if audio.ndim > 1:
        audio = np.delete(audio, 1, axis=1)
        audio = audio.reshape(-1)
    input_value = processor(audio, sampling_rate=sr).input_values[0]
    input_values_list.append(input_value)
    input_length_list.append(len(input_value))
    text = remove_special_characters(text)
    with processor.as_target_processor():
        labels_list.append(processor(text).input_ids)

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 376583/376583 [16:20<00:00, 384.11it/s]


In [24]:
print(len(input_values_list), len(input_length_list), len(labels_list))

print(input_values_list[0], input_values_list[0].shape)
print(input_length_list[0])
print(labels_list[0])

376583 376583 376583
[-0.01602839 -0.01658881 -0.00341888 ...  0.07335903  0.06299122
  0.04645874] (50688,)
50688
[111, 256, 1616, 1936, 1742, 256, 378, 1359, 256, 1620, 955, 256, 582, 1393, 256, 1350, 154, 256, 648, 256, 1486, 732]


In [25]:
import pandas as pd

train_rate = 0.99
train_idx = int(train_rate * len(input_values_list))

train_df = pd.DataFrame({'input_values': input_values_list[:train_idx], 'input_length': input_length_list[:train_idx], 'labels': labels_list[:train_idx]})
test_df = pd.DataFrame({'input_values': input_values_list[train_idx:], 'input_length': input_length_list[train_idx:], 'labels': labels_list[train_idx:]})

print(len(train_df))
print(len(test_df))

372817
3766


In [None]:
train_timit = Dataset.from_pandas(train_df)
test_timit = Dataset.from_pandas(test_df)

print(len(train_timit))
print(train_timit)

# 데이터 확인

In [None]:
# import IPython.display as ipd
# import numpy as np
# import random

# rand_int = random.randint(0, len(train_timit))

# ipd.Audio(data=np.asarray(train_timit[rand_int]["input_values"]), rate=16000)

# Train 준비

In [None]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [None]:
from datasets import load_metric
cer_metric = load_metric("cer")
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    # wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer}

In [None]:
from transformers import Wav2Vec2Model, Wav2Vec2ForCTC, Wav2Vec2Config

configuration = Wav2Vec2Config(num_attention_heads=num_attention_heads)
model = Wav2Vec2Model(configuration)

model = Wav2Vec2ForCTC.from_pretrained(
    # "facebook/wav2vec2-large-xlsr-53",
    "/wav2vec2/s-kr/fine-tune/dataset/results/checkpoint-175000", # 새로만든 보캡 옮겨야함.
    # gradient_checkpointing=True,
    ctc_loss_reduction="mean",
    configuration,
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size = len(processor.tokenizer)
)

# print(model.config)
# print("=" * 100)
# print(model)

In [None]:
model.freeze_feature_encoder()

In [None]:
from transformers import TrainingArguments

output_dir = os.path.join(dataset_path, "aihub_results")

number = 1000
training_args = TrainingArguments(
  output_dir=output_dir,           
  group_by_length=True,
  per_device_train_batch_size=batch_size,
  per_device_eval_batch_size=batch_size,
  evaluation_strategy="steps",
  num_train_epochs=30,
  fp16=True,
  gradient_checkpointing=True,
  save_steps=number,
  eval_steps=number,
  logging_steps=number,
  learning_rate=1e-4,
  log_on_each_node=True,
  weight_decay=0.005,
  warmup_steps=number,
  eval_accumulation_steps=1,
  save_total_limit=5,
  load_best_model_at_end=True,

)

# Train 시작

In [None]:
from transformers import Trainer, EarlyStoppingCallback
from setproctitle import setproctitle
setproctitle("wav2vec2-south-ft-aihub-mario")

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_timit,
    eval_dataset=test_timit,
    tokenizer=processor.feature_extractor,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=1)]
)

In [None]:
try:
    trainer.train()
except Exception as e:
    print(e)
finally:
    for obj in trainer.state.log_history:
        print(obj)
        
    trainer.save_model(output_dir)