In [1]:
# %%capture
# !pip install pydub
# !pip install librosa
# !pip install ipywidgets
# !pip install soundfile
# !pip install datasets -U
# !pip install jiwer -U
# !pip install torch==2.0.1 torchvision torchaudio
# !pip install transformers -U
# !pip install transformers[torch]
# !pip install accelerate -U
# !pip install huggingface_hub
# !jupyter labextension install @jupyter-widgets/jupyterlab-manager
# %%capture --no-capture

In [20]:
from huggingface_hub import interpreter_login
interpreter_login()

In [6]:
from transformers import Wav2Vec2CTCTokenizer
tokenizer = Wav2Vec2CTCTokenizer("vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token=" ")
from transformers import Wav2Vec2FeatureExtractor
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
from transformers import Wav2Vec2Processor
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [7]:
from transformers import Wav2Vec2ForCTC
from transformers import Wav2Vec2Processor
import torch.nn as nn
repo_name = "hongseongpil/wav2vec2-vocals"

In [10]:
from datasets import load_dataset , Dataset
dataset = load_dataset("text", data_files={"train": ["./dataset/train/train.txt"], "test": "./dataset/test/test.txt"})

In [11]:
import soundfile as sf
from Tokenize_Kor import decompose_tokens
import os
import re
def prepare_dataset(batch  , processor ,type):
    audiopth , text = batch['text'].split(maxsplit=1)
    labels = "".join(decompose_tokens(text, True)[0])
    audio_input, sample_rate = sf.read(os.path.join("dataset",type,"audiofiles",audiopth))
    batch['input_values'] = processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_values[0]
    with processor.as_target_processor():
      batch["labels"] = processor(labels).input_ids
    return batch

In [12]:
import os
dataset["train"] = dataset["train"].map(prepare_dataset,fn_kwargs={"processor": processor , 'type':'train'}, remove_columns=["text"])
dataset["test"] = dataset["test"].map(prepare_dataset,fn_kwargs={"processor": processor, 'type':'test'}, remove_columns=["text"])

Map:   0%|          | 0/2182 [00:00<?, ? examples/s]

2024-05-02 11:23:38.381438: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-02 11:23:38.492352: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Map:   0%|          | 0/276 [00:00<?, ? examples/s]

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 2182
    })
    test: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 276
    })
})

In [14]:
import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods

        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )
        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels
        return batch

In [15]:
from datasets import load_metric
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
cer_metric = load_metric("cer",trust_remote_code=True)

  cer_metric = load_metric("cer",trust_remote_code=True)


In [16]:
import numpy as np
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    return {"cer": cer}

In [17]:
model = Wav2Vec2ForCTC.from_pretrained(
        "facebook/wav2vec2-xls-r-300m",
        attention_dropout=0.0,
        hidden_dropout=0.0,
        feat_proj_dropout=0.0,
        mask_time_prob=0.05,
        layerdrop=0.0,
        gradient_checkpointing=True,
        ctc_loss_reduction="mean",
        pad_token_id=processor.tokenizer.pad_token_id,
        vocab_size=len(processor.tokenizer)
    )
model.freeze_feature_extractor()

  return self.fget.__get__(instance, owner)()
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
from transformers import TrainingArguments
from transformers import Trainer
from transformers import Wav2Vec2ForCTC
from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir=repo_name,
    group_by_length=True,
    per_device_train_batch_size=8,  # 수정: 전체 배치 크기 설정
    evaluation_strategy="steps",
    num_train_epochs=12,
    fp16=True,
    gradient_checkpointing=True,
    save_steps=500,
    eval_steps=500,
    logging_steps=500,
    learning_rate=1e-4,
    weight_decay=0.005,
    warmup_steps=1000,
    save_total_limit=1,
    push_to_hub=False,
    load_best_model_at_end=True,
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.001
)

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=processor.feature_extractor,
    callbacks=[early_stopping_callback]
)

In [48]:
trainer.train()



Step,Training Loss,Validation Loss,Cer
1,21.687,21.370493,0.887061
2,23.8481,21.369684,0.886327
3,21.4065,21.368078,0.883977
4,21.9429,21.365643,0.880599
5,19.3554,21.362373,0.877074
6,21.1782,21.358252,0.871934
7,23.9319,21.353514,0.860919




TrainOutput(global_step=7, training_loss=21.90713882446289, metrics={'train_runtime': 277.0867, 'train_samples_per_second': 0.361, 'train_steps_per_second': 0.025, 'total_flos': 3.141053938249344e+16, 'train_loss': 21.90713882446289, 'epoch': 1.0})

In [1]:
import torch
torch.cuda.memory_allocated()
print(torch.cuda.memory_allocated())

0


In [2]:
!rm -r hongseongpil