In [13]:
import os
from datasets import Dataset, Audio
from sklearn.model_selection import train_test_split
tamil_text = "Tamil_text"
tamil_audio = "Tamil_wav_5_seconds"
test_size = 0.3

In [14]:
text_files = [f for f in os.listdir(tamil_text) if f.endswith(('.txt'))]
data = {
        'audio': [],
        'transcript': []
    }

for text_file in text_files:
        # Construct paths for the audio and corresponding transcript
    transcript_path = os.path.join(tamil_text, text_file)
    audio_path = os.path.join(tamil_audio, os.path.splitext(text_file)[0] + '.wav')
    # Ensure both audio and transcript exist
    if os.path.exists(transcript_path):
        with open(transcript_path, 'r') as f:
            transcript = f.read().strip()
        
        data['audio'].append(audio_path)
        data['transcript'].append(transcript)

# Create a Dataset object
dataset = Dataset.from_dict(data)

# Cast the 'audio' column to the Audio feature type
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
    

In [15]:
common_voice_train, common_voice_test = dataset.train_test_split(test_size=test_size).values()

In [17]:
common_voice_train[0]

{'audio': {'path': 'Tamil_wav_5_seconds/1010345_2024.08.05_11.49.15-2024.08.05_11.50.15_12.wav',
  'array': array([ 0.3232654 ,  0.49187958,  0.46607825, ..., -0.358013  ,
         -0.37538546, -0.32246423]),
  'sampling_rate': 16000},
 'transcript': 'இல்லை தேர்வுக்கான பெரியார் ஒட்டுமொத்தமாக இணைந்து'}

In [18]:
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["transcript"]).lower() + " "
    return batch

In [19]:
common_voice_train = common_voice_train.map(remove_special_characters, remove_columns=["transcript"])
common_voice_test = common_voice_test.map(remove_special_characters, remove_columns=["transcript"])

Map:   0%|          | 0/2529 [00:00<?, ? examples/s]

Map:   0%|          | 0/1085 [00:00<?, ? examples/s]

In [20]:
def extract_all_chars(batch):
  all_text = " ".join(batch["text"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [21]:
vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)
vocab_test = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)

Map:   0%|          | 0/2529 [00:00<?, ? examples/s]

Map:   0%|          | 0/2529 [00:00<?, ? examples/s]

In [22]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

In [24]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
#vocab_dict

In [25]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [26]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

79

In [27]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [28]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")



In [29]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

In [30]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [31]:
 processor.save_pretrained("5_seconds_wav2vec2-large-xlsr-tamil-demo")

[]

In [42]:
path, speech_array, sampling_rate = common_voice_train[0]["audio"]

In [45]:
common_voice_train[0]["audio"]["array"]

array([ 0.3232654 ,  0.49187958,  0.46607825, ..., -0.358013  ,
       -0.37538546, -0.32246423])

In [47]:
common_voice_train[0]["audio"]

{'path': 'Tamil_wav_5_seconds/1010345_2024.08.05_11.49.15-2024.08.05_11.50.15_12.wav',
 'array': array([ 0.3232654 ,  0.49187958,  0.46607825, ..., -0.358013  ,
        -0.37538546, -0.32246423]),
 'sampling_rate': 16000}

In [48]:
import torchaudio

def speech_file_to_array_fn(batch):
    #path, speech_array, sampling_rate = torchaudio.load(batch["audio"])
    #path, speech_array, sampling_rate = batch["audio"]
    #batch["speech"] = speech_array[0].numpy()
    #batch["sampling_rate"] = sampling_rate
    path, speech_array, sampling_rate = batch["audio"]
    batch["speech"] = batch["audio"]["array"]
    batch["sampling_rate"] = batch["audio"]["sampling_rate"]
    
    batch["target_text"] = batch["text"]
    return batch

In [49]:
common_voice_train = common_voice_train.map(speech_file_to_array_fn, remove_columns=common_voice_train.column_names)
common_voice_test = common_voice_test.map(speech_file_to_array_fn, remove_columns=common_voice_test.column_names)


Map:   0%|          | 0/2529 [00:00<?, ? examples/s]

Map:   0%|          | 0/1085 [00:00<?, ? examples/s]

In [50]:
def prepare_dataset(batch):
    # check that all files have the correct sampling rate
    assert (
        len(set(batch["sampling_rate"])) == 1
    ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch

In [51]:
common_voice_train = common_voice_train.map(prepare_dataset, remove_columns=common_voice_train.column_names, batch_size=8, num_proc=4, batched=True)
common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names, batch_size=8, num_proc=4, batched=True)

Map (num_proc=4):   0%|          | 0/2529 [00:00<?, ? examples/s]

2024-08-16 11:10:30.117986: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-16 11:10:31.271819: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-16 11:10:33.126453: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-16 11:10:33.490237: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-08-16 11:10:33.490237: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with 

Map (num_proc=4):   0%|          | 0/1085 [00:00<?, ? examples/s]

2024-08-16 11:11:01.186201: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-16 11:11:01.211262: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-08-16 11:11:03.216686: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-16 11:11:03.240509: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-08-16 11:11:05.291546: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your

In [52]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [53]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [54]:
import evaluate
wer_metric = evaluate.load("wer")


2024-08-16 11:11:54.894210: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-16 11:11:54.917990: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [55]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [56]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53", 
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    gradient_checkpointing=True, 
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [61]:
model.freeze_feature_extractor()


SyntaxError: invalid syntax (3479821678.py, line 2)

In [65]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  #output_dir="/content/gdrive/MyDrive/wav2vec2-large-xlsr-tamil-demo",
  output_dir="./5_seconds_wav2vec2-large-xlsr-Tamil-demo",
  group_by_length=True,
  per_device_train_batch_size=1,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=30,
  fp16=True,
  save_steps=400,
  eval_steps=400,
  logging_steps=400,
  learning_rate=3e-4,
  warmup_steps=500,
  save_total_limit=2,
)

In [66]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=common_voice_train,
    eval_dataset=common_voice_test,
    tokenizer=processor.feature_extractor,
)

In [67]:
import torch
torch.cuda.empty_cache()
trainer.train()


Step,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 0 has a total capacity of 3.83 GiB of which 37.50 MiB is free. Including non-PyTorch memory, this process has 3.24 GiB memory in use. Of the allocated memory 2.92 GiB is allocated by PyTorch, and 30.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)