In [1]:
# Clone the Hugging Face Transformers repository
!git clone https://github.com/huggingface/transformers.git


Cloning into 'transformers'...
remote: Enumerating objects: 240373, done.[K
remote: Counting objects: 100% (969/969), done.[K
remote: Compressing objects: 100% (450/450), done.[K
remote: Total 240373 (delta 575), reused 707 (delta 427), pack-reused 239404 (from 1)[K
Receiving objects: 100% (240373/240373), 253.48 MiB | 28.42 MiB/s, done.
Resolving deltas: 100% (175864/175864), done.


In [2]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
!nvidia-smi


CUDA available: True
Sat Nov  9 17:47:12 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0              44W / 400W |      5MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                               

In [3]:
!pip install transformers




In [4]:
import transformers
print(transformers.__version__)


4.44.2


In [5]:
%%writefile custom_model.py

import torch
import torch.nn as nn
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

class Wav2Vec2ForCTCWithClassification(Wav2Vec2ForCTC):
    def __init__(self, model_name, num_classes=2):
        super().__init__(Wav2Vec2ForCTC.from_pretrained(model_name).config)

        # Load the base model
        self.wav2vec2 = Wav2Vec2ForCTC.from_pretrained(model_name)
        self.classification_head = nn.Linear(self.wav2vec2.config.hidden_size, num_classes)

    def forward(self, input_values, attention_mask=None, labels=None, class_labels=None):
        outputs = self.wav2vec2(input_values=input_values, attention_mask=attention_mask, labels=labels)

        # Extract hidden states
        hidden_states = outputs.hidden_states[-1]
        class_logits = self.classification_head(hidden_states[:, 0, :])

        classification_loss = None
        if class_labels is not None:
            classification_loss_fn = nn.CrossEntropyLoss()
            classification_loss = classification_loss_fn(class_logits, class_labels)

        return {
            "logits": outputs.logits,
            "class_logits": class_logits,
            "loss": outputs.loss,
            "classification_loss": classification_loss
        }


Writing custom_model.py


In [6]:
from custom_model import Wav2Vec2ForCTCWithClassification
from transformers import Wav2Vec2Processor

# Load the processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")

# Load your custom model
model = Wav2Vec2ForCTCWithClassification("facebook/wav2vec2-base", num_classes=2)
print("Model loaded successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully!


In [7]:
!pip install datasets transformers librosa


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
!pip install torchaudio




In [10]:
import torch
import gc

torch.cuda.empty_cache()
gc.collect()


112

In [11]:
# Check the dataset
import os
import torch
import torchaudio
from datasets import Dataset
from transformers import Wav2Vec2Processor

# Initialize the processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")

audio_folder = '/content/drive/MyDrive/FinalProject570/Data/wavdata'
transcription_folder = '/content/drive/MyDrive/FinalProject570/Data/chatdata'

# Function to load audio using torchaudio
def load_audio(file_path):
    speech_array, sampling_rate = torchaudio.load(file_path)
    speech_array = speech_array.squeeze().numpy()
    return speech_array, sampling_rate

# Function to preprocess each example
def preprocess_example(file_name):
    base_name = os.path.splitext(file_name)[0]
    wav_file_path = os.path.join(audio_folder, file_name)
    cha_file_path = os.path.join(transcription_folder, f"{base_name}.cha")

    # Load audio using torchaudio
    speech_array, sampling_rate = load_audio(wav_file_path)
    input_values = processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True).input_values[0]

    # Load transcription from .cha file
    transcription = ""
    with open(cha_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.startswith("*") and ":" in line:
                transcription += line.split(":")[1].strip() + " "

    # Tokenize transcription
    labels = processor.tokenizer(transcription, return_tensors="pt", padding=True).input_ids[0]
    return {"input_values": input_values, "labels": labels}

# Load data into a Dataset object
def load_dataset(sample_size=1):
    data = []
    count = 0
    for wav_file in os.listdir(audio_folder):
        if wav_file.endswith('.wav'):
            try:
                example = preprocess_example(wav_file)
                data.append(example)
                count += 1
                if count >= sample_size:  # Limit the number of samples
                    break
            except Exception as e:
                print(f"Error processing file {wav_file}: {e}")

    # Convert lists to tensors with padding
    input_values = torch.nn.utils.rnn.pad_sequence(
        [d["input_values"] for d in data], batch_first=True
    )
    labels = torch.nn.utils.rnn.pad_sequence(
        [d["labels"] for d in data], batch_first=True, padding_value=-100
    )

    return Dataset.from_dict({
        "input_values": input_values,
        "labels": labels
    })

# Load a small subset of the dataset for testing
dataset = load_dataset(sample_size=1)

# Check the dataset
print(f"Number of examples in dataset: {len(dataset)}")

# Print an example from the dataset
print("Sample input values:", dataset[0]['input_values'][:10])  # Print the first 10 values for inspection
print("Sample labels:", processor.decode(dataset[0]['labels']))



Number of examples in dataset: 1
Sample input values: [0.0010717147961258888, 0.0010717147961258888, 0.0010717147961258888, 0.0010717147961258888, 0.0010717147961258888, 0.0010717147961258888, 0.0010717147961258888, 0.0010717147961258888, 0.0010717147961258888, 0.0010717147961258888]
Sample labels: <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> O<unk> <unk> <unk>'<unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>'<unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>'<unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>'<unk> <unk> <unk> <unk> I'<unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>'<unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>'<unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> I'<unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>'<unk> <unk> <unk> <unk> <unk> <unk> <unk> I <unk> <unk> <unk> <un

In [12]:
import os
import torch
from transformers import TrainingArguments, Trainer
import gc

# Clear GPU cache and set environment variables for better memory management
torch.cuda.empty_cache()
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"  # Reduced from 256 to 128

# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=1, # Batch size reduced to 1
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=1000,
    fp16=True,
    gradient_accumulation_steps=2,  # Reduced from 4 to 2
    dataloader_num_workers=2,
    dataloader_pin_memory=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=processor
)

# Explicit garbage collection to free up memory
gc.collect()
torch.cuda.empty_cache()

print("Starting training...")
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Starting training...


  return F.conv1d(


OutOfMemoryError: CUDA out of memory. Tried to allocate 35.21 GiB. GPU 0 has a total capacity of 39.56 GiB of which 15.64 GiB is free. Process 11619 has 23.91 GiB memory in use. Of the allocated memory 23.39 GiB is allocated by PyTorch, and 25.73 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)