In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
!pip install torch torchaudio transformers datasets jiwer --quiet

In [44]:
import os
import torchaudio
from datasets import Dataset, load_dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments
import jiwer

# Define paths to audio and transcription files
audio_dir = "/kaggle/input/dysarthria-and-nondysarthria-speech-dataset/Dysarthria and Non Dysarthria/Dataset/Female_Non_Dysarthria/FC01/Session1/Wav"
transcription_dir = "/kaggle/input/dysarthria-and-nondysarthria-speech-dataset/Dysarthria and Non Dysarthria/Dataset/Female_Non_Dysarthria/FC01/Session1/Txt"

# Function to load audio and transcription data
def load_data(audio_dir, transcription_dir):
    audio_list = []
    transcription_list = []
    
    for file in os.listdir(audio_dir):
        if file.endswith(".wav"):
            audio_path = os.path.join(audio_dir, file)
            transcription_path = os.path.join(transcription_dir, file.replace(".wav", ".txt"))

            if os.path.exists(transcription_path):
                with open(transcription_path, "r") as f:
                    transcription = f.read().strip()

                # Load audio file
                waveform, sample_rate = torchaudio.load(audio_path)

                # Append to lists
                audio_list.append({"array": waveform.squeeze().numpy(), "sampling_rate": sample_rate})
                transcription_list.append(transcription)

    return {"audio": audio_list, "transcription": transcription_list}

# Load dataset
data = load_data(audio_dir, transcription_dir)

# Convert data to Hugging Face dataset format
dataset = Dataset.from_dict(data)


In [7]:
dataset

Dataset({
    features: ['audio', 'transcription'],
    num_rows: 164
})

In [45]:

# Preprocess the dataset
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

def prepare_batch(batch):
    # Extract audio and transcription
    audio = batch["audio"]
    transcription = batch["transcription"]
    
    # Preprocess audio
    inputs = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt", padding=True)
    
    # Preprocess transcription
    with processor.as_target_processor():
        labels = processor(transcription, return_tensors="pt").input_ids
    
    batch["input_values"] = inputs.input_values[0]
    batch["labels"] = labels[0]
    
    return batch

# Apply preprocessing to the dataset
dataset = dataset.map(prepare_batch, remove_columns=["audio", "transcription"])




Map:   0%|          | 0/164 [00:00<?, ? examples/s]



In [10]:
!pip install evaluate --quiet

In [54]:
import torch
from dataclasses import dataclass
from typing import Dict, List, Union
import numpy as np
from transformers import Wav2Vec2Processor

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: bool = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Extract input values and labels
        input_values = [feature["input_values"] for feature in features]
        labels = [feature["labels"] for feature in features]

        # Convert input_values to tensors if they're not already
        input_values = [torch.tensor(x) if not isinstance(x, torch.Tensor) else x for x in input_values]

        # Pad input values using the feature extractor
        batch = self.processor.feature_extractor.pad(
            {"input_values": input_values},
            padding=self.padding,
            return_tensors="pt",
        )

        # Create attention mask if not provided
        if "attention_mask" not in batch:
            batch["attention_mask"] = torch.ones_like(batch["input_values"])

        # Pad labels using the tokenizer
        label_features = self.processor.tokenizer.pad(
            {"input_ids": labels},
            padding=self.padding,
            return_tensors="pt",
        )

        # Replace padding with -100 to ignore loss correctly
        labels = label_features["input_ids"].masked_fill(label_features.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [55]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments

# Load your model and processor
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")

# Create an instance of the data collator
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

# Set up the training arguments
training_args = TrainingArguments(
    output_dir="./wav2vec2-torog",
    group_by_length=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="steps",
    num_train_epochs=3,
    fp16=True,
    gradient_checkpointing=True,
    save_steps=500,
    eval_steps=500,
    logging_steps=500,
    learning_rate=1e-4,
    weight_decay=0.005,
    warmup_steps=1000,
    save_total_limit=2,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,  # You might want to split your dataset into train and eval
    tokenizer=processor.feature_extractor,
)

# Start training
trainer.train()

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss


TrainOutput(global_step=33, training_loss=4602.848011363636, metrics={'train_runtime': 5273.1317, 'train_samples_per_second': 0.093, 'train_steps_per_second': 0.006, 'total_flos': 9.868901401169894e+16, 'train_loss': 4602.848011363636, 'epoch': 3.0})

In [56]:
model

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=1024, bias=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder

In [57]:
# Save the model and processor
model.save_pretrained("./wav2vec2-torog")
processor.save_pretrained("./wav2vec2-torog")


[]

In [58]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torchaudio

# Load the model and processor
model = Wav2Vec2ForCTC.from_pretrained("./wav2vec2-torog")
processor = Wav2Vec2Processor.from_pretrained("./wav2vec2-torog")


In [67]:
audio_file = "/kaggle/input/dysarthria-and-nondysarthria-speech-dataset/Dysarthria and Non Dysarthria/Dataset/Female_Non_Dysarthria/FC01/Session1/Wav/0004.wav"  # Replace with your audio file path
waveform, sample_rate = torchaudio.load(audio_file)

# Preprocess the audio
inputs = processor(waveform.squeeze().numpy(), sampling_rate=sample_rate, return_tensors="pt", padding=True)


In [68]:
# Get the predicted logits
with torch.no_grad():
    logits = model(inputs.input_values).logits

# Get the predicted IDs
predicted_ids = torch.argmax(logits, dim=-1)

# Decode the predicted IDs to text
transcription = processor.batch_decode(predicted_ids)
print("Transcription:", transcription)


Transcription: ['EPAEPAEPA EPAEPA EPA EPA']


In [69]:
##############################

In [70]:
import shutil

# Zip the model directory
shutil.make_archive('/kaggle/working/wav2vec2-torog', 'zip', '/kaggle/working/wav2vec2-torog')


'/kaggle/working/wav2vec2-torog.zip'

In [71]:
import os

# Check if the zip file exists
zip_file_path = '/kaggle/working/wav2vec2-torog.zip'
if os.path.exists(zip_file_path):
    # Generate a download link
    print(f'Download your model zip file here: {zip_file_path}')
else:
    print("Zip file does not exist.")


Download your model zip file here: /kaggle/working/wav2vec2-torog.zip
