In [1]:
from src.utils import *

In [5]:
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
import os

class CustomAudioDataset(Dataset):
    def __init__(self, data_dir, transform=None, fixed_length=None):
        self.data_dir = data_dir
        self.file_list, self.labels = self._get_file_list_and_labels()
        self.transform = transform
        self.fixed_length = fixed_length

    def _get_file_list_and_labels(self):
        file_list = []
        labels = []
        for root, dirs, files in os.walk(self.data_dir):
            for file in files:
                if file.endswith(".wav"):  # Adjust file extension if needed
                    file_list.append(os.path.join(root, file))
                    labels.append(os.path.basename(root))  # Extract label from directory name
        return file_list, labels

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file_path = self.file_list[idx]
        waveform, sample_rate = torchaudio.load(file_path)
        
        if self.transform:
            waveform = self.transform(waveform)
        
        if self.fixed_length:
            waveform = self._pad_waveform(waveform, self.fixed_length)

        label = self.labels[idx]
        return waveform, sample_rate, label

    def _pad_waveform(self, waveform, target_length):
        length_diff = target_length - waveform.size(1)
        if length_diff > 0:
            padding = torch.zeros((1, length_diff))
            waveform = torch.cat([waveform, padding], dim=1)
        return waveform

# Example usage
data_dir = "data/path/train/"
transform = None  # You can define transformations if needed
fixed_length = 16000  # Assuming you want to fix the length to 16000 samples

# Create custom dataset
dataset = CustomAudioDataset(data_dir, transform=transform, fixed_length=fixed_length)

# Create a DataLoader to iterate over the dataset
batch_size = 32
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)




In [6]:
# Now you can iterate over data_loader to get batches of audio waveforms, sample rates, and labels
for waveforms, sample_rates, labels in data_loader:
    # Your training/validation loop goes here
    print(waveforms.size(), sample_rates, labels)

torch.Size([32, 1, 16000]) tensor([16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000,
        16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000,
        16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000,
        16000, 16000]) ('wow', 'yes', 'off', 'up', 'right', 'happy', 'nine', 'wow', 'seven', 'six', 'two', 'cat', 'marvin', 'down', 'up', 'tree', 'nine', 'dog', 'bed', 'down', 'happy', 'six', 'house', 'on', 'go', 'down', 'one', 'right', 'eight', 'bird', 'bird', 'on')
torch.Size([32, 1, 16000]) tensor([16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000,
        16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000,
        16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000, 16000,
        16000, 16000]) ('six', 'down', 'bed', 'left', 'two', 'happy', 'three', 'left', 'two', 'nine', 'right', 'bird', 'house', 'tree', 'no', 'left', 'on', 'stop', 'nine', 'dog', 'tree', 'go', 'tree', 'marvin',

KeyboardInterrupt: 

In [None]:
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))
from tensorflow.keras.optimizers import Adam
from transformers import TFFlaxWhisperForAudioClassification, UnivNetModel

# Load pre-trained Wav2Vec2 model and processor using AutoFeatureExtractor
model_name = "facebook/wav2vec2-base-960h"

# Load pre-trained Wav2Vec2 model
model = UnivNetFeatureExtractor.from_pretrained(model_name)

In [None]:
import tensorflow as tf
from transformers import AutoFeatureExtractor
from tqdm.notebook import tqdm

# Load Wav2Vec tokenizer
model_name = "facebook/wav2vec2-base-960h"
tokenizer = AutoFeatureExtractor.from_pretrained(model_name)
print("test")

# Define a function to tokenize audio data
def tokenize_dataset(dataset):
    tokenized_batches = []
    for batch in tqdm(dataset, total=len(dataset)):
        
        audio_batch, labels = batch
        # print(audio_batch)
        inputs = tokenizer(audio_batch, return_tensors="tf", padding=True, verbose=False, sampling_rate=16000)
        tokenized_batches.append((inputs.input_values, labels))
    return tokenized_batches

# Tokenize the train dataset
tokenized_train_dataset = tokenize_dataset(train_dataset)

# Tokenize the validation dataset
tokenized_validation_dataset = tokenize_dataset(validation_dataset)


import tensorflow as tf

# Convert tokenized batches into a generator function
def batch_generator(tokenized_batches):
    for input_values, labels in tokenized_batches:
        yield input_values, labels

# Create Keras batch datasets for train and validation datasets
keras_train_dataset = tf.data.Dataset.from_generator(
    generator=lambda: batch_generator(tokenized_train_dataset),
    output_signature=(
        tf.TensorSpec(shape=(1, 32, 16000, 1), dtype=tf.float32),  # Input values
        tf.TensorSpec(shape=(32, 30), dtype=tf.int32)           # Labels
    )
)

keras_validation_dataset = tf.data.Dataset.from_generator(
    generator=lambda: batch_generator(tokenized_validation_dataset),
    output_signature=(
        tf.TensorSpec(shape=(1, 32, 16000, 1), dtype=tf.float32),  # Input values
        tf.TensorSpec(shape=(32, 30), dtype=tf.int32)           # Labels
    )
)



In [None]:
X_train = tf.concat([i[0][0] for i in tokenized_train_dataset], axis=0)
X_val = tf.concat([i[0][0] for i in tokenized_validation_dataset], axis=0)

In [None]:
y_train = tf.concat([i[1] for i in tokenized_train_dataset], axis=0)
y_val = tf.concat([i[1] for i in tokenized_validation_dataset], axis=0)

In [None]:
import gc
del tokenized_train_dataset
del tokenized_validation_dataset
gc.collect()

In [None]:
X_train = X_train.numpy()
X_val = X_val.numpy()

In [None]:
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1])
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1])

In [None]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Conv1D, BatchNormalization, Dropout, Dense
from tensorflow.keras.layers import Bidirectional, LSTM, TimeDistributed


def get_SR_Model(num_classes: int):
    X_input = Input(shape=(16000, 1))
    X = Conv1D(filters=256,kernel_size=15,strides=4)(X_input)
    X = BatchNormalization()(X)
    X = Dropout(0.2)(X)
    X = Conv1D(filters=512,kernel_size=15,strides=4)(X_input)
    X = BatchNormalization()(X)
    X = Dropout(0.2)(X)
    X = LSTM(units=512, return_sequences=True)(X)
    X = LSTM(units=512, return_sequences=False)(X)
    X = Dense(num_classes, activation='softmax')(X)
    return Model(inputs=[X_input], outputs=[X])

model = get_SR_Model(2)



In [None]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from transformers import TFWav2Vec2Model
from tensorflow.keras import Model, layers


class TFWav2Vec2Layer(layers.Layer):
    def __init__(self, **kwargs):
        super(TFWav2Vec2Layer, self).__init__(**kwargs)
        self.wav2vec_model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
        self.wav2vec_model.trainable = False

    def call(self, inputs, **kwargs):
        return self.wav2vec_model(inputs)["last_hidden_state"]


def get_wav2vec_classifier(num_classes: int):
    # Define input layer for tokenized input
    tokenized_input = layers.Input(shape=(None,), dtype=tf.int32)

    # Pass input through custom Wav2Vec2 layer
    hidden_states = TFWav2Vec2Layer()(tokenized_input)

    # Add classification layers
    x = layers.Dense(512, activation='relu')(hidden_states[:, 0, :])  # Take the first token's representation
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(num_classes, activation='softmax')(x)

    # Define model
    model = Model(inputs=tokenized_input, outputs=x)

    return model


# Example usage:
model = get_wav2vec_classifier(num_classes=30)


In [None]:
from tensorflow.keras import optimizers
from tensorflow.keras import losses
from tensorflow.keras import metrics

model.compile(optimizer=optimizers.Adam(learning_rate=0.001),
                loss=losses.CategoricalCrossentropy(),
                metrics=[metrics.CategoricalAccuracy()])

In [None]:
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=64)