In [1]:
import pandas as pd
import torch
import torchaudio
from pathlib import Path
import transformers
from noisereduce.torchgate import TorchGate as TG

### Removing Background Noise (Torchgate)

In [2]:
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# # Create TorchGating instance
# tg = TG(sr=8000, nonstationary=True).to(device)

In [3]:
from torch.utils.data import Dataset
import os
import pandas as pd

# Load CSV
csv_file_path = 'data/ground_truth_clip_labels.csv'
df = pd.read_csv(csv_file_path)

# Define a custom dataset class
class AudioDataset(Dataset):
    def __init__(self, dataframe, root_dir, transform=None):
        """
        Args:
            dataframe (DataFrame): DataFrame containing 'Show', 'EpId', 'ClipId', and 'y'.
            root_dir (str): Directory with all the audio files.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.dataframe = dataframe
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        # Construct file path
        audio_name = os.path.join(self.root_dir, f"{self.dataframe.iloc[idx]['Show']}_{self.dataframe.iloc[idx]['EpId']}_{self.dataframe.iloc[idx]['ClipId']}.wav")
        waveform, sample_rate = torchaudio.load(audio_name)

        # Get label
        label = self.dataframe.iloc[idx]['y']

        sample = {'audio': waveform, 'sample_rate': sample_rate, 'label': label}

        if self.transform:
            sample = self.transform(sample)

        return sample

# Specify the directory containing the audio files
audio_dir = 'data/clips/'

# Initialize dataset
audio_dataset = AudioDataset(dataframe=df, root_dir=audio_dir)

# Example of using DataLoader to create iterable data loader
# data_loader = DataLoader(audio_dataset, batch_size=4, shuffle=True)

In [4]:
from transformers import AutoFeatureExtractor, WhisperForAudioClassification

model_id = "openai/whisper-tiny"
token = "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"

feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, token=token)
model = WhisperForAudioClassification.from_pretrained(model_id, token=token)

model.safetensors:  49%|####8     | 73.4M/151M [00:00<?, ?B/s]

: 

In [None]:
from torch.utils.data import DataLoader

# Assuming you have already initialized 'audio_dataset' and 'feature_extractor' as shown in previous code

# Define the preprocessing transform as a PyTorch nn.Module
class PreprocessAudio(torch.nn.Module):
    def __init__(self, required_rate):
        super(PreprocessAudio, self).__init__()
        self.required_rate = required_rate

    def forward(self, sample):
        waveform, sample_rate = sample['audio'], sample['sample_rate']
        
        # Use the feature extractor
        return feature_extractor(waveform.squeeze(0), sampling_rate=self.required_rate, return_tensors="pt", padding=True, max_length=16000, truncation=True)

# Add the preprocessing step to the dataset class
audio_dataset.transform = PreprocessAudio(required_rate=feature_extractor.sampling_rate)

# Define a DataLoader
data_loader = DataLoader(audio_dataset, batch_size=4, shuffle=True, collate_fn=lambda x: x)

# Example loop to process batches
for batch in data_loader:
    inputs = [sample['input_values'] for sample in batch]  # Extract processed features
    # You can continue here to feed 'inputs' to your model


: 