In [25]:
import pandas as pd
import torch
import torchaudio
from pathlib import Path
import transformers
device = torch.device("cuda")

In [26]:
from transformers import AutoFeatureExtractor, WhisperForAudioClassification
import os

token = os.getenv('HF_TOKEN')
model_id = "openai/whisper-tiny"

feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, token=token)
model = WhisperForAudioClassification.from_pretrained(model_id, token=token, num_labels=7)

Some weights of WhisperForAudioClassification were not initialized from the model checkpoint at openai/whisper-tiny and are newly initialized: ['model.classifier.bias', 'model.classifier.weight', 'model.projector.bias', 'model.projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
data_path = ""
df = pd.read_csv("data/ground_truth.csv")

In [28]:
from datasets import metric

In [29]:
sampling_rate = feature_extractor.sampling_rate
max_duration = 30 #seconds


In [30]:
import math, random
import torch
import torchaudio
from torchaudio import transforms
from IPython.display import Audio
from noisereduce.torchgate import TorchGate as TG

class AudioUtil():
  # ----------------------------
  # Load an audio file. Return the signal as a tensor and the sample rate
  # ----------------------------
  @staticmethod
  def open(audio_file):
    sig, sr = torchaudio.load(audio_file)
    return sig
  
  # ----------------------------
  # Remove background noise
  # ----------------------------

  from noisereduce.torchgate import TorchGate as TG   

  @staticmethod
  def background_noise_removal(noisy_speech):
      tg = TG(sr=16000, nonstationary=True).to(device)
      noisy_speech = noisy_speech.to(device)
      enhanced_speech = tg(noisy_speech)
      return enhanced_speech
  

  @staticmethod
  def pad_trunc(aud, max_ms):
      sr = 16000  # Sample rate
      max_len = sr // 1000 * max_ms  # Convert ms to sample length
      num_rows, sig_len = aud.shape

      if sig_len > max_len:
          # Truncate the signal to the given length
          aud = aud[:, :max_len]
      elif sig_len < max_len:
          # Length of padding to add at the beginning and end of the signal
          pad_begin_len = random.randint(0, max_len - sig_len)
          pad_end_len = max_len - sig_len - pad_begin_len

          # Pad with 0s
          pad_begin = torch.zeros((num_rows, pad_begin_len), device=aud.device)
          pad_end = torch.zeros((num_rows, pad_end_len), device=aud.device)

          aud = torch.cat((pad_begin, aud, pad_end), 1)

      return (aud, sr)
  

  # ----------------------------
  # Audio PreProcessing
  # ----------------------------
  @staticmethod
  def preprocess_function(array):
     # Move tensor to CPU before processing with NumPy
     if array.is_cuda:
       array = array.cpu()

     input = feature_extractor(array,
                              sampling_rate=feature_extractor.sampling_rate,
                              max_length = int(feature_extractor.sampling_rate * max_duration),
                              truncation = True
      )
     return input
     


In [31]:
from torch.utils.data import DataLoader, Dataset
import torchaudio

# ----------------------------
# Sound Dataset
# ----------------------------
class SoundDS(Dataset):
  def __init__(self, df, data_path):
    self.df = df
    self.data_path = str(data_path)
    # self.duration = 4000
    # self.sr = 44100
    # self.channel = 2
    # self.shift_pct = 0.4
            
  # ----------------------------
  # Number of items in dataset
  # ----------------------------
  def __len__(self):
    return len(self.df)    
    
  # ----------------------------
  # Get i'th item in dataset
  # ----------------------------
  def __getitem__(self, idx):
    # Absolute file path of the audio file - concatenate the audio directory with
    # the relative path
    audio_file = self.data_path + self.df.loc[idx, 'relative_path']
    # Get the Class ID
    class_id = self.df.loc[idx, 'classID']

    aud = AudioUtil.open(audio_file)

    rechan = AudioUtil.background_noise_removal(aud)

    dur_aud, dur_sr = AudioUtil.pad_trunc(rechan, 30000)

    aug = AudioUtil.preprocess_function(dur_aud)

    return aug, class_id

In [32]:
from torch.utils.data import random_split

myds = SoundDS(df, data_path)

# Random split of 80:20 between training and validation
num_items = len(myds)
num_train = round(num_items * 0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(myds, [num_train, num_val])

# Create training and validation data loaders
# train_dl = torch.utils.data.DataLoader(train_ds, batch_size=32, shuffle=True)
# val_dl = torch.utils.data.DataLoader(val_ds, batch_size=32, shuffle=False)

In [33]:
from transformers import TrainingArguments, Trainer

batch_size = 32

args = TrainingArguments(
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    output_dir="./"
)



In [34]:
import numpy as np

def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [35]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics
)

In [36]:
trainer.train()

  0%|          | 0/145 [03:48<?, ?it/s]
  0%|          | 0/145 [00:00<?, ?it/s]

ValueError: axes don't match array

In [None]:
trainer.evaluate()