In [5]:
import pandas as pd
import torch
import torchaudio
from pathlib import Path
import transformers
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
from transformers import AutoFeatureExtractor, WhisperForAudioClassification
import os

token = os.getenv('HF_TOKEN')
model_id = "openai/whisper-tiny"

feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, token=token)
model = WhisperForAudioClassification.from_pretrained(model_id, token=token, num_labels=7)

In [2]:
data_path = "data/clips/"
df = pd.read_csv("data/ground_truth.csv")

In [None]:
from datasets import metric

In [None]:
import math, random
import torch
import torchaudio
from torchaudio import transforms
from IPython.display import Audio
from noisereduce.torchgate import TorchGate as TG

class AudioUtil():
  # ----------------------------
  # Load an audio file. Return the signal as a tensor and the sample rate
  # ----------------------------
  @staticmethod
  def open(audio_file):
    sig = torchaudio.load(audio_file)
    return (sig)
  
  # ----------------------------
  # Remove background noise
  # ----------------------------

  from noisereduce.torchgate import TorchGate as TG   

  @staticmethod
  def background_noise_removal(noisy_speech):
      tg = TG(sr=16000).to(device)
      enhanced_speech = tg(noisy_speech)
      return enhanced_speech
  
  # ----------------------------
  # Audio PreProcessing
  # ----------------------------
  @staticmethod
  def preprocess_function(array):
     max_duration = 4.0 #seconds
     input = feature_extractor(array,
                              sampling_rate=feature_extractor.sampling_rate,
                              max_length = int(feature_extractor.sampling_rate * max_duration),
                              truncation = True
      )
     return input
     


In [None]:
from torch.utils.data import DataLoader, Dataset
import torchaudio

# ----------------------------
# Sound Dataset
# ----------------------------
class SoundDS(Dataset):
  def __init__(self, df, data_path):
    self.df = df
    self.data_path = str(data_path)
    # self.duration = 4000
    # self.sr = 44100
    # self.channel = 2
    # self.shift_pct = 0.4
            
  # ----------------------------
  # Number of items in dataset
  # ----------------------------
  def __len__(self):
    return len(self.df)    
    
  # ----------------------------
  # Get i'th item in dataset
  # ----------------------------
  def __getitem__(self, idx):
    # Absolute file path of the audio file - concatenate the audio directory with
    # the relative path
    audio_file = self.data_path + self.df.loc[idx, 'relative_path']
    # Get the Class ID
    class_id = self.df.loc[idx, 'classID']

    aud = AudioUtil.open(audio_file)

    reaud = AudioUtil.background_noise_removal(aud)
    aug = AudioUtil.preprocess_function(reaud)

    return aug, class_id

In [None]:
from torch.utils.data import random_split

myds = SoundDS(df, data_path)

# Random split of 80:20 between training and validation
num_items = len(myds)
num_train = round(num_items * 0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(myds, [num_train, num_val])

# Create training and validation data loaders
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=32, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=32, shuffle=False)

In [None]:
from transformers import TrainingArguments, Trainer

batch_size = 32

args = TrainingArguments(
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dl,
    eval_dataset=val_dl,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()