In [18]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import csv
import io
import json
import librosa
from sklearn.model_selection import train_test_split
import librosa.display
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import matplotlib.pyplot as plt
import os
import torch
import torch.nn as nn
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

In [None]:
# Load the model.
model = hub.load('https://www.kaggle.com/models/google/yamnet/TensorFlow2/yamnet/1')


In [None]:
waveform, _ = librosa.load('Audios/vid_5.mp3', sr=16000)
waveform.shape

(267606,)

In [52]:
# Run the model, check the output.
scores, embeddings, log_mel_spectrogram = model(waveform)
scores.shape.assert_is_compatible_with([None, 521])
embeddings.shape.assert_is_compatible_with([None, 1024])
log_mel_spectrogram.shape.assert_is_compatible_with([None, 64])

In [53]:
# Find the name of the class with the top score when mean-aggregated across frames.
def class_names_from_csv(class_map_csv_text):
  """Returns list of class names corresponding to score vector."""
  class_map_csv = io.StringIO(class_map_csv_text)
  class_names = [display_name for (class_index, mid, display_name) in csv.reader(class_map_csv)]
  class_names = class_names[1:]  # Skip CSV header
  return class_names

In [54]:
class_map_path = model.class_map_path().numpy()
class_names = class_names_from_csv(tf.io.read_file(class_map_path).numpy().decode('utf-8'))
print(class_names[scores.numpy().mean(axis=0).argmax()])  # Should print 'Silence'.

Speech


In [55]:
embeddings.shape

TensorShape([34, 1024])

In [None]:
class AudioDataset(Dataset):
    def __init__(self, data_path = None, data = None, segment_len = 1000):
        """
        Initialize the dataset with preprocessed data.
        
        Args:
            data_path (str): Path to the preprocessed `.npz` file containing Mel spectrograms and labels.
            segment_len (int): Fixed length of segments (in time steps) to extract for training.
        """
        # data = np.load(data_path)
        self.mel_specs = data['mel_specs']  # Shape: (n_mels, total_time_steps)
        self.labels = data['labels']  # Shape: (total_time_steps,)
        
        self.segment_len = segment_len

        # Total number of segments that can be extracted from the dataset
        self.num_segments = self.mel_specs.shape[1] // self.segment_len

    def __len__(self):
        """
        Returns the total number of segments in the dataset.
        """
        return self.num_segments

    def __getitem__(self, idx: int):
        """
        Retrieve a fixed-length segment of Mel spectrogram and corresponding labels.

        Args:
            idx (int): Index of the segment to retrieve.

        Returns:
            tuple: (Mel spectrogram segment, Label segment)
        """
        start = idx * self.segment_len
        end = start + self.segment_len

        mel_segment = self.mel_specs[:, start:min(end, self.mel_specs.shape[1])]
        label_segment = self.labels[start:min(end, self.labels.shape[0])]

        return torch.tensor(mel_segment, dtype=torch.float32), torch.tensor(label_segment, dtype=torch.float32)


In [None]:
class CNNSoundClassifier(nn.Module):
    def __init__(self, input_size, num_filters, segment_len=1000):
        super(CNNSoundClassifier, self).__init__()
        self.segment_len = segment_len

        self.cnn = nn.Sequential(
            nn.Conv1d(input_size, num_filters, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),  
            nn.Dropout(p = 0.2),
            
            nn.Conv1d(num_filters, num_filters * 2, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Dropout(p = 0.2)
        )

        reduced_dim = self.segment_len // (2 * 2) 

        self.fc = nn.Linear(reduced_dim * num_filters * 2, segment_len)

        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.cnn(x)  # Shape: (batch_size, num_filters * 2, reduced_dim)
        x = x.view(x.size(0), -1)  # Flatten for the linear layer
        x = self.fc(x)  # Shape: (batch_size, segment_len)
        return self.sigmoid(x)


In [None]:
def extract_timestamps(predictions, hop_length, sr, merge_threshold=2, duration_threshold=1):
    """
    Extract timestamps from a binary prediction array, merging adjacent detections 
    and filtering out short detections based on duration.

    Args:
        predictions (list): Binary array of predictions (1 or 0), where 1 indicates the presence of an event.
        hop_length (int): The hop length in samples (used to calculate time from frame indices).
        sr (int): The sample rate (samples per second), used to convert indices to time.
        merge_threshold (float, optional): The minimum time gap (in seconds) between adjacent detections to consider them separate. Default is 2 seconds.
        duration_threshold (float, optional): The minimum duration (in seconds) of an event to keep. Default is 1 second.

    Returns:
        list: A list of tuples representing the start and end times of detected events (in seconds).
    """
    timestamps = []
    start = None

    for i, pred in enumerate(predictions):
        if pred > 0.5 and start is None:
            start = i
        elif pred <= 0.5 and start is not None:
            end = i
            # Convert frame indices to time and store the detected timestamp
            timestamps.append((start * hop_length / sr, end * hop_length / sr))
            start = None

    # Handle the case where an event ends at the last frame
    if start is not None:
        timestamps.append((start * hop_length / sr, len(predictions) * hop_length / sr))
    
    merged_timestamps = []
    for ts in timestamps:
        if not merged_timestamps or ts[0] - merged_timestamps[-1][1] > merge_threshold:
            # No overlap, append as new event
            merged_timestamps.append(ts)
        else:
            # Merge adjacent events into one
            merged_timestamps[-1] = (merged_timestamps[-1][0], ts[1])
    
    # Filter out events that are too short based on the duration_threshold
    merged_timestamps = [ts for ts in merged_timestamps if ts[1] - ts[0] >= duration_threshold]
    
    return merged_timestamps


In [None]:
# Preprocess all audio files to generate YAMNet embeddings and corresponding labels
def preprocess_all_audios(audio_files, annotations, sr=16000, hop_length=160, root_path='Audios', save_path='preprocessed_data.npz'):
    all_embeddings = None  # Initialize as None for the first file
    all_labels = None

    for audio_file in audio_files:
        audio_path = os.path.join(root_path, audio_file)
        waveform, _ = librosa.load(audio_path, sr=sr)  # Load the audio file with specified sample rate
        
        # Extract embeddings using YAMNet
        # YAMNet requires 16kHz sampling rate and the audio must be monophonic
        
        # Extract YAMNet embeddings and corresponding temporal data
        scores, embeddings, log_mel_spectrogram = model(waveform)  # Output embeddings and the corresponding time stamps
        embeddings = embeddings.numpy()  # Convert embeddings to numpy array
        
        # Generate labels for each time frame
        labels = np.zeros(embeddings.shape[0])  # Initialize labels for the time steps
        timestamps = annotations[audio_file]['crowd_noise']  # Retrieve crowd noise timestamps
        
        for start, end in timestamps:
            # Convert start and end times to indices based on hop_length and sample rate
            start_idx = int(start * sr / hop_length)
            end_idx = int(end * sr / hop_length)
            
            # Assign a label of 1 for the frames during the crowd noise event
            labels[start_idx:end_idx] = 1

        # Concatenate embeddings and labels
        if all_embeddings is None:
            all_embeddings = embeddings
        else:
            all_embeddings = np.concatenate((all_embeddings, embeddings), axis=0)

        if all_labels is None:
            all_labels = labels
        else:
            all_labels = np.concatenate((all_labels, labels))

    # Save the final arrays
    np.savez_compressed(save_path, embeddings=all_embeddings, labels=all_labels)
    print(f"Saved preprocessed data: Embeddings shape {all_embeddings.shape}, Labels length {all_labels.shape}")
    
    return {'embeddings': all_embeddings, 'labels': all_labels}

In [None]:
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10):
    """
    Train and validate the model for the specified number of epochs.

    Args:
        model (nn.Module): The neural network model to train.
        train_loader (DataLoader): DataLoader for the training dataset.
        val_loader (DataLoader): DataLoader for the validation dataset.
        criterion (nn.Module): The loss function used for training.
        optimizer (torch.optim.Optimizer): Optimizer for updating model weights.
        epochs (int, optional): Number of training epochs. Defaults to 10.

    Returns:
        None
    """
    model.train()

    for epoch in range(epochs):
        total_train_loss = 0
        train_correct_predictions = 0
        train_total_samples = 0

        for mel_spec, labels in train_loader:
            optimizer.zero_grad()

            outputs = model(mel_spec)
            loss = criterion(outputs.squeeze(), labels.squeeze())
            loss.backward()
            optimizer.step() 

            total_train_loss += loss.item()

            predictions = (outputs.squeeze() > 0.5).float()
            train_correct_predictions += (predictions == labels).sum().item()
            train_total_samples += labels.numel()

        # Calculate average training loss and accuracy for this epoch
        avg_train_loss = total_train_loss / len(train_loader)
        train_accuracy = train_correct_predictions / train_total_samples

        model.eval()

        total_val_loss = 0
        val_correct_predictions = 0
        val_total_samples = 0

        with torch.no_grad():
            for mel_spec, labels in val_loader:
                outputs = model(mel_spec)

                loss = criterion(outputs.squeeze(), labels.squeeze())
                total_val_loss += loss.item()

                predictions = (outputs.squeeze() > 0.5).float()
                val_correct_predictions += (predictions == labels).sum().item()
                val_total_samples += labels.numel()

        avg_val_loss = total_val_loss / len(val_loader)
        val_accuracy = val_correct_predictions / val_total_samples

        print(f"Epoch {epoch+1}/{epochs} -> "
            f"Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4%}, "
            f"Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4%}")

In [None]:
# Hop Length: Number of samples between consecutive frames in the spectrogram
HOP_LENGTH = 64
# Number of Mel frequency bands for Mel spectrogram
N_MELS = 64
# Sample rate of the audio (samples per second)
SAMPLE_RATE = 22050
# Hidden size of the LSTM layer (number of LSTM units)
# HIDDEN_SIZE = 64
# Maximum length of the input sequence (number of frames)
SEGMENT_LEN = 1000
# Training number of epochs
EPOCHS = 10
# Audios root directory path
DATASET_ROOT_DIRECTORY = 'Audios'

LEARNING_RATE = 2e-4

In [None]:
# Load annotations from JSON file
with open("timestamp_annotations.json", "r") as f:
    annotations = json.load(f)

# Extract the audio files from the annotations
audio_files = list(annotations.keys())

# Split the dataset into training and validation sets (80% training, 20% validation)
train_files, val_files = train_test_split(audio_files, test_size=0.2, random_state=42)

train_data = preprocess_all_audios(train_files, annotations, sr=SAMPLE_RATE, n_mels=N_MELS, hop_length=HOP_LENGTH, root_path=DATASET_ROOT_DIRECTORY)

val_data = preprocess_all_audios(val_files, annotations, sr=SAMPLE_RATE, n_mels=N_MELS, hop_length=HOP_LENGTH, root_path=DATASET_ROOT_DIRECTORY)

# Initialize the datasets
train_dataset = AudioDataset(
    data = train_data,
    segment_len=SEGMENT_LEN
)

val_dataset = AudioDataset(
    data = val_data,
    segment_len=SEGMENT_LEN
)

# Create DataLoader instances for both training and validation datasets
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)


In [None]:
# Initialize the model, loss function, and optimizer
# model = LSTMSoundClassifier(input_size=N_MELS, hidden_size=HIDDEN_SIZE, num_layers=1, output_size=1)
model = CNNSoundClassifier(input_size=N_MELS, num_filters=32, segment_len=SEGMENT_LEN)
criterion = nn.BCELoss()  # Binary Cross Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-3)

In [None]:
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=EPOCHS)