In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import csv
import io
import json
import librosa
from sklearn.model_selection import train_test_split
import librosa.display
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import matplotlib.pyplot as plt
import os
import torch
import torch.nn as nn
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report




In [2]:
# Find the name of the class with the top score when mean-aggregated across frames.
def class_names_from_csv(class_map_csv_text):
    """Returns list of class names corresponding to score vector."""
    class_map_csv = io.StringIO(class_map_csv_text)
    class_names = [display_name for (class_index, mid, display_name) in csv.reader(class_map_csv)]
    class_names = class_names[1:]  # Skip CSV header
    return class_names

In [3]:
# Load the model.
yamnet_model = hub.load('https://www.kaggle.com/models/google/yamnet/TensorFlow2/yamnet/1')













In [4]:
waveform, _ = librosa.load('Audios/rally_fan_noise_09.mp3', sr=16000)
waveform.shape

(491520,)

In [None]:
def load_audio_in_chunks(audio_path, chunk_duration=1.0, sample_rate=16000):
    audio, sr = librosa.load(audio_path, sr=sample_rate)
    chunk_length = int(chunk_duration * sr)  # Number of samples in each chunk
    chunks = [
        audio[i : i + chunk_length] for i in range(0, len(audio), chunk_length)
    ]
    return chunks, sr

# Example usage
audio_path = "Audios/rally_fan_noise_09.mp3"
chunks, sr = load_audio_in_chunks(audio_path)

# Process each chunk
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}: {len(chunk)} samples")
    scores, embeddings, log_mel_spectrogram = yamnet_model(chunk)
    class_map_path = yamnet_model.class_map_path().numpy()
    class_names = class_names_from_csv(tf.io.read_file(class_map_path).numpy().decode('utf-8'))
    print(class_names[scores.numpy().mean(axis=0).argmax()])


Chunk 1: 16000 samples
Speech
Chunk 2: 16000 samples
Speech
Chunk 3: 16000 samples
Speech
Chunk 4: 16000 samples
Speech
Chunk 5: 16000 samples
Speech
Chunk 6: 16000 samples
Speech
Chunk 7: 16000 samples
Speech
Chunk 8: 16000 samples
Speech
Chunk 9: 16000 samples
Speech
Chunk 10: 16000 samples
Speech
Chunk 11: 16000 samples
Speech
Chunk 12: 16000 samples
Speech
Chunk 13: 16000 samples
Speech
Chunk 14: 16000 samples
Speech
Chunk 15: 16000 samples
Speech
Chunk 16: 16000 samples
Speech
Chunk 17: 16000 samples
Speech
Chunk 18: 16000 samples
Speech
Chunk 19: 16000 samples
Speech
Chunk 20: 16000 samples
Speech
Chunk 21: 16000 samples
Speech
Chunk 22: 16000 samples
Speech
Chunk 23: 16000 samples
Speech
Chunk 24: 16000 samples
Vehicle
Chunk 25: 16000 samples
Vehicle
Chunk 26: 16000 samples
Speech
Chunk 27: 16000 samples
Speech
Chunk 28: 16000 samples
Speech
Chunk 29: 16000 samples
Speech
Chunk 30: 16000 samples
Speech
Chunk 31: 11520 samples
Speech


In [6]:
# Run the model, check the output.
scores, embeddings, log_mel_spectrogram = yamnet_model(waveform)
scores.shape.assert_is_compatible_with([None, 521])
embeddings.shape.assert_is_compatible_with([None, 1024])
log_mel_spectrogram.shape.assert_is_compatible_with([None, 64])

In [7]:
class_map_path = yamnet_model.class_map_path().numpy()
class_names = class_names_from_csv(tf.io.read_file(class_map_path).numpy().decode('utf-8'))
print(class_names[scores.numpy().mean(axis=0).argmax()]) 

Speech


In [8]:
embeddings.shape

TensorShape([63, 1024])

In [9]:
log_mel_spectrogram.shape

TensorShape([3072, 64])

In [10]:
class AudioDataset(Dataset):
    def __init__(self, data_path=None, data=None):
        
        if data is None:
            data = np.load(data_path)

        self.embeddings = data['embeddings']
        self.labels = data['labels']

        assert len(self.embeddings) == len(self.labels), "Embeddings and labels must have the same length."
        
        self.embeddings = torch.tensor(self.embeddings, dtype=torch.float32)
        self.labels = torch.tensor(self.labels, dtype=torch.float32)

    def __len__(self):
        return len(self.embeddings) 

    def __getitem__(self, idx: int):
        return self.embeddings[idx], self.labels[idx]


In [11]:
class CNNSoundClassifier(nn.Module):
    def __init__(self, input_size, num_filters, segment_len=1000):
        super(CNNSoundClassifier, self).__init__()
        self.segment_len = segment_len

        self.cnn = nn.Sequential(
            nn.Conv1d(input_size, num_filters, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),  
            nn.Dropout(p = 0.2),
            
            nn.Conv1d(num_filters, num_filters * 2, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Dropout(p = 0.2)
        )

        reduced_dim = self.segment_len // (2 * 2) 

        self.fc = nn.Linear(reduced_dim * num_filters * 2, segment_len)

        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.cnn(x)  # Shape: (batch_size, num_filters * 2, reduced_dim)
        x = x.view(x.size(0), -1)  # Flatten for the linear layer
        x = self.fc(x)  # Shape: (batch_size, segment_len)
        return self.sigmoid(x)


In [12]:
def extract_timestamps(predictions, hop_length, sr, merge_threshold=2, duration_threshold=1):
    """
    Extract timestamps from a binary prediction array, merging adjacent detections 
    and filtering out short detections based on duration.

    Args:
        predictions (list): Binary array of predictions (1 or 0), where 1 indicates the presence of an event.
        hop_length (int): The hop length in samples (used to calculate time from frame indices).
        sr (int): The sample rate (samples per second), used to convert indices to time.
        merge_threshold (float, optional): The minimum time gap (in seconds) between adjacent detections to consider them separate. Default is 2 seconds.
        duration_threshold (float, optional): The minimum duration (in seconds) of an event to keep. Default is 1 second.

    Returns:
        list: A list of tuples representing the start and end times of detected events (in seconds).
    """
    timestamps = []
    start = None

    for i, pred in enumerate(predictions):
        if pred > 0.5 and start is None:
            start = i
        elif pred <= 0.5 and start is not None:
            end = i
            # Convert frame indices to time and store the detected timestamp
            timestamps.append((start * hop_length / sr, end * hop_length / sr))
            start = None

    # Handle the case where an event ends at the last frame
    if start is not None:
        timestamps.append((start * hop_length / sr, len(predictions) * hop_length / sr))
    
    merged_timestamps = []
    for ts in timestamps:
        if not merged_timestamps or ts[0] - merged_timestamps[-1][1] > merge_threshold:
            # No overlap, append as new event
            merged_timestamps.append(ts)
        else:
            # Merge adjacent events into one
            merged_timestamps[-1] = (merged_timestamps[-1][0], ts[1])
    
    # Filter out events that are too short based on the duration_threshold
    merged_timestamps = [ts for ts in merged_timestamps if ts[1] - ts[0] >= duration_threshold]
    
    return merged_timestamps


In [None]:
# def preprocess_all_audios(audio_files, annotations, sr=16000, root_path='Audios', save_path='preprocessed_data.npz'):
#     all_embeddings = None 
#     all_labels = None

#     for audio_file in audio_files:
#         audio_path = os.path.join(root_path, audio_file)
#         waveform, _ = librosa.load(audio_path, sr=sr)

#         scores, embeddings, log_mel_spectrogram = yamnet_model(waveform)
#         embeddings = embeddings.numpy()

#         frame_length = 0.96  # seconds
#         frame_hop = 0.48  # seconds

#         num_embeddings = embeddings.shape[0]
#         labels = np.zeros(num_embeddings)
        
#         timestamps = annotations[audio_file]['crowd_noise']

#         for i in range(num_embeddings):
#             # Calculate start and end times for the current embedding
#             start_time = i * frame_hop
#             end_time = start_time + frame_length

#             # Check overlap with each crowd noise interval
#             for start, end in timestamps:
#                 overlap_start = max(start, start_time)
#                 overlap_end = min(end, end_time)
#                 overlap_duration = max(0, overlap_end - overlap_start)

#                 # Label as 1 if overlap exceeds half the frame length
#                 if overlap_duration >= (frame_length / 2):
#                     labels[i] = 1
#                     break  # No need to check further if labeled as 1

#         # Concatenate embeddings and labels across all files
#         if all_embeddings is None:
#             all_embeddings = embeddings
#         else:
#             all_embeddings = np.concatenate((all_embeddings, embeddings), axis=0)

#         if all_labels is None:
#             all_labels = labels
#         else:
#             all_labels = np.concatenate((all_labels, labels))
    
#     # Save the preprocessed data
#     # np.savez_compressed(save_path, embeddings=all_embeddings, labels=all_labels)
#     print(f"Saved preprocessed data: Embeddings shape {all_embeddings.shape}, Labels length {all_labels.shape}")
    
#     return {'embeddings': all_embeddings, 'labels': all_labels}

### Preprocessing audio for 4 classes

In [13]:
def preprocess_all_audios(audio_files, annotations, sr=16000, root_path='Audios', save_path='preprocessed_data.npz'):
    all_embeddings = None 
    all_labels = None

    # Define class names
    class_names = ['crowd_noise', 'long_noise', 'tyre_screech_noise', 'crash_noise']
                # [1, 2, 3, 4]
    num_classes = len(class_names)

    for audio_file in audio_files:
        audio_path = os.path.join(root_path, audio_file)
        waveform, _ = librosa.load(audio_path, sr=sr)

        # Extract YAMNet embeddings
        scores, embeddings, log_mel_spectrogram = yamnet_model(waveform)
        embeddings = embeddings.numpy()

        frame_length = 0.96  # seconds
        frame_hop = 0.48  # seconds
        num_embeddings = embeddings.shape[0]

        # Create a (num_embeddings, num_classes) binary label matrix
        labels = np.zeros((num_embeddings, num_classes))

        # Iterate over each class and assign labels
        for class_idx, class_name in enumerate(class_names):
            if class_name not in annotations[audio_file]:
                continue  # Skip if no annotations for this class

            timestamps = annotations[audio_file][class_name]

            for i in range(num_embeddings):
                start_time = i * frame_hop
                end_time = start_time + frame_length

                # Check overlap with each annotated interval
                for start, end in timestamps:
                    overlap_start = max(start, start_time)
                    overlap_end = min(end, end_time)
                    overlap_duration = max(0, overlap_end - overlap_start)

                    # Assign label if overlap exceeds half the frame length
                    if overlap_duration >= (frame_length / 2):
                        labels[i, class_idx] = 1
                        break  # Stop checking further intervals

        # Concatenate embeddings and labels across all files
        if all_embeddings is None:
            all_embeddings = embeddings
        else:
            all_embeddings = np.concatenate((all_embeddings, embeddings), axis=0)

        if all_labels is None:
            all_labels = labels
        else:
            all_labels = np.concatenate((all_labels, labels), axis=0)

    # Save the preprocessed data
    np.savez_compressed(save_path, embeddings=all_embeddings, labels=all_labels)
    # print(f"Saved preprocessed data: Embeddings shape {all_embeddings.shape}, Labels shape {all_labels.shape}")
    
    return {'embeddings': all_embeddings, 'labels': all_labels}

In [None]:
# Define the classifier head
class ClassifierHead(nn.Module):
    def __init__(self, input_dim, num_classes = 4, hidden_dim=128):
        """
        A simple classifier head to predict binary labels from embeddings.

        Args:
            input_dim (int): Dimensionality of the input embeddings.
            hidden_dim (int): Number of units in the hidden layer.
        """
        super(ClassifierHead, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(hidden_dim, num_classes) 
        # self.sigmoid = nn.Sigmoid() 

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        # x = self.sigmoid(x)
        return x

In [17]:
# Sample rate of the audio (samples per second)
SAMPLE_RATE = 16000

# Training number of epochs
EPOCHS = 20

# Audios root directory path
DATASET_ROOT_DIRECTORY = 'Audios'

LEARNING_RATE = 0.001

WEIGHT_DECAY = 1e-4  # Appropriate weight decay for small dataset

In [25]:
model = ClassifierHead(input_dim=1024, hidden_dim=128)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

In [19]:
# Load annotations from JSON file
with open("latest_timestamp_annotations.json", "r") as f:
    annotations = json.load(f)

# Extract the audio files from the annotations
audio_files = list(annotations.keys())

# Split the dataset into training and validation sets (80% training, 20% validation)
train_files, val_files = train_test_split(audio_files, test_size=0.2, random_state=21)

train_data = preprocess_all_audios(train_files, annotations, root_path=DATASET_ROOT_DIRECTORY)

val_data = preprocess_all_audios(val_files, annotations, sr=SAMPLE_RATE, root_path=DATASET_ROOT_DIRECTORY)

# # Initialize the datasets
train_dataset = AudioDataset(
    data = train_data,
)

val_dataset = AudioDataset(
    data = val_data,
)

# Create DataLoader instances for both training and validation datasets
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

In [26]:
print(len(train_loader))
print(len(val_loader))

1408
357


In [42]:
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10, threshold = 0.6):
    """
    Train and validate the model for the specified number of epochs.

    Args:
        model (nn.Module): The neural network model to train.
        train_loader (DataLoader): DataLoader for the training dataset.
        val_loader (DataLoader): DataLoader for the validation dataset.
        criterion (nn.Module): The loss function used for training.
        optimizer (torch.optim.Optimizer): Optimizer for updating model weights.
        epochs (int, optional): Number of training epochs. Defaults to 10.
        device (str, optional): Device to train on. Defaults to 'cuda'.

    Returns:
        None
    """
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)  # Move model to GPU if available

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        train_correct_samples = 0
        train_total_samples = 0

        for embeddings, labels in train_loader:
            embeddings, labels = embeddings.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(embeddings)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step() 

            total_train_loss += loss.item()

            # Multi-label accuracy (thresholding at 0.5)
            predictions = (outputs > threshold).float()
            train_correct_samples += (predictions == labels).all(dim=1).sum().item()  # Count only if all labels match
            train_total_samples += labels.shape[0]  # Total number of samples

        avg_train_loss = total_train_loss / len(train_loader)
        train_accuracy = train_correct_samples / train_total_samples  # Accuracy per sample

        model.eval()
        total_val_loss = 0
        val_correct_samples = 0
        val_total_samples = 0

        with torch.no_grad():
            for embeddings, labels in val_loader:
                embeddings, labels = embeddings.to(device), labels.to(device)

                outputs = model(embeddings)
                loss = criterion(outputs, labels)
                total_val_loss += loss.item()

                # Multi-label accuracy (thresholding at 0.5)
                predictions = (outputs > threshold).float()
                val_correct_samples += (predictions == labels).all(dim=1).sum().item()
                val_total_samples += labels.shape[0]

        avg_val_loss = total_val_loss / len(val_loader)
        val_accuracy = val_correct_samples / val_total_samples  # Accuracy per sample

        print(f"Epoch {epoch+1}/{epochs} -> "
              f"Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4%}, "
              f"Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4%}")


In [43]:
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=EPOCHS)

Epoch 1/20 -> Train Loss: 0.1106, Train Accuracy: 86.4145%, Val Loss: 0.1549, Val Accuracy: 81.7927%
Epoch 2/20 -> Train Loss: 0.1090, Train Accuracy: 86.4145%, Val Loss: 0.1561, Val Accuracy: 81.8627%
Epoch 3/20 -> Train Loss: 0.1064, Train Accuracy: 86.9117%, Val Loss: 0.1635, Val Accuracy: 82.1429%
Epoch 4/20 -> Train Loss: 0.1044, Train Accuracy: 86.9295%, Val Loss: 0.1519, Val Accuracy: 82.0028%
Epoch 5/20 -> Train Loss: 0.1034, Train Accuracy: 87.3380%, Val Loss: 0.1665, Val Accuracy: 81.0924%
Epoch 6/20 -> Train Loss: 0.1002, Train Accuracy: 87.2669%, Val Loss: 0.1615, Val Accuracy: 81.9328%
Epoch 7/20 -> Train Loss: 0.1015, Train Accuracy: 87.6754%, Val Loss: 0.1521, Val Accuracy: 81.5826%
Epoch 8/20 -> Train Loss: 0.0981, Train Accuracy: 87.5333%, Val Loss: 0.1695, Val Accuracy: 82.1429%
Epoch 9/20 -> Train Loss: 0.0975, Train Accuracy: 87.6043%, Val Loss: 0.1596, Val Accuracy: 82.1429%


KeyboardInterrupt: 

In [44]:
result = preprocess_all_audios(['vid_741.mp3'], annotations)


In [45]:
waveform, _ = librosa.load('Audios/vid_741.mp3', sr=16000)

# Pass the waveform through YAMNet to get embeddings
with torch.no_grad():
    scores, embeddings, _ = yamnet_model(waveform)  # Replace `model` with your YAMNet instance
    embeddings = embeddings.numpy()  # Convert embeddings to NumPy array if needed

# Convert embeddings to Torch tensor
embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32)

# Put the classifier in evaluation mode
model.eval()

# Get predictions
with torch.no_grad():
    predictions = model(embeddings_tensor)
    predictions = torch.sigmoid(predictions).numpy()  # Apply sigmoid if using BCEWithLogitsLoss


In [46]:
predictions

array([[8.2555128e-04, 9.9859113e-01, 8.6152412e-02, 6.3162729e-06],
       [5.4725807e-04, 9.9426454e-01, 4.9774341e-02, 1.0348833e-04],
       [3.3055272e-04, 9.9865192e-01, 8.9200310e-02, 3.6009742e-04],
       [3.8536920e-03, 9.8970360e-01, 3.9336178e-01, 1.8777044e-02],
       [7.4463952e-03, 9.7771180e-01, 3.8656232e-01, 1.0256008e-01],
       [7.6422824e-05, 9.9939656e-01, 4.4849187e-01, 8.9247350e-04],
       [4.0198922e-02, 8.0246359e-01, 8.8748038e-02, 2.9243913e-01],
       [7.4999496e-02, 2.9600933e-01, 1.2357598e-02, 3.9385554e-01],
       [2.6649131e-02, 3.7512547e-01, 4.1470532e-03, 5.9371990e-01],
       [2.8034974e-02, 4.4607252e-01, 1.4624724e-02, 2.1100454e-01],
       [1.7509377e-02, 8.5920654e-02, 8.9934701e-04, 4.0765740e-02],
       [7.1609575e-01, 8.7032475e-02, 1.8724872e-05, 2.9465836e-04],
       [6.7455369e-01, 1.2088279e-01, 9.7728735e-05, 2.3984655e-03],
       [1.5796210e-01, 1.6045561e-01, 1.1158787e-03, 2.8177034e-02],
       [7.4564181e-02, 2.5050640e-

In [47]:
result['labels']

array([[0., 1., 0., 0.],
       [0., 1., 1., 0.],
       [0., 1., 1., 0.],
       [0., 1., 1., 0.],
       [0., 1., 1., 0.],
       [0., 1., 1., 0.],
       [0., 1., 1., 0.],
       [0., 1., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 0.],
       [1., 0., 0., 0.]])

In [50]:
np.min(predictions)

6.316273e-06

In [None]:
# ['crowd_noise', 'loud_car_noise', 'tyre_screech_noise', 'crash_noise']
# [1, 2, 3, 4]
threshold = 0.5  # Adjust based on your model's calibration
binary_predictions = (predictions > threshold).astype(int)
binary_predictions

array([[0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [1, 0, 0, 0]])

In [26]:
(binary_predictions == result['labels']).all(axis=1).sum().item()

10

In [35]:
hop_size = 0.48  # In seconds
frame_length = 0.96  # Duration of each frame

noise_labels = ['crowd_noise', 'loud_car_noise', 'tyre_screech_noise', 'crash_noise']
timestamps = {0: [], 1: [], 2: [], 3: []}

# Create timestamps from binary predictions
for i, labels in enumerate(binary_predictions):
    for idx, label in enumerate(labels):
        if label == 1:  # If the noise is detected
            start_time = i * hop_size
            end_time = start_time + frame_length
            timestamps[idx].append((start_time, end_time))

# Function to merge overlapping or adjacent intervals
def merge_intervals(intervals):
    if not intervals:
        return []
    
    # Sort intervals based on start times
    intervals.sort()
    
    merged = [intervals[0]]
    for current in intervals[1:]:
        previous = merged[-1]
        # Merge overlapping or adjacent intervals
        if current[0] <= previous[1]:  
            merged[-1] = (previous[0], max(previous[1], current[1]))
        else:
            merged.append(current)
    
    return merged

# Apply merging for each type of noise
merged_timestamps = {idx: merge_intervals(timestamps[idx]) for idx in timestamps}

# Print results
for idx, intervals in merged_timestamps.items():
    print(f"Detected {noise_labels[idx]} intervals:")
    for start, end in intervals:
        print(f"  Start: {start:.2f}s, End: {end:.2f}s")



Detected crowd_noise intervals:
  Start: 2.88s, End: 4.32s
Detected loud_car_noise intervals:
  Start: 0.00s, End: 13.92s
Detected tyre_screech_noise intervals:
Detected crash_noise intervals:


In [165]:
data = preprocess_all_audios(['vid_475.mp3'], annotations)

Saved preprocessed data: Embeddings shape (23, 1024), Labels length (23,)


In [166]:
data['labels']

array([1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0.])

In [167]:
new = binary_predictions.flatten()

In [168]:
correct_predictions = new == data['labels']

# Count the number of correct predictions (True values)
correct_count = np.sum(correct_predictions)

In [169]:
correct_count

23

In [170]:
len(predictions)

23

In [None]:
def create_confusion_matrix_and_report(model, val_loader):
    """
    Evaluates the model using the validation data loader and computes the confusion matrix and classification report.
    
    Args:
        model (torch.nn.Module): The trained model.
        val_loader (DataLoader): DataLoader containing validation data.
    """
    model.eval()  # Set model to evaluation mode
    all_preds = []  # List to store all predictions
    all_labels = []  # List to store all true labels

    with torch.no_grad():  # Disable gradient computation during evaluation
        for embeddings, labels in val_loader:

            outputs = model(embeddings)
            predictions = (outputs.squeeze() > 0.5).float()
            
            all_preds.extend(predictions.cpu().numpy().flatten())
            all_labels.extend(labels.cpu().numpy().flatten())

    # Calculate confusion matrix using true and predicted labels
    cm = confusion_matrix(all_labels, all_preds, labels=[1, 0])  # 1 = Cheering, 0 = No Cheering
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Cheering", "No Cheering"])

    # Print classification report (precision, recall, f1-score, etc.)
    report = classification_report(all_labels, all_preds, target_names=["No Cheering", "Cheering"])
    print(report)

    disp.plot(cmap=plt.cm.Blues)
    plt.title("Confusion Matrix")
    plt.show()

In [None]:
create_confusion_matrix_and_report(model, val_loader)