In [1]:
import numpy as np

# Create monkey patches
np.float = float
np.int = int
np.object = object
np.bool = bool

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import matplotlib.pyplot as plt
import gunshot_utils as utils
import importlib
import ast
import re
import os
import pickle
from glob import glob
import librosa.display
import IPython.display as ipd

import torch as th
import torchaudio
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from pydub import AudioSegment
from pydub.playback import play

importlib.reload(utils)

In [3]:
class GunshotDetectionCNN(nn.Module):
    def __init__(self, num_frames):
        super(GunshotDetectionCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 10, kernel_size=(3, 7))
        self.pool1 = nn.MaxPool2d(kernel_size=(3, 1))
        self.conv2 = nn.Conv2d(10, 20, kernel_size=(3, 3))
        self.pool2 = nn.MaxPool2d(kernel_size=(3, 1))

        dummy_input = th.zeros(1, 3, 80, num_frames)  # Shape: (batch_size, channels, height, width)
        dummy_output = self.pool2(F.relu(self.conv2(self.pool1(F.relu(self.conv1(dummy_input))))))
        output_size = dummy_output.view(-1).shape[0]
        
        self.fc1 = nn.Linear(output_size, 256)
        self.fc2 = nn.Linear(256, 1)
        self.dropout = nn.Dropout(0.5)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))

        # Flatten the tensor
        x = x.view(x.size(0), -1) 

        x = self.dropout(F.relu(self.fc1(x)))  # Apply dropout
        x = self.sigmoid(self.fc2(x))
        return x

# Example usage
model = GunshotDetectionCNN(num_frames=utils.NUM_FRAMES)

In [65]:
class GunshotDataset(th.utils.data.Dataset):
    def __init__(self, spectograms, sample_rates, targets):
        self.X = []
        self.y = []

        for X, sample_rate, y in zip(spectograms, sample_rates, targets):
            X_frames, y_frames = utils.make_frames(X, y)
            self.X += X_frames
            self.y += y_frames

        print(len(self.X))
        tmp = th.cat(self.X)
        print(tmp.shape)
        self.mean = th.mean(tmp, dim=(0, 2)).unsqueeze(1)
        self.std = th.std(tmp, dim=(0, 2)).unsqueeze(1)
        del tmp

        self.X = [(x - self.mean)/self.std for x in self.X]

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.y[i]

In [66]:
gunshot_data_path = '/Users/borosabel/Documents/Uni/Thesis/PopMIR/Code/Audio/Gunshot/csv_combined/gunshot_dataset'

def generate_file_dataframe(folder_path):
    # Initialize a list to store the records
    records = []

    # Loop through all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.mp3') or filename.endswith('.wav') :  # Only consider mp3 files
            file_path = os.path.join(folder_path, filename)

            # Determine values for the third and fourth columns based on the filename
            if 'with_gunshot' in filename:
                timestampt = [2.0]  # Default if no number is found
                gunshot_flag = 1
                label = 1
            elif 'without_gunshot' in filename:
                gunshot_flag = 0
                label = 0
                timestampt = []
            else:
                continue

            # Append the record to the list
            records.append({
                'filename': file_path,
                'gunshot_location_in_seconds': timestampt,
                'num_gunshots': gunshot_flag,
                'label': label
            })

    # Create a DataFrame from the records
    df = pd.DataFrame(records)
    return df

# Generate the DataFrame
df = generate_file_dataframe(gunshot_data_path)

In [67]:
# GLOCK DATAFRAME MERGE
glock_csv = pd.read_csv('glock_gunshot_metadata.csv')

glock_csv = glock_csv[['filename', 'gunshot_location_in_seconds', 'num_gunshots', 'label']]

include_first_gunshot_only = True

# Function to preprocess gunshot start times, converting strings to lists of floats
def preprocess_gunshot_times(gunshot_times, include_first_gunshot_only=False):
    # Remove multiple spaces
    gunshot_times = re.sub(r'\s+', ' ', gunshot_times).strip()

    # Insert commas between numbers if missing
    gunshot_times = re.sub(r'(?<=\d)\s(?=\d)', ', ', gunshot_times)

    # Ensure there are no trailing commas
    gunshot_times = gunshot_times.replace(', ]', ']')

    # Safely evaluate the string as a list
    try:
        gunshot_list = ast.literal_eval(gunshot_times)
        if include_first_gunshot_only and isinstance(gunshot_list, list) and gunshot_list:
            return [gunshot_list[0]]  # Return only the first gunshot time
        return gunshot_list
    except (ValueError, SyntaxError):
        # Return an empty list if the string is not a valid list
        return []

# Apply the function to preprocess the 'gunshot_location_in_seconds' column with the boolean flag
glock_csv['gunshot_location_in_seconds'] = glock_csv['gunshot_location_in_seconds'].apply(
    lambda x: preprocess_gunshot_times(x, include_first_gunshot_only)
)

# If include_first_gunshot_only is True, set 'num_gunshots' to 1
if include_first_gunshot_only:
    glock_csv['num_gunshots'] = glock_csv['gunshot_location_in_seconds'].apply(lambda x: len(x))

# Add the label column
glock_csv['label'] = 1

In [58]:
appended_df = pd.concat([df, glock_csv], ignore_index=True)

In [48]:
files = appended_df[['filename', 'num_gunshots', 'gunshot_location_in_seconds']]
labels = appended_df[['label']]

X_train_paths, X_test_paths, y_train_paths, y_test_paths = train_test_split(files, labels, test_size=0.3, random_state=42)

In [49]:
spectrograms_train, sample_rates_train, labels_train = utils.preprocess_audio_train(X_train_paths, max_non_gunshot_samples=1)
spectrograms_test, sample_rates_test, labels_test = utils.preprocess_audio_train(X_test_paths, max_non_gunshot_samples=1)

In [50]:
# importlib.reload(utils)

In [51]:
# spectrograms_train, sample_rates_train, labels_train = utils.preprocess_audio_train(X_train_paths.iloc[[0]], max_non_gunshot_samples=1)

In [52]:
# len(spectrograms_train[0])

In [63]:
# 3 dimensions of mel-spectograms with 80 mel bands and 15 frames.
spectrograms_train[0].shape

In [68]:
dataset = GunshotDataset(spectrograms_train, sample_rates_train, labels_train)
dataloader = DataLoader(dataset, batch_size=256, shuffle=True)

In [40]:
mean_std = {'mean': dataset.mean, 'std': dataset.std}
with open('mean_std.pkl', 'wb') as f:
    pickle.dump(mean_std, f)

# Save using torch
th.save({'mean': dataset.mean, 'std': dataset.std}, 'mean_std.pth')

In [41]:
device = 'cuda' if th.cuda.is_available() else 'cpu'
print(device)

In [42]:
import torch as th
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

device = th.device("cuda" if th.cuda.is_available() else "cpu")

def train_model(model, optimizer, criterion, train_loader, valid_features, valid_labels, epochs=10, thresholds=None, mean=None, std=None, patience=3):
    if thresholds is None:
        thresholds = np.arange(0.1, 1.0, 0.01)

    if mean is None or std is None:
        raise ValueError("Mean and std must be provided for normalization.")

    mean = mean.to(device)
    std = std.to(device)

    model = model.to(device)
    best_threshold = 0.0
    best_score = 0.0
    epochs_since_improvement = 0

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0

        # Training phase with batch processing
        for features, labels in train_loader:
            features, labels = features.to(device), labels.to(device).float()

            optimizer.zero_grad()

            # Normalize features
            print("features shape", features.shape)
            print("mean shape", mean.shape)
            print("std shape", std.shape)
            features = (features - mean) / std

            outputs = model(features).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * features.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)

        print(f"Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}")

        model.eval()

        # Ensure valid_features and valid_labels are tensors without re-constructing them unnecessarily
        valid_features = th.stack([f.clone().detach().float() for f in valid_features]).to(device)  # Stack all features into a batch tensor
        if not isinstance(valid_labels, th.Tensor):
            valid_labels = th.tensor(valid_labels).float().to(device)
        else:
            valid_labels = valid_labels.clone().detach().float().to(device)

        best_epoch_threshold, current_score = evaluate_model(model, valid_features, valid_labels, thresholds, mean, std)

        if current_score > best_score:
            best_score = current_score
            best_threshold = best_epoch_threshold
            epochs_since_improvement = 0
            print(f"New best F1 score: {best_score:.4f}, model saved.")
        else:
            epochs_since_improvement += 1

        print(f"TEST best_f1: {best_score} with best threshold: {best_threshold}")

        # Early stopping based on patience
        if epochs_since_improvement >= patience:
            print(f"No improvement in F1 score for {patience} epochs. Stopping training.")
            break

    # Compute and display the confusion matrix on the validation set
    cm = compute_confusion_matrix(model, valid_features, valid_labels, best_threshold, mean, std)
    display_confusion_matrix(cm)

    return best_threshold, best_score

def evaluate_model(model, features, labels, thresholds, mean, std, batch_size=32):
    """
    Evaluates the model to find the best threshold based on F1 score.
    Uses batch processing for efficiency.
    """
    best_threshold = 0.0
    best_f1_score = 0.0

    # Normalize features
    features = (features - mean) / std

    with th.no_grad():
        outputs = model(features).squeeze().cpu().numpy()

        for threshold in thresholds:
            predictions = (outputs >= threshold).astype(float)
            avg_f1_score = f1_score(labels.cpu().numpy(), predictions)

            if avg_f1_score > best_f1_score:
                best_f1_score = avg_f1_score
                best_threshold = threshold

    return best_threshold, best_f1_score

def compute_confusion_matrix(model, features, labels, threshold, mean, std):
    """
    Compute confusion matrix using batch processing.
    """
    # Normalize features
    features = (features - mean) / std

    with th.no_grad():
        outputs = model(features).squeeze().cpu().numpy()

    predictions = (outputs >= threshold).astype(float)
    all_labels = labels.cpu().numpy()

    cm = confusion_matrix(all_labels, predictions)
    return cm

def display_confusion_matrix(cm):
    """
    Displays the confusion matrix using matplotlib.
    """
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
    disp.plot(cmap='magma')
    plt.show()


In [43]:
epochs = 2
lr = 3e-4

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)

best_threshold, best_f1 = train_model(model, optimizer, criterion, dataloader, spectrograms_test, labels_test, epochs=10, thresholds=None, mean=dataset.mean, std=dataset.std)

In [122]:
def manual_evaluate_test(model, feature, threshold, frame_size=utils.NUM_FRAMES, sampling_rate=utils.SAMPLING_RATE, hop_length=utils.HOP_LENGTH, mean=None, std=None, step_size=None, filter_time_sec=1):
    """
    Manually evaluate the model on an audio feature, returning time positions where gunshots are detected.

    Parameters:
        model: The trained model.
        feature: The feature (e.g., spectrogram) to evaluate.
        threshold: The prediction threshold for gunshots.
        frame_size: Number of frames to use in each evaluation.
        sampling_rate: Audio sampling rate.
        hop_length: Hop length in samples for each frame.
        mean: Mean for normalization.
        std: Standard deviation for normalization.
        step_size: Step size for moving through frames (default: frame_size // 2).
        filter_time_sec: Time (in seconds) to filter out close consecutive predictions.
    
    Returns:
        List of tuples (minutes, seconds, output) where gunshots are detected along with the model's output.
    """
    if mean is None or std is None:
        raise ValueError("Mean and std must be provided for normalization.")

    mean = mean.to(device)
    std = std.to(device)
    model = model.to(device)
    model.eval()

    predictions = []

    # Normalize feature
    feature = feature.to(device)
    feature = (feature - mean) / std

    num_frames = feature.shape[2]

    # If step_size is not specified, default to half the frame size
    if step_size is None:
        # step_size = frame_size // 2  # Adjust step_size if necessary
        step_size = 1

    total_iterations = 0  # To count the iterations

    with th.no_grad():
        # Loop through overlapping frames with smaller step size
        for j in range(0, num_frames - frame_size + 1, step_size):
            total_iterations += 1
            start = j
            end = j + frame_size

            input_frame = feature[:, :, start:end].unsqueeze(0).float()
            output = model(input_frame).squeeze().item()
            predictions.append((output, start))  # Keep track of output and start position
            
        return predictions

        # Sort predictions by the time index
        res = []
        for output, start in predictions:
            if output >= threshold:
                time_in_seconds = start * hop_length / sampling_rate
                minutes = int(time_in_seconds // 60)
                seconds = time_in_seconds % 60
                res.append((minutes, seconds, time_in_seconds, output))  # Add output to the result

    # Filter out close consecutive detections
    filtered_res = []
    last_detection_time = -float('inf')  # Initialize with negative infinity to accept the first detection

    for minutes, seconds, time_in_seconds, output in res:
        if time_in_seconds - last_detection_time >= filter_time_sec:
            # Append with output value included
            filtered_res.append((minutes, seconds, output))
            last_detection_time = time_in_seconds  # Update the last detected time

    # Return results with raw model output for comparison
    return filtered_res

In [123]:
spectrograms, sample_rates = utils.preprocess_audio(['/Users/borosabel/Documents/Uni/Thesis/PopMIR/M.I.A. - Paper Planes.mp3'])
# spectrograms, sample_rates = utils.preprocess_audio(['/Users/borosabel/Documents/Uni/Thesis/PopMIR/I Gave You Power.mp3'])
# spectrograms, sample_rates = utils.preprocess_audio(['/Users/borosabel/Documents/Uni/Thesis/PopMIR/50 Cent - Many Men (Wish Death) (Dirty Version).mp3'])
# spectrograms, sample_rates = utils.preprocess_audio(['/Users/borosabel/Documents/Uni/Thesis/PopMIR/50 Cent - Heat (Official Music Video).mp3'])
# spectrograms, sample_rates = utils.preprocess_audio(['/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Audio/Gunshots/Combined/with_gunshot_4655.mp3'])

In [124]:
spectrograms[0].shape

In [125]:
# Load mean and std from file
with open('./mean_std.pkl', 'rb') as f:
    data = pickle.load(f)
    mean = data['mean']
    std = data['std']

In [126]:
predictions = manual_evaluate_test(model, spectogram, threshold=best_threshold, mean=mean, std=std, step_size=1,
                     filter_time_sec=1.5)

In [127]:
preds, outputs = zip(*predictions)

In [128]:
plt.figure(figsize=(20, 15))
plt.plot(outputs, preds)

In [88]:
for spectogram in spectrograms:
    predicted_times = manual_evaluate_test(model, spectogram, threshold=best_threshold, mean=mean, std=std, step_size=1, 
    filter_time_sec=1.5)

In [89]:
print(f"Current treshold is {best_threshold} \n")

for minutes, seconds, output in predicted_times:
    print(f"Detected gunshot at {minutes}m {seconds:.2f}s with model output: {output:.4f}")

In [90]:
plt.plot(predicted_times, output)

In [48]:
from pydub import AudioSegment
from pydub.playback import play
import torchaudio
import torch as th

def extract_sample_at_time(audio_path, start_time_sec, frame_size=utils.NUM_FRAMES, hop_length=utils.HOP_LENGTH):
    """
    Extracts a sample from the audio corresponding to the specified start time.

    Parameters:
        audio_path (str): Path to the audio file.
        start_time_sec (float): The starting time in seconds to cut the sample.
        frame_size (int): Number of frames to consider (default: 86).
        sampling_rate (int): Sampling rate of the audio (default: 44100).
        hop_length (int): Hop length used in preprocessing (default: 512).
    
    Returns:
        waveform (Tensor): The extracted waveform.
        sample (AudioSegment): The extracted audio segment.
    """
    # Load the full audio file
    audio = AudioSegment.from_file(audio_path)

    _, sample_rate = torchaudio.load(audio_path)

    sample_duration_sec = (frame_size - 1) * hop_length / sample_rate
    sample_duration_ms = sample_duration_sec * 1000

    # Calculate start time in milliseconds
    start_time_ms = start_time_sec * 1000

    # Extract the segment using pydub
    sample = audio[start_time_ms:start_time_ms + sample_duration_ms]

    frame_offset = int(start_time_sec * sample_rate)
    num_frames = int(sample_duration_sec * sample_rate)

    waveform, _ = torchaudio.load(audio_path, frame_offset=frame_offset, num_frames=num_frames)

    return waveform, sample, sample_rate

def process_and_predict(model, audio_path, start_time_sec, mean, std, threshold=best_threshold):
    """
    Extracts a sample from the audio at a given time, plays it, preprocesses it,
    and feeds it to the model to make a prediction.

    Parameters:
        model (torch.nn.Module): The trained model.
        audio_path (str): Path to the audio file.
        start_time_sec (float): The starting time in seconds to extract the sample.
        mean (torch.Tensor): Mean used for normalization.
        std (torch.Tensor): Standard deviation used for normalization.
        threshold (float): Threshold to determine gunshot (default: 0.5).
    
    Returns:
        prediction (str): "Gunshot" if the model predicts a gunshot, otherwise "No Gunshot".
    """

    # Extract the waveform and the audio sample
    waveform, sample, sample_rate = extract_sample_at_time(audio_path, start_time_sec)

    # Play the audio sample
    print(f"Playing the audio sample from {start_time_sec:.2f} seconds.")
    play(sample)

    # Ensure mean and std are on the correct device
    mean = mean.to(device)
    std = std.to(device)
    model = model.to(device)
    waveform = waveform.to(device)

    # Preprocess the waveform using your utility function
    mel_spectrogram = utils.calculate_melbands(waveform[0], sample_rate)

    # Normalize the spectrogram
    mel_spectrogram = (mel_spectrogram - mean) / std

    # Reshape and feed to model
    with th.no_grad():
        input_tensor = mel_spectrogram.unsqueeze(0).float()  # Add batch dimension
        output = model(input_tensor).squeeze().item()

    # Apply threshold to determine if it is a gunshot
    if output >= threshold:
        prediction = "Gunshot"
    else:
        prediction = "No Gunshot"

    print(f"Model Prediction: {prediction} with output: {output}")
    return prediction

# Example usage
audio_path = '/Users/borosabel/Documents/Uni/Thesis/PopMIR/50 Cent - Many Men (Wish Death) (Dirty Version).mp3'
# audio_path = '/Users/borosabel/Documents/Uni/Thesis/PopMIR/M.I.A. - Paper Planes.mp3'
# audio_path = '/Users/borosabel/Documents/Uni/Thesis/PopMIR/DMX - Ruff Ryders Anthem.mp3'
# audio_path = '/Users/borosabel/Documents/Uni/Thesis/PopMIR/Code/Audio/Gunshot/csv_combined/gunshot_dataset/with_gunshot_Just Playing (Dreams) (2005 Remaster)_glock_17_9mm(240)_346.mp3'
model = model
mean = mean
std = std

print(best_threshold)

# Predict if there's a gunshot starting at 2 seconds
prediction = process_and_predict(model, audio_path, start_time_sec=42.4, mean=mean, std=std)

In [42]:
import torchaudio
import matplotlib.pyplot as plt
import random
from IPython.display import Audio, display

# Load the audio file
def load_audio(audio_path):
    waveform, sample_rate = torchaudio.load(audio_path)
    return waveform, sample_rate

# Function to plot multiple waveforms
def plot_waveforms(waveforms, sample_rate):
    fig, axes = plt.subplots(len(waveforms), 1, figsize=(10, 10), sharex=True)
    if len(waveforms) == 1:
        axes = [axes]  # Ensure axes is iterable if there's only one waveform

    for i, waveform in enumerate(waveforms):
        axes[i].plot(waveform[0].numpy())
        axes[i].set_title(f"Waveform {i+1}")
        axes[i].set_xlabel('Time (samples)')
        axes[i].set_ylabel('Amplitude')

    plt.tight_layout()
    plt.show()

# The modified function from earlier
def select_gunshot_segment(waveform, sample_rate, gunshot_time, frame_length, max_shift_sec=0.8):
    random_shift = random.uniform(-max_shift_sec, max_shift_sec)
    shifted_gunshot_time = gunshot_time + random_shift
    start_time = max(0, shifted_gunshot_time - (frame_length / sample_rate) / 2)
    start_sample = int(start_time * sample_rate)
    end_sample = start_sample + int(frame_length)
    end_sample = min(end_sample, waveform.size(1))
    start_sample = max(0, end_sample - int(frame_length))

    return waveform[:, start_sample:end_sample]

# Function to play multiple audio segments
def play_audio_segments(waveforms, sample_rate):
    for i, waveform in enumerate(waveforms):
        display(Audio(waveform.numpy(), rate=sample_rate))  # Convert waveform to numpy array and play

# Example usage
audio_path = '/Users/borosabel/Documents/Uni/Thesis/PopMIR/Code/Audio/Gunshot/csv_combined/gunshot_dataset/with_gunshot_Against All Odds_glock_17_9mm(72)_1331.mp3'
waveform, sample_rate = load_audio(audio_path)

# Parameters
gunshot_time = 2.0  # Assume gunshot occurs at 2 seconds
frame_length = utils.FRAME_LENGTH

print(frame_length)

# Run the selection 5 times and plot + play
waveforms = []
for _ in range(5):
    selected_segment = select_gunshot_segment(waveform, sample_rate, gunshot_time, frame_length)
    waveforms.append(selected_segment)

# Plot the waveforms
plot_waveforms(waveforms, sample_rate)

# Play the waveforms
play_audio_segments(waveforms, sample_rate)

In [46]:
prediction = process_and_predict(model, audio_path, start_time_sec=0.7, mean=mean, std=std)