In [1]:
import numpy as np

# Create monkey patches
np.float = float
np.int = int
np.object = object
np.bool = bool

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import matplotlib.pyplot as plt
from IPython.display import Audio, display
import gunshot_utils as utils
import importlib
import ast
import re
import os
from pydub import AudioSegment
import random
import pickle
from glob import glob
import librosa.display
import IPython.display as ipd

import torch as th
import torchaudio
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from pydub import AudioSegment
from pydub.playback import play

importlib.reload(utils)

<b>1.Data creation</b>

In [3]:
# We have here a CSV file where the gunshot filenames, the number of gunshots and their starting point is stored
gunshot_csv = pd.read_csv('/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Audio/Gunshots/csv/edge-collected-gunshot-audio/updated_gunshot_metadata.csv')

In [4]:
gunshot_csv.head()

In [5]:
# The problem with these gunshot files is that some of them are too quiet and when I place it on a music we can't really hear them. 
# So I tried to filter them and get only a set of gunshot files where the gunshots are actually loud enough to place them on a music file.

In [6]:
# This cell was used to clean the csv data because some of the gunshot samples were too low in decibels to even hear something.
# Load your data
df = pd.read_csv('/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Audio/Gunshots/csv/edge-collected-gunshot-audio/updated_gunshot_metadata.csv')
audio_dir = '/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Audio/Gunshots/csv/edge-collected-gunshot-audio/edge-collected-gunshot-audio'

def get_max_decibel_level(audio_segment):
    """
    Calculate the maximum decibel level of the audio segment.
    
    Parameters:
        audio_segment (AudioSegment): The audio segment to analyze.
        
    Returns:
        max_db (float): The maximum decibel level of the audio.
    """
    return audio_segment.max_dBFS

def filter_gunshots_by_decibel(df, audio_dir, threshold_db=-20.0):
    """
    Filter out gunshot audio files based on their maximum decibel levels and return
    a list of files with low decibel levels for manual review.
    
    Parameters:
        df (DataFrame): DataFrame containing the file paths and other metadata.
        audio_dir (str): Directory where the audio files are stored.
        threshold_db (float): Decibel threshold below which files will be listed for review.
        
    Returns:
        filtered_df (DataFrame): DataFrame containing only the files above the decibel threshold.
        low_db_files (list): List of file paths with low decibel levels.
    """
    filtered_records = []
    low_db_files = []

    for index, row in df.iterrows():
        file_path = row['filename']
        full_path = os.path.join(audio_dir, file_path)

        # Load the audio file
        try:
            audio = AudioSegment.from_file(full_path)
        except Exception as e:
            continue

        # Get the maximum decibel level
        max_db = get_max_decibel_level(audio)
        if max_db > threshold_db:
            filtered_records.append(row)
        else:
            low_db_files.append(full_path)
    # Create a new DataFrame with filtered records
    filtered_df = pd.DataFrame(filtered_records)
    return filtered_df, low_db_files

# Define your threshold (e.g., -20 dBFS)
threshold_db = -5.0

# Filter the DataFrame based on max decibel level and get the low decibel files
filtered_df, low_db_files = filter_gunshots_by_decibel(df, audio_dir, threshold_db=threshold_db)

# Save the filtered DataFrame to a new CSV file
filtered_df.to_csv('filtered_gunshot_metadata.csv', index=False)

# Print the list of files with low decibel levels
print(f"Number of files with low decibel levels:{len(low_db_files)}")

<b>After we got the filtered dataset we can use those gunshots to create audio with gunshots</b>

In [17]:
gunshots_paths = pd.read_csv('./filtered_gunshot_metadata.csv')
gunshots = gunshots_paths[['filename', 'num_gunshots', 'gunshot_location_in_seconds']].copy()  # Create a copy to avoid SettingWithCopyWarning

# Function to preprocess gunshot start times, converting strings to lists of floats
def preprocess_gunshot_times(gunshot_times, include_first_gunshot_only=False):
    # Remove multiple spaces
    gunshot_times = re.sub(r'\s+', ' ', gunshot_times).strip()

    # Insert commas between numbers if missing
    gunshot_times = re.sub(r'(?<=\d)\s(?=\d)', ', ', gunshot_times)

    # Ensure there are no trailing commas
    gunshot_times = gunshot_times.replace(', ]', ']')

    # Safely evaluate the string as a list
    try:
        gunshot_list = ast.literal_eval(gunshot_times)
        if include_first_gunshot_only and isinstance(gunshot_list, list) and gunshot_list:
            return [gunshot_list[0]]  # Return only the first gunshot time
        return gunshot_list
    except (ValueError, SyntaxError):
        # Return an empty list if the string is not a valid list
        return []

# Define the root path
root_path = '/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Audio/Gunshots/csv/edge-collected-gunshot-audio/edge-collected-gunshot-audio/'

# Boolean flag to control if only the first gunshot time should be included
include_first_gunshot_only = False

# Apply the function to preprocess the 'gunshot_location_in_seconds' column with the boolean flag
gunshots['gunshot_location_in_seconds'] = gunshots['gunshot_location_in_seconds'].apply(
    lambda x: preprocess_gunshot_times(x, include_first_gunshot_only)
)

# If include_first_gunshot_only is True, set 'num_gunshots' to 1
if include_first_gunshot_only:
    gunshots['num_gunshots'] = gunshots['gunshot_location_in_seconds'].apply(lambda x: len(x))

# Add the label column
gunshots['label'] = 1

music_df = pd.read_excel('/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Excel/baseline_data_w_topics_w_features.xlsx', engine='openpyxl')

music = music_df.rename(columns={'Path': 'filename'})
music['label'] = 0

music_labels = music[['label']]
music_paths_df = music[['filename']]

df = utils.generate_data_samples(music, gunshots, number_of_samples_w_gunshots=1, number_of_samples_wo_gunshots=0)

In [18]:
df

In [27]:
audio = AudioSegment.from_file(df['filename'].iloc[0])

# Play the audio
play(audio)

<b>Model and frame sizes</b>

In [28]:
# Model now can take any number of frames.

class GunshotDetectionCNN(nn.Module):
    def __init__(self, num_frames):
        super(GunshotDetectionCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 10, kernel_size=(3, 7))
        self.pool1 = nn.MaxPool2d(kernel_size=(3, 1))
        self.conv2 = nn.Conv2d(10, 20, kernel_size=(3, 3))
        self.pool2 = nn.MaxPool2d(kernel_size=(3, 1))

        dummy_input = th.zeros(1, 3, 80, num_frames)  # Shape: (batch_size, channels, height, width)
        dummy_output = self.pool2(F.relu(self.conv2(self.pool1(F.relu(self.conv1(dummy_input))))))
        output_size = dummy_output.view(-1).shape[0]

        self.fc1 = nn.Linear(output_size, 256)
        self.fc2 = nn.Linear(256, 1)
        self.dropout = nn.Dropout(0.5)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))

        # Flatten the tensor
        x = x.view(x.size(0), -1)

        x = self.dropout(F.relu(self.fc1(x)))  # Apply dropout
        x = self.sigmoid(self.fc2(x))
        return x

# Example usage
model = GunshotDetectionCNN(num_frames=150)

In [29]:
# This means that if we have 150 frames we cover: 
print(f"{150 * utils.HOP_LENGTH / 44100} seconds")

In [30]:
# Since we have the gunshots at 2 seconds hard, it produces all the time the same patterns. To generalize the model a bit more we shift the gunshots randomly and generate multiple samples.

# Load the audio file
def load_audio(audio_path):
    waveform, sample_rate = torchaudio.load(audio_path)
    return waveform, sample_rate

# Function to plot multiple waveforms
def plot_waveforms(waveforms, sample_rate):
    fig, axes = plt.subplots(len(waveforms), 1, figsize=(10, 10), sharex=True)
    if len(waveforms) == 1:
        axes = [axes]

    for i, waveform in enumerate(waveforms):
        axes[i].plot(waveform[0].numpy())
        axes[i].set_title(f"Waveform {i+1}")
        axes[i].set_xlabel('Time (samples)')
        axes[i].set_ylabel('Amplitude')

    plt.tight_layout()
    plt.show()

# The modified function from earlier
def select_gunshot_segment(waveform, sample_rate, gunshot_time, frame_length, max_shift_sec=0.3):
    random_shift = random.uniform(-max_shift_sec, max_shift_sec)
    shifted_gunshot_time = gunshot_time + random_shift
    start_time = max(0, shifted_gunshot_time - (frame_length / sample_rate) / 2)
    start_sample = int(start_time * sample_rate)
    end_sample = start_sample + int(frame_length)
    end_sample = min(end_sample, waveform.size(1))
    start_sample = max(0, end_sample - int(frame_length))

    return waveform[:, start_sample:end_sample]

# Function to play multiple audio segments
def play_audio_segments(waveforms, sample_rate):
    for i, waveform in enumerate(waveforms):
        display(Audio(waveform.numpy(), rate=sample_rate))  # Convert waveform to numpy array and play

# Example usage
audio_path = df['filename'].iloc[0]
waveform, sample_rate = load_audio(audio_path)

# Parameters
gunshot_time = 2.0
frame_length = utils.FRAME_LENGTH

print(frame_length)

# Run the selection 5 times and plot + play
waveforms = []
for _ in range(5):
    selected_segment = select_gunshot_segment(waveform, sample_rate, gunshot_time, frame_length)
    waveforms.append(selected_segment)

# Plot the waveforms
plot_waveforms(waveforms, sample_rate)

# Play the waveforms
play_audio_segments(waveforms, sample_rate)