In [19]:
import numpy as np

# Create monkey patches
np.float = float
np.int = int
np.object = object
np.bool = bool

In [20]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import matplotlib.pyplot as plt
import gunshot_utils as utils
import importlib
import ast
import re
import os
import pickle
from glob import glob
import librosa.display
import IPython.display as ipd

import torch as th
import numpy as np
import torchaudio
from torch.utils.data import DataLoader
from IPython.display import Audio
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from pydub import AudioSegment
from pydub.playback import play

importlib.reload(utils)

<module 'gunshot_utils' from '/Users/borosabel/Documents/Uni/Thesis/PopMusicInformationRetrieval/gunshot_utils.py'>

In [12]:
gunshot_df = pd.read_csv('./filtered_gunshot_metadata_glocks.csv')
gunshot_df = gunshot_df[['filename', 'gunshot_location_in_seconds', 'num_gunshots']]
gunshot_df.head()

Unnamed: 0,filename,gunshot_location_in_seconds,num_gunshots
0,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,[1.72269841],1
1,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,[1.67290249],1
2,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,[1.61977324 3.50795918 5.42746032],3
3,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,[1.75 1.98768707 2.26022676],3
4,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,[1.75 2.61845805 3.06664399],3


In [4]:
music_df = pd.read_excel('/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Excel/baseline_data_w_topics_w_features.xlsx', engine='openpyxl')
music_df = music_df[['Path', 'Sample Rate (Hz)']]
music_df.head()

Unnamed: 0,Path,Sample Rate (Hz)
0,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,48000
1,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,48000
2,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,48000
3,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,48000
4,/Users/borosabel/Documents/Uni/Thesis/PopMIR/D...,48000


In [5]:
music_train_df, music_valid_df = train_test_split(music_df, test_size=0.2, random_state=42)
gunshot_train_df, gunshot_valid_df = train_test_split(gunshot_df, test_size=0.2, random_state=42)

In [6]:
class GunshotDetectionCNN(nn.Module):
    def __init__(self, num_frames):
        super(GunshotDetectionCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 10, kernel_size=(3, 7))
        self.pool1 = nn.MaxPool2d(kernel_size=(3, 1))
        self.conv2 = nn.Conv2d(10, 20, kernel_size=(3, 3))
        self.pool2 = nn.MaxPool2d(kernel_size=(3, 1))

        dummy_input = th.zeros(1, 3, 80, num_frames) 
        dummy_output = self.pool2(F.relu(self.conv2(self.pool1(F.relu(self.conv1(dummy_input))))))
        output_size = dummy_output.view(-1).shape[0]

        self.fc1 = nn.Linear(output_size, 256)
        self.fc2 = nn.Linear(256, 1)
        self.dropout = nn.Dropout(0.5)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))

        # Flatten the tensor
        x = x.view(x.size(0), -1)

        x = self.dropout(F.relu(self.fc1(x)))  # Apply dropout
        x = self.sigmoid(self.fc2(x))
        return x

# Example usage
model = GunshotDetectionCNN(num_frames=utils.NUM_FRAMES)

In [7]:
import torchaudio
import torch as th
import numpy as np
import random
from torch.utils.data import DataLoader

class GunshotDataset(th.utils.data.Dataset):
    def __init__(self, music_df, gunshot_df, excerpt_len=5.0, gunshot_placement_sec=2.0, gunshot_prob=1.0, min_db=3, max_db=5, max_non_gunshot_samples=1, mean=None, std=None):
        """
        :param music_df: DataFrame containing paths to music files.
        :param gunshot_df: DataFrame containing paths to gunshot files and timing info.
        :param excerpt_len: Length of the music segment in seconds.
        :param gunshot_placement_sec: Time in seconds where to place the gunshot in the music.
        :param gunshot_prob: Probability of adding a gunshot to the segment.
        :param min_db: Minimum gain (in dB) to apply to the gunshot.
        :param max_db: Maximum gain (in dB) to apply to the gunshot.
        :param max_non_gunshot_samples: Max number of non-gunshot samples to extract when no gunshots are present.
        """
        super().__init__()
        self.music_paths = music_df['Path'].tolist()
        self.gunshot_paths = gunshot_df['filename'].tolist()
        self.gunshot_truth = gunshot_df['gunshot_location_in_seconds'].apply(
            lambda x: utils.preprocess_gunshot_times(x, include_first_gunshot_only=True)
        ).tolist()
        self.excerpt_len = excerpt_len
        self.gunshot_placement_sec = gunshot_placement_sec
        self.gunshot_prob = gunshot_prob
        self.min_db = min_db
        self.max_db = max_db
        self.max_non_gunshot_samples = max_non_gunshot_samples

    def __getitem__(self, idx):
        fn_music = self.music_paths[idx]
        print(f"Music: {fn_music}")
        add_gunshot = (np.random.rand() < self.gunshot_prob)
        print(f"Should add gunshot on the music excerpt? {'yes' if add_gunshot else 'no'}")
        sample_rate = 44100

        if add_gunshot:
            gunshot_idx = np.random.randint(0, len(self.gunshot_paths) - 1)
            fn_gunshot = self.gunshot_paths[gunshot_idx]
            print(f"Gunshot path: {fn_gunshot}")
            gunshot_times = self.gunshot_truth[gunshot_idx][0]
            
            music_segment, sr = utils.combine_music_and_gunshot(
                music_file=fn_music,
                gunshot_file=fn_gunshot,
                gunshot_time=gunshot_times,
                gunshot_volume_increase_dB=self.max_db,
                gunshot_placement_sec=self.gunshot_placement_sec,
                excerpt_len_sec=self.excerpt_len,
                sample_rate=utils.SAMPLING_RATE
            )
            label = 1
            spectrograms, labels = utils.preprocess_audio_train(music_segment, sr, label, gunshot_times)
        else:
            # Extract a segment of music without gunshots
            music_segment, sr = utils.extract_music_segment(
                music_file=fn_music,
                excerpt_len=self.excerpt_len,
                sample_rate=utils.SAMPLING_RATE
            )
            label = 0
            spectrograms, labels = utils.preprocess_audio_train(music_segment, sr, label)

        if not spectrograms or not labels:
            raise ValueError("Spectrograms or labels are empty after preprocessing")

        return spectrograms[0], labels[0]

    def __len__(self):
        return len(self.music_paths)

In [17]:
train_dataset = GunshotDataset(music_train_df, gunshot_train_df, excerpt_len=5.0, gunshot_placement_sec=2.0, min_db=5, max_db=10, gunshot_prob=1)
valid_dataset = GunshotDataset(music_valid_df, gunshot_valid_df, excerpt_len=5.0, gunshot_placement_sec=2.0, min_db=5, max_db=10, gunshot_prob=0.5)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

In [21]:
spectogram, label = train_dataset[0]

Music: /Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Audio/west_coast/Pac/All Eyez On Me/2Pac - How Do You Want It (feat. JoJo & K-Ci).mp3
Should add gunshot on the music excerpt? yes
Gunshot path: /Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Audio/Gunshots/csv/edge-collected-gunshot-audio/edge-collected-gunshot-audio/glock_17_9mm(231).wav
Resampling music from 48000 Hz to 44100 Hz.
Extracted a 5.0 seconds music excerpt for processing.
Loading the gunshot file...
Applying a 10 dB volume increase to the gunshot.
Combining the music and gunshot. The gunshot will be placed at 2.0 seconds.
------PREPROCESSING AUDIO DATA------
Waveform shape:  torch.Size([2, 220500])
Sampling rate:  44100
------SELECTING GUNSHOT SEGMENT WITH FRAME SIZE OF 43520------
------SELECTING GUNSHOT SEGMENT------
Segment shape after cutting 43520 size: torch.Size([2, 43520])
MEL SPECTOGRAM shape of the segment torch.Size([3, 81, 86])
------PREPROCESSING AUDIO DATA------
