In [18]:
import numpy as np

# Create monkey patches
np.float = float
np.int = int
np.object = object
np.bool = bool

In [19]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import matplotlib.pyplot as plt
import gunshot_utils as utils
import importlib
import ast
import re
import os
import pickle
from glob import glob
import librosa.display
import IPython.display as ipd

import torch as th
import torchaudio
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from pydub import AudioSegment
from pydub.playback import play

importlib.reload(utils)

In [20]:
from pydub import AudioSegment
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import spectrogram
from IPython.display import Audio, display

# Function to plot waveform
def plot_waveform(audio, title, overlay_position=None):
    samples = np.array(audio.get_array_of_samples())
    plt.figure(figsize=(10, 4))
    plt.plot(samples)
    if overlay_position is not None:
        plt.axvline(x=overlay_position * audio.frame_rate // 1000, color='r', linestyle='--')
    plt.title(title)
    plt.xlabel('Sample Index')
    plt.ylabel('Amplitude')
    plt.show()

# Function to plot spectrogram
def plot_spectrogram(audio, title, overlay_position=None):
    samples = np.array(audio.get_array_of_samples())
    f, t, Sxx = spectrogram(samples, fs=audio.frame_rate)
    plt.figure(figsize=(10, 4))
    plt.pcolormesh(t, f, 10 * np.log10(Sxx))
    if overlay_position is not None:
        plt.axvline(x=overlay_position / 1000, color='r', linestyle='--')
    plt.title(title)
    plt.ylabel('Frequency [Hz]')
    plt.xlabel('Time [sec]')
    plt.colorbar(label='dB')
    plt.show()

# Function to convert pydub AudioSegment to numpy array for playback
def audio_segment_to_np(audio_segment):
    samples = np.array(audio_segment.get_array_of_samples())
    # Normalize to the range [-1, 1]
    return samples.astype(np.float32) / np.iinfo(samples.dtype).max

# Load music and gunshot audio files
music = AudioSegment.from_file("/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Audio/west_coast/Ice-T/Retrun of The Real/Dear Homie.mp3")
gunshot = AudioSegment.from_file("/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Audio/Gunshots/archive/M249/6 (8).wav")

music = music[:10000]

music.export('./temp.mp3')

In [16]:
w, s = torchaudio.load('/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Audio/west_coast/Ice-T/Retrun of The Real/Dear Homie.mp3')

In [17]:
w_, s_ = torchaudio.load('./temp.mp3')

In [18]:
ipd.Audio('/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Audio/west_coast/Ice-T/Retrun of The Real/Dear Homie.mp3')

In [19]:
ipd.Audio('./temp.mp3')

In [None]:
# Play original music and gunshot
play(music)

In [22]:
play(gunshot)

In [23]:
position = 500  # position in milliseconds
combined_audio = music.overlay(gunshot, position=position)

In [24]:
plot_waveform(music, "Original Music Waveform")
plot_waveform(combined_audio, "Combined Audio Waveform", overlay_position=position)
plot_spectrogram(music, "Original Music Spectrogram")
plot_spectrogram(combined_audio, "Combined Audio Spectrogram", overlay_position=position)

In [12]:
from pydub import AudioSegment
from pydub.playback import play

with_gunshot_0 = '/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Audio/Gunshots/Combined/with_gunshot_0.mp3'
SAMPLING_RATE = 44100
HOP_LENGTH = 512

audio = AudioSegment.from_file(with_gunshot_0)

# Function to play a frame excerpt
def play_excerpt(audio, num_frames, start_time_sec, sampling_rate, hop_length):
    # Calculate the duration of the excerpt in seconds
    excerpt_duration_sec = (num_frames - 1) * hop_length / sampling_rate
    print(f"Playing an excerpt of {excerpt_duration_sec:.2f} seconds ({num_frames} frames) starting at {start_time_sec} seconds")

    # Convert start time and duration to milliseconds for slicing the audio
    start_time_ms = start_time_sec * 1000
    excerpt_duration_ms = excerpt_duration_sec * 1000

    # Extract the excerpt from the specified start time
    excerpt = audio[start_time_ms:start_time_ms + excerpt_duration_ms]  # Take the segment starting at `start_time_ms`

    play(excerpt)

# Play a 15-frame long excerpt
play_excerpt(audio, num_frames=15, start_time_sec=2, sampling_rate=SAMPLING_RATE, hop_length=HOP_LENGTH)

In [14]:
# Play an 86-frame long excerpt
play_excerpt(audio, num_frames=86, start_time_sec=1.9, sampling_rate=SAMPLING_RATE, hop_length=HOP_LENGTH)

In [25]:
play(combined_audio)

In [88]:
class GunshotDetectionCNN(nn.Module):
    def __init__(self):
        super(GunshotDetectionCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 10, kernel_size=(3, 7))
        self.pool1 = nn.MaxPool2d(kernel_size=(3, 1))
        self.conv2 = nn.Conv2d(10, 20, kernel_size=(3, 3))
        self.pool2 = nn.MaxPool2d(kernel_size=(3, 1))
        self.fc1 = nn.Linear(20 * 7 * 8, 256)
        self.fc2 = nn.Linear(256, 1)
        self.dropout = nn.Dropout(0.5)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = x.view(-1, 20 * 7 * 8)
        x = self.dropout(F.relu(self.fc1(x)))  # Apply dropout
        x = self.sigmoid(self.fc2(x))
        return x

# Initialize the model
model = GunshotDetectionCNN()

In [89]:
class GunshotDataset(th.utils.data.Dataset):
    def __init__(self, spectograms, sample_rates, targets):
        self.X = []
        self.y = []

        for X, sample_rate, y in zip(spectograms, sample_rates, targets):
            X_frames, y_frames = utils.make_frames(X, y)
            self.X += X_frames
            self.y += y_frames

        tmp = th.cat(self.X)
        self.mean = th.mean(tmp, dim=(0, 2)).unsqueeze(1)
        self.std = th.std(tmp, dim=(0, 2)).unsqueeze(1)
        del tmp

        self.X = [(x - self.mean)/self.std for x in self.X]

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.y[i]

In [90]:
training_data_path = '/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Audio/Gunshots/Combined'

def generate_file_dataframe(folder_path):
    # Initialize a list to store the records
    records = []

    # Loop through all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.mp3'):  # Only consider mp3 files
            file_path = os.path.join(folder_path, filename)
            # Determine values for the third and fourth columns based on the filename
            if 'with_gunshot' in filename:
                gunshot_flag = 1
                label = 1
                timestampt = [2.0]
            elif 'without_gunshot' in filename:
                gunshot_flag = 0
                label = 0
                timestampt = []
            else:
                continue

            # Append the record to the list
            records.append([file_path, timestampt, gunshot_flag, label])

    # Create a DataFrame from the records
    df = pd.DataFrame(records, columns=['filename', 'gunshot_location_in_seconds', 'num_gunshots', 'label'])
    return df

df = generate_file_dataframe(training_data_path)

In [91]:
files = df[['filename', 'num_gunshots', 'gunshot_location_in_seconds']]
labels = df[['label']]

In [92]:
X_train_paths, X_test_paths, y_train_paths, y_test_paths = train_test_split(files, labels, test_size=0.3, random_state=42)

In [93]:
X_train_paths

In [94]:
spectrograms_train, sample_rates_train, labels_train = utils.preprocess_audio_train(X_train_paths, max_non_gunshot_samples=10)
spectrograms_test, sample_rates_test, labels_test = utils.preprocess_audio_train(X_test_paths, max_non_gunshot_samples=10)

In [95]:
# 3 dimensions of mel-spectograms with 80 mel bands and 15 frames.
spectrograms_train[0].shape

In [96]:
dataset = GunshotDataset(spectrograms_train, sample_rates_train, labels_train)
dataloader = DataLoader(dataset, batch_size=256, shuffle=True)

In [97]:
mean_std = {'mean': dataset.mean, 'std': dataset.std}
with open('mean_std.pkl', 'wb') as f:
    pickle.dump(mean_std, f)

# Save using torch
th.save({'mean': dataset.mean, 'std': dataset.std}, 'mean_std.pth')

In [98]:
device = 'cuda' if th.cuda.is_available() else 'cpu'
print(device)

In [127]:
import torch as th
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

device = th.device("cuda" if th.cuda.is_available() else "cpu")

def train_model(model, optimizer, criterion, train_loader, valid_features, valid_labels, epochs=10, thresholds=None, mean=None, std=None, patience=3):
    if thresholds is None:
        thresholds = np.arange(0.1, 1.0, 0.05)  # Define a range of thresholds to test

    if mean is None or std is None:
        raise ValueError("Mean and std must be provided for normalization.")

    mean = mean.to(device)
    std = std.to(device)

    model = model.to(device)
    best_threshold = 0.0
    best_score = 0.0
    epochs_since_improvement = 0

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0

        # Training phase
        for features, labels in train_loader:
            features, labels = features.to(device), labels.to(device).float()

            optimizer.zero_grad()

            # Normalize features
            features = (features - mean) / std

            outputs = model(features)
            outputs = outputs.squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * features.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)

        print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}")

        model.eval()
        best_epoch_threshold, current_score = evaluate_model(model, valid_features, valid_labels, thresholds, mean, std)

        if current_score > best_score:
            best_score = current_score
            best_threshold = best_epoch_threshold
            epochs_since_improvement = 0
            print(f"New best F1 score: {best_score:.4f}, model saved.")
        else:
            epochs_since_improvement += 1

        print(f"TEST best_f1: {best_score} with best threshold: {best_threshold}")

        # Check for early stopping
        if epochs_since_improvement >= patience:
            print(f"No improvement in F1 score for {patience} epochs. Stopping training.")
            break

    # Compute and display the confusion matrix on the validation set
    cm = compute_confusion_matrix(model, valid_features, valid_labels, best_threshold, mean, std)
    display_confusion_matrix(cm)

    return best_threshold, best_score

def evaluate_model(model, features, labels, thresholds, mean, std):
    best_threshold = 0.0
    best_f1_score = 0.0

    with th.no_grad():
        for threshold in thresholds:
            all_predictions = []
            all_labels = []

            for feature, label in zip(features, labels):
                feature = feature.to(device)
                label = th.tensor(label).float().to(device)  # Ensure label is a float tensor

                # Normalize feature
                feature = (feature - mean) / std

                # Get model predictions
                output = model(feature.unsqueeze(0)).squeeze().cpu().numpy()  # Add batch dimension

                # Apply threshold
                predictions = (output >= threshold).astype(float)

                all_predictions.append(predictions)
                all_labels.append(label.item())

            # Calculate F1 score
            avg_f1_score = f1_score(all_labels, all_predictions)

            if avg_f1_score > best_f1_score:
                best_f1_score = avg_f1_score
                best_threshold = threshold

    return best_threshold, best_f1_score

def compute_confusion_matrix(model, features, labels, threshold, mean, std):
    all_predictions = []
    all_labels = []

    model.eval()
    with th.no_grad():
        for feature, label in zip(features, labels):
            feature = feature.to(device)
            label = th.tensor(label).float().to(device)  # Ensure label is a float tensor

            # Normalize feature
            feature = (feature - mean) / std

            # Get model predictions
            output = model(feature.unsqueeze(0)).squeeze().cpu().numpy()  # Add batch dimension

            # Apply threshold
            predictions = (output >= threshold).astype(float)

            all_predictions.append(predictions)
            all_labels.append(label.item())

    cm = confusion_matrix(all_labels, all_predictions)
    return cm

def display_confusion_matrix(cm):
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
    disp.plot(cmap='magma')
    plt.show()

def evaluate_model_accuracy(model, features, labels, thresholds, mean, std):
    best_threshold = 0.0
    best_accuracy = 0.0

    with th.no_grad():
        for threshold in thresholds:
            all_accuracies = []

            for feature, label in zip(features, labels):
                feature = feature.to(device)
                label = th.tensor(label).to(device)  # Ensure label is a tensor

                # Normalize feature
                feature = (feature - mean) / std

                # Get model predictions
                output = model(feature).squeeze().cpu().numpy()  # Add batch dimension

                # Apply threshold
                predictions = (output >= threshold).astype(float)

                # Calculate accuracy
                accuracy = accuracy_score([label.item()], [predictions.item()])

                all_accuracies.append(accuracy)

            avg_accuracy = np.mean(all_accuracies)
            if avg_accuracy > best_accuracy:
                best_accuracy = avg_accuracy
                best_threshold = threshold

    return best_threshold, best_accuracy

In [128]:
epochs = 2
lr = 3e-4

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)

best_threshold, best_f1 = train_model(model, optimizer, criterion, dataloader, spectrograms_test, labels_test, epochs=25, thresholds=None, mean=dataset.mean, std=dataset.std)

In [15]:
def manual_evaluate_test(model, feature, threshold, frame_size=15, mean=None, std=None):
    if mean is None or std is None:
        raise ValueError("Mean and std must be provided for normalization.")

    mean = mean.to(device)
    std = std.to(device)

    model = model.to(device)
    model.eval()

    with th.no_grad():
        predictions = []

        # Prepare features
        num_frames = feature.shape[2]
        feature = feature.to(device)
        feature = (feature - mean) / std  # Normalize the feature

        # Loop through non-overlapping frames
        for j in range(0, num_frames - frame_size + 1, 1):
            start = j
            end = j + frame_size

            input_frame = feature[:, :, start:end].unsqueeze(0).float()
            output = model(input_frame).squeeze().item()
            predictions.append(output)

        res = []
        for idx in range(len(predictions)):
            if predictions[idx] >= threshold:
                print(idx, predictions[idx])
                time_in_seconds = idx * utils.HOP_LENGTH / utils.SAMPLING_RATE
                minutes = int(time_in_seconds // 60)
                seconds = time_in_seconds % 60
                res.append((minutes, seconds))

    return res

In [16]:
spectrograms, sample_rates = utils.preprocess_audio(['/Users/borosabel/Documents/Uni/Thesis/PopMIR/M.I.A. - Paper Planes.mp3'])
# spectrograms, sample_rates = utils.preprocess_audio(['/Users/borosabel/Documents/Uni/Thesis/PopMIR/50 Cent - Many Men (Wish Death) (Dirty Version).mp3'])

In [17]:
spectrograms[0].shape

In [131]:
# Load mean and std from file
with open('./mean_std.pkl', 'rb') as f:
    data = pickle.load(f)
    mean = data['mean']
    std = data['std']

In [132]:
for spectogram in spectrograms:
    predicted_times = manual_evaluate_test(model, spectogram, threshold=best_threshold, mean=mean, std=std)

In [133]:
best_threshold

In [134]:
for time in predicted_times:
    print(f"Prediction at {time[0]} minutes and {time[1]} seconds")