In [1]:
import numpy as np

# Create monkey patches
np.float = float
np.int = int
np.object = object
np.bool = bool

In [2]:
import pandas as pd
import re
import ast
import gunshot_utils as utils
import importlib
import os
import random
import pandas as pd
import torchaudio

importlib.reload(utils)

In [3]:
gunshots_paths = pd.read_csv('./filtered_gunshot_metadata.csv')
gunshots = gunshots_paths[['filename', 'num_gunshots', 'gunshot_location_in_seconds']].copy()  # Create a copy to avoid SettingWithCopyWarning

# Function to preprocess gunshot start times, converting strings to lists of floats
def preprocess_gunshot_times(gunshot_times, include_first_gunshot_only=False):
    # Remove multiple spaces
    gunshot_times = re.sub(r'\s+', ' ', gunshot_times).strip()

    # Insert commas between numbers if missing
    gunshot_times = re.sub(r'(?<=\d)\s(?=\d)', ', ', gunshot_times)

    # Ensure there are no trailing commas
    gunshot_times = gunshot_times.replace(', ]', ']')

    # Safely evaluate the string as a list
    try:
        gunshot_list = ast.literal_eval(gunshot_times)
        if include_first_gunshot_only and isinstance(gunshot_list, list) and gunshot_list:
            return [gunshot_list[0]]  # Return only the first gunshot time
        return gunshot_list
    except (ValueError, SyntaxError):
        # Return an empty list if the string is not a valid list
        return []

# Define the root path
root_path = '/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Audio/Gunshots/csv/edge-collected-gunshot-audio/edge-collected-gunshot-audio/'

# Boolean flag to control if only the first gunshot time should be included
include_first_gunshot_only = False

# Apply the function to preprocess the 'gunshot_location_in_seconds' column with the boolean flag
gunshots['gunshot_location_in_seconds'] = gunshots['gunshot_location_in_seconds'].apply(
    lambda x: preprocess_gunshot_times(x, include_first_gunshot_only)
)

# If include_first_gunshot_only is True, set 'num_gunshots' to 1
if include_first_gunshot_only:
    gunshots['num_gunshots'] = gunshots['gunshot_location_in_seconds'].apply(lambda x: len(x))

# Add the label column
gunshots['label'] = 1

In [72]:
music_df = pd.read_excel('/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Excel/baseline_data_w_topics_w_features.xlsx', engine='openpyxl')

music = music_df.rename(columns={'Path': 'filename'})
music['label'] = 0

music_labels = music[['label']]
music_paths_df = music[['filename']]

In [79]:
df = utils.generate_data_samples(music, gunshots, number_of_samples_w_gunshots=5000, number_of_samples_wo_gunshots=0)

In [45]:
gunshots_paths

In [46]:
set(gunshots_paths['firearm'])

In [47]:
set(gunshots_paths['caliber'])

In [48]:
gun_type_mapping = {
    'Glock 17': 'glock_17',
    'Remington 870': 'remington_870',
    'Ruger 556': 'ruger_556',
    'Smith & Wesson': 'smith_wesson'
}

caliber_mapping = {
    '.223 mm': '223mm',
    '.38 cal': '38cal',
    '12 guage': '12gauge',
    '9mm': '9mm'
}

df = pd.read_csv('/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Audio/Gunshots/csv/edge-collected-gunshot-audio/gunshot-audio-all-metadata.csv')

audio_dir = '/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Audio/Gunshots/csv/edge-collected-gunshot-audio/edge-collected-gunshot-audio'

In [63]:
# This cell was used to clean the csv data because some of the gunshot samples were too low in decibels to even hear something.

import os
import pandas as pd
from pydub import AudioSegment

# Load your data
df = pd.read_csv('/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Audio/Gunshots/csv/edge-collected-gunshot-audio/updated_gunshot_metadata.csv')

audio_dir = '/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Audio/Gunshots/csv/edge-collected-gunshot-audio/edge-collected-gunshot-audio'

def get_max_decibel_level(audio_segment):
    """
    Calculate the maximum decibel level of the audio segment.
    
    Parameters:
        audio_segment (AudioSegment): The audio segment to analyze.
        
    Returns:
        max_db (float): The maximum decibel level of the audio.
    """
    return audio_segment.max_dBFS

def filter_gunshots_by_decibel(df, audio_dir, threshold_db=-20.0):
    """
    Filter out gunshot audio files based on their maximum decibel levels and return
    a list of files with low decibel levels for manual review.
    
    Parameters:
        df (DataFrame): DataFrame containing the file paths and other metadata.
        audio_dir (str): Directory where the audio files are stored.
        threshold_db (float): Decibel threshold below which files will be listed for review.
        
    Returns:
        filtered_df (DataFrame): DataFrame containing only the files above the decibel threshold.
        low_db_files (list): List of file paths with low decibel levels.
    """
    filtered_records = []
    low_db_files = []

    for index, row in df.iterrows():
        file_path = row['filename']
        full_path = os.path.join(audio_dir, file_path)

        # Load the audio file
        try:
            audio = AudioSegment.from_file(full_path)
        except Exception as e:
            print(f"Error loading {full_path}: {e}")
            continue

        # Get the maximum decibel level
        max_db = get_max_decibel_level(audio)
        # print(f"{file_path} - Max dB: {max_db:.2f}")

        # Filter based on the decibel threshold
        if max_db > threshold_db:
            filtered_records.append(row)
        else:
            # print(f"File {file_path} has low dB ({max_db:.2f}) and will be excluded.")
            low_db_files.append(full_path)

    # Create a new DataFrame with filtered records
    filtered_df = pd.DataFrame(filtered_records)
    return filtered_df, low_db_files

# Define your threshold (e.g., -20 dBFS)
threshold_db = -1.0

# Filter the DataFrame based on max decibel level and get the low decibel files
filtered_df, low_db_files = filter_gunshots_by_decibel(df, audio_dir, threshold_db=threshold_db)

# Save the filtered DataFrame to a new CSV file
filtered_df.to_csv('filtered_gunshot_metadata.csv', index=False)

# Print the list of files with low decibel levels
print("Files with low decibel levels:")
for file in low_db_files:
    print(file)

In [64]:
len(low_db_files)

In [65]:
df = pd.read_csv('/Users/borosabel/Documents/Uni/Thesis/PopMIR/Data/Audio/Gunshots/csv/edge-collected-gunshot-audio/filtered_gunshot_metadata.csv')

In [66]:
df

In [1]:
df2 = pd.read_csv('./filtered_gunshot_metadata.csv')

In [68]:
df2 = df2[df2['filename'].str.contains('glock', case=False, na=False)]

In [77]:
df2.to_csv('glock_gunshot_metadata.csv', index=False)

In [80]:
df2

In [None]:
# Define the source column and the destination directory
source_column = 'filepath'  # Column containing the file paths
destination_dir = '/path/to/new/location'  # The destination folder where you want to copy files

# Make sure the destination directory exists
os.makedirs(destination_dir, exist_ok=True)

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    source_file = row[source_column]  # Extract the file path from the DataFrame

    # Get just the filename, not the full path
    filename = os.path.basename(source_file)

    # Define the destination file path
    destination_file = os.path.join(destination_dir, filename)

    try:
        # Copy the file
        shutil.copy(source_file, destination_file)
        print(f"Copied: {source_file} to {destination_file}")
    except Exception as e:
        print(f"Error copying {source_file}: {e}")