# Adding random field audio to my negatives dataset
1. Generate list of random field audio files from rloc2025a, one from each recorder. 
2. Create .pkl file with each hour split into 1.5 second chunks, matching the df format of the train anf val dfs. 
3. Integrate that field_negatives.pkl into my existing dataset.

In [None]:
import os
import random
from pathlib import Path
import pandas as pd

In [None]:

def randomly_select_wav_files(root_dir):
    """
    Randomly select one .wav file from each subdirectory and create 1.5 second clips
    """
    root_path = Path(root_dir)
    selected_files = []
    
    # Find all subdirectories
    subdirs = [d for d in root_path.iterdir() if d.is_dir()]
    
    print(f"Found {len(subdirs)} subdirectories in {root_dir}")
    
    for subdir in subdirs:
        # Find all .wav files in this subdirectory
        wav_files = list(subdir.glob("*.WAV"))
        
        if wav_files:
            # Randomly select one file
            selected_file = random.choice(wav_files)
            
            # Create 1.5 second clips from 15-minute file
            file_duration = 15 * 60  # 15 minutes = 900 seconds
            clip_duration = 1.5
            
            # Calculate number of clips
            num_clips = int(file_duration / clip_duration)
            
            for clip_num in range(num_clips):
                start_time = clip_num * clip_duration
                end_time = start_time + clip_duration
                
                selected_files.append({
                    'subdirectory': subdir.name,
                    'selected_file': selected_file.name,
                    'full_path': str(selected_file),
                    'total_wav_files': len(wav_files),
                    'start_time': start_time,
                    'end_time': end_time,
                    'field_data': 1,
                    'clip_number': clip_num + 1
                })
            
            print(f"{subdir.name}: Selected {selected_file.name} from {len(wav_files)} .WAV files, created {num_clips} clips")
        else:
            print(f"{subdir.name}: No .WAV files found")
    
    return selected_files

In [None]:

# Set random seed for reproducibility
random.seed(42)

# Select files
root_directory = "/media/kiwi/datasets/unfinalized/rloc2025a"
selected_files = randomly_select_wav_files(root_directory)

print(f"\nTotal selected files: {len(selected_files)}")

In [None]:

# Convert to DataFrame for easier viewing and manipulation
if selected_files:
    df_selected = pd.DataFrame(selected_files)
    print("\nSelected files summary:")
    print(df_selected)
    
    # Save to CSV for reference
    output_file = "/home/brg226/projects/vira_beg/training_data/field_negatives/selected_field_negatives.csv"
    df_selected.to_csv(output_file, index=False)
    print(f"\nSaved selected files to {output_file}")
else:
    print("No files were selected")

In [None]:
# Rename and clean up the DataFrame
df_selected = df_selected.rename(columns={'full_path': 'file'})
df_selected = df_selected.drop(columns=['clip_number', 'subdirectory', 'total_wav_files', 'selected_file'])
df_selected = df_selected.reset_index(drop=True)
print("Cleaned DataFrame")

In [None]:
df_selected.head()

In [None]:
# Save the cleaned DataFrame as a pickle file
pkl_output_file = "/home/brg226/projects/vira_beg/training_data/field_negatives/field_negatives.pkl"
df_selected.to_pickle(pkl_output_file)
print(f"Saved cleaned DataFrame to {pkl_output_file}")

print(f"\nFinal DataFrame shape: {df_selected.shape}")
print(f"Columns: {df_selected.columns.tolist()}")

# Merge field_negatives.pkl with update1_fulltrain.pkl

In [None]:
# Load the original training dataset
update1_path = "/home/brg226/projects/vira_beg/training_data/update1_fulltrain.pkl"
df_original = pd.read_pickle(update1_path)

print(f"Original dataset shape: {df_original.shape}")
print(f"Original columns: {df_original.columns.tolist()}")
print(f"Index type: {type(df_original.index)}")
print(f"Index names: {df_original.index.names}")

# Convert MultiIndex to regular columns if needed
if hasattr(df_original.index, 'nlevels') and df_original.index.nlevels > 1:
    print("Converting MultiIndex to regular columns...")
    df_original = df_original.reset_index()
    print(f"After reset_index - shape: {df_original.shape}")
    print(f"After reset_index - columns: {df_original.columns.tolist()}")
else:
    print("Index is not MultiIndex, no conversion needed")

In [None]:

# Add field_data column to original dataset (fill with 0 for existing data)
df_original['field_data'] = 0

In [None]:
print(f"\nField negatives dataset shape: {df_selected.shape}")
print(f"Field negatives columns: {df_selected.columns.tolist()}")

In [None]:

# Get columns that exist in original but not in field negatives
original_only_cols = set(df_original.columns) - set(df_selected.columns)
print(f"\nColumns in original dataset that need to be added to field negatives: {original_only_cols}")


In [None]:

# Add missing columns to field_negatives with 0 values
for col in original_only_cols:
    df_selected[col] = 0


In [None]:

# Reorder field_negatives columns to match original dataset
df_selected = df_selected[df_original.columns]

print(f"\nAligned field negatives shape: {df_selected.shape}")
print(f"Aligned field negatives columns: {df_selected.columns.tolist()}")


In [None]:

# Concatenate the datasets
df_merged = pd.concat([df_original, df_selected], ignore_index=True)

print(f"\nMerged dataset shape: {df_merged.shape}")
print(f"Merged dataset columns: {df_merged.columns.tolist()}")


In [None]:

# Check field_data distribution
print(f"\nField data distribution:")
print(df_merged['field_data'].value_counts())


In [None]:

# Save the merged dataset
merged_output_path = "/home/brg226/projects/vira_beg/training_data/update2_fulltrain_with_field.pkl"
df_merged.to_pickle(merged_output_path)
print(f"\nSaved merged dataset to: {merged_output_path}")


In [None]:

print("\nFirst few rows of merged dataset:")
df_merged.head()


In [None]:

print("\nLast few rows of merged dataset (should be field data):")
df_merged.tail()