In [None]:
# --- 📚 Import Libraries ---

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch

In [None]:
# --- ⚙️ Configuration ---

# Define the folder structure and directories to be processed
"""
📁 Required Folder Structure

my_data_directory/
│
├── human/
│   │
│   ├── person1_standing_with_jacket/
│   │   └── signal_5000.csv
│   │
│   ├── person1_sitting_with_jacket/
│   │   └── signal_5000.csv
│   │
│   └── ... (more scenario folders for 'human')
│
└── nonhuman/
    │
    ├── chair_with_bag_and_jacket/
    │   └── signal_5000.csv
    │
    ├── chair_without_bag/
    │   └── signal_5000.csv
    │
    └── ... (more scenario folders for 'nonhuman')

"""

# This should be the folder containing the 'human' and 'nonhuman' subdirectories.
# We will use this data for testing our deep learning model.
VAL_DATA_DIRECTORY = '/Users/anuragde/Documents/project-work/myvaldata' 

# Set the path where the preprocessed .npy files will be saved.
OUTPUT_DIRECTORY = './preprocessed_val_data'

In [3]:
# --- 📂 Discover and List Folders ---

scenarios_to_process = {}
total_scenarios = 0

print("--- Discovering Folders (Scenarios) ---")

if not os.path.isdir(VAL_DATA_DIRECTORY):
    print(f"❌ Error: The specified data directory does not exist: {VAL_DATA_DIRECTORY}")
else:
    for category in ['human', 'nonhuman']:
        category_path = os.path.join(VAL_DATA_DIRECTORY, category)
        if os.path.isdir(category_path):
            # Find all subdirectories in the category folder
            scenarios = [s for s in os.listdir(category_path) if os.path.isdir(os.path.join(category_path, s))]
            scenarios.sort()  # Sort alphabetically for consistent order
            scenarios_to_process[category] = scenarios
            
            # Print the discovered folders in a compact, single line
            print(f"\nFound {len(scenarios)} folders(scenarios) for '{category}':")
            if scenarios:
                print(f"  [{', '.join(scenarios)}]")
            
            total_scenarios += len(scenarios)
        else:
            print(f"\nWarning: Directory for category '{category}' not found.")
            scenarios_to_process[category] = []

    print(f"\n✅ Discovery complete. Found a total of {total_scenarios} folders(scenarios).")

--- Discovering Folders (Scenarios) ---

Found 6 folders(scenarios) for 'human':
  [person4_chair_sitting_with_jacket, person4_chair_sitting_without_jacket, person4_sitting_office_table_with_jacket, person4_sitting_office_table_without_jacket, person4_with_jacket_standing, person4_without_jacket_standing]

Found 6 folders(scenarios) for 'nonhuman':
  [chair_with_bag_and_jacket, drawer_with_basket, empty_chair, empty_floor, office_table_with_books_and_computer, office_table_with_laptop]

✅ Discovery complete. Found a total of 12 folders(scenarios).


In [4]:
# --- 🎛️ Preprocessing Function ---

def process_and_save_spectrograms(data_path, output_path, scenarios_dict):
    """
    A spectrogram is a visual representation of the spectrum of frequencies of a signal as it varies with time. 
    In this script, it's generated using a Short-Time Fourier Transform (STFT), which breaks down the 
    process into several key steps.

    ---

    ### Step 1: Defining the STFT Parameters
    - n_fft = 256 (Window Size): The signal is analyzed in overlapping chunks of 256 samples.
    - hop_length = 128 (Step Size): The analysis window slides forward 128 samples at a time.
    - window = torch.hann_window(n_fft): A Hann window is applied to each chunk to reduce artifacts.

    ### Step 2: The STFT Calculation (`torch.stft`)
    This function slides the window across the signal, computing a Fast Fourier Transform (FFT) on each chunk. 
    The output is a 2D array of complex numbers representing the frequency and phase information over time.

    ### Step 3: Creating the Magnitude Spectrogram (`torch.abs`)
    We take the absolute value of the complex numbers to get their magnitude, discarding the phase. 
    This final 2D array of real numbers is the spectrogram.
    """
    print("--- Starting Preprocessing ---")
    
    # --- FIXED PARAMETERS ---
    FIXED_LENGTH = 19517
    n_fft, hop_length = 256, 128
    window = torch.hann_window(n_fft)
    
    all_spectrograms = []
    all_labels = []

    for category, label in [('human', 1), ('nonhuman', 0)]:
        scenarios = scenarios_dict.get(category, [])
        if not scenarios:
            continue
        
        category_path = os.path.join(data_path, category)
        
        # This loop will now have a continuous progress bar
        for scenario in tqdm(scenarios, desc=f"Processing '{category}'"):
            scenario_path = os.path.join(category_path, scenario)
            csv_file = next((f for f in os.listdir(scenario_path) if f.endswith('.csv')), None)
            if not csv_file: continue

            df = pd.read_csv(os.path.join(scenario_path, csv_file), header=None)
            if df.empty: continue

            for _, row in df.iterrows():
                if row.shape[0] <= 5500: continue
                signal = row.values[5500:].astype(np.float32)

                if signal.shape[0] > FIXED_LENGTH:
                    signal = signal[:FIXED_LENGTH]
                elif signal.shape[0] < FIXED_LENGTH:
                    padding = np.zeros(FIXED_LENGTH - signal.shape[0], dtype=np.float32)
                    signal = np.concatenate((signal, padding))
                
                signal_tensor = torch.from_numpy(signal)
                spec = torch.stft(signal_tensor, n_fft=n_fft, hop_length=hop_length, window=window, return_complex=True)
                spectrogram = torch.abs(spec)
                
                all_spectrograms.append(spectrogram.numpy())
                all_labels.append(label)

    print("\n-> Stacking and saving data to .npy files...")
    if not all_spectrograms:
        print("Error: No data was processed. Aborting save.")
        return

    spectrograms_array = np.array(all_spectrograms, dtype=np.float32)
    labels_array = np.array(all_labels, dtype=np.int64)

    os.makedirs(output_path, exist_ok=True)
    
    dataset_name = os.path.basename(os.path.normpath(data_path))
    np.save(os.path.join(output_path, f'{dataset_name}_spectrograms.npy'), spectrograms_array)
    np.save(os.path.join(output_path, f'{dataset_name}_labels.npy'), labels_array)
    
    print(f"✅ Preprocessing complete for {dataset_name}.")
    print(f"   - Spectrograms saved to: {os.path.join(output_path, f'{dataset_name}_spectrograms.npy')} with shape {spectrograms_array.shape}")
    print(f"   - Labels saved to:     {os.path.join(output_path, f'{dataset_name}_labels.npy')} with shape {labels_array.shape}")

In [5]:
# --- ▶️ Execute Processing ---

if total_scenarios > 0:
    # Pass the discovered folder list to the function
    process_and_save_spectrograms(VAL_DATA_DIRECTORY, OUTPUT_DIRECTORY, scenarios_to_process)
else:
    print("No scenarios were found to process. Please check your VAL_DATA_DIRECTORY path.")

--- Starting Preprocessing ---


Processing 'human': 100%|██████████| 6/6 [00:17<00:00,  2.89s/it]
Processing 'nonhuman': 100%|██████████| 6/6 [00:16<00:00,  2.82s/it]



-> Stacking and saving data to .npy files...
✅ Preprocessing complete for myvaldata.
   - Spectrograms saved to: ./preprocessed_val_data/myvaldata_spectrograms.npy with shape (12000, 129, 153)
   - Labels saved to:     ./preprocessed_val_data/myvaldata_labels.npy with shape (12000,)
