In [None]:
import pandas as pd
import numpy as np
import os
from glob import glob

def load_s1p_file(file_path):
    """
    Load an s1p file and return frequency, gain, and phase data
    Skip the metadata row (first row)
    """
    try:
        # Skip the first row (metadata) and load the data
        data = pd.read_csv(file_path, delimiter=r'\s+', header=None, skiprows=1)
        
        # Verify we have 101 rows of data
        if len(data) != 101:
            print(f"Warning: {file_path} has {len(data)} rows instead of 101")
            
        frequency = data.iloc[:, 0]
        gain = data.iloc[:, 1]
        phase = data.iloc[:, 2]
        
        return frequency, gain, phase
    except Exception as e:
        print(f"Error loading file {file_path}: {str(e)}")
        return None, None, None

def extract_metadata(filename):
    """
    Extract product type and storage condition from filename
    Example: A_1_1.s1p -> Product Type: A (bread), Storage: 1 (open)
    """
    parts = os.path.basename(filename).split('_')
    product_type = 'Bread' if parts[0] == 'A' else 'Cookies'
    storage_condition = {
        '1': 'Open',
        '2': 'Wrapped',
        '3': 'Humid'
    }.get(parts[1], 'Unknown')
    
    return product_type, storage_condition

def create_dataset(data_folder):
    """
    Create a dataset from all s1p files in the folder
    """
    # Get all s1p files
    s1p_files = glob(os.path.join(data_folder, '*.s1p'))
    s1p_files.sort()  # Sort files to ensure consistent ordering
    
    if not s1p_files:
        raise ValueError(f"No .s1p files found in {data_folder}")

    # Lists to store data
    all_features = []
    product_types = []
    storage_conditions = []
    filenames = []

    # Process each file
    for file_path in s1p_files:
        # Load data
        frequency, gain, phase = load_s1p_file(file_path)
        
        if gain is None or phase is None:
            continue
            
        # Verify we have 101 values for both gain and phase
        if len(gain) != 101 or len(phase) != 101:
            print(f"Skipping {file_path}: Invalid data length")
            continue
            
        # Concatenate gain and phase as features
        features = np.concatenate([gain, phase])
        
        # Extract metadata
        product_type, storage = extract_metadata(file_path)
        
        # Append to lists
        all_features.append(features)
        product_types.append(product_type)
        storage_conditions.append(storage)
        filenames.append(os.path.basename(file_path))

    # Convert features to numpy array
    X = np.array(all_features)
    
    # Create feature names
    feature_names = ([f'gain_{i}' for i in range(101)] + 
                    [f'phase_{i}' for i in range(101)])
    
    # Create DataFrame with metadata first, then features
    df = pd.DataFrame({
        'Filename': filenames,
        'Product_Type': product_types,
        'Storage_Condition': storage_conditions
    })
    
    # Add feature columns
    for i, feature_name in enumerate(feature_names):
        df[feature_name] = X[:, i]
    
    return df

def main():
    # Set the path to your data folder
    data_folder = './Food/Bakery'  # Adjust this path as needed
    
    try:
        # Create the dataset
        print("Processing files...")
        df = create_dataset(data_folder)
        
        # Save the processed dataset
        output_file = 'processed_bakery_data.csv'
        df.to_csv(output_file, index=False)
        
        print(f"\nDataset created successfully!")
        print(f"Shape: {df.shape}")
        print(f"\nNumber of samples processed: {len(df)}")
        print(f"Number of features: {len(df.columns) - 3}")  # Subtract 3 for metadata columns
        
        # Print some basic information
        print("\nProduct Type distribution:")
        print(df['Product_Type'].value_counts())
        print("\nStorage Condition distribution:")
        print(df['Storage_Condition'].value_counts())
        
        # Verify feature dimensions
        print("\nFeature verification:")
        print(f"Number of gain features: {len([col for col in df.columns if col.startswith('gain')])}")
        print(f"Number of phase features: {len([col for col in df.columns if col.startswith('phase')])}")
        
        # Display first few columns to verify order
        print("\nFirst few columns:")
        print(df.columns[:10].tolist())
        
        # Save a sample of the data (first 5 rows) to a separate file for verification
        df.head().to_csv('sample_processed_data.csv', index=False)
        print("\nSample data saved to 'sample_processed_data.csv' for verification")
        
    except Exception as e:
        print(f"Error creating dataset: {str(e)}")

if __name__ == "__main__":
    main()

In [None]:
# Load the processed data
data = pd.read_csv('processed_bakery_data.csv')

# First few columns should be metadata
print("First columns:", data.columns[:5])

# Verify the order
assert data.columns[0] == 'Filename'
assert data.columns[1] == 'Product_Type'
assert data.columns[2] == 'Storage_Condition'

In [4]:
import pandas as pd
import numpy as np

def add_gaussian_noise(data, noise_level):
    """
    Add Gaussian noise to gain values while preserving phases
    
    Parameters:
    data: DataFrame containing the original data
    noise_level: Standard deviation of the Gaussian noise
    """
    # Create a copy of the original data
    noisy_data = data.copy()
    
    # Get gain column names
    gain_columns = [col for col in data.columns if col.startswith('gain')]
    
    # Add noise only to gain values
    for col in gain_columns:
        original_values = noisy_data[col].values
        noise = np.random.normal(0, noise_level, size=len(original_values))
        noisy_data[col] = original_values + noise
        
        # Ensure gains stay in reasonable range (0 to 1)
        noisy_data[col] = np.clip(noisy_data[col], 0, 1)
    
    return noisy_data

def augment_dataset(input_file, output_file):
    """
    Augment the dataset by adding noise-based samples
    """
    # Load the original data
    print("Loading original data...")
    original_data = pd.read_csv(input_file)
    
    # Define noise levels
    low_noise_level = 0.001   # Low noise (0.1% of signal)
    high_noise_level = 0.005  # Higher noise (0.5% of signal)
    
    # Generate noisy versions
    print("Generating low-noise samples...")
    low_noise_data = add_gaussian_noise(original_data, low_noise_level)
    print("Generating high-noise samples...")
    high_noise_data = add_gaussian_noise(original_data, high_noise_level)
    
    # Add noise level indicator to filenames
    low_noise_data['Filename'] = low_noise_data['Filename'].apply(lambda x: f"low_noise_{x}")
    high_noise_data['Filename'] = high_noise_data['Filename'].apply(lambda x: f"high_noise_{x}")
    
    # Concatenate all datasets
    print("Combining datasets...")
    augmented_data = pd.concat([original_data, low_noise_data, high_noise_data], 
                              axis=0, ignore_index=True)
    
    # Save augmented dataset
    print("Saving augmented dataset...")
    augmented_data.to_csv(output_file, index=False)
    
    # Print summary
    print("\nAugmentation Summary:")
    print(f"Original samples: {len(original_data)}")
    print(f"Total augmented samples: {len(augmented_data)}")
    print("\nProduct Type distribution:")
    print(augmented_data['Product_Type'].value_counts())
    print("\nStorage Condition distribution:")
    print(augmented_data['Storage_Condition'].value_counts())
    
    # Verify noise addition
    gain_columns = [col for col in augmented_data.columns if col.startswith('gain')]
    original_stats = original_data[gain_columns].mean().mean()
    low_noise_stats = low_noise_data[gain_columns].mean().mean()
    high_noise_stats = high_noise_data[gain_columns].mean().mean()
    
    print("\nGain Values Statistics:")
    print(f"Original data mean: {original_stats:.6f}")
    print(f"Low noise data mean: {low_noise_stats:.6f}")
    print(f"High noise data mean: {high_noise_stats:.6f}")
    
    # Create visualization of original vs noisy signals for verification
    import matplotlib.pyplot as plt
    
    # Plot example signals
    plt.figure(figsize=(15, 5))
    sample_idx = 0  # First sample
    gain_values = range(101)
    
    plt.plot(gain_values, original_data.iloc[sample_idx][gain_columns], 
             label='Original', alpha=0.7)
    plt.plot(gain_values, low_noise_data.iloc[sample_idx][gain_columns], 
             label='Low Noise', alpha=0.7)
    plt.plot(gain_values, high_noise_data.iloc[sample_idx][gain_columns], 
             label='High Noise', alpha=0.7)
    
    plt.title('Comparison of Original and Noisy Signals')
    plt.xlabel('Frequency Index')
    plt.ylabel('Gain Value')
    plt.legend()
    plt.grid(True)
    plt.savefig('signal_comparison.png')
    plt.close()
    
    return augmented_data

def main():
    input_file = 'processed_bakery_data.csv'
    output_file = 'augmented_bakery_data.csv'
    
    try:
        augmented_data = augment_dataset(input_file, output_file)
        print(f"\nAugmented dataset saved to {output_file}")
        print("Signal comparison plot saved to 'signal_comparison.png'")
        
    except Exception as e:
        print(f"Error during data augmentation: {str(e)}")

if __name__ == "__main__":
    main()

Loading original data...
Generating low-noise samples...
Generating high-noise samples...
Combining datasets...
Saving augmented dataset...

Augmentation Summary:
Original samples: 60
Total augmented samples: 180

Product Type distribution:
Product_Type
Bread      90
Cookies    90
Name: count, dtype: int64

Storage Condition distribution:
Storage_Condition
Open       60
Wrapped    60
Humid      60
Name: count, dtype: int64

Gain Values Statistics:
Original data mean: 0.986492
Low noise data mean: 0.986480
High noise data mean: 0.986158

Augmented dataset saved to augmented_bakery_data.csv
Signal comparison plot saved to 'signal_comparison.png'
