In [1]:
import os
import pandas as pd
from io import StringIO
import re
import glob

# Define the data path
data_dir = './Bakery/*.s1p'
data_files = glob.glob(data_dir)

# Check if files are detected
if not data_files:
    print("No files found in the specified directory. Check the path:", data_dir)
else:
    print(f"Found {len(data_files)} files.")

# Helper function to parse filenames for metadata
def parse_filename(filepath):
    filename = os.path.basename(filepath)
    match = re.match(r'([A-B])_([1-3])_([1-9][0-9]*).s1p', filename)
    if match:
        product_type = match.group(1)
        storage_condition = int(match.group(2))
        replicate = int(match.group(3))
        return product_type, storage_condition, replicate
    else:
        raise ValueError(f"Filename format not recognized: {filename}")

# List to hold processed data
processed_data = []

# Process each file
for file in data_files:
    print(f"Processing file: {file}")
    with open(file, 'r') as f:
        # Filter out comment lines
        lines = [line for line in f if not line.startswith('#')]
    
    # Debug: Check the number of lines
    print(f"File {file} has {len(lines)} data lines after filtering.")

    # Load the data into a DataFrame
    try:
        df = pd.read_csv(StringIO(''.join(lines)), delim_whitespace=True, names=['frequency', 'gain', 'phase'])
    except Exception as e:
        print(f"Error reading file {file}: {e}")
        continue
    
    # Ensure the correct number of rows (101 rows expected)
    if len(df) != 101:
        print(f"Skipping {file} due to incorrect row count ({len(df)} rows found).")
        continue
    
    # Parse labels from the filename
    try:
        product_type, storage_condition, replicate = parse_filename(file)
    except ValueError as e:
        print(e)
        continue
    
    # Create a feature vector
    new_feature = df['gain'].tolist() + df['phase'].tolist()
    
    # Add to the processed data
    processed_data.append([file, new_feature, product_type, replicate, storage_condition])

# If no data is processed, alert the user
if not processed_data:
    print("No valid data was processed. Check the file format and content.")
else:
    # Convert processed data to a DataFrame with corrected column names
    df = pd.DataFrame(processed_data, columns=['Replica', 'New Feature', 'Product_Type', 'Replicate', 'Storage_Condition'])

    # Optionally expand 'New Feature' into individual columns
    new_feature_df = pd.DataFrame(df['New Feature'].to_list(), columns=[f"Feature_{i+1}" for i in range(len(processed_data[0][1]))])

    # Combine with metadata
    combined_df = pd.concat([df[['Replica', 'Product_Type', 'Replicate', 'Storage_Condition']], new_feature_df], axis=1)

    # Encode Product Type
    combined_df['Product_Type'] = combined_df['Product_Type'].map({'A': 0, 'B': 1})  # 0 for bread, 1 for cookies

    # One-hot encode Storage Condition
    storage_one_hot = pd.get_dummies(combined_df['Storage_Condition'], prefix='storage')
    combined_df = pd.concat([combined_df.drop('Storage_Condition', axis=1), storage_one_hot], axis=1)

    # Save to CSV
    combined_df.to_csv('MMMatrix_bakery_data_encoded.csv', index=False)

    # Display the final DataFrame
    print("\nEncoded DataFrame:")
    print(combined_df.head())


Found 60 files.
Processing file: ./Bakery/A_2_2.s1p
File ./Bakery/A_2_2.s1p has 101 data lines after filtering.
Processing file: ./Bakery/A_2_10.s1p
File ./Bakery/A_2_10.s1p has 101 data lines after filtering.
Processing file: ./Bakery/A_1_7.s1p
File ./Bakery/A_1_7.s1p has 101 data lines after filtering.
Processing file: ./Bakery/A_3_5.s1p
File ./Bakery/A_3_5.s1p has 101 data lines after filtering.
Processing file: ./Bakery/A_2_8.s1p
File ./Bakery/A_2_8.s1p has 101 data lines after filtering.
Processing file: ./Bakery/B_1_2.s1p
File ./Bakery/B_1_2.s1p has 101 data lines after filtering.
Processing file: ./Bakery/B_1_4.s1p
File ./Bakery/B_1_4.s1p has 101 data lines after filtering.
Processing file: ./Bakery/A_3_9.s1p
File ./Bakery/A_3_9.s1p has 101 data lines after filtering.
Processing file: ./Bakery/A_3_6.s1p
File ./Bakery/A_3_6.s1p has 101 data lines after filtering.
Processing file: ./Bakery/A_3_10.s1p
File ./Bakery/A_3_10.s1p has 101 data lines after filtering.
Processing file: ./B

  df = pd.read_csv(StringIO(''.join(lines)), delim_whitespace=True, names=['frequency', 'gain', 'phase'])
  df = pd.read_csv(StringIO(''.join(lines)), delim_whitespace=True, names=['frequency', 'gain', 'phase'])
  df = pd.read_csv(StringIO(''.join(lines)), delim_whitespace=True, names=['frequency', 'gain', 'phase'])
  df = pd.read_csv(StringIO(''.join(lines)), delim_whitespace=True, names=['frequency', 'gain', 'phase'])
  df = pd.read_csv(StringIO(''.join(lines)), delim_whitespace=True, names=['frequency', 'gain', 'phase'])
  df = pd.read_csv(StringIO(''.join(lines)), delim_whitespace=True, names=['frequency', 'gain', 'phase'])
  df = pd.read_csv(StringIO(''.join(lines)), delim_whitespace=True, names=['frequency', 'gain', 'phase'])
  df = pd.read_csv(StringIO(''.join(lines)), delim_whitespace=True, names=['frequency', 'gain', 'phase'])
  df = pd.read_csv(StringIO(''.join(lines)), delim_whitespace=True, names=['frequency', 'gain', 'phase'])
  df = pd.read_csv(StringIO(''.join(lines)), d