In [None]:
import re
from pathlib import Path
import numpy as np
import os
import fnmatch

In [None]:
averaging_matches = ["Hor_scan"]

In [2]:
# This regex expression matches everything until a scan\d is found. 
# Needs to have Run followed by 1-3 digits at some points.
# If two output files should be averaged, they should have the exact same
# match on this regex expression. All files should match the regex
file_match_expr = r".*Run\d{1,3}.*scan\d{1,2}_"
path_used = Path("larger_test/1D/")

In [3]:
def group_files_by_pattern(file_list: list[Path], pattern: str) -> dict:
    """
    Groups files based on regex pattern matches
    Input: file_path: List of file paths
            pattern: regex pattern used
    Output: Dictionary mapping group names to list of file paths
    """
    groups = {}
    
    for file_path in file_list:
        file_name = file_path.name
        match = re.match(pattern, file_name)
        
        if match is None:
            raise RuntimeError(f"file {file_name} does not match pattern: {pattern}. Change pattern")
        
        group_key = match.group(0)  # The matched part becomes the group key
        
        if group_key not in groups:
            groups[group_key] = []
        groups[group_key].append(file_path)
    
    return groups


def read_dat_file(file_path: Path) -> tuple:
    """
    Reads a .dat file and extracts header comments and data
    Returns: (header_lines, q_values, intensity_values, sigma_values)
    """
    header_lines = []
    data_lines = []
    
    with open(file_path, 'r') as f:
        for line in f:
            if line.startswith('#'):
                header_lines.append(line.strip())
            else:
                # Skip empty lines
                if line.strip():
                    data_lines.append(line.strip())
    
    # Parse data (skip the column header line if it exists in data_lines)
    data_arrays = []
    for line in data_lines:
        # Skip column header lines that might not start with #
        if 'q_nm' in line or 'sigma' in line:
            continue
        try:
            values = [float(x) for x in line.split()]
            if len(values) == 3:  # q, I, sigma
                data_arrays.append(values)
        except ValueError:
            continue  # Skip malformed lines
    
    if not data_arrays:
        raise ValueError(f"No valid data found in {file_path}")
    
    data_arrays = np.array(data_arrays)
    q_values = data_arrays[:, 0]
    intensity_values = data_arrays[:, 1]
    sigma_values = data_arrays[:, 2]
    
    return header_lines, q_values, intensity_values, sigma_values


def average_dat_files(file_paths: list) -> tuple:
    """
    Averages multiple .dat files using simple column averaging
    Input: List of .dat file paths to average
    Output: (header_lines, avg_q, avg_intensity, avg_sigma)
    """
    if not file_paths:
        raise ValueError("No files to average")
    
    # Use header from first file
    header_lines, first_q, first_intensity, first_sigma = read_dat_file(file_paths[0])
    
    # Initialize arrays for averaging
    all_q = [first_q]
    all_intensity = [first_intensity]
    all_sigma = [first_sigma]
    
    # Read remaining files
    for file_path in file_paths[1:]:
        _, q_vals, intensity_vals, sigma_vals = read_dat_file(file_path)
        all_q.append(q_vals)
        all_intensity.append(intensity_vals)
        all_sigma.append(sigma_vals)
    
    # Convert to numpy arrays and average
    all_q = np.array(all_q)
    all_intensity = np.array(all_intensity)
    all_sigma = np.array(all_sigma)
    
    # Simple averaging across files (axis=0)
    avg_q = np.mean(all_q, axis=0)
    avg_intensity = np.mean(all_intensity, axis=0) 
    avg_sigma = np.mean(all_sigma, axis=0)
    
    return header_lines, avg_q, avg_intensity, avg_sigma


def save_averaged_data(header_lines: list, q, intensity, sigma, output_path: Path):
    """
    Saves averaged data in .dat format with original header
    """
    with open(output_path, 'w') as f:
        # Write header comments from first file
        for line in header_lines:
            f.write(f"{line}\n")
                
        # Write data
        for i in range(len(q)):
            f.write(f"  {q[i]:e}    {intensity[i]:e}   {sigma[i]:e}\n")


def average_files_in_directory(input_dir: Path, detector_type: str):
    """
    Takes in an individual SAXS/Reduction or WAXS/Reduction directory and performs averaging
    for files and writes averaged files to SAXS/Averaged or WAXS/Averaged
    """
    # Get all .dat files in the directory
    dat_files = list(input_dir.glob("*.dat"))
    
    if not dat_files:
        print(f"No .dat files found in {input_dir}")
        return
    
    # Group files by pattern
    file_groups = group_files_by_pattern(dat_files, file_match_expr)

    
    # Create output directory
    output_dir = input_dir.parent / "Averaged"
    output_dir.mkdir(exist_ok=True)
    
    print(f"Processing {len(file_groups)} groups in {input_dir}")
    
    # Process each group
    for group_key, files_in_group in file_groups.items():
        print(f"  Averaging {len(files_in_group)} files for pattern: {group_key}")
        
        # Average the files
        header_lines, avg_q, avg_intensity, avg_sigma = average_dat_files(files_in_group)
        
        # Create output filename
        output_filename = f"{group_key}{detector_type}.dat"
        output_path = output_dir / output_filename
        
        # Save averaged data
        save_averaged_data(header_lines, avg_q, avg_intensity, avg_sigma, output_path)
        
        print(f"    Saved: {output_path}")
            

In [4]:
def process_directory(base_dir: Path):
    """
    Main function to process both SAXS and WAXS directories
    Input: Path to 1D/ directory containing SAXS/ and WAXS/ subdirectories
    """
    saxs_reduction_dir = base_dir / "SAXS" / "Reduction"
    waxs_reduction_dir = base_dir / "WAXS" / "Reduction"
    
    print(f"Processing directory: {base_dir}")
    
    if saxs_reduction_dir.exists():
        print("\nProcessing SAXS files...")
        average_files_in_directory(saxs_reduction_dir, "SAXS")
    else:
        print(f"SAXS Reduction directory not found: {saxs_reduction_dir}")
    
    if waxs_reduction_dir.exists():
        print("\nProcessing WAXS files...")
        average_files_in_directory(waxs_reduction_dir, "WAXS")
    else:
        print(f"WAXS Reduction directory not found: {waxs_reduction_dir}")
    
    print("\nAveraging complete!")


def average_files(raw_file_paths: list, detector_shape: list[int]):
    """
    LEGACY FUNCTION - kept for compatibility
    raw_file_paths: list of raw file paths to be averaged
    detector_shape: shape of the detector used to perform this averaging
    Output: Numpy array representing processed data
    """
    
    shape_tuple = tuple(detector_shape)
    avg_image = np.zeros(shape_tuple)

    for raw_file_path in raw_file_paths:
        # Read the raw file
        data = np.fromfile(raw_file_path, dtype=np.int32).reshape(shape_tuple)
        avg_image += data

        # file_name = os.path.splitext(os.path.basename(raw_file_path))[0]

    return avg_image/len(raw_file_paths)

In [5]:
# Test the averaging system
# Run the complete averaging process on the larger_test directory

# Process the entire 1D directory
process_directory(path_used)

Processing directory: larger_test/1D

Processing SAXS files...
Processing 141 groups in larger_test/1D/SAXS/Reduction
  Averaging 3 files for pattern: Run11_PS_AcOH_RampT37_ctr7_scan1_
    Saved: larger_test/1D/SAXS/Averaged/Run11_PS_AcOH_RampT37_ctr7_scan1_SAXS.dat
  Averaging 1 files for pattern: Hor_scan_Run6_RampT20_ctr0_scan1_
    Saved: larger_test/1D/SAXS/Averaged/Hor_scan_Run6_RampT20_ctr0_scan1_SAXS.dat
  Averaging 3 files for pattern: Run10_AcOH_RampT87_ctr27_scan1_
    Saved: larger_test/1D/SAXS/Averaged/Run10_AcOH_RampT87_ctr27_scan1_SAXS.dat
  Averaging 3 files for pattern: Run11_PS_AcOH_RampT25_ctr2_scan1_
    Saved: larger_test/1D/SAXS/Averaged/Run11_PS_AcOH_RampT25_ctr2_scan1_SAXS.dat
  Averaging 3 files for pattern: Run1_Empty_capi_RampT25_ctr1_scan1_
    Saved: larger_test/1D/SAXS/Averaged/Run1_Empty_capi_RampT25_ctr1_scan1_SAXS.dat
  Averaging 3 files for pattern: Run10_AcOH_RampT60_ctr16_scan1_
    Saved: larger_test/1D/SAXS/Averaged/Run10_AcOH_RampT60_ctr16_scan1_S