In [None]:
import os
import glob
import pandas as pd
import numpy as np
from scipy.stats import wasserstein_distance

def process_folders(folders, output_dir="AVMs"):
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created directory: {output_dir}")

    for folder in folders:
        print(f"\n--- Processing Folder: {folder} ---")
        
        search_pattern = os.path.join(folder, "*.csv")
        csv_files = glob.glob(search_pattern)

        if not csv_files:
            print(f"No CSV files found in {folder}. Skipping.")
            continue

        # 1. Build Global Distributions for the 4 observables
        # We store them in a list of arrays to build the 'Ensemble'
        ensemble_data = [[] for _ in range(4)]
        valid_files = []

        print("Step 1: Building global ensemble distributions...")
        for file_path in csv_files:
            try:
                df = pd.read_csv(file_path)
                if len(df.columns) >= 4:
                    for i in range(4):
                        ensemble_data[i].extend(df.iloc[:, i].values)
                    valid_files.append(file_path)
            except Exception as e:
                print(f"Error reading {file_path} for ensemble: {e}")

        # Convert lists to numpy arrays for speed
        global_distributions = [np.array(dist) for dist in ensemble_data]
        
        # 2. Calculate W1 and write to CSV row-by-row
        output_file = os.path.join(output_dir, f"{folder}_results.csv")
        headers = ["file_name", "W1_obs1", "W1_obs2", "W1_obs3", "W1_obs4"]
        
        # Initialize the CSV with headers
        pd.DataFrame(columns=headers).to_csv(output_file, index=False)

        print(f"Step 2: Calculating W1 and saving to {output_file}...")
        
        for idx, file_path in enumerate(valid_files):
            try:
                df = pd.read_csv(file_path)
                file_name = os.path.basename(file_path)
                
                # Calculate Wasserstein distance for each of the 4 columns
                scores = [file_name]
                for i in range(4):
                    local_values = df.iloc[:, i].values
                    w1 = wasserstein_distance(global_distributions[i], local_values)
                    scores.append(w1)
                
                # Append this single row to the CSV
                # mode='a' appends, header=False prevents re-writing the header
                result_df = pd.DataFrame([scores])
                result_df.to_csv(output_file, mode='a', index=False, header=False)
                
                if (idx + 1) % 10 == 0 or (idx + 1) == len(valid_files):
                    print(f" Progress: {idx + 1}/{len(valid_files)} files processed.")
                    
            except Exception as e:
                print(f"Error processing {file_path}: {e}")

# --- Execution ---
folders_to_process = ["paths1", "paths2"]
process_folders(folders_to_process)
print("\nProcessing complete.")