In [1]:
import os
import glob
import pandas as pd
import numpy as np
from scipy.interpolate import interp1d

def get_global_stats(files, cols=4, bins=2000):
    """
    Creates a binned 'Global CDF' for each observable to avoid 
    comparing against millions of raw points.
    """
    print("Pre-calculating Global Distributions (Pass 1)...")
    all_min = [np.inf] * cols
    all_max = [-np.inf] * cols
    
    # First pass to find global min/max for consistent binning
    for f in files:
        df = pd.read_csv(f, nrows=1000) # Quick peek for range
        for i in range(cols):
            all_min[i] = min(all_min[i], df.iloc[:, i].min())
            all_max[i] = max(all_max[i], df.iloc[:, i].max())

    # Create histograms (the ensemble)
    global_counts = [np.zeros(bins) for _ in range(cols)]
    bin_edges = [np.linspace(all_min[i], all_max[i], bins + 1) for i in range(cols)]

    for idx, f in enumerate(files):
        df = pd.read_csv(f)
        for i in range(cols):
            counts, _ = np.histogram(df.iloc[:, i], bins=bin_edges[i])
            global_counts[i] += counts
        if (idx + 1) % 100 == 0:
            print(f" Loaded {idx+1}/{len(files)} files into ensemble...")

    # Convert counts to CDFs
    global_cdfs = []
    for i in range(cols):
        cdf = np.cumsum(global_counts[i])
        cdf /= cdf[-1] # Normalize
        # Create an interpolator for fast lookups
        x_vals = (bin_edges[i][:-1] + bin_edges[i][1:]) / 2
        global_cdfs.append(interp1d(x_vals, cdf, bounds_error=False, fill_value=(0, 1)))
        
    return global_cdfs, bin_edges

def process_fast(folders, output_dir="AVMs2"):
    if not os.path.exists(output_dir): os.makedirs(output_dir)

    for folder in folders:
        print(f"\n--- Processing {folder} ---")
        csv_files = glob.glob(os.path.join(folder, "*.csv"))
        if not csv_files: continue

        # 1. Get Global Representation
        global_cdfs, bin_edges = get_global_stats(csv_files)

        # 2. Compute W1 via CDF integration
        output_file = os.path.join(output_dir, f"{folder}_results.csv")
        pd.DataFrame(columns=["file_name", "W1_1", "W1_2", "W1_3", "W1_4"]).to_csv(output_file, index=False)

        print("Calculating W1 scores (Pass 2)...")
        for idx, f in enumerate(csv_files):
            df = pd.read_csv(f)
            row = [os.path.basename(f)]
            
            for i in range(4):
                # Calculate Local CDF
                data = np.sort(df.iloc[:, i].values)
                local_cdf_y = np.linspace(0, 1, len(data))
                
                # Get Global CDF values at the same data points
                glob_cdf_y = global_cdfs[i](data)
                
                # W1 is the area between CDFs: mean of absolute differences
                # over the range of the data
                w1 = np.trapz(np.abs(local_cdf_y - glob_cdf_y), x=data)
                row.append(w1)

            pd.DataFrame([row]).to_csv(output_file, mode='a', index=False, header=False)
            if (idx + 1) % 50 == 0:
                print(f" Progress: {idx+1}/{len(csv_files)} done.")

# Run
process_fast(["paths1", "paths2"])


--- Processing paths1 ---
Pre-calculating Global Distributions (Pass 1)...
 Loaded 100/1000 files into ensemble...
 Loaded 200/1000 files into ensemble...
 Loaded 300/1000 files into ensemble...
 Loaded 400/1000 files into ensemble...
 Loaded 500/1000 files into ensemble...
 Loaded 600/1000 files into ensemble...
 Loaded 700/1000 files into ensemble...
 Loaded 800/1000 files into ensemble...
 Loaded 900/1000 files into ensemble...
 Loaded 1000/1000 files into ensemble...
Calculating W1 scores (Pass 2)...
 Progress: 50/1000 done.
 Progress: 100/1000 done.
 Progress: 150/1000 done.
 Progress: 200/1000 done.
 Progress: 250/1000 done.
 Progress: 300/1000 done.
 Progress: 350/1000 done.
 Progress: 400/1000 done.
 Progress: 450/1000 done.
 Progress: 500/1000 done.
 Progress: 550/1000 done.
 Progress: 600/1000 done.
 Progress: 650/1000 done.
 Progress: 700/1000 done.
 Progress: 750/1000 done.
 Progress: 800/1000 done.
 Progress: 850/1000 done.
 Progress: 900/1000 done.
 Progress: 950/1000 do