In [1]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import wasserstein_distance

def load_and_calculate_avm(search_dir, column_index=0):
    """
    Loads CSV files, aggregates global data for a specific column,
    and calculates the AVM (Area between CDFs) for each file against the global CDF.
    """
    
    # 1. Find files
    search_pattern = os.path.join(search_dir, "*.csv")
    csv_files = glob.glob(search_pattern)

    if not csv_files:
        print(f"Error: No .csv files found in directory '{search_dir}'")
        return [], []

    print(f"Found {len(csv_files)} files. Loading data...")

    # 2. Load data into a list (array) of DataFrames
    data_frames = []
    
    for file_path in csv_files:
        try:
            # Read CSV
            df = pd.read_csv(file_path)
            
            # Ensure enough columns exist (User requirement: < 4 continue)
            if len(df.columns) < 4:
                continue
                
            data_frames.append(df)
            
        except Exception as e:
            print(f"Could not read {file_path}: {e}")

    if not data_frames:
        print("No valid dataframes loaded.")
        return [], []

    print(f"Successfully loaded {len(data_frames)} DataFrames.")

    # 3. Aggregate Global Data (The "Total Distribution")
    # We collect all values from the specific column (e.g., th1 at index 0)
    # Using a list comprehension is memory efficient here before concatenation
    print("Aggregating global distribution...")
    global_data = np.concatenate([df.iloc[:, column_index].values for df in data_frames])

    # 4. Calculate AVM for each file
    # AVM := Area between Global CDF and File CDF
    print("Calculating AVMs...")
    avm_scores = []
    
    for df in data_frames:
        local_data = df.iloc[:, column_index].values
        
        # scipy.stats.wasserstein_distance calculates the integral 
        # of the absolute difference between the CDFs of two distributions.
        score = wasserstein_distance(global_data, local_data)
        avm_scores.append(score)

    return data_frames, avm_scores

# --- Execution ---

# Define your path
search_dir = "paths1" 

# Run the function (targeting index 0 for 'th1')
dfs, avm_results = load_and_calculate_avm(search_dir, column_index=0)

# --- Plotting the Histogram ---

if avm_results:
    plt.figure(figsize=(10, 6))
    
    # Plot histogram
    plt.hist(avm_results, bins=50, color='skyblue', edgecolor='black', alpha=0.7)
    
    # Add labels and title
    plt.title(r'Histogram of Area Validation Metrics (AVM) for $th_1$', fontsize=14)
    plt.xlabel(r'AVM: $\int | F_{total}(s) - F_{file}(s) | ds$', fontsize=12)
    plt.ylabel('Frequency (Number of Files)', fontsize=12)
    
    # Add a mean line for reference
    mean_avm = np.mean(avm_results)
    plt.axvline(mean_avm, color='red', linestyle='dashed', linewidth=1, label=f'Mean AVM: {mean_avm:.4f}')
    plt.legend()
    
    plt.grid(axis='y', alpha=0.5)
    plt.show()
    

Found 1000 files. Loading data...
Successfully loaded 1000 DataFrames.
Aggregating global distribution...
Calculating AVMs...


KeyboardInterrupt: 