In [None]:
# Vergleich zwischen den OSM "type" Kategorien: Signifikanztest mit Kruskal-Wallis Test, Dunn's post-hoc Test, Cohen's d Effektmatrix (Segmente gewichtet nach Länge)
# Zur Erstellung des Codes wurde die generative Künstliche Intelligenz (KI) „Claude AI“ des Anbieters Anthropic in Version 3.7 genutzt

In [6]:
# Vergleich zwischen den OSM "type" Kategorien: Signifikanztest mit Kruskal-Wallis Test, Dunn's post-hoc Test, Cohen's d Effektmatrix (Segmente gewichtet nach Länge)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import gaussian_kde
import pyarrow.parquet as pq
import gc
from tqdm.notebook import tqdm
import matplotlib.patches as mpatches
import os
from scikit_posthocs import posthoc_dunn

# Configuration - modify to match your setup
PARQUET_FILE = "data/network_all_months_plus_25833_length_with_fahrradstrasse.parquet"
OUTPUT_DIR = "analysis_results/005_Infra_OSM_type"
COLUMN_TYPE = 'type'  # Column for street type
COLUMN_HIST = '2304-2412_speeds'  # Column for speed histogram data
COLUMN_LENGTH = 'length_m'  # Column for length in meters

# Make sure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Plot configurations
PLOT_CONFIG = {
    # General
    'TEXT_FONT_SIZE': 10,
    'TITLE_FONT_SIZE': 16,
    'AXIS_LABEL_FONT_SIZE': 12,
    'LEGEND_FONT_SIZE': 9,
    'DPI': 300,
    
    # Colors
    'BOX_COLOR_1': '#a0ddff',
    'BOX_COLOR_2': '#c8e9a0',
    'MEDIAN_COLOR': 'darkblue',
    'MEAN_COLOR': 'red',
    
    # Labels
    'HISTOGRAM_TITLE': 'Geschwindigkeitshistogramm nach OSM "type", 04/23 - 12/24',
    'BOXPLOT_TITLE': 'Boxplot Geschwindigkeitsverteilung nach OSM "type", 04/23 - 12/24',
    'VIOLINPLOT_TITLE': 'Geigenplot Geschwindigkeitsverteilung nach OSM "type", 04/23 - 12/24',
    'X_LABEL': 'Geschwindigkeit (km/h)',
    'Y_LABEL': 'Prozentsatz (%)',
    
    # Box annotations
    'ANNOTATION_BBOX_STYLE': dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8)
}

# Specific street types to analyze
STREET_TYPES_TO_ANALYZE = [
    'footway',
    'residential',
    'secondary',
    'service_driveway',
    'tertiary',
    'service',
    'path',
    'cycleway',
    'track',
    'primary',
    'service_parking_aisle',
    'unclassified',
    'living_street',
    'steps',
    'pedestrian'
]
def log(message):
    """Print a log message"""
    print(message)

def force_gc():
    """Force garbage collection"""
    gc.collect()

def parse_histogram(hist_str):
    """Parse histogram string to numpy array"""
    try:
        if isinstance(hist_str, str):
            hist_str = hist_str.strip('[]')
            # Fast NumPy parsing
            try:
                values = np.fromstring(hist_str, sep=',')
                return values
            except:
                # Fallback to manual parsing if NumPy method fails
                values = [float(x) for x in hist_str.split(',')]
                return np.array(values)
        else:
            return np.zeros(32)  # Return zeros for missing histograms
    except Exception as e:
        log(f"Error parsing histogram: {e}")
        return np.zeros(32)

def calculate_all_weighted_statistics(histogram):
    """
    Calculate all statistics from a histogram in a single efficient pass.
    Returns a dictionary with all statistics.
    """
    if np.sum(histogram) == 0:
        return {
            'mean': np.nan,
            'median': np.nan,
            'std': np.nan,
            'percentile_5': np.nan,
            'percentile_25': np.nan,
            'percentile_75': np.nan,
            'percentile_95': np.nan,
            'min': np.nan,
            'max': np.nan
        }
    
    speed_bins = np.arange(32)  # 0-31 km/h
    total_count = np.sum(histogram)
    
    # Normalize the histogram
    norm_hist = histogram / total_count
    
    # Calculate weighted mean in one step
    mean = np.sum(speed_bins * norm_hist)
    
    # Calculate variance and std in one step
    variance = np.sum(((speed_bins - mean) ** 2) * norm_hist)
    std = np.sqrt(variance)
    
    # Calculate cumulative distribution once
    cum_dist = np.cumsum(norm_hist)
    
    # Find min/max with data present
    min_idx = np.nonzero(histogram)[0][0]
    max_idx = np.nonzero(histogram)[0][-1]
    
    # Calculate percentiles efficiently
    percentile_5 = np.interp(0.05, cum_dist, speed_bins)
    percentile_25 = np.interp(0.25, cum_dist, speed_bins)
    median = np.interp(0.5, cum_dist, speed_bins)
    percentile_75 = np.interp(0.75, cum_dist, speed_bins)
    percentile_95 = np.interp(0.95, cum_dist, speed_bins)
    
    return {
        'mean': mean,
        'median': median,
        'std': std,
        'percentile_5': percentile_5,
        'percentile_25': percentile_25,
        'percentile_75': percentile_75,
        'percentile_95': percentile_95,
        'min': speed_bins[min_idx],
        'max': speed_bins[max_idx]
    }

def calculate_cohens_d_from_histograms(hist1, hist2):
    """Calculate Cohen's d effect size between two histograms"""
    speed_bins = np.arange(32)  # 0-31 km/h
    
    # Normalize histograms
    norm_hist1 = hist1 / np.sum(hist1) if np.sum(hist1) > 0 else np.zeros_like(hist1)
    norm_hist2 = hist2 / np.sum(hist2) if np.sum(hist2) > 0 else np.zeros_like(hist2)
    
    # Calculate means
    mean1 = np.sum(speed_bins * norm_hist1)
    mean2 = np.sum(speed_bins * norm_hist2)
    
    # Calculate variances
    var1 = np.sum(((speed_bins - mean1) ** 2) * norm_hist1)
    var2 = np.sum(((speed_bins - mean2) ** 2) * norm_hist2)
    
    # Calculate pooled standard deviation
    pooled_std = np.sqrt((var1 + var2) / 2)
    
    # Cohen's d
    d = abs(mean1 - mean2) / pooled_std if pooled_std > 0 else 0
    
    return d, mean1, mean2

def calculate_effective_n(weights):
    """Calculate effective sample size using relative weight approach"""
    if len(weights) == 0 or np.sum(weights) == 0:
        return 0
    
    weights = np.array(weights)
    effective_n = np.sum(weights)**2 / np.sum(weights**2)
    return effective_n
def create_length_weighted_histogram_comparison(histograms, street_types, stats_info):
    """Create a length-weighted histogram comparison of multiple street types"""
    log("Creating length-weighted histogram comparison")
    
    # Create speed bins
    speed_bins = np.arange(32)
    
    # Create figure
    plt.figure(figsize=(16, 10))
    
    # Plot histograms
    for i, (hist, street_type) in enumerate(zip(histograms, street_types)):
        # Normalize histogram
        hist_norm = hist / np.sum(hist) if np.sum(hist) > 0 else np.zeros_like(hist)
        
        # Plot histogram
        plt.bar(speed_bins + i*0.1, hist_norm * 100, alpha=0.7, 
                label=f"{street_type} ({stats_info[i]['total_length_km']:.1f} km, n={stats_info[i]['effective_n']:.1f})", 
                width=0.8/len(histograms))
        
        # Add vertical line for mean
        plt.axvline(x=stats_info[i]['stats']['mean'], color=f"C{i}", linestyle='-', 
                    label=f"{street_type} Mean: {stats_info[i]['stats']['mean']:.2f} km/h")
    
    # Add legends
    plt.legend(loc='upper left', fontsize=PLOT_CONFIG['LEGEND_FONT_SIZE'])
    
    # Add labels and title
    plt.xlabel(PLOT_CONFIG['X_LABEL'], fontsize=PLOT_CONFIG['AXIS_LABEL_FONT_SIZE'])
    plt.ylabel(PLOT_CONFIG['Y_LABEL'], fontsize=PLOT_CONFIG['AXIS_LABEL_FONT_SIZE'])
    plt.title(PLOT_CONFIG['HISTOGRAM_TITLE'], fontsize=PLOT_CONFIG['TITLE_FONT_SIZE'])
    plt.grid(alpha=0.3)
    plt.xticks(range(0, 32, 2))
    
    # Save figure
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, "length_weighted_histograms.png"), dpi=PLOT_CONFIG['DPI'], bbox_inches='tight')
    log("Saved length-weighted histogram comparison")
    plt.close()

def create_median_weighted_histogram_comparison(histograms, street_types, stats_info):
    """Create a median-based histogram comparison of multiple street types"""
    log("Creating median-weighted histogram comparison")
    
    # Create speed bins
    speed_bins = np.arange(32)
    
    # Create figure
    plt.figure(figsize=(16, 10))
    
    # Plot histograms
    for i, (hist, street_type) in enumerate(zip(histograms, street_types)):
        # Normalize histogram
        hist_norm = hist / np.sum(hist) if np.sum(hist) > 0 else np.zeros_like(hist)
        
        # Plot histogram
        plt.bar(speed_bins + i*0.1, hist_norm * 100, alpha=0.7, 
                label=f"{street_type} ({stats_info[i]['total_length_km']:.1f} km, n={stats_info[i]['effective_n']:.1f})", 
                width=0.8/len(histograms))
        
        # Add vertical line for median
        plt.axvline(x=stats_info[i]['stats']['median'], color=f"C{i}", linestyle='--', 
                    label=f"{street_type} Median: {stats_info[i]['stats']['median']:.2f} km/h")
    
    # Add legends
    plt.legend(loc='upper left', fontsize=PLOT_CONFIG['LEGEND_FONT_SIZE'])
    
    # Add labels and title
    plt.xlabel(PLOT_CONFIG['X_LABEL'], fontsize=PLOT_CONFIG['AXIS_LABEL_FONT_SIZE'])
    plt.ylabel(PLOT_CONFIG['Y_LABEL'], fontsize=PLOT_CONFIG['AXIS_LABEL_FONT_SIZE'])
    plt.title("Geschwindigkeitshistogramm nach OSM \"type\" (sortiert nach Median), 04/23 - 12/24", 
              fontsize=PLOT_CONFIG['TITLE_FONT_SIZE'])
    plt.grid(alpha=0.3)
    plt.xticks(range(0, 32, 2))
    
    # Save figure
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, "median_weighted_histograms.png"), dpi=PLOT_CONFIG['DPI'], bbox_inches='tight')
    log("Saved median-weighted histogram comparison")
    plt.close()
def create_boxplot_from_histograms(street_type_data, valid_types_sorted):
    """Create boxplot using statistics derived from length-weighted histograms"""
    log("Creating boxplot from length-weighted histograms")
    
    fig, ax = plt.subplots(figsize=(16, 10))

    # Prepare statistics
    all_stats = []
    for street_type in valid_types_sorted:
        # Get statistics from histogram
        stats = street_type_data[street_type]['stats']
        
        boxplot_stats = {
            'mean': stats['mean'],
            'median': stats['median'],
            'q1': stats['percentile_25'],
            'q3': stats['percentile_75'],
            'whislo': stats['percentile_5'],
            'whishi': stats['percentile_95'],
            'label': (f"{street_type}\n"
                     f"(effective_n={street_type_data[street_type]['effective_n']:.1f}\n"
                     f"{street_type_data[street_type]['count']:,} Segmente\n"
                     f"{street_type_data[street_type]['total_length_km']:.1f} km")
        }
        all_stats.append(boxplot_stats)
    
    # Create boxplot using Axes.bxp()
    ax.bxp(
        [{
            'med': stats['median'],
            'q1': stats['q1'],
            'q3': stats['q3'],
            'whislo': stats['whislo'],
            'whishi': stats['whishi'],
            'fliers': [],
            'label': stats['label']
        } for stats in all_stats],
        positions=range(len(all_stats)),
        showfliers=False,
        patch_artist=True,
        boxprops={'facecolor': PLOT_CONFIG['BOX_COLOR_1'], 'alpha': 0.7},
        medianprops={'color': PLOT_CONFIG['MEDIAN_COLOR'], 'linewidth': 2},
        whiskerprops={'color': 'black', 'linestyle': '-', 'linewidth': 1},
        capprops={'color': 'black', 'linewidth': 1}
    )
    
    # Add mean markers and annotations
    for i, stats in enumerate(all_stats):
        # Mean star
        ax.scatter(
            i, stats['mean'],
            marker='*',
            s=150,
            color=PLOT_CONFIG['MEAN_COLOR'],
            zorder=3
        )
        
        # Blue median text
        ax.text(
            i, stats['median'] + 0.7,
            f"{stats['median']:.1f}",
            ha='center', va='bottom',
            color=PLOT_CONFIG['MEDIAN_COLOR'],
            fontsize=PLOT_CONFIG['TEXT_FONT_SIZE'],
            weight='bold'
        )
        
        # Red mean text
        ax.text(
            i, stats['mean'] - 0.7,
            f"{stats['mean']:.1f}",
            ha='center', va='top',
            color=PLOT_CONFIG['MEAN_COLOR'],
            fontsize=PLOT_CONFIG['TEXT_FONT_SIZE'],
            weight='bold'
        )
    
    # Formatting
    ax.set_xticks(range(len(all_stats)))
    ax.set_xticklabels([stats['label'] for stats in all_stats], rotation=45, ha='right')
    ax.set_ylabel(PLOT_CONFIG['X_LABEL'], fontsize=PLOT_CONFIG['AXIS_LABEL_FONT_SIZE'])
    ax.set_title(
        "Boxplot Geschwindigkeitsverteilungen DB Rad+ nach OSM 'type'\n"
        "Zeitraum 04/23 - 12/24",
        fontsize=PLOT_CONFIG['TITLE_FONT_SIZE']
    )
    ax.grid(axis='y', alpha=0.3)
    
    # Legend
    legend_elements = [
        mpatches.Patch(facecolor=PLOT_CONFIG['BOX_COLOR_1'], alpha=0.7, label='IQR'),
        plt.Line2D([0], [0], color=PLOT_CONFIG['MEDIAN_COLOR'], lw=2, label='Median'),
        plt.Line2D([0], [0], marker='*', color=PLOT_CONFIG['MEAN_COLOR'], markersize=10, 
                   linestyle='None', label='Mittelwert')
    ]
    ax.legend(handles=legend_elements, loc='upper right')
    
    plt.tight_layout()
    plt.savefig(
        os.path.join(OUTPUT_DIR, 'street_type_boxplot.png'),
        dpi=PLOT_CONFIG['DPI'],
        bbox_inches='tight'
    )
    plt.close()
def create_violin_plot_from_histograms(street_type_data, valid_types_sorted):
    """Create violin plot using distributions from length-weighted histograms"""
    log("Creating violin plot from length-weighted histograms")
    
    fig, ax = plt.subplots(figsize=(16, 10))

    # Draw violins
    for i, street_type in enumerate(valid_types_sorted):
        hist = street_type_data[street_type]['length_weighted_agg_hist']
        speeds = np.arange(32)
        
        # Create weighted KDE
        kde = gaussian_kde(
            speeds, 
            weights=hist,
            bw_method=0.25  # Optimal smoothing for speed data
        )
        x = np.linspace(0, 31, 100)
        density = kde(x)
        
        # Normalize density width
        density_norm = 0.4 * density / density.max()  
        
        # Draw violin body
        ax.fill_betweenx(
            x, 
            i - density_norm, 
            i + density_norm,
            color=PLOT_CONFIG['BOX_COLOR_2'],
            alpha=0.7
        )
        
        # Add median line
        median = street_type_data[street_type]['stats']['median']
        ax.hlines(
            median,
            i - density_norm.max(),
            i + density_norm.max(),
            colors=PLOT_CONFIG['MEDIAN_COLOR'],
            linewidths=1.5
        )
        
        # Add blue median value above line
        ax.text(
            i, median + 0.35,
            f"{median:.1f}",
            ha='center', va='bottom',
            color=PLOT_CONFIG['MEDIAN_COLOR'],
            fontsize=PLOT_CONFIG['TEXT_FONT_SIZE'],
            weight='bold'
        )
    
    # Add weighted means as stars
    mean_markers = ax.scatter(
        range(len(valid_types_sorted)),
        [street_type_data[t]['stats']['mean'] for t in valid_types_sorted],
        marker='*',
        s=150,
        color=PLOT_CONFIG['MEAN_COLOR'],
        zorder=3
    )
    
    # Add red mean values below stars
    for i, street_type in enumerate(valid_types_sorted):
        mean = street_type_data[street_type]['stats']['mean']
        ax.text(
            i, mean - 0.35,
            f"{mean:.1f}",
            ha='center', va='top',
            color=PLOT_CONFIG['MEAN_COLOR'],
            fontsize=PLOT_CONFIG['TEXT_FONT_SIZE'],
            weight='bold'
        )
    
    # X-axis labels with metadata
    labels = [
        f"{t}\n"
        f"n={street_type_data[t]['effective_n']:.1f}\n"
        f"{street_type_data[t]['count']:,} segments\n"
        f"{street_type_data[t]['total_length_km']:.1f} km"
        for t in valid_types_sorted
    ]
    
    ax.set_xticks(range(len(labels)))
    ax.set_xticklabels(
        labels,
        rotation=45,
        ha='right',
        fontsize=PLOT_CONFIG['TEXT_FONT_SIZE']
    )

    # Titles and styling
    ax.set_ylabel(PLOT_CONFIG['X_LABEL'], fontsize=PLOT_CONFIG['AXIS_LABEL_FONT_SIZE'])
    ax.set_title(
        "Geschwindigkeitsverteilungen DB Rad+ nach OSM 'type'\n"
        "Geigendiagramm, Zeitraum 04/23 - 12/24",
        fontsize=PLOT_CONFIG['TITLE_FONT_SIZE']
    )
    ax.grid(axis='y', alpha=0.3)
    
    # Custom legend
    legend_elements = [
        mpatches.Patch(facecolor=PLOT_CONFIG['BOX_COLOR_2'], alpha=0.7, label='Density'),
        plt.Line2D([0], [0], color=PLOT_CONFIG['MEDIAN_COLOR'], lw=1.5, label='Median'),
        plt.Line2D([0], [0], marker='*', color=PLOT_CONFIG['MEAN_COLOR'], markersize=10, 
                   linestyle='None', label='Mean')
    ]
    ax.legend(
        handles=legend_elements,
        loc='upper right',
        fontsize=PLOT_CONFIG['LEGEND_FONT_SIZE']
    )
    
    plt.tight_layout()
    plt.savefig(
        os.path.join(OUTPUT_DIR, 'street_type_weighted_violin.png'),
        dpi=PLOT_CONFIG['DPI'],
        bbox_inches='tight'
    )
    plt.close()
def create_mean_vs_median_comparison(street_type_data, valid_types):
    """Create scatter plot comparing weighted means and medians"""
    log("Creating mean vs median comparison plot")
    
    plt.figure(figsize=(10, 8))
    
    # Extract means and medians
    means = [street_type_data[t]['stats']['mean'] for t in valid_types]
    medians = [street_type_data[t]['stats']['median'] for t in valid_types]
    
    # Calculate skewness using the difference between mean and median
    skewness = [mean - median for mean, median in zip(means, medians)]
    
    # Create scatter plot
    scatter = plt.scatter(means, medians, 
                          c=skewness, cmap='coolwarm', alpha=0.9, 
                          s=100, edgecolors='black')
    
    # Add diagonal line (y=x)
    min_val = min(min(means), min(medians)) - 1
    max_val = max(max(means), max(medians)) + 1
    plt.plot([min_val, max_val], [min_val, max_val], 'k--', alpha=0.5)
    
    # Add labels for each point
    for i, txt in enumerate(valid_types):
        plt.annotate(txt, (means[i], medians[i]), 
                    xytext=(5, 5), textcoords='offset points')
    
    # Add colorbar to show skewness
    cbar = plt.colorbar(scatter)
    cbar.set_label('Skewness (Mean - Median)')
    
    # Add labels and title
    plt.xlabel('Weighted Mean Speed (km/h)', fontsize=PLOT_CONFIG['AXIS_LABEL_FONT_SIZE'])
    plt.ylabel('Weighted Median Speed (km/h)', fontsize=PLOT_CONFIG['AXIS_LABEL_FONT_SIZE'])
    plt.title('Comparison of Weighted Mean vs Median Speeds by Street Type', fontsize=PLOT_CONFIG['TITLE_FONT_SIZE'])
    
    # Equal aspect ratio
    plt.axis('equal')
    plt.grid(alpha=0.3)
    
    # Save figure
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, 'mean_vs_median_comparison.png'), dpi=PLOT_CONFIG['DPI'], bbox_inches='tight')
    plt.close()
    
    # Also create a summary table
    comparison_df = pd.DataFrame({
        'Street Type': valid_types,
        'Weighted Mean': means,
        'Weighted Median': medians,
        'Difference (Mean-Median)': skewness,
        'Effective N': [street_type_data[t]['effective_n'] for t in valid_types],
        'Total Length (km)': [street_type_data[t]['total_length_km'] for t in valid_types]
    })
    
    comparison_df.to_csv(os.path.join(OUTPUT_DIR, 'mean_vs_median_comparison.csv'), index=False)
    
    # Display the comparison
    log("\nComparison of weighted means and medians:")
    for idx, row in comparison_df.iterrows():
        log(f"{row['Street Type']}: Mean = {row['Weighted Mean']:.2f}, Median = {row['Weighted Median']:.2f}, " +
            f"Diff = {row['Difference (Mean-Median)']:.2f}, Effective N = {row['Effective N']:.1f}")

def perform_kruskal_wallis_and_dunn_tests(street_type_data, valid_types, test_type="mean"):
    """
    Perform Kruskal-Wallis test followed by Dunn's post-hoc test directly using 
    the length-weighted histograms. Test type can be 'mean' or 'median' to determine 
    sorting of results.
    """
    log(f"\nPerforming Kruskal-Wallis test followed by Dunn's post-hoc test (sorting by {test_type})")
    
    # Generate samples from histograms for the Kruskal-Wallis test
    # We'll limit this to a reasonable number of samples per type to avoid memory issues
    max_samples_per_type = 10000
    samples_by_type = []
    
    for street_type in valid_types:
        hist = street_type_data[street_type]['length_weighted_agg_hist']
        norm_hist = hist / np.sum(hist)
        
        # Generate samples efficiently using numpy's random choice
        bins = np.arange(32)
        samples = np.random.choice(bins, size=max_samples_per_type, p=norm_hist)
        samples_by_type.append(samples)
    
    # 1. Perform Kruskal-Wallis test
    h_stat, p_value = stats.kruskal(*samples_by_type)
    
    log(f"Kruskal-Wallis test results:")
    log(f"H statistic: {h_stat:.4f}")
    log(f"p-value: {p_value:.6f}")
    
    # Create DataFrame for Dunn's test
    all_samples = []
    all_types = []
    
    for i, samples in enumerate(samples_by_type):
        all_samples.extend(samples)
        all_types.extend([valid_types[i]] * len(samples))
    
    df_samples = pd.DataFrame({
        'speed': all_samples,
        'type': all_types
    })
    
    # Calculate Cohen's d effect sizes between all pairs
    # Store in a matrix for fast lookups
    d_matrix = np.zeros((len(valid_types), len(valid_types)))
    mean_diffs = np.zeros((len(valid_types), len(valid_types)))
    
    for i, type1 in enumerate(valid_types):
        for j, type2 in enumerate(valid_types):
            if i < j:  # Only calculate once per pair
                hist1 = street_type_data[type1]['length_weighted_agg_hist']
                hist2 = street_type_data[type2]['length_weighted_agg_hist']
                
                # Calculate Cohen's d directly from histograms
                d, mean1, mean2 = calculate_cohens_d_from_histograms(hist1, hist2)
                d_matrix[i, j] = d
                d_matrix[j, i] = d  # Mirror for convenience
                
                diff = mean1 - mean2
                mean_diffs[i, j] = diff
                mean_diffs[j, i] = -diff  # Mirror with sign change
    
    # If the Kruskal-Wallis test is significant, perform Dunn's post-hoc test
    if p_value < 0.05:
        log("\nSignificant differences found between street types (p < 0.05)")
        
        # Perform Dunn's test
        log("Performing Dunn's test for post-hoc analysis")
        dunn_results = posthoc_dunn(df_samples, val_col='speed', group_col='type', p_adjust='fdr_bh')
        
        # Save Dunn's test results
        dunn_results.to_csv(os.path.join(OUTPUT_DIR, f'dunn_test_{test_type}_results.csv'))
        
        # Create a heatmap of p-values
        plt.figure(figsize=(14, 12))
        mask = np.triu(np.ones_like(dunn_results, dtype=bool))
        
        sns.heatmap(dunn_results, mask=mask, annot=True, cmap='coolwarm_r', 
                  vmin=0, vmax=0.05, center=0.025, 
                  annot_kws={"size": 10}, fmt='.3f')
        
        plt.title(f"Vergleich nach OSM 'type': Dunn's post-hoc p-Werte ({test_type}-basiert)\nSignifikanz: p < 0.05 (rot)", 
                fontsize=PLOT_CONFIG['TITLE_FONT_SIZE'])
        plt.tight_layout(pad=2.0)
        plt.savefig(os.path.join(OUTPUT_DIR, f'dunn_test_{test_type}_heatmap.png'), dpi=PLOT_CONFIG['DPI'], bbox_inches='tight')
        plt.close()
        
        # Find significantly different pairs and organize into a list
        sig_pairs = []
        for i in range(len(valid_types)):
            for j in range(i+1, len(valid_types)):
                type1 = valid_types[i]
                type2 = valid_types[j]
                
                # Get corresponding indices in dunn_results
                try:
                    dunn_i = dunn_results.index.get_loc(type1)
                    dunn_j = dunn_results.columns.get_loc(type2)
                    p_val = dunn_results.iloc[dunn_i, dunn_j]
                    
                    if p_val < 0.05:  # Only include significant pairs
                        # Get effect size from pre-computed matrices
                        d = d_matrix[i, j]
                        diff = mean_diffs[i, j]
                        
                        sig_pairs.append((type1, type2, p_val, diff, d))
                except (KeyError, IndexError):
                    log(f"Warning: Could not find {type1} vs {type2} in Dunn results")
        
        # Sort by significance
        sig_pairs.sort(key=lambda x: x[2])
        
        log("\nSignificantly different street type pairs with length-weighted effect sizes:")
        for type1, type2, p_val, diff, d in sig_pairs:
            effect_size_interp = "small" if d < 0.2 else "medium" if d < 0.75 else "large"
            log(f"{type1} vs {type2}: p={p_val:.6f}, weighted mean diff={diff:.2f} km/h, Cohen's d={d:.3f} ({effect_size_interp})")
        
        # Save effect size results to CSV
        effect_size_df = pd.DataFrame(sig_pairs, 
                                     columns=['Type1', 'Type2', 'p_value', 'weighted_mean_diff', 'cohens_d'])
        effect_size_df['effect_size_interpretation'] = effect_size_df['cohens_d'].apply(
            lambda d: "small" if d < 0.2 else "medium" if d < 0.75 else "large")
        effect_size_df.to_csv(os.path.join(OUTPUT_DIR, f'effect_size_{test_type}_results.csv'), index=False)

        # Create a matrix of Cohen's d values with the same structure as dunn_results
        # Important: We'll use the same index/column order as dunn_results
        cohens_d_df = pd.DataFrame(
            np.zeros((len(dunn_results.index), len(dunn_results.columns))),
            index=dunn_results.index,
            columns=dunn_results.columns
        )
        
        # First, initialize all values to NaN
        cohens_d_df.iloc[:, :] = np.nan
        
        # Then fill in Cohen's d values for ALL pairs (mirror the upper triangle to match Dunn's)
        for i in range(len(dunn_results.index)):
            for j in range(len(dunn_results.columns)):
                if i < j:  # Upper triangle (to match standard scientific reporting)
                    type_i = dunn_results.index[i]
                    type_j = dunn_results.columns[j]
                    
                    # Check if this pair is statistically significant
                    if dunn_results.iloc[i, j] < 0.05:
                        # Get indices in valid_types list
                        try:
                            idx_i = valid_types.index(type_i)
                            idx_j = valid_types.index(type_j)
                            
                            # Get Cohen's d value from our precomputed matrix
                            if idx_i < idx_j:
                                cohens_d_df.iloc[i, j] = d_matrix[idx_i, idx_j]
                            else:
                                cohens_d_df.iloc[i, j] = d_matrix[idx_j, idx_i]
                        except ValueError:
                            log(f"Warning: Could not find {type_i} or {type_j} in valid_types")
        
        # Save the Cohen's d matrix (only significant differences in upper triangle)
        cohens_d_df.to_csv(os.path.join(OUTPUT_DIR, f'cohens_d_matrix_{test_type}.csv'))
        
        # Create a heatmap visualization of Cohen's d values
        plt.figure(figsize=(14, 12))
        
        # Use a diverging colormap to highlight differences
        cmap = sns.diverging_palette(240, 10, as_cmap=True)
        
        # Create mask for lower triangle (opposite of Dunn's test)
        mask = np.tril(np.ones_like(cohens_d_df, dtype=bool))
        
        # Create heatmap for Cohen's d values
        sns.heatmap(cohens_d_df, annot=True, cmap=cmap, 
                  vmin=0, vmax=1.5, center=0.75,  # 0.75 is the threshold for "large" effect
                  annot_kws={"size": 10}, fmt='.3f',
                  mask=mask)  # Apply mask to hide lower triangle
        
        # Add colorbar labels for effect size interpretation
        colorbar = plt.gcf().axes[-1]
        colorbar.text(3.5, 0.1, 'Kleiner Effekt (<0.2)', ha='left', va='center')
        colorbar.text(3.5, 0.4, 'Mittlerer Effekt (0.2-0.75)', ha='left', va='center')
        colorbar.text(3.5, 0.9, 'Großer Effekt (>0.75)', ha='left', va='center')
        
        plt.title(f"Vergleich nach OSM 'type': Effektstärken nach Cohen's d ({test_type}-basiert)\nDifferenzen zwischen OSM 'type' (04/23 - 12/24)", 
                 fontsize=PLOT_CONFIG['TITLE_FONT_SIZE'])
        plt.tight_layout(pad=2.0)
        plt.savefig(os.path.join(OUTPUT_DIR, f'cohens_d_heatmap_{test_type}.png'), dpi=PLOT_CONFIG['DPI'], bbox_inches='tight')
        plt.close()
        
        return dunn_results, (d_matrix, mean_diffs)
    else:
        log("No significant differences found between street types (p >= 0.05)")
        return None, (d_matrix, mean_diffs)

def analyze_street_types():
    """
    Main analysis function that consistently uses length-weighted histograms for all calculations.
    """
    log(f"Starting consistent length-weighted analysis of parquet file: {PARQUET_FILE}")
    
    # Verify the parquet file exists
    if not os.path.exists(PARQUET_FILE):
        log(f"Error: File {PARQUET_FILE} not found")
        return None
    
    # Get file info
    parquet_file = pq.ParquetFile(PARQUET_FILE)
    num_row_groups = parquet_file.metadata.num_row_groups
    total_rows = parquet_file.metadata.num_rows
    log(f"Parquet file has {num_row_groups} row groups and approximately {total_rows:,} rows")
    
    # Define the specific columns we need
    columns = [COLUMN_TYPE, COLUMN_HIST, COLUMN_LENGTH]
    
    # Initialize storage for results
    street_type_data = {}
    
    # Process each row group with efficient chunking
    log(f"Processing {num_row_groups} row groups to extract speed data by street type")
    
    # Define chunk size for efficient processing within each row group
    CHUNK_SIZE = 50000
    
    for rg in tqdm(range(num_row_groups), desc="Processing row groups"):
        try:
            # Get the row group size to determine number of chunks needed
            row_group_metadata = parquet_file.metadata.row_group(rg)
            row_group_size = row_group_metadata.num_rows
            num_chunks = (row_group_size + CHUNK_SIZE - 1) // CHUNK_SIZE  # Ceiling division
            
            # Read the entire row group
            table = parquet_file.read_row_group(rg, columns=columns)
            
            # Process each chunk of the row group
            for chunk_idx in tqdm(range(num_chunks), desc=f"Chunks in row group {rg}", leave=False):
                # Calculate offsets
                offset = chunk_idx * CHUNK_SIZE
                length = min(CHUNK_SIZE, row_group_size - offset)
                
                # Extract just this chunk from the row group
                chunk_table = table.slice(offset, length)
                df_chunk = chunk_table.to_pandas()
                
                # Filter out NaN values
                df_chunk = df_chunk.dropna(subset=[COLUMN_TYPE, COLUMN_HIST, COLUMN_LENGTH])
                
                # Filter to only include the specified street types
                df_chunk = df_chunk[df_chunk[COLUMN_TYPE].isin(STREET_TYPES_TO_ANALYZE)]
                
                # Process each row in this chunk
                for _, row in df_chunk.iterrows():
                    street_type = row[COLUMN_TYPE]
                    hist = parse_histogram(row[COLUMN_HIST])
                    segment_length = row[COLUMN_LENGTH]
                    
                    if len(hist) > 0 and np.sum(hist) > 0:
                        # Initialize entry for this street type if it doesn't exist
                        if street_type not in street_type_data:
                            street_type_data[street_type] = {
                                'lengths': [],  # Store segment lengths
                                'length_weighted_agg_hist': np.zeros(32),  # Initialize aggregated histogram
                                'count': 0
                            }
                        
                        # Add data for this street segment
                        street_type_data[street_type]['lengths'].append(segment_length)
                        
                        # Add to the aggregated histogram with length weighting
                        # Multiply histogram by segment length for length-weighting
                        street_type_data[street_type]['length_weighted_agg_hist'] += hist * segment_length
                        
                        street_type_data[street_type]['count'] += 1
                
                # Clean up to free memory after each chunk
                del df_chunk, chunk_table
                force_gc()
            
            # Clean up the entire row group data
            del table
            force_gc()
                
        except Exception as e:
            log(f"Error processing row group {rg}: {e}")
    
    # Count total processed segments
    total_segments = sum(data['count'] for data in street_type_data.values())
    log(f"Processed {total_segments:,} total street segments across {len(street_type_data)} different street types")
    
    # Check if we have data for all the requested street types
    for street_type in STREET_TYPES_TO_ANALYZE:
        if street_type not in street_type_data:
            log(f"Warning: No data found for street type '{street_type}'")
        elif street_type_data[street_type]['count'] < 30:
            log(f"Warning: Insufficient data for street type '{street_type}' (only {street_type_data[street_type]['count']} segments)")
    
    # Get the street types that have sufficient data (at least 30 segments)
    valid_types = [st for st in STREET_TYPES_TO_ANALYZE 
                  if st in street_type_data and street_type_data[st]['count'] >= 30]
    
    log(f"Analysis will include {len(valid_types)} street types with sufficient data")
    for i, type_name in enumerate(valid_types, 1):
        count = street_type_data[type_name]['count']
        log(f"{i}. {type_name}: {count:,} segments")
    
    # Calculate statistics for each street type using only the length-weighted histogram
    # We'll do this ONCE for each street type and store the results for reuse
    log("Calculating length-weighted statistics for each street type")
    for street_type in valid_types:
        # Calculate total length in km
        total_length_km = sum(street_type_data[street_type]['lengths']) / 1000
        
        # Calculate effective sample size
        effective_n = calculate_effective_n(street_type_data[street_type]['lengths'])
        
        # Calculate all statistics efficiently in a single pass
        agg_hist = street_type_data[street_type]['length_weighted_agg_hist']
        stats = calculate_all_weighted_statistics(agg_hist)
        
        # Store all calculated statistics
        street_type_data[street_type]['stats'] = stats
        street_type_data[street_type]['effective_n'] = effective_n
        street_type_data[street_type]['total_length_km'] = total_length_km
        
        log(f"{street_type}: weighted mean = {stats['mean']:.2f} km/h, weighted median = {stats['median']:.2f} km/h, effective n = {effective_n:.1f}")
    
    # Sort valid types by weighted mean for visualizations
    valid_types_sorted_by_mean = sorted(valid_types, 
                                        key=lambda x: street_type_data[x]['stats']['mean'] if x in street_type_data else 0,
                                        reverse=True)
    
    # Sort valid types by weighted median for comparison
    valid_types_sorted_by_median = sorted(valid_types, 
                                         key=lambda x: street_type_data[x]['stats']['median'] if x in street_type_data else 0,
                                         reverse=True)
    
    # Create visualizations
    log("Creating visualizations based on length-weighted histograms")
    
    # 1. Create length-weighted histogram comparison (sorted by mean)
    histograms = []
    hist_labels = []
    hist_stats = []
    
    for street_type in valid_types_sorted_by_mean:
        histograms.append(street_type_data[street_type]['length_weighted_agg_hist'])
        hist_labels.append(street_type)
        hist_stats.append({
            'stats': street_type_data[street_type]['stats'],
            'total_length_km': street_type_data[street_type]['total_length_km'],
            'effective_n': street_type_data[street_type]['effective_n']
        })
    
    create_length_weighted_histogram_comparison(histograms, hist_labels, hist_stats)
    
    # 2. Create median-sorted histogram comparison
    histograms_by_median = []
    hist_labels_by_median = []
    hist_stats_by_median = []
    
    for street_type in valid_types_sorted_by_median:
        histograms_by_median.append(street_type_data[street_type]['length_weighted_agg_hist'])
        hist_labels_by_median.append(street_type)
        hist_stats_by_median.append({
            'stats': street_type_data[street_type]['stats'],
            'total_length_km': street_type_data[street_type]['total_length_km'],
            'effective_n': street_type_data[street_type]['effective_n']
        })
    
    create_median_weighted_histogram_comparison(histograms_by_median, hist_labels_by_median, hist_stats_by_median)
    
    # 3. Create boxplot visualization
    create_boxplot_from_histograms(street_type_data, valid_types_sorted_by_mean)
    
    # 4. Create violin plot
    create_violin_plot_from_histograms(street_type_data, valid_types_sorted_by_mean)
    
    # 5. Create mean vs median comparison
    create_mean_vs_median_comparison(street_type_data, valid_types_sorted_by_mean)
    
    # 6. Perform Kruskal-Wallis and Dunn's tests based on mean-sorted order
    dunn_results_mean, effect_sizes_mean = perform_kruskal_wallis_and_dunn_tests(
        street_type_data, valid_types_sorted_by_mean, "mean")
    
    # 7. Perform Kruskal-Wallis and Dunn's tests based on median-sorted order
    dunn_results_median, effect_sizes_median = perform_kruskal_wallis_and_dunn_tests(
        street_type_data, valid_types_sorted_by_median, "median")
    
    # 8. Compare the two approaches if both tests were significant
    if dunn_results_mean is not None and dunn_results_median is not None:
        log("\nComparing mean-based and median-based significant pair findings:")
        
        # Get sets of significant pairs from both tests
        mean_pairs = set()
        median_pairs = set()
        
        # Extract significant pairs from Dunn's test results
        for i in range(len(valid_types_sorted_by_mean)):
            for j in range(i+1, len(valid_types_sorted_by_mean)):
                if dunn_results_mean.iloc[i, j] < 0.05:
                    mean_pairs.add((valid_types_sorted_by_mean[i], valid_types_sorted_by_mean[j]))
        
        for i in range(len(valid_types_sorted_by_median)):
            for j in range(i+1, len(valid_types_sorted_by_median)):
                if dunn_results_median.iloc[i, j] < 0.05:
                    median_pairs.add((valid_types_sorted_by_median[i], valid_types_sorted_by_median[j]))
        
        # Find pairs that were significant in both tests
        common_pairs = mean_pairs.intersection(median_pairs)
        only_mean_pairs = mean_pairs - median_pairs
        only_median_pairs = median_pairs - mean_pairs
        
        log(f"Number of pairs significant in both tests: {len(common_pairs)}")
        log(f"Number of pairs significant only in mean-based test: {len(only_mean_pairs)}")
        log(f"Number of pairs significant only in median-based test: {len(only_median_pairs)}")
        
        # List the common pairs
        if common_pairs:
            log("\nPairs significant in both tests:")
            for pair in common_pairs:
                log(f"{pair[0]} vs {pair[1]}")
                
        # List pairs only significant in mean test
        if only_mean_pairs:
            log("\nPairs significant only in mean-based test:")
            for pair in only_mean_pairs:
                log(f"{pair[0]} vs {pair[1]}")
        
        # List pairs only significant in median test
        if only_median_pairs:
            log("\nPairs significant only in median-based test:")
            for pair in only_median_pairs:
                log(f"{pair[0]} vs {pair[1]}")
        
        # Create Venn diagram of significant pairs if matplotlib_venn is available
        try:
            from matplotlib_venn import venn2
            
            plt.figure(figsize=(10, 8))
            venn2(subsets=(len(only_mean_pairs), len(only_median_pairs), len(common_pairs)),
                set_labels=('Mean-based', 'Median-based'))
            plt.title("Significant Differences Found: Mean vs Median Approach", fontsize=PLOT_CONFIG['TITLE_FONT_SIZE'])
            plt.savefig(os.path.join(OUTPUT_DIR, 'mean_vs_median_venn.png'), dpi=PLOT_CONFIG['DPI'], bbox_inches='tight')
            plt.close()
        except ImportError:
            log("matplotlib_venn not available for creating Venn diagram. Install with 'pip install matplotlib-venn'")
    
    # 9. Create summary statistics table
    summary_data = []
    for street_type in valid_types_sorted_by_mean:
        stats = street_type_data[street_type]['stats']
        
        summary_data.append({
            'type': street_type,
            'count': street_type_data[street_type]['count'],
            'total_length_km': street_type_data[street_type]['total_length_km'],
            'effective_n': street_type_data[street_type]['effective_n'],
            'mean': stats['mean'],
            'median': stats['median'],
            'std': stats['std'],
            'min': stats['min'],
            'max': stats['max'],
            'percentile_5': stats['percentile_5'],
            'percentile_25': stats['percentile_25'],
            'percentile_75': stats['percentile_75'],
            'percentile_95': stats['percentile_95'],
            'skewness': stats['mean'] - stats['median']
        })
    
    summary_df = pd.DataFrame(summary_data)
    summary_df.to_csv(os.path.join(OUTPUT_DIR, 'street_type_summary_stats.csv'), index=False)
    
    log("Consistent length-weighted analysis complete. All results are based on the same length-weighted histograms.")
    return street_type_data

# Run the analysis
if __name__ == "__main__":
    street_data = analyze_street_types()

Starting consistent length-weighted analysis of parquet file: data/network_all_months_plus_25833_length_with_fahrradstrasse.parquet
Parquet file has 1 row groups and approximately 592,136 rows
Processing 1 row groups to extract speed data by street type


Processing row groups:   0%|          | 0/1 [00:00<?, ?it/s]

Chunks in row group 0:   0%|          | 0/12 [00:00<?, ?it/s]

Processed 587,967 total street segments across 15 different street types
Analysis will include 15 street types with sufficient data
1. footway: 236,139 segments
2. residential: 121,106 segments
3. secondary: 41,187 segments
4. service_driveway: 31,506 segments
5. tertiary: 30,352 segments
6. service: 28,692 segments
7. path: 28,627 segments
8. cycleway: 16,089 segments
9. track: 13,789 segments
10. primary: 12,001 segments
11. service_parking_aisle: 10,052 segments
12. unclassified: 6,897 segments
13. living_street: 6,162 segments
14. steps: 4,065 segments
15. pedestrian: 1,303 segments
Calculating length-weighted statistics for each street type
footway: weighted mean = 18.76 km/h, weighted median = 18.76 km/h, effective n = 91694.3
residential: weighted mean = 18.98 km/h, weighted median = 18.92 km/h, effective n = 58311.4
secondary: weighted mean = 20.86 km/h, weighted median = 20.73 km/h, effective n = 19492.7
service_driveway: weighted mean = 12.51 km/h, weighted median = 12.05 km/

In [1]:
# copy_clipped Vergleich zwischen den OSM "type" Kategorien: Signifikanztest mit Kruskal-Wallis Test, Dunn's post-hoc Test, Cohen's d Effektmatrix (Segmente gewichtet nach Länge)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import gaussian_kde
import pyarrow.parquet as pq
import gc
from tqdm.notebook import tqdm
import matplotlib.patches as mpatches
import os
from scikit_posthocs import posthoc_dunn

# Configuration - modify to match your setup
PARQUET_FILE = "data/network_all_months_plus_25833_length_with_fahrradstrasse.parquet"
OUTPUT_DIR = "analysis_results/005_Infra_OSM_type_CLIP"
COLUMN_TYPE = 'type'  # Column for street type
COLUMN_HIST = '2304-2412_speeds'  # Column for speed histogram data
COLUMN_LENGTH = 'length_m'  # Column for length in meters

# Make sure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Plot configurations
PLOT_CONFIG = {
    # General
    'TEXT_FONT_SIZE': 10,
    'TITLE_FONT_SIZE': 16,
    'AXIS_LABEL_FONT_SIZE': 12,
    'LEGEND_FONT_SIZE': 9,
    'DPI': 300,
    
    # Colors
    'BOX_COLOR_1': '#a0ddff',
    'BOX_COLOR_2': '#c8e9a0',
    'MEDIAN_COLOR': 'darkblue',
    'MEAN_COLOR': 'red',
    
    # Labels
    'HISTOGRAM_TITLE': 'Geschwindigkeitshistogramm nach OSM "type", 04/23 - 12/24',
    'BOXPLOT_TITLE': 'Boxplot Geschwindigkeitsverteilung nach OSM "type", 04/23 - 12/24',
    'VIOLINPLOT_TITLE': 'Geigenplot Geschwindigkeitsverteilung nach OSM "type", 04/23 - 12/24',
    'X_LABEL': 'Geschwindigkeit (km/h)',
    'Y_LABEL': 'Prozentsatz (%)',
    
    # Box annotations
    'ANNOTATION_BBOX_STYLE': dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8)
}

# Specific street types to analyze
STREET_TYPES_TO_ANALYZE = [
    'footway',
    'residential',
    'secondary',
    'service_driveway',
    'tertiary',
    'service',
    'path',
    'cycleway',
    'track',
    'primary',
    'service_parking_aisle',
    'unclassified',
    'living_street',
    'steps',
    'pedestrian'
]
def log(message):
    """Print a log message"""
    print(message)

def force_gc():
    """Force garbage collection"""
    gc.collect()

def parse_histogram(hist_str):
    """Parse histogram string to numpy array"""
    try:
        if isinstance(hist_str, str):
            hist_str = hist_str.strip('[]')
            # Fast NumPy parsing
            try:
                values = np.fromstring(hist_str, sep=',')
                return values
            except:
                # Fallback to manual parsing if NumPy method fails
                values = [float(x) for x in hist_str.split(',')]
                return np.array(values)
        else:
            return np.zeros(32)  # Return zeros for missing histograms
    except Exception as e:
        log(f"Error parsing histogram: {e}")
        return np.zeros(32)

def calculate_all_weighted_statistics(histogram):
    """
    Calculate all statistics from a histogram in a single efficient pass.
    Returns a dictionary with all statistics.
    """
    if np.sum(histogram) == 0:
        return {
            'mean': np.nan,
            'median': np.nan,
            'std': np.nan,
            'percentile_5': np.nan,
            'percentile_25': np.nan,
            'percentile_75': np.nan,
            'percentile_95': np.nan,
            'min': np.nan,
            'max': np.nan
        }
    
    speed_bins = np.arange(32)  # 0-31 km/h
    total_count = np.sum(histogram)
    
    # Normalize the histogram
    norm_hist = histogram / total_count
    
    # Calculate weighted mean in one step
    mean = np.sum(speed_bins * norm_hist)
    
    # Calculate variance and std in one step
    variance = np.sum(((speed_bins - mean) ** 2) * norm_hist)
    std = np.sqrt(variance)
    
    # Calculate cumulative distribution once
    cum_dist = np.cumsum(norm_hist)
    
    # Find min/max with data present
    min_idx = np.nonzero(histogram)[0][0]
    max_idx = np.nonzero(histogram)[0][-1]
    
    # Calculate percentiles efficiently
    percentile_5 = np.interp(0.05, cum_dist, speed_bins)
    percentile_25 = np.interp(0.25, cum_dist, speed_bins)
    median = np.interp(0.5, cum_dist, speed_bins)
    percentile_75 = np.interp(0.75, cum_dist, speed_bins)
    percentile_95 = np.interp(0.95, cum_dist, speed_bins)
    
    return {
        'mean': mean,
        'median': median,
        'std': std,
        'percentile_5': percentile_5,
        'percentile_25': percentile_25,
        'percentile_75': percentile_75,
        'percentile_95': percentile_95,
        'min': speed_bins[min_idx],
        'max': speed_bins[max_idx]
    }

def calculate_cohens_d_from_histograms(hist1, hist2):
    """Calculate Cohen's d effect size between two histograms"""
    speed_bins = np.arange(32)  # 0-31 km/h
    
    # Normalize histograms
    norm_hist1 = hist1 / np.sum(hist1) if np.sum(hist1) > 0 else np.zeros_like(hist1)
    norm_hist2 = hist2 / np.sum(hist2) if np.sum(hist2) > 0 else np.zeros_like(hist2)
    
    # Calculate means
    mean1 = np.sum(speed_bins * norm_hist1)
    mean2 = np.sum(speed_bins * norm_hist2)
    
    # Calculate variances
    var1 = np.sum(((speed_bins - mean1) ** 2) * norm_hist1)
    var2 = np.sum(((speed_bins - mean2) ** 2) * norm_hist2)
    
    # Calculate pooled standard deviation
    pooled_std = np.sqrt((var1 + var2) / 2)
    
    # Cohen's d
    d = abs(mean1 - mean2) / pooled_std if pooled_std > 0 else 0
    
    return d, mean1, mean2

def calculate_effective_n(weights):
    """Calculate effective sample size using relative weight approach"""
    if len(weights) == 0 or np.sum(weights) == 0:
        return 0
    
    weights = np.array(weights)
    effective_n = np.sum(weights)**2 / np.sum(weights**2)
    return effective_n
def create_length_weighted_histogram_comparison(histograms, street_types, stats_info):
    """Create a length-weighted histogram comparison of multiple street types"""
    log("Creating length-weighted histogram comparison")
    
    # Create speed bins
    speed_bins = np.arange(32)
    
    # Create figure
    plt.figure(figsize=(16, 10))
    
    # Plot histograms
    for i, (hist, street_type) in enumerate(zip(histograms, street_types)):
        # Normalize histogram
        hist_norm = hist / np.sum(hist) if np.sum(hist) > 0 else np.zeros_like(hist)
        
        # Plot histogram
        plt.bar(speed_bins + i*0.1, hist_norm * 100, alpha=0.7, 
                label=f"{street_type} ({stats_info[i]['total_length_km']:.1f} km, n={stats_info[i]['effective_n']:.1f})", 
                width=0.8/len(histograms))
        
        # Add vertical line for mean
        plt.axvline(x=stats_info[i]['stats']['mean'], color=f"C{i}", linestyle='-', 
                    label=f"{street_type} Mean: {stats_info[i]['stats']['mean']:.2f} km/h")
    
    # Add legends
    plt.legend(loc='upper left', fontsize=PLOT_CONFIG['LEGEND_FONT_SIZE'])
    
    # Add labels and title
    plt.xlabel(PLOT_CONFIG['X_LABEL'], fontsize=PLOT_CONFIG['AXIS_LABEL_FONT_SIZE'])
    plt.ylabel(PLOT_CONFIG['Y_LABEL'], fontsize=PLOT_CONFIG['AXIS_LABEL_FONT_SIZE'])
    plt.title(PLOT_CONFIG['HISTOGRAM_TITLE'], fontsize=PLOT_CONFIG['TITLE_FONT_SIZE'])
    plt.grid(alpha=0.3)
    plt.xticks(range(0, 32, 2))
    
    # Save figure
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, "length_weighted_histograms.png"), dpi=PLOT_CONFIG['DPI'], bbox_inches='tight')
    log("Saved length-weighted histogram comparison")
    plt.close()

def create_median_weighted_histogram_comparison(histograms, street_types, stats_info):
    """Create a median-based histogram comparison of multiple street types"""
    log("Creating median-weighted histogram comparison")
    
    # Create speed bins
    speed_bins = np.arange(32)
    
    # Create figure
    plt.figure(figsize=(16, 10))
    
    # Plot histograms
    for i, (hist, street_type) in enumerate(zip(histograms, street_types)):
        # Normalize histogram
        hist_norm = hist / np.sum(hist) if np.sum(hist) > 0 else np.zeros_like(hist)
        
        # Plot histogram
        plt.bar(speed_bins + i*0.1, hist_norm * 100, alpha=0.7, 
                label=f"{street_type} ({stats_info[i]['total_length_km']:.1f} km, n={stats_info[i]['effective_n']:.1f})", 
                width=0.8/len(histograms))
        
        # Add vertical line for median
        plt.axvline(x=stats_info[i]['stats']['median'], color=f"C{i}", linestyle='--', 
                    label=f"{street_type} Median: {stats_info[i]['stats']['median']:.2f} km/h")
    
    # Add legends
    plt.legend(loc='upper left', fontsize=PLOT_CONFIG['LEGEND_FONT_SIZE'])
    
    # Add labels and title
    plt.xlabel(PLOT_CONFIG['X_LABEL'], fontsize=PLOT_CONFIG['AXIS_LABEL_FONT_SIZE'])
    plt.ylabel(PLOT_CONFIG['Y_LABEL'], fontsize=PLOT_CONFIG['AXIS_LABEL_FONT_SIZE'])
    plt.title("Geschwindigkeitshistogramm nach OSM \"type\" (sortiert nach Median), 04/23 - 12/24", 
              fontsize=PLOT_CONFIG['TITLE_FONT_SIZE'])
    plt.grid(alpha=0.3)
    plt.xticks(range(0, 32, 2))
    
    # Save figure
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, "median_weighted_histograms.png"), dpi=PLOT_CONFIG['DPI'], bbox_inches='tight')
    log("Saved median-weighted histogram comparison")
    plt.close()
def create_boxplot_from_histograms(street_type_data, valid_types_sorted):
    """Create boxplot using statistics derived from length-weighted histograms"""
    log("Creating boxplot from length-weighted histograms")
    
    fig, ax = plt.subplots(figsize=(16, 10))

    # Prepare statistics
    all_stats = []
    for street_type in valid_types_sorted:
        # Get statistics from histogram
        stats = street_type_data[street_type]['stats']
        
        boxplot_stats = {
            'mean': stats['mean'],
            'median': stats['median'],
            'q1': stats['percentile_25'],
            'q3': stats['percentile_75'],
            'whislo': stats['percentile_5'],
            'whishi': stats['percentile_95'],
            'label': (f"{street_type}\n"
                     f"(effective_n={street_type_data[street_type]['effective_n']:.1f}\n"
                     f"{street_type_data[street_type]['count']:,} Segmente\n"
                     f"{street_type_data[street_type]['total_length_km']:.1f} km")
        }
        all_stats.append(boxplot_stats)
    
    # Create boxplot using Axes.bxp()
    ax.bxp(
        [{
            'med': stats['median'],
            'q1': stats['q1'],
            'q3': stats['q3'],
            'whislo': stats['whislo'],
            'whishi': stats['whishi'],
            'fliers': [],
            'label': stats['label']
        } for stats in all_stats],
        positions=range(len(all_stats)),
        showfliers=False,
        patch_artist=True,
        boxprops={'facecolor': PLOT_CONFIG['BOX_COLOR_1'], 'alpha': 0.7},
        medianprops={'color': PLOT_CONFIG['MEDIAN_COLOR'], 'linewidth': 2},
        whiskerprops={'color': 'black', 'linestyle': '-', 'linewidth': 1},
        capprops={'color': 'black', 'linewidth': 1}
    )
    
    # Add mean markers and annotations
    for i, stats in enumerate(all_stats):
        # Mean star
        ax.scatter(
            i, stats['mean'],
            marker='*',
            s=150,
            color=PLOT_CONFIG['MEAN_COLOR'],
            zorder=3
        )
        
        # Blue median text
        ax.text(
            i, stats['median'] + 0.7,
            f"{stats['median']:.1f}",
            ha='center', va='bottom',
            color=PLOT_CONFIG['MEDIAN_COLOR'],
            fontsize=PLOT_CONFIG['TEXT_FONT_SIZE'],
            weight='bold'
        )
        
        # Red mean text
        ax.text(
            i, stats['mean'] - 0.7,
            f"{stats['mean']:.1f}",
            ha='center', va='top',
            color=PLOT_CONFIG['MEAN_COLOR'],
            fontsize=PLOT_CONFIG['TEXT_FONT_SIZE'],
            weight='bold'
        )
    
    # Formatting
    ax.set_xticks(range(len(all_stats)))
    ax.set_xticklabels([stats['label'] for stats in all_stats], rotation=45, ha='right')
    ax.set_ylabel(PLOT_CONFIG['X_LABEL'], fontsize=PLOT_CONFIG['AXIS_LABEL_FONT_SIZE'])
    ax.set_title(
        "Boxplot Geschwindigkeitsverteilungen DB Rad+ nach OSM 'type'\n"
        "Zeitraum 04/23 - 12/24",
        fontsize=PLOT_CONFIG['TITLE_FONT_SIZE']
    )
    ax.grid(axis='y', alpha=0.3)
    
    # Legend
    legend_elements = [
        mpatches.Patch(facecolor=PLOT_CONFIG['BOX_COLOR_1'], alpha=0.7, label='IQR'),
        plt.Line2D([0], [0], color=PLOT_CONFIG['MEDIAN_COLOR'], lw=2, label='Median'),
        plt.Line2D([0], [0], marker='*', color=PLOT_CONFIG['MEAN_COLOR'], markersize=10, 
                   linestyle='None', label='Mittelwert')
    ]
    ax.legend(handles=legend_elements, loc='upper right')
    
    plt.tight_layout()
    plt.savefig(
        os.path.join(OUTPUT_DIR, 'street_type_boxplot.png'),
        dpi=PLOT_CONFIG['DPI'],
        bbox_inches='tight'
    )
    plt.close()
def create_violin_plot_from_histograms(street_type_data, valid_types_sorted):
    """Create violin plot using distributions from length-weighted histograms"""
    log("Creating violin plot from length-weighted histograms")
    
    fig, ax = plt.subplots(figsize=(16, 10))

    # Draw violins
    for i, street_type in enumerate(valid_types_sorted):
        hist = street_type_data[street_type]['length_weighted_agg_hist']
        speeds = np.arange(32)
        
        # Create weighted KDE
        kde = gaussian_kde(
            speeds, 
            weights=hist,
            bw_method=0.25  # Optimal smoothing for speed data
        )
        x = np.linspace(0, 31, 100)
        density = kde(x)
        
        # Normalize density width
        density_norm = 0.4 * density / density.max()  
        
        # Draw violin body
        ax.fill_betweenx(
            x, 
            i - density_norm, 
            i + density_norm,
            color=PLOT_CONFIG['BOX_COLOR_2'],
            alpha=0.7
        )
        
        # Add median line
        median = street_type_data[street_type]['stats']['median']
        ax.hlines(
            median,
            i - density_norm.max(),
            i + density_norm.max(),
            colors=PLOT_CONFIG['MEDIAN_COLOR'],
            linewidths=1.5
        )
        
        # Add blue median value above line
        ax.text(
            i, median + 0.35,
            f"{median:.1f}",
            ha='center', va='bottom',
            color=PLOT_CONFIG['MEDIAN_COLOR'],
            fontsize=PLOT_CONFIG['TEXT_FONT_SIZE'],
            weight='bold'
        )
    
    # Add weighted means as stars
    mean_markers = ax.scatter(
        range(len(valid_types_sorted)),
        [street_type_data[t]['stats']['mean'] for t in valid_types_sorted],
        marker='*',
        s=150,
        color=PLOT_CONFIG['MEAN_COLOR'],
        zorder=3
    )
    
    # Add red mean values below stars
    for i, street_type in enumerate(valid_types_sorted):
        mean = street_type_data[street_type]['stats']['mean']
        ax.text(
            i, mean - 0.35,
            f"{mean:.1f}",
            ha='center', va='top',
            color=PLOT_CONFIG['MEAN_COLOR'],
            fontsize=PLOT_CONFIG['TEXT_FONT_SIZE'],
            weight='bold'
        )
    
    # X-axis labels with metadata
    labels = [
        f"{t}\n"
        f"n={street_type_data[t]['effective_n']:.1f}\n"
        f"{street_type_data[t]['count']:,} segments\n"
        f"{street_type_data[t]['total_length_km']:.1f} km"
        for t in valid_types_sorted
    ]
    
    ax.set_xticks(range(len(labels)))
    ax.set_xticklabels(
        labels,
        rotation=45,
        ha='right',
        fontsize=PLOT_CONFIG['TEXT_FONT_SIZE']
    )

    # Titles and styling
    ax.set_ylabel(PLOT_CONFIG['X_LABEL'], fontsize=PLOT_CONFIG['AXIS_LABEL_FONT_SIZE'])
    ax.set_title(
        "Geschwindigkeitsverteilungen DB Rad+ nach OSM 'type'\n"
        "Geigendiagramm, Zeitraum 04/23 - 12/24",
        fontsize=PLOT_CONFIG['TITLE_FONT_SIZE']
    )
    ax.grid(axis='y', alpha=0.3)
    
    # Custom legend
    legend_elements = [
        mpatches.Patch(facecolor=PLOT_CONFIG['BOX_COLOR_2'], alpha=0.7, label='Density'),
        plt.Line2D([0], [0], color=PLOT_CONFIG['MEDIAN_COLOR'], lw=1.5, label='Median'),
        plt.Line2D([0], [0], marker='*', color=PLOT_CONFIG['MEAN_COLOR'], markersize=10, 
                   linestyle='None', label='Mean')
    ]
    ax.legend(
        handles=legend_elements,
        loc='upper right',
        fontsize=PLOT_CONFIG['LEGEND_FONT_SIZE']
    )
    
    plt.tight_layout()
    plt.savefig(
        os.path.join(OUTPUT_DIR, 'street_type_weighted_violin.png'),
        dpi=PLOT_CONFIG['DPI'],
        bbox_inches='tight'
    )
    plt.close()
def create_mean_vs_median_comparison(street_type_data, valid_types):
    """Create scatter plot comparing weighted means and medians"""
    log("Creating mean vs median comparison plot")
    
    plt.figure(figsize=(10, 8))
    
    # Extract means and medians
    means = [street_type_data[t]['stats']['mean'] for t in valid_types]
    medians = [street_type_data[t]['stats']['median'] for t in valid_types]
    
    # Calculate skewness using the difference between mean and median
    skewness = [mean - median for mean, median in zip(means, medians)]
    
    # Create scatter plot
    scatter = plt.scatter(means, medians, 
                          c=skewness, cmap='coolwarm', alpha=0.9, 
                          s=100, edgecolors='black')
    
    # Add diagonal line (y=x)
    min_val = min(min(means), min(medians)) - 1
    max_val = max(max(means), max(medians)) + 1
    plt.plot([min_val, max_val], [min_val, max_val], 'k--', alpha=0.5)
    
    # Add labels for each point
    for i, txt in enumerate(valid_types):
        plt.annotate(txt, (means[i], medians[i]), 
                    xytext=(5, 5), textcoords='offset points')
    
    # Add colorbar to show skewness
    cbar = plt.colorbar(scatter)
    cbar.set_label('Skewness (Mean - Median)')
    
    # Add labels and title
    plt.xlabel('Weighted Mean Speed (km/h)', fontsize=PLOT_CONFIG['AXIS_LABEL_FONT_SIZE'])
    plt.ylabel('Weighted Median Speed (km/h)', fontsize=PLOT_CONFIG['AXIS_LABEL_FONT_SIZE'])
    plt.title('Comparison of Weighted Mean vs Median Speeds by Street Type', fontsize=PLOT_CONFIG['TITLE_FONT_SIZE'])
    
    # Equal aspect ratio
    plt.axis('equal')
    plt.grid(alpha=0.3)
    
    # Save figure
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, 'mean_vs_median_comparison.png'), dpi=PLOT_CONFIG['DPI'], bbox_inches='tight')
    plt.close()
    
    # Also create a summary table
    comparison_df = pd.DataFrame({
        'Street Type': valid_types,
        'Weighted Mean': means,
        'Weighted Median': medians,
        'Difference (Mean-Median)': skewness,
        'Effective N': [street_type_data[t]['effective_n'] for t in valid_types],
        'Total Length (km)': [street_type_data[t]['total_length_km'] for t in valid_types]
    })
    
    comparison_df.to_csv(os.path.join(OUTPUT_DIR, 'mean_vs_median_comparison.csv'), index=False)
    
    # Display the comparison
    log("\nComparison of weighted means and medians:")
    for idx, row in comparison_df.iterrows():
        log(f"{row['Street Type']}: Mean = {row['Weighted Mean']:.2f}, Median = {row['Weighted Median']:.2f}, " +
            f"Diff = {row['Difference (Mean-Median)']:.2f}, Effective N = {row['Effective N']:.1f}")

def perform_kruskal_wallis_and_dunn_tests(street_type_data, valid_types, test_type="mean"):
    """
    Perform Kruskal-Wallis test followed by Dunn's post-hoc test directly using 
    the length-weighted histograms. Test type can be 'mean' or 'median' to determine 
    sorting of results.
    """
    log(f"\nPerforming Kruskal-Wallis test followed by Dunn's post-hoc test (sorting by {test_type})")
    
    # Generate samples from histograms for the Kruskal-Wallis test
    # We'll limit this to a reasonable number of samples per type to avoid memory issues
    max_samples_per_type = 10000
    samples_by_type = []
    
    for street_type in valid_types:
        hist = street_type_data[street_type]['length_weighted_agg_hist']
        norm_hist = hist / np.sum(hist)
        
        # Generate samples efficiently using numpy's random choice
        bins = np.arange(32)
        samples = np.random.choice(bins, size=max_samples_per_type, p=norm_hist)
        samples_by_type.append(samples)
    
    # 1. Perform Kruskal-Wallis test
    h_stat, p_value = stats.kruskal(*samples_by_type)
    
    log(f"Kruskal-Wallis test results:")
    log(f"H statistic: {h_stat:.4f}")
    log(f"p-value: {p_value:.6f}")
    
    # Create DataFrame for Dunn's test
    all_samples = []
    all_types = []
    
    for i, samples in enumerate(samples_by_type):
        all_samples.extend(samples)
        all_types.extend([valid_types[i]] * len(samples))
    
    df_samples = pd.DataFrame({
        'speed': all_samples,
        'type': all_types
    })
    
    # Calculate Cohen's d effect sizes between all pairs
    # Store in a matrix for fast lookups
    d_matrix = np.zeros((len(valid_types), len(valid_types)))
    mean_diffs = np.zeros((len(valid_types), len(valid_types)))
    
    for i, type1 in enumerate(valid_types):
        for j, type2 in enumerate(valid_types):
            if i < j:  # Only calculate once per pair
                hist1 = street_type_data[type1]['length_weighted_agg_hist']
                hist2 = street_type_data[type2]['length_weighted_agg_hist']
                
                # Calculate Cohen's d directly from histograms
                d, mean1, mean2 = calculate_cohens_d_from_histograms(hist1, hist2)
                d_matrix[i, j] = d
                d_matrix[j, i] = d  # Mirror for convenience
                
                diff = mean1 - mean2
                mean_diffs[i, j] = diff
                mean_diffs[j, i] = -diff  # Mirror with sign change
    
    # If the Kruskal-Wallis test is significant, perform Dunn's post-hoc test
    if p_value < 0.05:
        log("\nSignificant differences found between street types (p < 0.05)")
        
        # Perform Dunn's test
        log("Performing Dunn's test for post-hoc analysis")
        dunn_results = posthoc_dunn(df_samples, val_col='speed', group_col='type', p_adjust='fdr_bh')
        
        # Save Dunn's test results
        dunn_results.to_csv(os.path.join(OUTPUT_DIR, f'dunn_test_{test_type}_results.csv'))
        
        # Create a heatmap of p-values
        plt.figure(figsize=(14, 12))
        mask = np.triu(np.ones_like(dunn_results, dtype=bool))
        
        sns.heatmap(dunn_results, mask=mask, annot=True, cmap='coolwarm_r', 
                  vmin=0, vmax=0.05, center=0.025, 
                  annot_kws={"size": 10}, fmt='.3f')
        
        plt.title(f"Vergleich nach OSM 'type': Dunn's post-hoc p-Werte ({test_type}-basiert)\nSignifikanz: p < 0.05 (rot)", 
                fontsize=PLOT_CONFIG['TITLE_FONT_SIZE'])
        plt.tight_layout(pad=2.0)
        plt.savefig(os.path.join(OUTPUT_DIR, f'dunn_test_{test_type}_heatmap.png'), dpi=PLOT_CONFIG['DPI'], bbox_inches='tight')
        plt.close()
        
        # Find significantly different pairs and organize into a list
        sig_pairs = []
        for i in range(len(valid_types)):
            for j in range(i+1, len(valid_types)):
                type1 = valid_types[i]
                type2 = valid_types[j]
                
                # Get corresponding indices in dunn_results
                try:
                    dunn_i = dunn_results.index.get_loc(type1)
                    dunn_j = dunn_results.columns.get_loc(type2)
                    p_val = dunn_results.iloc[dunn_i, dunn_j]
                    
                    if p_val < 0.05:  # Only include significant pairs
                        # Get effect size from pre-computed matrices
                        d = d_matrix[i, j]
                        diff = mean_diffs[i, j]
                        
                        sig_pairs.append((type1, type2, p_val, diff, d))
                except (KeyError, IndexError):
                    log(f"Warning: Could not find {type1} vs {type2} in Dunn results")
        
        # Sort by significance
        sig_pairs.sort(key=lambda x: x[2])
        
        log("\nSignificantly different street type pairs with length-weighted effect sizes:")
        for type1, type2, p_val, diff, d in sig_pairs:
            effect_size_interp = "small" if d < 0.2 else "medium" if d < 0.75 else "large"
            log(f"{type1} vs {type2}: p={p_val:.6f}, weighted mean diff={diff:.2f} km/h, Cohen's d={d:.3f} ({effect_size_interp})")
        
        # Save effect size results to CSV
        effect_size_df = pd.DataFrame(sig_pairs, 
                                     columns=['Type1', 'Type2', 'p_value', 'weighted_mean_diff', 'cohens_d'])
        effect_size_df['effect_size_interpretation'] = effect_size_df['cohens_d'].apply(
            lambda d: "small" if d < 0.2 else "medium" if d < 0.75 else "large")
        effect_size_df.to_csv(os.path.join(OUTPUT_DIR, f'effect_size_{test_type}_results.csv'), index=False)

        # Create a matrix of Cohen's d values with the same structure as dunn_results
        # Important: We'll use the same index/column order as dunn_results
        cohens_d_df = pd.DataFrame(
            np.zeros((len(dunn_results.index), len(dunn_results.columns))),
            index=dunn_results.index,
            columns=dunn_results.columns
        )
        
        # First, initialize all values to NaN
        cohens_d_df.iloc[:, :] = np.nan
        
        # Then fill in Cohen's d values for ALL pairs (mirror the upper triangle to match Dunn's)
        for i in range(len(dunn_results.index)):
            for j in range(len(dunn_results.columns)):
                if i < j:  # Upper triangle (to match standard scientific reporting)
                    type_i = dunn_results.index[i]
                    type_j = dunn_results.columns[j]
                    
                    # Check if this pair is statistically significant
                    if dunn_results.iloc[i, j] < 0.05:
                        # Get indices in valid_types list
                        try:
                            idx_i = valid_types.index(type_i)
                            idx_j = valid_types.index(type_j)
                            
                            # Get Cohen's d value from our precomputed matrix
                            if idx_i < idx_j:
                                cohens_d_df.iloc[i, j] = d_matrix[idx_i, idx_j]
                            else:
                                cohens_d_df.iloc[i, j] = d_matrix[idx_j, idx_i]
                        except ValueError:
                            log(f"Warning: Could not find {type_i} or {type_j} in valid_types")
        
        # Save the Cohen's d matrix (only significant differences in upper triangle)
        cohens_d_df.to_csv(os.path.join(OUTPUT_DIR, f'cohens_d_matrix_{test_type}.csv'))
        
        # Create a heatmap visualization of Cohen's d values
        plt.figure(figsize=(14, 12))
        
        # Use a diverging colormap to highlight differences
        cmap = sns.diverging_palette(240, 10, as_cmap=True)
        
        # Create mask for lower triangle (opposite of Dunn's test)
        mask = np.tril(np.ones_like(cohens_d_df, dtype=bool))
        
        # Create heatmap for Cohen's d values
        sns.heatmap(cohens_d_df, annot=True, cmap=cmap, 
                  vmin=0, vmax=1.5, center=0.75,  # 0.75 is the threshold for "large" effect
                  annot_kws={"size": 10}, fmt='.3f',
                  mask=mask)  # Apply mask to hide lower triangle
        
        # Add colorbar labels for effect size interpretation
        colorbar = plt.gcf().axes[-1]
        colorbar.text(3.5, 0.1, 'Kleiner Effekt (<0.2)', ha='left', va='center')
        colorbar.text(3.5, 0.4, 'Mittlerer Effekt (0.2-0.75)', ha='left', va='center')
        colorbar.text(3.5, 0.9, 'Großer Effekt (>0.75)', ha='left', va='center')
        
        plt.title(f"Vergleich nach OSM 'type': Effektstärken nach Cohen's d ({test_type}-basiert)\nDifferenzen zwischen OSM 'type' (04/23 - 12/24)", 
                 fontsize=PLOT_CONFIG['TITLE_FONT_SIZE'])
        plt.tight_layout(pad=2.0)
        plt.savefig(os.path.join(OUTPUT_DIR, f'cohens_d_heatmap_{test_type}.png'), dpi=PLOT_CONFIG['DPI'], bbox_inches='tight')
        plt.close()
        
        return dunn_results, (d_matrix, mean_diffs)
    else:
        log("No significant differences found between street types (p >= 0.05)")
        return None, (d_matrix, mean_diffs)

def analyze_street_types():
    """
    Main analysis function that consistently uses length-weighted histograms for all calculations.
    """
    log(f"Starting consistent length-weighted analysis of parquet file: {PARQUET_FILE}")
    
    # Verify the parquet file exists
    if not os.path.exists(PARQUET_FILE):
        log(f"Error: File {PARQUET_FILE} not found")
        return None
    
    # Get file info
    parquet_file = pq.ParquetFile(PARQUET_FILE)
    num_row_groups = parquet_file.metadata.num_row_groups
    total_rows = parquet_file.metadata.num_rows
    log(f"Parquet file has {num_row_groups} row groups and approximately {total_rows:,} rows")
    
    # Define the specific columns we need
    columns = [COLUMN_TYPE, COLUMN_HIST, COLUMN_LENGTH]
    
    # Initialize storage for results
    street_type_data = {}
    
    # Process each row group with efficient chunking
    log(f"Processing {num_row_groups} row groups to extract speed data by street type")
    
    # Define chunk size for efficient processing within each row group
    CHUNK_SIZE = 50000
    
    for rg in tqdm(range(num_row_groups), desc="Processing row groups"):
        try:
            # Get the row group size to determine number of chunks needed
            row_group_metadata = parquet_file.metadata.row_group(rg)
            row_group_size = row_group_metadata.num_rows
            num_chunks = (row_group_size + CHUNK_SIZE - 1) // CHUNK_SIZE  # Ceiling division
            
            # Read the entire row group
            table = parquet_file.read_row_group(rg, columns=columns)
            
            # Process each chunk of the row group
            for chunk_idx in tqdm(range(num_chunks), desc=f"Chunks in row group {rg}", leave=False):
                # Calculate offsets
                offset = chunk_idx * CHUNK_SIZE
                length = min(CHUNK_SIZE, row_group_size - offset)
                
                # Extract just this chunk from the row group
                chunk_table = table.slice(offset, length)
                df_chunk = chunk_table.to_pandas()
                
                # Filter out NaN values
                df_chunk = df_chunk.dropna(subset=[COLUMN_TYPE, COLUMN_HIST, COLUMN_LENGTH])
                
                # Filter to only include the specified street types
                df_chunk = df_chunk[df_chunk[COLUMN_TYPE].isin(STREET_TYPES_TO_ANALYZE)]
                
                # Process each row in this chunk
                for _, row in df_chunk.iterrows():
                    street_type = row[COLUMN_TYPE]
                    hist = parse_histogram(row[COLUMN_HIST])
                    segment_length = row[COLUMN_LENGTH]
                    
                    if len(hist) > 0 and np.sum(hist) > 0:
                        # Initialize entry for this street type if it doesn't exist
                        if street_type not in street_type_data:
                            street_type_data[street_type] = {
                                'lengths': [],  # Store segment lengths
                                'length_weighted_agg_hist': np.zeros(32),  # Initialize aggregated histogram
                                'count': 0
                            }
                        
                        # Add data for this street segment
                        street_type_data[street_type]['lengths'].append(segment_length)
                        
                        # Add to the aggregated histogram with length weighting
                        # Multiply histogram by segment length for length-weighting
                        street_type_data[street_type]['length_weighted_agg_hist'] += hist * segment_length
                        
                        street_type_data[street_type]['count'] += 1
                
                # Clean up to free memory after each chunk
                del df_chunk, chunk_table
                force_gc()
            
            # Clean up the entire row group data
            del table
            force_gc()
                
        except Exception as e:
            log(f"Error processing row group {rg}: {e}")
    
    # Count total processed segments
    total_segments = sum(data['count'] for data in street_type_data.values())
    log(f"Processed {total_segments:,} total street segments across {len(street_type_data)} different street types")
    
    # Check if we have data for all the requested street types
    for street_type in STREET_TYPES_TO_ANALYZE:
        if street_type not in street_type_data:
            log(f"Warning: No data found for street type '{street_type}'")
        elif street_type_data[street_type]['count'] < 30:
            log(f"Warning: Insufficient data for street type '{street_type}' (only {street_type_data[street_type]['count']} segments)")
    
    # Get the street types that have sufficient data (at least 30 segments)
    valid_types = [st for st in STREET_TYPES_TO_ANALYZE 
                  if st in street_type_data and street_type_data[st]['count'] >= 30]
    
    log(f"Analysis will include {len(valid_types)} street types with sufficient data")
    for i, type_name in enumerate(valid_types, 1):
        count = street_type_data[type_name]['count']
        log(f"{i}. {type_name}: {count:,} segments")
    
    # Calculate statistics for each street type using only the length-weighted histogram
    # We'll do this ONCE for each street type and store the results for reuse
    log("Calculating length-weighted statistics for each street type")
    for street_type in valid_types:
        # Calculate total length in km
        total_length_km = sum(street_type_data[street_type]['lengths']) / 1000
        
        # Calculate effective sample size
        effective_n = calculate_effective_n(street_type_data[street_type]['lengths'])
        
        # Calculate all statistics efficiently in a single pass
        agg_hist = street_type_data[street_type]['length_weighted_agg_hist']
        stats = calculate_all_weighted_statistics(agg_hist)
        
        # Store all calculated statistics
        street_type_data[street_type]['stats'] = stats
        street_type_data[street_type]['effective_n'] = effective_n
        street_type_data[street_type]['total_length_km'] = total_length_km
        
        log(f"{street_type}: weighted mean = {stats['mean']:.2f} km/h, weighted median = {stats['median']:.2f} km/h, effective n = {effective_n:.1f}")
    
    # Sort valid types by weighted mean for visualizations
    valid_types_sorted_by_mean = sorted(valid_types, 
                                        key=lambda x: street_type_data[x]['stats']['mean'] if x in street_type_data else 0,
                                        reverse=True)
    
    # Sort valid types by weighted median for comparison
    valid_types_sorted_by_median = sorted(valid_types, 
                                         key=lambda x: street_type_data[x]['stats']['median'] if x in street_type_data else 0,
                                         reverse=True)
    
    # Create visualizations
    log("Creating visualizations based on length-weighted histograms")
    
    # 1. Create length-weighted histogram comparison (sorted by mean)
    histograms = []
    hist_labels = []
    hist_stats = []
    
    for street_type in valid_types_sorted_by_mean:
        histograms.append(street_type_data[street_type]['length_weighted_agg_hist'])
        hist_labels.append(street_type)
        hist_stats.append({
            'stats': street_type_data[street_type]['stats'],
            'total_length_km': street_type_data[street_type]['total_length_km'],
            'effective_n': street_type_data[street_type]['effective_n']
        })
    
    create_length_weighted_histogram_comparison(histograms, hist_labels, hist_stats)
    
    # 2. Create median-sorted histogram comparison
    histograms_by_median = []
    hist_labels_by_median = []
    hist_stats_by_median = []
    
    for street_type in valid_types_sorted_by_median:
        histograms_by_median.append(street_type_data[street_type]['length_weighted_agg_hist'])
        hist_labels_by_median.append(street_type)
        hist_stats_by_median.append({
            'stats': street_type_data[street_type]['stats'],
            'total_length_km': street_type_data[street_type]['total_length_km'],
            'effective_n': street_type_data[street_type]['effective_n']
        })
    
    create_median_weighted_histogram_comparison(histograms_by_median, hist_labels_by_median, hist_stats_by_median)
    
    # 3. Create boxplot visualization
    create_boxplot_from_histograms(street_type_data, valid_types_sorted_by_mean)
    
    # 4. Create violin plot
    create_violin_plot_from_histograms(street_type_data, valid_types_sorted_by_mean)
    
    # 5. Create mean vs median comparison
    create_mean_vs_median_comparison(street_type_data, valid_types_sorted_by_mean)
    
    # 6. Perform Kruskal-Wallis and Dunn's tests based on mean-sorted order
    dunn_results_mean, effect_sizes_mean = perform_kruskal_wallis_and_dunn_tests(
        street_type_data, valid_types_sorted_by_mean, "mean")
    
    # 7. Perform Kruskal-Wallis and Dunn's tests based on median-sorted order
    dunn_results_median, effect_sizes_median = perform_kruskal_wallis_and_dunn_tests(
        street_type_data, valid_types_sorted_by_median, "median")
    
    # 8. Compare the two approaches if both tests were significant
    if dunn_results_mean is not None and dunn_results_median is not None:
        log("\nComparing mean-based and median-based significant pair findings:")
        
        # Get sets of significant pairs from both tests
        mean_pairs = set()
        median_pairs = set()
        
        # Extract significant pairs from Dunn's test results
        for i in range(len(valid_types_sorted_by_mean)):
            for j in range(i+1, len(valid_types_sorted_by_mean)):
                if dunn_results_mean.iloc[i, j] < 0.05:
                    mean_pairs.add((valid_types_sorted_by_mean[i], valid_types_sorted_by_mean[j]))
        
        for i in range(len(valid_types_sorted_by_median)):
            for j in range(i+1, len(valid_types_sorted_by_median)):
                if dunn_results_median.iloc[i, j] < 0.05:
                    median_pairs.add((valid_types_sorted_by_median[i], valid_types_sorted_by_median[j]))
        
        # Find pairs that were significant in both tests
        common_pairs = mean_pairs.intersection(median_pairs)
        only_mean_pairs = mean_pairs - median_pairs
        only_median_pairs = median_pairs - mean_pairs
        
        log(f"Number of pairs significant in both tests: {len(common_pairs)}")
        log(f"Number of pairs significant only in mean-based test: {len(only_mean_pairs)}")
        log(f"Number of pairs significant only in median-based test: {len(only_median_pairs)}")
        
        # List the common pairs
        if common_pairs:
            log("\nPairs significant in both tests:")
            for pair in common_pairs:
                log(f"{pair[0]} vs {pair[1]}")
                
        # List pairs only significant in mean test
        if only_mean_pairs:
            log("\nPairs significant only in mean-based test:")
            for pair in only_mean_pairs:
                log(f"{pair[0]} vs {pair[1]}")
        
        # List pairs only significant in median test
        if only_median_pairs:
            log("\nPairs significant only in median-based test:")
            for pair in only_median_pairs:
                log(f"{pair[0]} vs {pair[1]}")
        
        # Create Venn diagram of significant pairs if matplotlib_venn is available
        try:
            from matplotlib_venn import venn2
            
            plt.figure(figsize=(10, 8))
            venn2(subsets=(len(only_mean_pairs), len(only_median_pairs), len(common_pairs)),
                set_labels=('Mean-based', 'Median-based'))
            plt.title("Significant Differences Found: Mean vs Median Approach", fontsize=PLOT_CONFIG['TITLE_FONT_SIZE'])
            plt.savefig(os.path.join(OUTPUT_DIR, 'mean_vs_median_venn.png'), dpi=PLOT_CONFIG['DPI'], bbox_inches='tight')
            plt.close()
        except ImportError:
            log("matplotlib_venn not available for creating Venn diagram. Install with 'pip install matplotlib-venn'")
    
    # 9. Create summary statistics table
    summary_data = []
    for street_type in valid_types_sorted_by_mean:
        stats = street_type_data[street_type]['stats']
        
        summary_data.append({
            'type': street_type,
            'count': street_type_data[street_type]['count'],
            'total_length_km': street_type_data[street_type]['total_length_km'],
            'effective_n': street_type_data[street_type]['effective_n'],
            'mean': stats['mean'],
            'median': stats['median'],
            'std': stats['std'],
            'min': stats['min'],
            'max': stats['max'],
            'percentile_5': stats['percentile_5'],
            'percentile_25': stats['percentile_25'],
            'percentile_75': stats['percentile_75'],
            'percentile_95': stats['percentile_95'],
            'skewness': stats['mean'] - stats['median']
        })
    
    summary_df = pd.DataFrame(summary_data)
    summary_df.to_csv(os.path.join(OUTPUT_DIR, 'street_type_summary_stats.csv'), index=False)
    
    log("Consistent length-weighted analysis complete. All results are based on the same length-weighted histograms.")
    return street_type_data

# Run the analysis
if __name__ == "__main__":
    street_data = analyze_street_types()

Starting consistent length-weighted analysis of parquet file: data/network_all_months_plus_25833_length_with_fahrradstrasse.parquet
Parquet file has 12 row groups and approximately 466,957 rows
Processing 12 row groups to extract speed data by street type


Processing row groups:   0%|          | 0/12 [00:00<?, ?it/s]

Chunks in row group 0:   0%|          | 0/1 [00:00<?, ?it/s]

Chunks in row group 1:   0%|          | 0/1 [00:00<?, ?it/s]

Chunks in row group 2:   0%|          | 0/1 [00:00<?, ?it/s]

Chunks in row group 3:   0%|          | 0/1 [00:00<?, ?it/s]

Chunks in row group 4:   0%|          | 0/1 [00:00<?, ?it/s]

Chunks in row group 5:   0%|          | 0/1 [00:00<?, ?it/s]

Chunks in row group 6:   0%|          | 0/1 [00:00<?, ?it/s]

Chunks in row group 7:   0%|          | 0/1 [00:00<?, ?it/s]

Error processing row group 8: Corrupt snappy compressed data.


Chunks in row group 9:   0%|          | 0/1 [00:00<?, ?it/s]

Chunks in row group 10:   0%|          | 0/1 [00:00<?, ?it/s]

Chunks in row group 11:   0%|          | 0/1 [00:00<?, ?it/s]

Processed 428,595 total street segments across 15 different street types
Analysis will include 15 street types with sufficient data
1. footway: 192,850 segments
2. residential: 81,186 segments
3. secondary: 25,847 segments
4. service_driveway: 27,412 segments
5. tertiary: 17,130 segments
6. service: 22,425 segments
7. path: 13,612 segments
8. cycleway: 12,586 segments
9. track: 7,245 segments
10. primary: 8,601 segments
11. service_parking_aisle: 7,834 segments
12. unclassified: 2,973 segments
13. living_street: 4,291 segments
14. steps: 3,594 segments
15. pedestrian: 1,009 segments
Calculating length-weighted statistics for each street type
footway: weighted mean = 18.72 km/h, weighted median = 18.72 km/h, effective n = 72937.3
residential: weighted mean = 18.97 km/h, weighted median = 18.93 km/h, effective n = 38380.4
secondary: weighted mean = 20.68 km/h, weighted median = 20.57 km/h, effective n = 12104.6
service_driveway: weighted mean = 12.57 km/h, weighted median = 12.11 km/h, e