# Variable Sets Comparison: Final Variables vs Optimal Variables

This notebook compares the `final_variable_set` from Variable Importance Analysis with `optimal_variables` from Spatial Spread Analysis and creates 2 optimized variable sets as arrays.

## Overview:
- **Variable Comparison**: Compare final variables from both approaches
- **Overlap Analysis**: Analyze common and unique variables
- **Performance Assessment**: Evaluate variable set performance
- **2 Variable Sets Creation**: Generate optimized sets as arrays (Common Variables and Best Performing)
- **Variable Name Cleaning**: Remove prefixes and handle special variables


## 1. Configuration and Setup


In [None]:
# =============================================================================
# CONFIGURATION AND SETUP
# =============================================================================

import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

# Configuration
#specie = 'leptocybe-invasa'  # Target species: 'leptocybe-invasa' or 'thaumastocoris-peregrinus'
# training = 'south-east-asia' 
# bio = bio1  # Bioclimatic variable identifier
savefig = True  # Set to True to save figures

# Optional filters (leave empty to include all results)
model_prefixes = []  # e.g., ['CHELSA', 'MPI-ESM']
scenarios = []       # e.g., ['historical', 'future']

# Paths
base_path = os.path.join(os.path.dirname(os.getcwd()), 'figs')
figs_path = os.path.join(os.path.dirname(os.getcwd()), 'figs')  # Figures directory
results_path = os.path.join(os.path.dirname(os.getcwd()), 'results')  # Figures directory
# results_path = os.path.join(base_path, "results")

# Create directories if they don't exist
os.makedirs(figs_path, exist_ok=True)
os.makedirs(results_path, exist_ok=True)

## 2. Variable Name Cleaning Functions


In [None]:
# =============================================================================
# VARIABLE NAME CLEANING FUNCTIONS
# =============================================================================

def infer_model_prefix(var_list):
    """Infer model prefix directly from variable names when metadata is missing."""
    for var in var_list:
        if not isinstance(var, str):
            continue
        lower_var = var.lower()
        if 'srtm' in lower_var or 'ndvi' in lower_var:
            continue
        if '_bioclim_' in var:
            candidate = var.split('_bioclim_')[0]
            if candidate:
                return candidate
        parts = var.rsplit('_', 1)
        if len(parts) == 2 and parts[1].isdigit() and parts[0]:
            return parts[0]
    return None


def clean_variable_name(var_name, model_prefix=None):
    """
    Clean variable name by removing prefixes, filtering out SRTM/NDVI, and converting to numeric.
    
    Parameters:
    -----------
    var_name : str
        Original variable name
    model_prefix : str, optional
        Model prefix to strip (e.g., 'ensemble_mean', 'MPI-M-MPI-ESM-LR_GERICS-REMO2015')
        
    Returns:
    --------
    int or None
        Numeric variable value, or None if SRTM/NDVI (to be filtered out)
    """
    import re

    if not isinstance(var_name, str):
        return None

    # Filter out SRTM and NDVI variables immediately
    lowered = var_name.lower()
    if 'srtm' in lowered or 'ndvi' in lowered:
        return None

    cleaned_name = var_name

    # Remove dynamic model-prefix patterns first (handles ensemble_mean and CMIP models)
    if model_prefix:
        prefix_variants = {
            model_prefix,
            model_prefix.lower(),
            model_prefix.upper()
        }
        for prefix in prefix_variants:
            for suffix in ('_bioclim_', '_'):
                candidate = f"{prefix}{suffix}"
                if cleaned_name.startswith(candidate):
                    cleaned_name = cleaned_name[len(candidate):]
                    break
            else:
                continue
            break

    # Remove common static prefixes
    common_prefixes = [
        'ensemble_mean_bioclim_',
        'bioclim_',
        'ensemble_mean_',
        'mean_',
        'std_',
        'min_',
        'max_'
    ]
    for prefix in common_prefixes:
        if cleaned_name.startswith(prefix):
            cleaned_name = cleaned_name[len(prefix):]
            break

    # Attempt to extract trailing numeric identifier
    match = re.search(r'_bioclim_(\d+)$', cleaned_name)
    if match:
        cleaned_name = match.group(1)
    else:
        match = re.search(r'_(\d+)$', cleaned_name)
        if match:
            cleaned_name = match.group(1)
        else:
            match = re.search(r'(\d+)$', cleaned_name)
            if match:
                cleaned_name = match.group(1)

    try:
        numeric_value = int(cleaned_name.lstrip('0') or '0')
        return numeric_value
    except ValueError:
        return None


def clean_variable_list(var_list, model_prefix=None):
    """
    Clean a list of variable names, filtering out SRTM/NDVI and converting to numeric.
    
    Parameters:
    -----------
    var_list : list
        List of variable names
    model_prefix : str, optional
        Model prefix to strip from variable names
        
    Returns:
    --------
    list
        List of numeric variable values (SRTM/NDVI filtered out)
    """
    cleaned_vars = []
    for var in var_list:
        cleaned_var = clean_variable_name(var, model_prefix=model_prefix)
        if cleaned_var is not None:
            cleaned_vars.append(cleaned_var)
    return cleaned_vars

def remove_duplicates_preserve_order(var_list):
    """
    Remove duplicates while preserving order.
    
    Parameters:
    -----------
    var_list : list
        List of variables (may contain duplicates)
        
    Returns:
    --------
    list
        List without duplicates
    """
    seen = set()
    result = []
    for var in var_list:
        if var not in seen:
            seen.add(var)
            result.append(var)
    return result

## 3. Load Comprehensive Analysis Results


In [None]:
# =============================================================================
# LOAD COMPREHENSIVE ANALYSIS RESULTS
# =============================================================================

def load_comprehensive_analysis_results(base_path, specie, training, bio, model_prefixes=None, scenarios=None):
    """
    Load comprehensive analysis results across iterations, supporting multiple model prefixes and scenarios.
    """
    import glob
    import re

    results_dict = {}

    # Normalize filters
    model_prefixes = [mp.strip('_') for mp in (model_prefixes or []) if mp]
    scenarios = [sc.lower() for sc in (scenarios or []) if sc]

    # Scan only base_path directory
    if not base_path or not os.path.isdir(base_path):
        print("No valid base_path directory found to search for comprehensive summaries.")
        return results_dict

    # Use single pattern: 06_comprehensive_analysis_summary_*.json
    pattern = "06_comprehensive_analysis_summary_*.json"

    files = []
    # Use pathlib for reliable recursive searching of nested directories
    root_path = Path(base_path)
    if root_path.exists():
        # Use rglob to recursively search all nested directories
        matched = list(root_path.rglob(pattern))
        # Filter out .ipynb_checkpoints directories
        matched = [f for f in matched if '.ipynb_checkpoints' not in str(f)]
        if matched:
            files.extend([str(f) for f in matched])

    # Deduplicate while preserving order
    seen = set()
    unique_files = []
    for fpath in files:
        if fpath not in seen:
            seen.add(fpath)
            unique_files.append(fpath)
    files = unique_files

    if not files:
        print("No comprehensive analysis summary files found with the expected patterns.")
        return results_dict

    # Apply model prefix filter if provided
    if model_prefixes:
        filtered_files = []
        for fpath in files:
            fname = os.path.basename(fpath)
            if any(f"_{mp}_" in fname or fname.endswith(f"_{mp}.json") for mp in model_prefixes):
                filtered_files.append(fpath)
        if not filtered_files:
            print(f"No files matched the specified model prefixes: {model_prefixes}")
        else:
            files = filtered_files

    # Apply scenario filter if provided
    if scenarios:
        filtered_files = []
        for fpath in files:
            fname = os.path.basename(fpath).lower()
            if any(f"_{sc}_" in fname for sc in scenarios):
                filtered_files.append(fpath)
        if not filtered_files:
            print(f"No files matched the specified scenarios: {scenarios}")
        else:
            files = filtered_files

    print(f"Found {len(files)} comprehensive analysis summary files")

    for file_path in files:
        try:
            filename = os.path.basename(file_path)
            base_name = filename.replace('.json', '')

            # Attempt to extract iteration identifier
            iteration = None
            match = re.search(r'iteration[_-]?(\d+)', base_name, re.IGNORECASE)
            if match:
                iteration = f"iteration_{match.group(1)}"
            else:
                # Fallback: use trailing numeric chunk if available
                match = re.search(r'_(\d+)$', base_name)
                if match:
                    iteration = f"iteration_{match.group(1)}"

            if not iteration:
                iteration = base_name

            # Load JSON content
            with open(file_path, 'r') as f:
                data = json.load(f)

            data['source_file'] = filename

            # Ensure unique iteration keys (important when multiple model prefixes share numbers)
            key = iteration
            if key in results_dict:
                hints = [iteration]
                if data.get('model_prefix'):
                    hints.append(str(data['model_prefix']))
                if data.get('scenario'):
                    hints.append(str(data['scenario']))
                key = '_'.join(hints)
                suffix = 1
                while key in results_dict:
                    key = f"{iteration}_{suffix}"
                    suffix += 1

            results_dict[key] = data
            print(f"Loaded {key} from {filename}")

        except Exception as e:
            print(f"Error loading {file_path}: {str(e)}")

    return results_dict

# Load results
print("Loading comprehensive analysis results...")
comprehensive_results = load_comprehensive_analysis_results(
    base_path,
    specie,
    training,
    bio,
    model_prefixes=model_prefixes,
    scenarios=scenarios
)

print(f"\nLoaded {len(comprehensive_results)} comprehensive analysis results")
for iteration in comprehensive_results.keys():
    print(f"  - {iteration}")


## 4. Extract and Compare Variable Sets


In [None]:
# =============================================================================
# EXTRACT AND COMPARE VARIABLE SETS
# =============================================================================

def extract_variable_sets(comprehensive_results):
    """
    Extract final_variable_set and optimal_variables from comprehensive results.
    
    Parameters:
    -----------
    comprehensive_results : dict
        Dictionary containing comprehensive results from all iterations
        
    Returns:
    --------
    variable_comparison : dict
        Dictionary containing variable sets comparison
    """
    
    variable_comparison = {}
    
    for iteration, results in comprehensive_results.items():
        # Extract final variables from variable importance analysis
        final_variables_raw = results.get('variable_importance_summary', {}).get('final_variable_set', [])
        
        # Extract optimal variables from spatial spread analysis
        optimal_variables_raw = results.get('spatial_spread_summary', {}).get('optimal_variables', [])
        
        # Determine model prefix from metadata or infer from variable names (supports ensemble_mean-only runs)
        metadata_candidates = [
            results.get('model_prefix'),
            results.get('model'),
            results.get('model_name'),
            results.get('metadata', {}).get('model_prefix') if isinstance(results.get('metadata'), dict) else None,
            results.get('experiment_metadata', {}).get('model_prefix') if isinstance(results.get('experiment_metadata'), dict) else None,
        ]
        model_prefix = next((candidate for candidate in metadata_candidates if candidate), None)
        if not model_prefix:
            model_prefix = infer_model_prefix(final_variables_raw + optimal_variables_raw)
        if not model_prefix:
            model_prefix = 'ensemble_mean'
        
        # Clean variable names (apply Variable Name Cleaning Functions with model_prefix)
        final_variables = clean_variable_list(final_variables_raw, model_prefix=model_prefix)
        optimal_variables = clean_variable_list(optimal_variables_raw, model_prefix=model_prefix)
        
        # Remove duplicates while preserving order
        final_variables = remove_duplicates_preserve_order(final_variables)
        optimal_variables = remove_duplicates_preserve_order(optimal_variables)
        
        # Calculate overlap
        final_set = set(final_variables)
        optimal_set = set(optimal_variables)
        
        common_variables = list(final_set.intersection(optimal_set))
        final_only = list(final_set - optimal_set)
        optimal_only = list(optimal_set - final_set)
        
        # Calculate overlap percentage
        total_unique = len(final_set.union(optimal_set))
        overlap_percentage = (len(common_variables) / total_unique * 100) if total_unique > 0 else 0
        
        variable_comparison[iteration] = {
            'final_variables': final_variables,
            'optimal_variables': optimal_variables,
            'common_variables': common_variables,
            'final_only': final_only,
            'optimal_only': optimal_only,
            'overlap_percentage': overlap_percentage,
            'total_final': len(final_variables),
            'total_optimal': len(optimal_variables),
            'total_common': len(common_variables)
        }
    
    return variable_comparison

def create_variable_comparison_table(variable_comparison):
    """
    Create comparison table for variable sets.
    
    Parameters:
    -----------
    variable_comparison : dict
        Dictionary containing variable sets comparison
        
    Returns:
    --------
    df : pandas.DataFrame
        Comparison table
    """
    
    data = []
    
    for iteration, comparison in variable_comparison.items():
        row = {
            'Iteration': iteration,
            'Final_Variables_Count': comparison['total_final'],
            'Optimal_Variables_Count': comparison['total_optimal'],
            'Common_Variables_Count': comparison['total_common'],
            'Overlap_Percentage': comparison['overlap_percentage'],
            'Final_Variables': ', '.join(map(str, comparison['final_variables'])),
            'Optimal_Variables': ', '.join(map(str, comparison['optimal_variables'])),
            'Common_Variables': ', '.join(map(str, comparison['common_variables'])),
            'Final_Only': ', '.join(map(str, comparison['final_only'])),
            'Optimal_Only': ', '.join(map(str, comparison['optimal_only']))
        }
        data.append(row)
    
    df = pd.DataFrame(data)
    return df

# Extract variable sets
print("Extracting variable sets from comprehensive results...")
variable_comparison = extract_variable_sets(comprehensive_results)

# Create comparison table
comparison_table = create_variable_comparison_table(variable_comparison)

print(f"\nVariable Sets Comparison Table:")
print(comparison_table.to_string(index=False))

# Display summary statistics
if not comparison_table.empty:
    print(f"\nSummary Statistics:")
    print(f"• Average Final Variables: {comparison_table['Final_Variables_Count'].mean():.1f}")
    print(f"• Average Optimal Variables: {comparison_table['Optimal_Variables_Count'].mean():.1f}")
    print(f"• Average Common Variables: {comparison_table['Common_Variables_Count'].mean():.1f}")
    print(f"• Average Overlap: {comparison_table['Overlap_Percentage'].mean():.1f}%")


## 5. Create 2 Optimized Variable Sets as Arrays


In [None]:
# =============================================================================
# CREATE 2 OPTIMIZED VARIABLE SETS AS ARRAYS
# =============================================================================

def create_optimized_variable_sets(variable_comparison):
    """
    Create 2 optimized variable sets as arrays based on analysis results.
    
    Parameters:
    -----------
    variable_comparison : dict
        Dictionary containing variable sets comparison
        
    Returns:
    --------
    optimized_sets : dict
        Dictionary containing 2 optimized variable sets as arrays
    """
    
    # Collect all variables from all iterations
    all_final_variables = []
    all_optimal_variables = []
    all_common_variables = []
    
    for iteration, comparison in variable_comparison.items():
        all_final_variables.extend(comparison['final_variables'])
        all_optimal_variables.extend(comparison['optimal_variables'])
        all_common_variables.extend(comparison['common_variables'])
    
    # Count variable frequency
    from collections import Counter
    
    final_freq = Counter(all_final_variables)
    optimal_freq = Counter(all_optimal_variables)
    common_freq = Counter(all_common_variables)
    
    # Calculate variable scores (frequency + importance)
    all_variables = set(all_final_variables + all_optimal_variables)
    variable_scores = {}
    
    for var in all_variables:
        final_score = final_freq.get(var, 0)
        optimal_score = optimal_freq.get(var, 0)
        common_score = common_freq.get(var, 0)
        
        # Combined score: common variables get highest weight
        total_score = common_score * 3 + final_score + optimal_score
        variable_scores[var] = total_score
    
    # Sort variables by score
    sorted_variables = sorted(variable_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Create 2 optimized sets as arrays
    optimized_sets = {}
    
    # Set 1: Most frequent common variables (highest consensus)
    common_vars_sorted = sorted(common_freq.items(), key=lambda x: x[1], reverse=True)
    set1_vars = [var for var, freq in common_vars_sorted[:5]]  # Top 5 common variables
    optimized_sets["Set 1 - Common Variables"] = np.array(set1_vars)
    
    # Set 2: Best performing variables (highest combined score)
    set2_vars = [var for var, score in sorted_variables[:6]]  # Top 6 variables by score
    optimized_sets["Set 2 - Best Performing"] = np.array(set2_vars)
    
    # # Set 3: Balanced approach (mix of common and unique high-scoring variables)
    # # Take top 3 common + top 3 unique high-scoring variables
    # common_top3 = [var for var, freq in common_vars_sorted[:3]]
    # unique_vars = [var for var, score in sorted_variables if var not in common_top3][:3]
    # set3_vars = common_top3 + unique_vars
    # optimized_sets["Set 3 - Balanced Approach"] = np.array(set3_vars)
    
    # Add metadata for each set (create separate dictionary to avoid modification during iteration)
    metadata_dict = {}
    for set_name, variables in optimized_sets.items():
        # Calculate set characteristics
        common_count = sum(1 for var in variables if var in common_freq)
        final_count = sum(1 for var in variables if var in final_freq)
        optimal_count = sum(1 for var in variables if var in optimal_freq)
        
        # Add metadata to separate dictionary
        metadata_dict[f"{set_name}_metadata"] = {
            'variable_count': len(variables),
            'common_variables': common_count,
            'final_variables': final_count,
            'optimal_variables': optimal_count,
            'consensus_score': common_count / len(variables) if len(variables) > 0 else 0
        }
    
    # Add metadata to optimized_sets after iteration is complete
    optimized_sets.update(metadata_dict)
    
    return optimized_sets, variable_scores

def create_variable_sets_table(optimized_sets):
    """
    Create table for the 2 optimized variable sets.
    
    Parameters:
    -----------
    optimized_sets : dict
        Dictionary containing optimized variable sets
        
    Returns:
    --------
    df : pandas.DataFrame
        Variable sets table
    """
    
    data = []
    
    # for set_name in ["Set 1 - Common Variables", "Set 2 - Best Performing", "Set 3 - Balanced Approach"]:
    for set_name in ["Set 1 - Common Variables", "Set 2 - Best Performing"]:
        if set_name in optimized_sets:
            variables = optimized_sets[set_name]
            metadata = optimized_sets.get(f"{set_name}_metadata", {})
            
            row = {
                'Set_Name': set_name,
                'Variable_Count': metadata.get('variable_count', len(variables)),
                'Common_Variables': metadata.get('common_variables', 0),
                'Final_Variables': metadata.get('final_variables', 0),
                'Optimal_Variables': metadata.get('optimal_variables', 0),
                'Consensus_Score': metadata.get('consensus_score', 0),
                'Variables': ', '.join(map(str, variables)) if len(variables) > 0 else ''
            }
            data.append(row)
    
    df = pd.DataFrame(data)
    return df

# Create optimized variable sets
print("Creating 2 optimized variable sets as arrays...")
optimized_sets, variable_scores = create_optimized_variable_sets(variable_comparison)

# Create variable sets table
sets_table = create_variable_sets_table(optimized_sets)

print(f"\nOptimized Variable Sets:")
print(sets_table.to_string(index=False))

# Display the sets in the requested array format
print(f"\nVariable Sets as Arrays (with Variable Name Cleaning Applied):")
print("# Variable Sets as NumPy Arrays")
# for set_name in ["Set 1 - Common Variables", "Set 2 - Best Performing", "Set 3 - Balanced Approach"]:
for set_name in ["Set 1 - Common Variables", "Set 2 - Best Performing"]:
    if set_name in optimized_sets:
        variables = optimized_sets[set_name]
        print(f"\n{set_name}:")
        print(f"variables_{set_name.lower().replace(' ', '_').replace('-', '_')} = np.array({list(variables)})")
        print(f"# Array shape: {variables.shape}")
        print(f"# Variables: {list(variables)}")

# Display variable scores for reference
print(f"\nVariable Scores (Top 10):")
sorted_scores = sorted(variable_scores.items(), key=lambda x: x[1], reverse=True)
for i, (var, score) in enumerate(sorted_scores[:10]):
    print(f"{i+1:2d}. {var}: {score:.1f}")


## 6. Save Results


In [None]:
# =============================================================================
# SAVE RESULTS
# =============================================================================

if savefig:
    print("Saving results...")
    
    # Save comparison table
    if not comparison_table.empty:
        comparison_file = os.path.join(results_path, f"08_variable_comparison_{specie}_{training}_{bio}.csv")
        comparison_table.to_csv(comparison_file, index=False)
        print(f"Variable comparison table saved to: {comparison_file}")
    
    # Save variable sets table
    if not sets_table.empty:
        sets_file = os.path.join(results_path, f"08_optimized_variable_sets_{specie}_{training}_{bio}.csv")
        sets_table.to_csv(sets_file, index=False)
        print(f"Optimized variable sets table saved to: {sets_file}")
    
    # Save optimized sets in JSON format (convert numpy arrays to lists)
    json_sets = {}
    # for set_name in ["Set 1 - Common Variables", "Set 2 - Best Performing", "Set 3 - Balanced Approach"]:
    for set_name in ["Set 1 - Common Variables", "Set 2 - Best Performing"]:
        if set_name in optimized_sets and len(optimized_sets[set_name]) > 0:
            # Convert numpy array to list for JSON serialization
            json_sets[set_name] = optimized_sets[set_name].tolist()
    
    json_file = os.path.join(results_path, f"08_optimized_variable_sets_{specie}_{training}_{bio}.json")
    with open(json_file, 'w') as f:
        json.dump(json_sets, f, indent=2)
    print(f"Optimized variable sets JSON saved to: {json_file}")
    
    # Save comprehensive results (only the 2 optimized sets, exclude metadata)
    comprehensive_file = os.path.join(results_path, f"08_comprehensive_variable_analysis_{specie}_{training}_{bio}.json")
    # Filter to only include the 2 optimized sets (exclude metadata entries)
    optimized_sets_only = {}
    for set_name in ["Set 1 - Common Variables", "Set 2 - Best Performing"]:
        if set_name in optimized_sets:
            optimized_sets_only[set_name] = optimized_sets[set_name].tolist() if isinstance(optimized_sets[set_name], np.ndarray) else optimized_sets[set_name]
    
    comprehensive_data = {
        'variable_comparison': variable_comparison,
        'optimized_sets': optimized_sets_only,
        'variable_scores': variable_scores
    }
    with open(comprehensive_file, 'w') as f:
        json.dump(comprehensive_data, f, indent=2, default=str)
    print(f"Comprehensive variable analysis saved to: {comprehensive_file}")
    
else:
    print("Skipping file saving (savefig=False)")


## 7. Final Summary


In [None]:
# =============================================================================
# FINAL SUMMARY
# =============================================================================

print("="*100)
print("VARIABLE SETS COMPARISON COMPLETED")
print("="*100)

if variable_comparison and optimized_sets:
    print(f"\nANALYSIS SUMMARY:")
    print(f"• Iterations Analyzed: {len(variable_comparison)}")
    print(f"• Comparison Tables Created: 2")
    print(f"• Optimized Variable Sets: 2 (as NumPy arrays)")
    print(f"• Variable Names Cleaned: Yes")
    print(f"• Prefixes Removed: Yes")
    print(f"• SRTM/NDVI Filtered Out: Yes")
    print(f"• Converted to Numeric: Yes")
    
    print(f"\nOPTIMIZED VARIABLE SETS (ARRAYS):")
    sets = {
        "Set2": list(optimized_sets.get("Set 1 - Common Variables", [])),
        "Set3": list(optimized_sets.get("Set 2 - Best Performing", [])),
    }

    # for set_name in ["Set 1 - Common Variables", "Set 2 - Best Performing", "Set 3 - Balanced Approach"]:
    for set_name in ["Set 1 - Common Variables", "Set 2 - Best Performing"]:
        if set_name in optimized_sets:
            variables = optimized_sets[set_name]
            metadata = optimized_sets.get(f"{set_name}_metadata", {})
            print(f"• {set_name}: {len(variables)} variables")
            print(f"  - Array: {list(variables)}")
            print(f"  - Consensus Score: {metadata.get('consensus_score', 0):.2f}")
    
    print(f"\nOUTPUT FILES:")
    print(f"• CSV Tables: {results_path}")
    print(f"• JSON Files: {results_path}")

    
    print(f"\nKEY INSIGHTS:")
    if not comparison_table.empty:
        avg_overlap = comparison_table['Overlap_Percentage'].mean()
        avg_final = comparison_table['Final_Variables_Count'].mean()
        avg_optimal = comparison_table['Optimal_Variables_Count'].mean()
        
        print(f"• Average Variable Overlap: {avg_overlap:.1f}%")
        print(f"• Average Final Variables: {avg_final:.1f}")
        print(f"• Average Optimal Variables: {avg_optimal:.1f}")
        
        if avg_overlap > 70:
            print(f"• Integration Level: HIGH - Strong consensus between approaches")
        elif avg_overlap > 50:
            print(f"• Integration Level: MODERATE - Some consensus between approaches")
        else:
            print(f"• Integration Level: LOW - Limited consensus between approaches")
    
    print(f"\nRECOMMENDATIONS:")
    print(f"1. Use Set 1 for maximum consensus (common variables)")
    print(f"2. Use Set 2 for best performance (highest scoring variables)")
    print(f"3. All variable names are cleaned and ready for use")
    print(f"4. Variables are provided as numeric arrays for easy integration")
    print(f"5. SRTM and NDVI variables have been filtered out")
    
else:
    print(f"\nNO DATA FOUND:")
    print(f"• No comprehensive analysis results found")
    print(f"• Please run the main analysis notebook first")
    print(f"• Ensure JSON files are saved in the correct location")

print(f"\n" + "="*100)
print("VARIABLE SETS COMPARISON COMPLETED SUCCESSFULLY")
print("="*100)
