# Variable Sets Comparison: Final Variables vs Optimal Variables

This notebook compares the `final_variable_set` from Variable Importance Analysis with `optimal_variables` from Spatial Spread Analysis and creates 3 optimized variable sets as arrays.

## Overview:
- **Variable Comparison**: Compare final variables from both approaches
- **Overlap Analysis**: Analyze common and unique variables
- **Performance Assessment**: Evaluate variable set performance
- **3 Variable Sets Creation**: Generate optimized sets as arrays
- **Variable Name Cleaning**: Remove prefixes and handle special variables


## 1. Configuration and Setup


In [None]:
# =============================================================================
# CONFIGURATION AND SETUP
# =============================================================================

import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

# Configuration
#specie = 'leptocybe-invasa'  # Target species: 'leptocybe-invasa' or 'thaumastocoris-peregrinus'
# training = 'south-east-asia' 
# bio = bio1  # Bioclimatic variable identifier
savefig = True  # Set to True to save figures

# Paths
base_path = os.path.join(os.path.dirname(os.getcwd()), 'figs')
figs_path = os.path.join(os.path.dirname(os.getcwd()), 'figs')  # Figures directory
results_path = os.path.join(os.path.dirname(os.getcwd()), 'results')  # Figures directory
# results_path = os.path.join(base_path, "results")

# Create directories if they don't exist
os.makedirs(figs_path, exist_ok=True)
os.makedirs(results_path, exist_ok=True)

## 2. Variable Name Cleaning Functions


In [None]:
# =============================================================================
# VARIABLE NAME CLEANING FUNCTIONS
# =============================================================================

def clean_variable_name(var_name):
    """
    Clean variable name by removing prefixes, filtering out SRTM/NDVI, and converting to numeric.
    
    Parameters:
    -----------
    var_name : str
        Original variable name
        
    Returns:
    --------
    int or None
        Numeric variable value, or None if SRTM/NDVI (to be filtered out)
    """
    
    # Remove common prefixes
    prefixes_to_remove = [
        'ensemble_mean_bioclim_',
        'bioclim_',
        'ensemble_mean_',
        'mean_',
        'std_',
        'min_',
        'max_'
    ]
    
    cleaned_name = var_name
    for prefix in prefixes_to_remove:
        if cleaned_name.startswith(prefix):
            cleaned_name = cleaned_name[len(prefix):]
            break
    
    # Filter out SRTM and NDVI variables
    if 'srtm' in cleaned_name.lower() or 'ndvi' in cleaned_name.lower():
        return None  # Return None to indicate this variable should be filtered out
    
    # Convert to numeric (remove leading zeros)
    try:
        # Remove leading zeros and convert to int
        numeric_value = int(cleaned_name.lstrip('0') or '0')
        return numeric_value
    except ValueError:
        # If conversion fails, return None to filter out
        return None

def clean_variable_list(var_list):
    """
    Clean a list of variable names, filtering out SRTM/NDVI and converting to numeric.
    
    Parameters:
    -----------
    var_list : list
        List of variable names
        
    Returns:
    --------
    list
        List of numeric variable values (SRTM/NDVI filtered out)
    """
    cleaned_vars = []
    for var in var_list:
        cleaned_var = clean_variable_name(var)
        if cleaned_var is not None:  # Only include non-None variables
            cleaned_vars.append(cleaned_var)
    return cleaned_vars

def remove_duplicates_preserve_order(var_list):
    """
    Remove duplicates while preserving order.
    
    Parameters:
    -----------
    var_list : list
        List of variables (may contain duplicates)
        
    Returns:
    --------
    list
        List without duplicates
    """
    seen = set()
    result = []
    for var in var_list:
        if var not in seen:
            seen.add(var)
            result.append(var)
    return result

## 3. Load Comprehensive Analysis Results


In [None]:
# =============================================================================
# LOAD COMPREHENSIVE ANALYSIS RESULTS
# =============================================================================

def load_comprehensive_analysis_results(base_path, specie, training, bio):
    """
    Load comprehensive analysis results from different iterations.
    
    Parameters:
    -----------
    base_path : str
        Base path to the project directory
    specie : str
        Species name
    training : str
        Training region
    bio : str
        Bio variables type
        
    Returns:
    --------
    results_dict : dict
        Dictionary containing results from all iterations
    """
    
    results_dict = {}
    figs_path = os.path.join(os.path.dirname(os.getcwd()), 'figs')  # Figures directory
    
    # Look for comprehensive analysis summary files
    pattern = f"06_comprehensive_analysis_summary_{specie}_{training}_{bio}_*.json"
    
    print(f"Looking for files matching pattern: {pattern}")
    
    # Find all matching files
    import glob
    files = glob.glob(os.path.join(figs_path, pattern))
    
    if not files:
        print(f"No comprehensive analysis summary files found in {figs_path}")
        print(f"Looking for any JSON files with 'comprehensive' in the name...")
        
        # Try broader search
        all_json_files = glob.glob(os.path.join(figs_path, "*comprehensive*.json"))
        print(f"Found {len(all_json_files)} files with 'comprehensive' in name:")
        for f in all_json_files:
            print(f"  - {os.path.basename(f)}")
        
        files = all_json_files
    
    print(f"Found {len(files)} comprehensive analysis summary files")
    
    for file_path in files:
        try:
            # Extract iteration from filename
            filename = os.path.basename(file_path)
            
            # Try to extract iteration number
            if "iteration" in filename.lower():
                # Extract iteration number from filename
                import re
                match = re.search(r'iteration[_-]?(\d+)', filename, re.IGNORECASE)
                if match:
                    iteration = f"iteration_{match.group(1)}"
                else:
                    iteration = filename.replace('.json', '')
            else:
                iteration = filename.replace('.json', '')
            
            # Load JSON file
            with open(file_path, 'r') as f:
                data = json.load(f)
            
            results_dict[iteration] = data
            print(f"‚úì Loaded {iteration} from {filename}")
            
        except Exception as e:
            print(f"‚úó Error loading {file_path}: {str(e)}")
    
    return results_dict

# Load results
print("Loading comprehensive analysis results...")
comprehensive_results = load_comprehensive_analysis_results(base_path, specie, training, bio)

print(f"\nLoaded {len(comprehensive_results)} comprehensive analysis results")
for iteration in comprehensive_results.keys():
    print(f"  - {iteration}")


## 4. Extract and Compare Variable Sets


In [None]:
# =============================================================================
# EXTRACT AND COMPARE VARIABLE SETS
# =============================================================================

def extract_variable_sets(comprehensive_results):
    """
    Extract final_variable_set and optimal_variables from comprehensive results.
    
    Parameters:
    -----------
    comprehensive_results : dict
        Dictionary containing comprehensive results from all iterations
        
    Returns:
    --------
    variable_comparison : dict
        Dictionary containing variable sets comparison
    """
    
    variable_comparison = {}
    
    for iteration, results in comprehensive_results.items():
        # Extract final variables from variable importance analysis
        final_variables_raw = results.get('variable_importance_summary', {}).get('final_variable_set', [])
        
        # Extract optimal variables from spatial spread analysis
        optimal_variables_raw = results.get('spatial_spread_summary', {}).get('optimal_variables', [])
        
        # Clean variable names (apply Variable Name Cleaning Functions)
        final_variables = clean_variable_list(final_variables_raw)
        optimal_variables = clean_variable_list(optimal_variables_raw)
        
        # Remove duplicates while preserving order
        final_variables = remove_duplicates_preserve_order(final_variables)
        optimal_variables = remove_duplicates_preserve_order(optimal_variables)
        
        # Calculate overlap
        final_set = set(final_variables)
        optimal_set = set(optimal_variables)
        
        common_variables = list(final_set.intersection(optimal_set))
        final_only = list(final_set - optimal_set)
        optimal_only = list(optimal_set - final_set)
        
        # Calculate overlap percentage
        total_unique = len(final_set.union(optimal_set))
        overlap_percentage = (len(common_variables) / total_unique * 100) if total_unique > 0 else 0
        
        variable_comparison[iteration] = {
            'final_variables': final_variables,
            'optimal_variables': optimal_variables,
            'common_variables': common_variables,
            'final_only': final_only,
            'optimal_only': optimal_only,
            'overlap_percentage': overlap_percentage,
            'total_final': len(final_variables),
            'total_optimal': len(optimal_variables),
            'total_common': len(common_variables)
        }
    
    return variable_comparison

def create_variable_comparison_table(variable_comparison):
    """
    Create comparison table for variable sets.
    
    Parameters:
    -----------
    variable_comparison : dict
        Dictionary containing variable sets comparison
        
    Returns:
    --------
    df : pandas.DataFrame
        Comparison table
    """
    
    data = []
    
    for iteration, comparison in variable_comparison.items():
        row = {
            'Iteration': iteration,
            'Final_Variables_Count': comparison['total_final'],
            'Optimal_Variables_Count': comparison['total_optimal'],
            'Common_Variables_Count': comparison['total_common'],
            'Overlap_Percentage': comparison['overlap_percentage'],
            'Final_Variables': ', '.join(map(str, comparison['final_variables'])),
            'Optimal_Variables': ', '.join(map(str, comparison['optimal_variables'])),
            'Common_Variables': ', '.join(map(str, comparison['common_variables'])),
            'Final_Only': ', '.join(map(str, comparison['final_only'])),
            'Optimal_Only': ', '.join(map(str, comparison['optimal_only']))
        }
        data.append(row)
    
    df = pd.DataFrame(data)
    return df

# Extract variable sets
print("Extracting variable sets from comprehensive results...")
variable_comparison = extract_variable_sets(comprehensive_results)

# Create comparison table
comparison_table = create_variable_comparison_table(variable_comparison)

print(f"\nüìä Variable Sets Comparison Table:")
print(comparison_table.to_string(index=False))

# Display summary statistics
if not comparison_table.empty:
    print(f"\nüìà Summary Statistics:")
    print(f"‚Ä¢ Average Final Variables: {comparison_table['Final_Variables_Count'].mean():.1f}")
    print(f"‚Ä¢ Average Optimal Variables: {comparison_table['Optimal_Variables_Count'].mean():.1f}")
    print(f"‚Ä¢ Average Common Variables: {comparison_table['Common_Variables_Count'].mean():.1f}")
    print(f"‚Ä¢ Average Overlap: {comparison_table['Overlap_Percentage'].mean():.1f}%")


## 5. Create 3 Optimized Variable Sets as Arrays


In [None]:
# =============================================================================
# CREATE 3 OPTIMIZED VARIABLE SETS AS ARRAYS
# =============================================================================

def create_optimized_variable_sets(variable_comparison):
    """
    Create 3 optimized variable sets as arrays based on analysis results.
    
    Parameters:
    -----------
    variable_comparison : dict
        Dictionary containing variable sets comparison
        
    Returns:
    --------
    optimized_sets : dict
        Dictionary containing 3 optimized variable sets as arrays
    """
    
    # Collect all variables from all iterations
    all_final_variables = []
    all_optimal_variables = []
    all_common_variables = []
    
    for iteration, comparison in variable_comparison.items():
        all_final_variables.extend(comparison['final_variables'])
        all_optimal_variables.extend(comparison['optimal_variables'])
        all_common_variables.extend(comparison['common_variables'])
    
    # Count variable frequency
    from collections import Counter
    
    final_freq = Counter(all_final_variables)
    optimal_freq = Counter(all_optimal_variables)
    common_freq = Counter(all_common_variables)
    
    # Calculate variable scores (frequency + importance)
    all_variables = set(all_final_variables + all_optimal_variables)
    variable_scores = {}
    
    for var in all_variables:
        final_score = final_freq.get(var, 0)
        optimal_score = optimal_freq.get(var, 0)
        common_score = common_freq.get(var, 0)
        
        # Combined score: common variables get highest weight
        total_score = common_score * 3 + final_score + optimal_score
        variable_scores[var] = total_score
    
    # Sort variables by score
    sorted_variables = sorted(variable_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Create 3 optimized sets as arrays
    optimized_sets = {}
    
    # Set 1: Most frequent common variables (highest consensus)
    common_vars_sorted = sorted(common_freq.items(), key=lambda x: x[1], reverse=True)
    set1_vars = [var for var, freq in common_vars_sorted[:5]]  # Top 5 common variables
    optimized_sets["Set 1 - Common Variables"] = np.array(set1_vars)
    
    # Set 2: Best performing variables (highest combined score)
    set2_vars = [var for var, score in sorted_variables[:6]]  # Top 6 variables by score
    optimized_sets["Set 2 - Best Performing"] = np.array(set2_vars)
    
    # # Set 3: Balanced approach (mix of common and unique high-scoring variables)
    # # Take top 3 common + top 3 unique high-scoring variables
    # common_top3 = [var for var, freq in common_vars_sorted[:3]]
    # unique_vars = [var for var, score in sorted_variables if var not in common_top3][:3]
    # set3_vars = common_top3 + unique_vars
    # optimized_sets["Set 3 - Balanced Approach"] = np.array(set3_vars)
    
    # Add metadata for each set (create separate dictionary to avoid modification during iteration)
    metadata_dict = {}
    for set_name, variables in optimized_sets.items():
        # Calculate set characteristics
        common_count = sum(1 for var in variables if var in common_freq)
        final_count = sum(1 for var in variables if var in final_freq)
        optimal_count = sum(1 for var in variables if var in optimal_freq)
        
        # Add metadata to separate dictionary
        metadata_dict[f"{set_name}_metadata"] = {
            'variable_count': len(variables),
            'common_variables': common_count,
            'final_variables': final_count,
            'optimal_variables': optimal_count,
            'consensus_score': common_count / len(variables) if len(variables) > 0 else 0
        }
    
    # Add metadata to optimized_sets after iteration is complete
    optimized_sets.update(metadata_dict)
    
    return optimized_sets, variable_scores

def create_variable_sets_table(optimized_sets):
    """
    Create table for the 3 optimized variable sets.
    
    Parameters:
    -----------
    optimized_sets : dict
        Dictionary containing optimized variable sets
        
    Returns:
    --------
    df : pandas.DataFrame
        Variable sets table
    """
    
    data = []
    
    # for set_name in ["Set 1 - Common Variables", "Set 2 - Best Performing", "Set 3 - Balanced Approach"]:
    for set_name in ["Set 1 - Common Variables", "Set 2 - Best Performing"]:
        if set_name in optimized_sets:
            variables = optimized_sets[set_name]
            metadata = optimized_sets.get(f"{set_name}_metadata", {})
            
            row = {
                'Set_Name': set_name,
                'Variable_Count': metadata.get('variable_count', len(variables)),
                'Common_Variables': metadata.get('common_variables', 0),
                'Final_Variables': metadata.get('final_variables', 0),
                'Optimal_Variables': metadata.get('optimal_variables', 0),
                'Consensus_Score': metadata.get('consensus_score', 0),
                'Variables': ', '.join(map(str, variables)) if len(variables) > 0 else ''
            }
            data.append(row)
    
    df = pd.DataFrame(data)
    return df

# Create optimized variable sets
print("Creating 3 optimized variable sets as arrays...")
optimized_sets, variable_scores = create_optimized_variable_sets(variable_comparison)

# Create variable sets table
sets_table = create_variable_sets_table(optimized_sets)

print(f"\nüéØ Optimized Variable Sets:")
print(sets_table.to_string(index=False))

# Display the sets in the requested array format
print(f"\nüìã Variable Sets as Arrays (with Variable Name Cleaning Applied):")
print("# Variable Sets as NumPy Arrays")
# for set_name in ["Set 1 - Common Variables", "Set 2 - Best Performing", "Set 3 - Balanced Approach"]:
for set_name in ["Set 1 - Common Variables", "Set 2 - Best Performing"]:
    if set_name in optimized_sets:
        variables = optimized_sets[set_name]
        print(f"\n{set_name}:")
        print(f"variables_{set_name.lower().replace(' ', '_').replace('-', '_')} = np.array({list(variables)})")
        print(f"# Array shape: {variables.shape}")
        print(f"# Variables: {list(variables)}")

# Display variable scores for reference
print(f"\nüìä Variable Scores (Top 10):")
sorted_scores = sorted(variable_scores.items(), key=lambda x: x[1], reverse=True)
for i, (var, score) in enumerate(sorted_scores[:10]):
    print(f"{i+1:2d}. {var}: {score:.1f}")


## 6. Save Results


In [None]:
# =============================================================================
# SAVE RESULTS
# =============================================================================

if savefig:
    print("Saving results...")
    
    # Save comparison table
    if not comparison_table.empty:
        comparison_file = os.path.join(results_path, f"08_variable_comparison_{specie}_{training}_{bio}.csv")
        comparison_table.to_csv(comparison_file, index=False)
        print(f"‚úì Variable comparison table saved to: {comparison_file}")
    
    # Save variable sets table
    if not sets_table.empty:
        sets_file = os.path.join(results_path, f"08_optimized_variable_sets_{specie}_{training}_{bio}.csv")
        sets_table.to_csv(sets_file, index=False)
        print(f"‚úì Optimized variable sets table saved to: {sets_file}")
    
    # Save optimized sets in JSON format (convert numpy arrays to lists)
    json_sets = {}
    # for set_name in ["Set 1 - Common Variables", "Set 2 - Best Performing", "Set 3 - Balanced Approach"]:
    for set_name in ["Set 1 - Common Variables", "Set 2 - Best Performing"]:
        if set_name in optimized_sets and len(optimized_sets[set_name]) > 0:
            # Convert numpy array to list for JSON serialization
            json_sets[set_name] = optimized_sets[set_name].tolist()
    
    json_file = os.path.join(results_path, f"08_optimized_variable_sets_{specie}_{training}_{bio}.json")
    with open(json_file, 'w') as f:
        json.dump(json_sets, f, indent=2)
    print(f"‚úì Optimized variable sets JSON saved to: {json_file}")
    
    # Save variable scores
    scores_file = os.path.join(results_path, f"08_variable_scores_{specie}_{training}_{bio}.json")
    with open(scores_file, 'w') as f:
        json.dump(variable_scores, f, indent=2)
    print(f"‚úì Variable scores saved to: {scores_file}")
    
    # Save comprehensive results
    comprehensive_file = os.path.join(results_path, f"08_comprehensive_variable_analysis_{specie}_{training}_{bio}.json")
    comprehensive_data = {
        'variable_comparison': variable_comparison,
        'optimized_sets': {k: v.tolist() if isinstance(v, np.ndarray) else v for k, v in optimized_sets.items()},
        'variable_scores': variable_scores
    }
    with open(comprehensive_file, 'w') as f:
        json.dump(comprehensive_data, f, indent=2, default=str)
    print(f"‚úì Comprehensive variable analysis saved to: {comprehensive_file}")
    
    # Save Python code for the variable sets
    python_file = os.path.join(results_path, f"08_variable_sets_code_{specie}_{training}_{bio}.py")
    with open(python_file, 'w') as f:
        f.write("# Optimized Variable Sets (with Variable Name Cleaning Applied)\\n")
        f.write("import numpy as np\\n\\n")
        
        # for set_name in ["Set 1 - Common Variables", "Set 2 - Best Performing", "Set 3 - Balanced Approach"]:
        for set_name in ["Set 1 - Common Variables", "Set 2 - Best Performing"]:
            if set_name in optimized_sets and len(optimized_sets[set_name]) > 0:
                variables = optimized_sets[set_name]
                var_name = set_name.lower().replace(' ', '_').replace('-', '_')
                f.write(f"# {set_name}\\n")
                f.write(f"variables_{var_name} = np.array({list(variables)})\\n\\n")
    
    print(f"‚úì Python code for variable sets saved to: {python_file}")
    
else:
    print("Skipping file saving (savefig=False)")


## 7. Final Summary


In [None]:
# =============================================================================
# FINAL SUMMARY
# =============================================================================

print("="*100)
print("üéØ VARIABLE SETS COMPARISON COMPLETED")
print("="*100)

if variable_comparison and optimized_sets:
    print(f"\nüìä ANALYSIS SUMMARY:")
    print(f"‚Ä¢ Iterations Analyzed: {len(variable_comparison)}")
    print(f"‚Ä¢ Comparison Tables Created: 2")
    print(f"‚Ä¢ Optimized Variable Sets: 3 (as NumPy arrays)")
    print(f"‚Ä¢ Variable Names Cleaned: ‚úì")
    print(f"‚Ä¢ Prefixes Removed: ‚úì")
    print(f"‚Ä¢ SRTM/NDVI Filtered Out: ‚úì")
    print(f"‚Ä¢ Converted to Numeric: ‚úì")
    
    print(f"\nüéØ OPTIMIZED VARIABLE SETS (ARRAYS):")
    sets = {
        "Set2": list(optimized_sets.get("Set 1 - Common Variables", [])),
        "Set3": list(optimized_sets.get("Set 2 - Best Performing", [])),
    }

    # for set_name in ["Set 1 - Common Variables", "Set 2 - Best Performing", "Set 3 - Balanced Approach"]:
    for set_name in ["Set 1 - Common Variables", "Set 2 - Best Performing"]:
        if set_name in optimized_sets:
            variables = optimized_sets[set_name]
            metadata = optimized_sets.get(f"{set_name}_metadata", {})
            print(f"‚Ä¢ {set_name}: {len(variables)} variables")
            print(f"  - Array: {list(variables)}")
            print(f"  - Consensus Score: {metadata.get('consensus_score', 0):.2f}")
    
    print(f"\nüìÅ OUTPUT FILES:")
    print(f"‚Ä¢ CSV Tables: {results_path}")
    print(f"‚Ä¢ JSON Files: {results_path}")
    print(f"‚Ä¢ Python Code: {results_path}")
    
    print(f"\nüîç KEY INSIGHTS:")
    if not comparison_table.empty:
        avg_overlap = comparison_table['Overlap_Percentage'].mean()
        avg_final = comparison_table['Final_Variables_Count'].mean()
        avg_optimal = comparison_table['Optimal_Variables_Count'].mean()
        
        print(f"‚Ä¢ Average Variable Overlap: {avg_overlap:.1f}%")
        print(f"‚Ä¢ Average Final Variables: {avg_final:.1f}")
        print(f"‚Ä¢ Average Optimal Variables: {avg_optimal:.1f}")
        
        if avg_overlap > 70:
            print(f"‚Ä¢ Integration Level: HIGH - Strong consensus between approaches")
        elif avg_overlap > 50:
            print(f"‚Ä¢ Integration Level: MODERATE - Some consensus between approaches")
        else:
            print(f"‚Ä¢ Integration Level: LOW - Limited consensus between approaches")
    
    print(f"\nüéØ RECOMMENDATIONS:")
    print(f"1. Use Set 1 for maximum consensus (common variables)")
    print(f"2. Use Set 2 for best performance (highest scoring variables)")
    print(f"3. Use Set 3 for balanced approach (mix of common and unique)")
    print(f"4. All variable names are cleaned and ready for use")
    print(f"5. Variables are provided as numeric arrays for easy integration")
    print(f"6. SRTM and NDVI variables have been filtered out")
    
else:
    print(f"\n‚ö†Ô∏è  NO DATA FOUND:")
    print(f"‚Ä¢ No comprehensive analysis results found")
    print(f"‚Ä¢ Please run the main analysis notebook first")
    print(f"‚Ä¢ Ensure JSON files are saved in the correct location")

print(f"\n" + "="*100)
print("‚úÖ VARIABLE SETS COMPARISON COMPLETED SUCCESSFULLY")
print("="*100)
