# Predictive Variables

In [None]:
############### CONFIGURATION PARAMETERS ###############
# Modify these parameters according to your analysis requirements

# Species name for the analysis
# specie = 'leptocybe-invasa' # 'leptocybe-invasa' # 'thaumastocoris-peregrinus' # 

# Geographic region for analysis
# region = region_train
# region = 'south-east-asia' # 'east-asia' # 'south-east-asia' # 

# training = 'south-east-asia'

# Save figures flag (True to save plots, False to only display)
# savefig = True

# Time period for multi-year mean calculations
# year00 = year0 # start year multiyear mean
# year11 = year1 # end year multiyear mean

# Future climate scenario flag (True for future projections, False for historical)
# Future = False

# models = ["CHELSA"]

# project_map = {
#     'hist': 'historical',
#     'future': 'ssp370'
# }

# bioclim_folder = "/scratch/gito_aciar/data/CHELSA/chelsav2/GLOBAL/climatologies/"
###########################################################

In [None]:
# Import required libraries for geospatial data processing and visualization

# Standard library imports
import os
import shutil
import time
import sys
import glob
import subprocess

# Scientific computing libraries
import numpy as np
import pandas as pd

# Geospatial data processing libraries
import xarray as xr
import rioxarray  # For raster I/O with xarray
import geopandas as gpd  # For vector data processing
from geocube.api.core import make_geocube  # For raster-vector operations
from shapely.geometry import Polygon  # For geometric operations

# Raster processing libraries
import rasterio  # For low-level raster operations
from rasterio.warp import reproject, Resampling  # For raster reprojection
from rasterio.enums import Resampling  # For resampling methods

# Visualization libraries
import matplotlib.pyplot as plt
import matplotlib as mpl
import cartopy.io.shapereader as shapereader  # For map data

# Climate data processing
import cdo  # Climate Data Operators for netCDF processing

# Configure matplotlib for better plot appearance
params = {'legend.fontsize': 'x-large',
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}
plt.rcParams.update(params)

In [None]:
# Define geographic regions for species distribution modeling
# Each region contains a list of countries/territories for analysis

regions = {
    'east-asia': ['China', 'Taiwan', 'Japan', 'North Korea', 'South Korea'],
    'indo': ["Indonesia",'Malaysia','Singapore','Brunei','East Timor'],
    'sea': ['Myanmar', 'Cambodia', 'Laos', 'Philippines', 'Thailand', 'Vietnam'],
    'south-east-asia': ['Brunei', 'Myanmar', 'Cambodia', 'East Timor', 'Indonesia', 'Laos', 'Malaysia', 'Philippines', 'Singapore', 'Thailand', 'Vietnam'],
    'australia': ['Australia'],
    'australasia' : ['Australia', 'New Zealand'],
    'india-sri-lanka' : ['Sri Lanka'],
    'all' : ['Australia','France','Italy','Portugal','South Africa','United States of America','Madagascar','Spain','Greece','Cyprus','Mexico','Kenya','Algeria','Israel','Egypt','Ethiopia','Ghana','Malawi','Mauritius','Morocco','Mozambique','Rwanda','Sierra Leone','United Republic of Tanzania','Tunisia','Uganda','Zimbabwe','China','India','Iran','Iraq','Jordan','Sri Lanka','Syria','Taiwan','Turkey','Malta','Montenegro','United Kingdom','Argentina','Brazil','Chile','Paraguay','Uruguay']
}

In [None]:
# Define directory paths for data organization

# Main data directory (one level up from current working directory)
data_path = os.path.join(os.path.dirname(os.getcwd()), 'data')

# Documentation and figures output directories
docs_path = os.path.join(os.path.dirname(os.getcwd()), 'docs')
figs_path = os.path.join(os.path.dirname(os.getcwd()), 'figs')

# Species-specific output directory
out_path = os.path.join(os.path.dirname(os.getcwd()), 'out', specie)

# Input data path (can be training or test data based on 'training' flag)
input_path = os.path.join(out_path, 'input')

# Temporary directory for intermediate processing files
dirtmp = os.path.join(input_path,'tmp')

In [None]:
# Load and process geographic boundary data for the study region

# Load country shapefiles for the training/test region
gdf_countries = gpd.read_file(os.path.join(input_path, '%s.shp' %training))
shapes = [shape for shape in gdf_countries.geometry]  # Extract geometry objects for clipping

# Calculate bounding box coordinates for the study area
minx, miny, maxx, maxy = gdf_countries.total_bounds
# bbox = [minx, maxx, miny, maxy]  # Alternative bbox format
bbox = [minx, miny, maxx, maxy]  # Standard bbox format: [minx, miny, maxx, maxy]
print("Bounding box:", bbox)

# Create a rectangular polygon from the bounding box for raster clipping
minx, miny, maxx, maxy = gdf_countries.total_bounds
poly = Polygon([(minx, miny), (minx, maxy), (maxx, maxy), (maxx, miny)])
gdf = gpd.GeoDataFrame(geometry=[poly], crs="EPSG:4326")  # WGS84 coordinate system

## 1. Bioclimatic Variables Processing (WorldClim)

This section processes bioclimatic variables from WorldClim data, including:
- **Temperature variables (BIO1-BIO11)**: Converting from Kelvin to Celsius when necessary
- **Precipitation variables (BIO12-BIO19)**: Processing precipitation data
- **CRS Assignment**: Ensuring all rasters have proper coordinate reference systems

In [None]:
# Define output directories for processed bioclimatic variables
hist_output_path = os.path.join(input_path, 'worldclim')  # Historical climate data
future_output_path = os.path.join(input_path, 'worldclim_future')  # Future climate projections

# Create mapping for destination folders
destination_map = {
    'hist': hist_output_path,
    'future': future_output_path
}

# Define bioclimatic variables to process (BIO1-BIO19)
bio_variables = range(1, 20)

# Define threshold for Kelvin to Celsius conversion (typical Earth temperatures in Kelvin are > 200)
KELVIN_THRESHOLD = 200.0

# Process bioclimatic variables based on time period
if Future:
    print("\nFuture processing block is not yet implemented.")
else:
    print("\nStarting file copy, conversion, and CRS assignment process...")
    print("!!! WARNING: Files from previous models in the list will be overwritten. !!!")
    print("-" * 50)
    
    # Initialize counters for processing statistics
    file_copy_count = 0
    file_convert_count = 0
    file_missing_count = 0
    
    # Process each climate model
    for model_prefix in models:
        print(f"\nProcessing Model: {model_prefix.strip('_')}")
    
        # Process historical and future projections
        for project_key, project_name in project_map.items():
            destination_folder = destination_map[project_key]
            print(f"  -> Project: {project_name} | Destination: {destination_folder}")
            os.makedirs(destination_folder, exist_ok=True)
    
            # Process each bioclimatic variable (BIO1-BIO19)
            for i in bio_variables:
                # Search for matching files using glob pattern
                pattern = f"{model_prefix}_{project_name}_*_bio_{i}.tif"
                matches = glob.glob(os.path.join(bioclim_folder, pattern))
                
                if not matches:
                    print(f"bio_{i}: not found")
                    file_missing_count += 1
                    continue  # Skip to next variable
            
                # Use the first matching file
                source_path = matches[0]
                print(f"bio_{i}: {source_path}")
            
                # Define output filename and path
                destination_filename = f"{model_prefix}_bio_{i}.tif"
                destination_path = os.path.join(destination_folder, destination_filename)
            
                # Verify source file exists
                if not os.path.exists(source_path):
                    print(f"    - SKIPPED (Not Found): {source_path}")
                    file_missing_count += 1
                    continue
                
                # Remove existing output file to ensure fresh processing
                if os.path.exists(destination_path):
                    os.remove(destination_path)
                    print(f"    - Removed existing file: {destination_filename}")
                
                try:
                    # Process temperature variables (BIO1-BIO11)
                    if 1 <= i <= 11:
                        # Open raster to check temperature values
                        src = rioxarray.open_rasterio(source_path, chunks=True)
                        scale_factor = src.attrs.get("scale_factor", 1.0)
                        add_offset = src.attrs.get("add_offset", 0.0)

                        if scale_factor != 1.0 or add_offset != 0.0:
                            src = src * scale_factor + add_offset
                            print(f"    - Applied scale_factor={scale_factor}, add_offset={add_offset}")

                        # Calculate mean to detect temperature units
                        mean_val = float(src.mean().compute())   
                        # Convert from Kelvin to Celsius if needed
                        if mean_val > KELVIN_THRESHOLD:
                            celsius_data = src - 273.15
                            celsius_data.attrs['units'] = 'degrees Celsius'
                                
                            # Set coordinate reference system
                            celsius_data = celsius_data.rio.write_crs("EPSG:4326")
                            
                            clipped = celsius_data.rio.clip(shapes, gdf_countries.crs, drop=True)
                            clipped.rio.to_raster(destination_path)
                            
                            print(f"    - CONVERTED (K->C) & CRS Set: {destination_filename}")
                            file_convert_count += 1
                        else:
                            # Data already in Celsius, just set CRS
                            src_with_crs = src.rio.write_crs("EPSG:4326")
                            clipped = src_with_crs.rio.clip(shapes, gdf_countries.crs, drop=True)
                            clipped.rio.to_raster(destination_path)
                            print(f"    - COPIED & CRS Set (Already Celsius): {destination_filename}")
                            file_copy_count += 1
                        
                        # Close the dataset to free memory
                        src.close()
                    else:
                        # Process precipitation variables (BIO12-BIO19)
                        precip_src = rioxarray.open_rasterio(source_path, chunks=True)
                        scale_factor = precip_src.attrs.get("scale_factor", 1.0)
                        add_offset = precip_src.attrs.get("add_offset", 0.0)
                        
                        if scale_factor != 1.0 or add_offset != 0.0:
                            precip_src = precip_src * scale_factor + add_offset
                            print(f"    - Applied scale_factor={scale_factor}, add_offset={add_offset}")
                                
                        # Set coordinate reference system for precipitation data
                        precip_src_with_crs = precip_src.rio.write_crs("EPSG:4326")
                        clipped = precip_src_with_crs.rio.clip(shapes, gdf_countries.crs, drop=True)

                        clipped.rio.to_raster(destination_path)
                        print(f"    - COPIED & CRS Set (Precipitation): {destination_filename}")
                        file_copy_count += 1
                        
                        # Close the dataset to free memory
                        precip_src.close()
                except Exception as e:
                    print(f"    - ERROR processing {source_path}: {e}")
    
    print("-" * 50)
    print("\nProcessing complete.")
    print(f"Summary: {file_convert_count} files converted, {file_copy_count} files copied with CRS set, {file_missing_count} files not found.")

In [None]:
# Dictionary mapping bioclimatic variable numbers to descriptive names
# These correspond to the 19 bioclimatic variables from WorldClim
bioclim_names = {
    # Temperature variables (BIO1-BIO11)
    1: 'Annual Mean Temperature',
    2: 'Mean Diurnal Range (Mean of monthly (max temp - min temp))',
    3: 'Isothermality (BIO2/BIO7) (×100)',
    4: 'Temperature Seasonality (standard deviation ×100)',
    5: 'Max Temperature of Warmest Month',
    6: 'Min Temperature of Coldest Month',
    7: 'Temperature Annual Range (BIO5-BIO6)',
    8: 'Mean Temperature of Wettest Quarter',
    9: 'Mean Temperature of Driest Quarter',
    10: 'Mean Temperature of Warmest Quarter',
    11: 'Mean Temperature of Coldest Quarter',
    
    # Precipitation variables (BIO12-BIO19)
    12: 'Annual Precipitation',
    13: 'Precipitation of Wettest Month',
    14: 'Precipitation of Driest Month',
    15: 'Precipitation Seasonality (Coefficient of Variation)',
    16: 'Precipitation of Wettest Quarter',
    17: 'Precipitation of Driest Quarter',
    18: 'Precipitation of Warmest Quarter',
    19: 'Precipitation of Coldest Quarter'
}

In [None]:
# Generate list of bioclimatic variable numbers (1-19)
bioclim_no = list(np.arange(1,20))  # All 19 bioclimatic variables
# bioclim_no = [1, 5, 6, 12, 13, 14]  # Alternative: subset of variables

# NOTE: file_names_worldclim will be generated inside the loop in Cell 11
# for each model_prefix to ensure correct filenames for each model

In [None]:
# Select bioclimatic variables for visualization
bioclim_plot = [1, 5, 6, 8, 16, 17]  # Selected variables for plotting

# Configure subplot layout based on number of variables to plot
if len(bioclim_plot) <= 3:
    nrows, ncols = 1, len(bioclim_plot)
    figsize = (18, 4)
elif 3 < len(bioclim_plot) <= 6:
    nrows, ncols = 2, 3
    figsize = (18, 7)
elif 6 < len(bioclim_plot) <= 9:
    nrows, ncols = 3, 3
    figsize = (18, 10)

In [None]:
# Initialize lists to store raster information
rasters, labels = [], []

# === STEP 1: Process and clip bioclimatic rasters for each model ===
for model_prefix in models:
    # Generate filenames for this specific model (must be inside loop!)
    file_names_worldclim = []
    for no in bioclim_no:
        file_names_worldclim.append(f'{model_prefix}_bio_{no}.tif')
    
    for no, file_name in zip(bioclim_no, file_names_worldclim):

        if Future:
            # Process future climate projections
            wc_region = rioxarray.open_rasterio(
                os.path.join(input_path, 'worldclim_future', file_name), masked=True)
            wc_region = wc_region.rio.write_crs("EPSG:4326").rio.clip(geometries=shapes)
            wc_region = wc_region.fillna(-9999).rio.write_nodata(-9999)
            wc_region.name = bioclim_names[no]

            # Save clipped raster for future scenario
            out_raster = os.path.join(
                input_path, f"{model_prefix}_bio_{no}_{region}_future.tif")
            wc_region.rio.to_raster(out_raster)

            rasters.append((model_prefix, f"{model_prefix}_bio_{no}_{region}_future.tif"))

        else:
            # Process historical climate data
            wc_region = rioxarray.open_rasterio(
                os.path.join(input_path, 'worldclim', file_name), masked=True)
            wc_region = wc_region.rio.write_crs("EPSG:4326").rio.clip(geometries=shapes)
            wc_region = wc_region.fillna(-9999).rio.write_nodata(-9999)
            wc_region.name = bioclim_names[no]

            # Save clipped raster for historical scenario
            out_raster = os.path.join(
                input_path, f"{model_prefix}_bio_{no}_{region}.tif")
            wc_region.rio.to_raster(out_raster)

            rasters.append((model_prefix, f"{model_prefix}_bio_{no}_{region}.tif"))

# === STEP 2: Create visualization plots for each climate model ===
# Generate titles for each bioclimatic variable
titles = [f"bio-{no:02d}: {name}" for (no, name) in bioclim_names.items()]

# Create plots for each climate model
for model_prefix in models:
    # Create subplot grid
    fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize, sharex=True, sharey=True)

    # Plot each selected bioclimatic variable
    for iax, bc_no in enumerate(bioclim_plot):
        r, c = iax // ncols, iax % ncols
        axes = ax[c] if nrows == 1 else ax[r, c]

        # Find raster files for this model and bioclimatic variable
        matches = [r for m, r in rasters if m == model_prefix and f"_bio_{bc_no}_" in r]

        for raster in matches:
            # Load raster data
            ds = rioxarray.open_rasterio(os.path.join(input_path, raster), masked=True)
            print(f"Opened {raster}")

            # Select appropriate color map based on variable type
            if bc_no < 12:  # Temperature variables
                cmap = plt.cm.Spectral_r
            elif bc_no == 12:  # Annual precipitation
                cmap = plt.cm.BuGn
            else:  # Other precipitation variables
                cmap = plt.cm.BrBG

            # Plot country boundaries as background
            gdf_countries.plot(ax=axes, facecolor='lightgray', edgecolor='k')

            # Plot raster data
            raster_data = ds.isel()
            pcol = raster_data.plot(
                ax=axes, cmap=cmap, add_colorbar=False, add_labels=False
            )

            # Set subplot title
            axes.set_title(titles[bc_no - 1])

        # Add colorbar to each subplot
        plt.colorbar(pcol, ax=axes, extend="both")
    
    plt.tight_layout()

    # Save figure if requested
    if savefig:
        if Future:
            fig.savefig(
                os.path.join(figs_path, f"03a_wordclim_bioclim_{region}_{model_prefix}_future.png"),
                transparent=True,
            )
        else:
            fig.savefig(
                os.path.join(figs_path, f"03a_wordclim_bioclim_{region}_{model_prefix}.png"),
                transparent=True,
            )

    plt.close(fig)  # Close figure to free memory 


In [None]:
# Import required libraries for raster reprojection and alignment
import os
import numpy as np
import rasterio
from rasterio.warp import reproject, Resampling
import matplotlib.pyplot as plt

# === RASTER REPROJECTION AND ALIGNMENT ===
# This section aligns SRTM elevation and NDVI data to match the bioclimatic variables grid

# STEP 1: Load reference raster (BIO1) to get target properties
ref_path = os.path.join(input_path, "%s_bio_1_%s.tif" %(models[0], region))

with rasterio.open(ref_path) as src1:
    # Extract reference raster properties for alignment
    ref_profile = src1.profile  # Raster metadata
    ref_transform = src1.transform  # Geospatial transformation
    ref_crs = src1.crs  # Coordinate reference system
    ref_width = src1.width  # Raster width in pixels
    ref_height = src1.height  # Raster height in pixels
    ref_nodata = src1.nodata  # No-data value
    bio1 = src1.read(1)  # Read BIO1 data for reference

# STEP 2: Reproject and align SRTM elevation data
src2_path = os.path.join(input_path, "tmp", f'srtm_{region}.tif')
output_path = os.path.join(input_path, f'srtm_{region}.tif')

with rasterio.open(src2_path) as src2:
    # Create empty destination array with reference raster dimensions
    destination = np.zeros((ref_height, ref_width), np.float32)

    # Perform reprojection to align SRTM with bioclimatic variables
    reproject(
        source=rasterio.band(src2, 1),  # Source band
        destination=destination,  # Destination array
        src_transform=src2.transform,  # Source transformation
        src_crs=src2.crs,  # Source coordinate system
        dst_transform=ref_transform,  # Target transformation
        dst_crs=ref_crs,  # Target coordinate system
        resampling=Resampling.bilinear  # Interpolation method
    )

    # Handle negative elevation values (set to nodata)
    nodata_val = ref_nodata if ref_nodata is not None else -9999
    destination[destination < 0] = nodata_val

    # Update output profile with reference properties
    output_profile = ref_profile.copy()
    output_profile.update({
        'dtype': 'float32',
        'nodata': nodata_val
    })

    # Save aligned SRTM raster
    with rasterio.open(output_path, 'w', **output_profile) as dst:
        dst.write(destination, 1)

# STEP 3: Reproject and align NDVI data
src3_path_ndvi = os.path.join(input_path, f'ndvi-median-classed_{region}.tif')
output_path_ndvi = os.path.join(input_path, f'ndvi_{region}.tif')

with rasterio.open(src3_path_ndvi) as src3:
    # Create empty destination array with reference raster dimensions
    destination = np.zeros((ref_height, ref_width), np.float32)

    # Perform reprojection to align NDVI with bioclimatic variables
    reproject(
        source=rasterio.band(src3, 1),  # Source band
        destination=destination,  # Destination array
        src_transform=src3.transform,  # Source transformation
        src_crs=src3.crs,  # Source coordinate system
        dst_transform=ref_transform,  # Target transformation
        dst_crs=ref_crs,  # Target coordinate system
        resampling=Resampling.bilinear  # Interpolation method
    )

    # Handle negative NDVI values (set to nodata)
    nodata_val = ref_nodata if ref_nodata is not None else -9999
    destination[destination < 0] = nodata_val

    # Update output profile with reference properties
    output_profile = ref_profile.copy()
    output_profile.update({
        'dtype': 'float32',
        'nodata': nodata_val
    })

    # Save aligned NDVI raster
    with rasterio.open(output_path_ndvi, 'w', **output_profile) as dst:
        dst.write(destination, 1)

print(f"Raster alignment completed successfully. Output saved to: {output_path}")