# Predictive Variables

In [None]:
############### CONFIGURATION VARIABLES - MODIFY AS NEEDED ###############

# Species name for the distribution modeling study
# Options: 'leptocybe-invasa', 'thaumastocoris-peregrinus', or other species names
# specie = 'leptocybe-invasa' # 'leptocybe-invasa' # 'thaumastocoris-peregrinus' # 

# Geographic region for analysis - determines which countries/areas to include
# region = region_test  # Use predefined test region
# region = 'south-east-asia' # 'east-asia' # 'south-east-asia' # 

# Training mode flag - determines whether to process training or test data
# training = False  # Set to True for training data, False for test data

# Figure saving flag - controls whether to save generated plots
# savefig = True  # Set to True to save figures, False to only display

# Multi-year mean calculation parameters
# year00 = year0 # Start year for multi-year mean calculation
# year11 = year1 # End year for multi-year mean calculation

###########################################################

In [None]:
# Standard library imports
import os  # Operating system interface for file/directory operations
import shutil  # High-level file operations (copy, move, etc.)
import time  # Time-related functions for timing operations
import sys  # System-specific parameters and functions
import glob  # Unix shell-style pathname pattern expansion
import subprocess  # Subprocess management for external commands

# Scientific computing libraries
import numpy as np  # Numerical computing with arrays and mathematical functions
import pandas as pd  # Data manipulation and analysis with DataFrames

# Geospatial data processing libraries
import xarray as xr  # N-dimensional labeled arrays for scientific data (NetCDF, etc.)
import rioxarray  # Raster I/O operations for xarray (GeoTIFF, etc.)
import geopandas as gpd  # Geospatial data manipulation with pandas-like interface
from geocube.api.core import make_geocube  # Convert vector data to raster format

# Visualization libraries
import matplotlib.pyplot as plt  # Plotting and visualization
import matplotlib as mpl  # Matplotlib configuration and styling
import cartopy.io.shapereader as shapereader  # Reading shapefiles for cartopy

# Configure matplotlib parameters for better plot appearance
params = {'legend.fontsize': 'x-large',
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}
plt.rcParams.update(params)

In [None]:
# Define directory structure for the project
# All paths are relative to the parent directory of the current working directory

data_path = os.path.join(os.path.dirname(os.getcwd()), 'data')
docs_path = os.path.join(os.path.dirname(os.getcwd()), 'docs')
figs_path = os.path.join(os.path.dirname(os.getcwd()), 'figs')
out_path = os.path.join(os.path.dirname(os.getcwd()), 'out', specie)
# input_path = os.path.join(out_path, 'input', 'train') if training else os.path.join(out_path, 'input', 'test')
input_path = os.path.join(out_path, 'input')
dirtmp = os.path.join(input_path,'tmp') # temporary directory
os.makedirs(dirtmp, exist_ok=True)

In [None]:
# Regional groupings for species distribution modeling
# Each key represents a study region, with values being lists of country names
# These regions are used to clip environmental data to specific geographic areas

regions = {
    'east-asia': ['China', 'Taiwan', 'Japan', 'North Korea', 'South Korea'],
    'indo': ["Indonesia",'Malaysia','Singapore','Brunei','East Timor'],
    'sea': ['Myanmar', 'Cambodia', 'Laos', 'Philippines', 'Thailand', 'Vietnam'],
    'south-east-asia': ['Brunei', 'Myanmar', 'Cambodia', 'East Timor', 'Indonesia', 'Laos', 'Malaysia', 'Philippines', 'Singapore', 'Thailand', 'Vietnam'],
    'australia': ['Australia'],
    'australasia' : ['Australia', 'New Zealand'],
    'india-sri-lanka' : ['Sri Lanka'],
    'all' : ['Australia','France','Italy','Portugal','South Africa','United States of America','Madagascar','Spain','Greece','Cyprus','Mexico','Kenya','Algeria','Israel','Egypt','Ethiopia','Ghana','Malawi','Mauritius','Morocco','Mozambique','Rwanda','Sierra Leone','United Republic of Tanzania','Tunisia','Uganda','Zimbabwe','China','India','Iran','Iraq','Jordan','Sri Lanka','Syria','Taiwan','Turkey','Malta','Montenegro','United Kingdom','Argentina','Brazil','Chile','Paraguay','Uruguay']
}

In [None]:
# Load country boundary shapefiles for spatial clipping operations
# The shapefile name is determined by the 'training' variable (e.g., 'train.shp' or 'test.shp')
gdf_countries = gpd.read_file(os.path.join(input_path, '%s.shp' %training))

# Extract geometry objects from the GeoDataFrame for use in spatial operations
# These geometries will be used to clip raster data to the study area boundaries
shapes = [shape for shape in gdf_countries.geometry]

## 1. Elevation Data Processing

This section processes elevation data from WorldClim, which provides global elevation data at 30 arc-second resolution (~1km).

**Data Source**: WorldClim elevation dataset (wc2.1_30s_elev.tif)
- **Resolution**: 30 arc-seconds (~1km at equator)
- **Coverage**: Global
- **Format**: GeoTIFF

**Alternative Data Source**: ETOPO Global Relief Model
- https://www.ncei.noaa.gov/products/etopo-global-relief-model
- Higher resolution but larger file size

In [None]:
# Alternative elevation data processing using ETOPO Global Relief Model
# This section is commented out but shows how to use higher-resolution ETOPO data

# Load ETOPO elevation data (60 arc-second resolution, ~2km at equator)
# etopo = rioxarray.open_rasterio(os.path.join(data_path, 'topo', 'ETOPO_2022_v1_60s_N90W180_bed.tif'), masked=True)

# Clip ETOPO data to the study region boundaries
# etopo_east_asia = etopo.rio.clip(geometries=shapes)

# Export clipped ETOPO elevation data
# etopo_east_asia.rio.to_raster(os.path.join('output', 'rasters', 'etopo_east_asia.tif'))

In [None]:
# Process WorldClim elevation data for the study region

# Define path to WorldClim elevation dataset
elev_worldclim = os.path.join(data_path, 'worldclim', 'wc2.1_30s_elev.tif')

# Load elevation raster data with masked values (handles NoData values)
srtm = rioxarray.open_rasterio(elev_worldclim, masked=True)

# Clip elevation data to the study region boundaries using country shapes
srtm_region = srtm.rio.clip(geometries=shapes)

# Set a descriptive name for the elevation variable
srtm_region.name = 'elevation'

# Export clipped elevation data to temporary directory
# Filename includes training/test identifier for organization
srtm_region.rio.to_raster(os.path.join(input_path, "tmp", 'srtm_%s.tif' %training))

In [None]:
# Create elevation visualization with optimized color scaling

# Calculate dynamic color scale limits based on elevation data
# vmax: Set to 85% of maximum elevation to avoid extreme outliers
vmax = 0.85*srtm_region.max()

# vmin: Calculate minimum value for better contrast
# Formula: vmax - (1.21875 * vmax) provides good contrast for most elevation ranges
vmin = np.round((vmax - (1.21875 * vmax)))

# Create figure with appropriate size for geographic visualization
fig, ax = plt.subplots(figsize=(18,6))

# Plot country boundaries as background (light gray with black edges)
gdf_countries.plot(ax=ax, facecolor='lightgray', edgecolor='k')

# Alternative: Plot ETOPO data with fixed color scale (commented out)
# etopo_east_asia.plot(cmap=plt.cm.terrain, vmin=-2000, vmax=8000)

# Plot elevation data with earth-tone colormap and dynamic scaling
srtm_region.plot(ax=ax, cmap=plt.cm.gist_earth, vmin=vmin, vmax=vmax)

# Optional: Add title (commented out to keep plot clean)
# ax.set_title('Elevation')

In [None]:
if savefig:
    fig.savefig(os.path.join(figs_path, '03_wordclim_elevation_%s.png' %training), transparent=True)

## 2. Normalized Difference Vegetation Index (NDVI) Processing

This section processes NDVI data from the Copernicus Global Land Service, which provides vegetation health and density information.

**Data Source**: Copernicus Global Land Service NDVI Long-Term Statistics
- **Product**: NDVI-LTS (Long-Term Statistics) 1999-2019
- **Resolution**: 1km
- **Coverage**: Global
- **Format**: NetCDF
- **Statistics**: Median values over 20-year period

**NDVI Values and Interpretation**:
- **-1 to 0**: Water, snow, ice, or clouds
- **0 to 0.1**: Bare soil, rocks, sand
- **0.1 to 0.25**: Sparse vegetation, grasslands
- **0.25 to 0.4**: Dense grasslands, agricultural areas
- **0.4 to 1.0**: Dense vegetation, forests

**Data Source**: https://land.copernicus.eu/global/products/ndvi

In [None]:
# Load NDVI Long-Term Statistics dataset from Copernicus Global Land Service

# Alternative data source (commented out): Short-Term Statistics (2015-2019)
# https://land.copernicus.vgt.vito.be/manifest/ndvi_stats_all/manifest_cgls_ndvi_stats_all_latest.txt
# ndvi_nc_file = os.path.join(data_path, 'ndvi', 'c_gls_NDVI-STS_2015-2019-0611_GLOBE_PROBAV_V3.0.1.nc')

# Primary data source: Long-Term Statistics (1999-2019) - more stable for modeling
# File contains median NDVI values calculated over 20-year period
ndvi_nc_file = os.path.join(data_path, 'ndvi', 'c_gls_NDVI-LTS_1999-2019-1221_GLOBE_VGT-PROBAV_V3.0.1.nc')

# Open NetCDF dataset using xarray for efficient handling of multi-dimensional data
ndvi_nc = xr.open_dataset(ndvi_nc_file)

In [None]:
# Set coordinate reference system for spatial operations
# EPSG:4326 is WGS84 geographic coordinate system (latitude/longitude)
ndvi_nc.rio.write_crs("epsg:4326", inplace=True)

# Clip NDVI data to the study region boundaries using country shapes
# This reduces data size and focuses analysis on the area of interest
ndvi_nc_region = ndvi_nc.rio.clip(geometries=shapes)

In [None]:
# Extract median NDVI values from the clipped dataset
ndvi_median_region = ndvi_nc_region['median']

# Define classification bins for NDVI values
# These bins correspond to different vegetation density categories
ndvi_class_bins = [-np.inf, 0, 0.1, 0.25, 0.4, np.inf]
# Class 0: Unknown/No data, Class 1: No vegetation, Class 2: Bare area
# Class 3: Low vegetation, Class 4: Moderate vegetation, Class 5: High vegetation

# Classify continuous NDVI values into discrete categories using digitize
ndvi_class = np.digitize(ndvi_median_region, ndvi_class_bins)

# Apply the nodata mask to the newly classified NDVI data
# This ensures that areas with no data remain masked in the classified output
ndvi_class = np.ma.masked_where(np.isnan(ndvi_median_region), ndvi_class)

# Create xarray DataArray for the classified NDVI data
# Preserve original coordinates and CRS information
ndvi_median_class_region = xr.DataArray(ndvi_class, 
                                        coords={'lat': ndvi_median_region.lat,'lon': ndvi_median_region.lon,'crs': ndvi_median_region.crs}, 
                                        dims=["lat", "lon"])

# Set descriptive name for the classified variable
ndvi_median_class_region.name = 'ndvi_median_classed'

# Export classified NDVI data as GeoTIFF
ndvi_median_class_region.rio.to_raster(os.path.join(input_path, 'ndvi-median-classed_%s.tif' %training))

In [None]:
# Define color scheme and labels for NDVI classification visualization

# Color dictionary mapping class numbers to colors and descriptive labels
col_dict = {0: ['white', 'Unknown'],           # No data or unknown areas
            1: ['gray', 'No Vegetation'],      # NDVI ≤ 0 (water, snow, ice)
            2: ['y', 'Bare Area'],             # 0 < NDVI ≤ 0.1 (bare soil, rocks)
            3: ['yellowgreen', 'Low Vegetation'],    # 0.1 < NDVI ≤ 0.25 (sparse vegetation)
            4: ['g', 'Moderate Vegetation'],   # 0.25 < NDVI ≤ 0.4 (grasslands, agriculture)
            5: ['darkgreen', 'High Vegetation']}     # NDVI > 0.4 (dense forests)

# Create custom colormap from the defined colors
cmap = mpl.colors.ListedColormap([col_dict[x][0] for x in col_dict.keys()])

# Extract labels for colorbar formatting
labels = [col_dict[x][1] for x in col_dict.keys()]

# Set color for masked/invalid values (black)
cmap.set_bad='k'

# Create boundary normalization for discrete color mapping
# Boundaries at -0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5
norm = mpl.colors.BoundaryNorm(np.arange(-0.5,6), cmap.N) 

# Create custom formatter for colorbar labels
fmt = mpl.ticker.FuncFormatter(lambda x, pos: labels[norm(x)])

In [None]:
# Create side-by-side visualization of continuous and classified NDVI data

# Create figure with two subplots side by side
fig, ax = plt.subplots(ncols=2, figsize=(18, 6))

# Left plot: Continuous NDVI values
# Plot country boundaries as background
gdf_countries.plot(ax=ax[0], facecolor='lightgray', edgecolor='k')

# Plot continuous NDVI with yellow-green colormap (0-1 range)
pcol = ndvi_median_region.plot(ax=ax[0], cmap=plt.cm.YlGn, vmin=0, vmax=1, add_colorbar=False)
plt.colorbar(pcol, extend='both')  # Add colorbar with extend for values outside range
ax[0].set_title('Normalized Difference Vegetation Index')

# Right plot: Classified NDVI values
# Plot country boundaries as background
gdf_countries.plot(ax=ax[1], facecolor='lightgray', edgecolor='k')

# Plot classified NDVI with custom discrete colormap
pcol = ndvi_median_class_region.plot(ax=ax[1], cmap=cmap, norm=norm, add_colorbar=False)
plt.colorbar(pcol, format=fmt, ticks=np.linspace(0,5,6))  # Custom colorbar with class labels
ax[1].set_title('NDVI subdivided in classes')

# Auto adjust subplot spacing to fit figure size properly
plt.tight_layout()

In [None]:
if savefig:
    fig.savefig(os.path.join(figs_path, '03_ndvi_%s.png' %training), transparent=True)