# Specie distribution

In [None]:
# =============================================================================
# CONFIGURATION SECTION - MODIFY THESE SETTINGS AS NEEDED
# =============================================================================

# specie = 'leptocybe-invasa' # 'thaumastocoris-peregrinus' # 
# training = 'sea' # 'australia'
# interest = 'sea'
# savefig = False

In [None]:
# =============================================================================
# IMPORT REQUIRED LIBRARIES
# =============================================================================

import os
import shutil
import pandas as pd 
import geopandas as gpd
import elapid as ela

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

import cartopy.crs as ccrs
import cartopy.feature as cfeature
import cartopy.io.shapereader as shapereader

from cartopy.io.shapereader import Reader, natural_earth
from cartopy.feature import ShapelyFeature

import warnings
warnings.filterwarnings("ignore")

params = {'legend.fontsize': 'x-large',
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}

plt.rcParams.update(params)

In [None]:
# =============================================================================
# MAPPING CONFIGURATION AND UTILITY FUNCTIONS
# =============================================================================

# Set default figure size and projection for all maps
figsize = (18,12)  # Large figure size for detailed maps
projection = ccrs.PlateCarree()  # Equirectangular projection (lat/lon coordinates)

def make_map(figsize, projection, res):
    """
    Create a standardized map with cartographic features
    
    Parameters:
    -----------
    figsize : tuple
        Figure size in inches (width, height)
    projection : cartopy.crs
        Map projection to use
    res : str
        Resolution for cartographic features ('110m', '50m', '10m')
    
    Returns:
    --------
    fig, ax, gl : matplotlib figure, axes, and gridlines objects
    """
    
    fig = plt.figure(figsize=figsize)
    ax = plt.axes(projection=projection)
    
    # Add cartographic features in order of importance
    ax.add_feature(cfeature.BORDERS.with_scale(res))  # Country borders
    ax.add_feature(cfeature.STATES.with_scale(res),   # State/province borders
                   linestyle=':', edgecolor='gray', linewidth=0.5)
    ax.add_feature(cfeature.LAND.with_scale(res), color='lightgray')  # Land areas
    ax.add_feature(cfeature.COASTLINE.with_scale(res))  # Coastlines
    
    # Add gridlines for coordinate reference
    gl = ax.gridlines(color='grey', linestyle=':', draw_labels=True, rotate_labels=False)
       
    return fig, ax, gl

## 1. Species Occurrence

In [None]:
# =============================================================================
# DIRECTORY SETUP AND PATH CONFIGURATION
# =============================================================================

# Define main data directory (parent directory of current working directory)
data_path = os.path.join(os.path.dirname(os.getcwd()), 'data')
   
# Set up figures output directory
figs_path = os.path.join(os.path.dirname(os.getcwd()), 'figs')
# Remove existing figures directory if it exists to start fresh
if os.path.exists(figs_path):
    shutil.rmtree(figs_path)

# Create fresh figures directory
os.makedirs(figs_path, exist_ok=True)

# Set up output directory structure for the specific species
out_path = os.path.join(os.path.dirname(os.getcwd()), 'out', specie)
input_path = os.path.join(out_path, 'input')

# Create all necessary directories for train/test data storage
for path in [figs_path, os.path.join(input_path, 'train'), os.path.join(input_path, 'test')]:
    if not os.path.exists(path):
        os.makedirs(path)

In [None]:
# =============================================================================
# LOAD GLOBAL SPECIES OCCURRENCE DATA
# =============================================================================

# Load the aggregated occurrence data for the specified species
# The CSV file contains all known occurrence records with coordinates and source information
occurences_global = pd.read_csv(os.path.join(data_path, 'species', specie, '%s_aggregated.csv' %specie))
print('Number of occurences globally is: %s' %len(occurences_global))
display(occurences_global)

In [None]:
# =============================================================================
# REGIONAL DEFINITIONS FOR SPECIES DISTRIBUTION ANALYSIS
# =============================================================================

# Dictionary defining different geographic regions for training and testing
# Each key represents a region name, and the value is a list of country names
# These regions are used to filter occurrence data for specific geographic areas
regions = {
    'east-asia': ['China', 'Taiwan', 'Japan', 'North Korea', 'South Korea'],
    'indo': ["Indonesia",'Malaysia','Singapore','Brunei','East Timor'],
    'sea': ['Myanmar', 'Cambodia', 'Laos', 'Philippines', 'Thailand', 'Vietnam'],
    'south-east-asia': ['Brunei', 'Myanmar', 'Cambodia', 'East Timor', 'Indonesia', 'Laos', 'Malaysia', 'Philippines', 'Singapore', 'Thailand', 'Vietnam'],
    'australia': ['Australia'],
    'australasia' : ['Australia', 'New Zealand'],
    'india-sri-lanka' : ['Sri Lanka'],
    'all' : ['Australia','France','Italy','Portugal','South Africa','United States of America','Madagascar','Spain','Greece','Cyprus','Mexico','Kenya','Algeria','Israel','Egypt','Ethiopia','Ghana','Malawi','Mauritius','Morocco','Mozambique','Rwanda','Sierra Leone','United Republic of Tanzania','Tunisia','Uganda','Zimbabwe','China','India','Iran','Iraq','Jordan','Sri Lanka','Syria','Taiwan','Turkey','Malta','Montenegro','United Kingdom','Argentina','Brazil','Chile','Paraguay','Uruguay']
}

In [None]:
# =============================================================================
# DATA CLEANING FOR LONGITUDE VALUES
# =============================================================================

# Clean longitude data for leptocybe-invasa species
# Remove spaces from longitude values and convert to float for proper numeric processing
if specie == 'leptocybe-invasa':
    occurences_global['lon'] = occurences_global['lon'].str.replace(' ', '').astype(float)

In [None]:
# =============================================================================
# CREATE GEOPANDAS DATAFRAME AND FILTER VALID COORDINATES
# =============================================================================

# Convert longitude and latitude columns to Point geometries for spatial analysis
geometry = gpd.points_from_xy(occurences_global['lon'], occurences_global['lat'])
# Create GeoDataFrame with source information and geometry
occurences = gpd.GeoDataFrame(occurences_global['source'], geometry=geometry)

# Remove occurrences that have invalid or empty coordinates
# This filters out any records where coordinates couldn't be properly parsed
occurences = occurences[~occurences.is_empty]
print('Number of occurences globally after removing occurrences without coordinates: %s' %len(occurences))

In [None]:
# =============================================================================
# CREATE GLOBAL OCCURRENCE MAP BY DATA SOURCE
# =============================================================================

# Assign unique colors to each data source for visualization
colors = list(mcolors.TABLEAU_COLORS.keys())
dict_color = {src: colors[i]  for i, src in enumerate(occurences.source.unique())}
occurences['color'] = occurences['source'].map(dict_color)

# Create a global map with cartographic features
fig, ax, gl = make_map(figsize, projection, res='110m')

# Plot occurrence points colored by data source
for src, df in occurences.groupby('source'):
    ax.scatter(x=df.geometry.x, y=df.geometry.y, color=df.color, label=src, transform=ccrs.PlateCarree())

# Add a reference box showing the SEA-22 region boundaries
ax.plot([89.26, 146.96, 146.96, 89.26, 89.26], [-15.14, -15.14, 27.26, 27.26, -15.14], transform=ccrs.PlateCarree(), label='SEA-22', color=colors[len(occurences.source.unique())])   
ax.legend(loc='lower left')                           

In [None]:
# =============================================================================
# SAVE GLOBAL OCCURRENCE MAP
# =============================================================================

if savefig:
    fig.savefig(os.path.join(figs_path, '01_presence_%s_global.png' %specie), transparent=True)

In [None]:
# =============================================================================
# LOAD COUNTRY BOUNDARIES FROM NATURAL EARTH DATA
# =============================================================================

# Download and load country boundary data from Natural Earth dataset
# This provides high-quality vector data for country polygons
resolution = '50m'  # Medium resolution for good detail without excessive file size
category = 'cultural'  # Cultural features (countries, states, etc.)
name = 'admin_0_countries'  # Administrative level 0 (country-level boundaries)

# Download the shapefile from Natural Earth
shpfilename = shapereader.natural_earth(resolution, category, name)
# Load the country boundaries as a GeoDataFrame
countries = gpd.read_file(shpfilename)
# display(countries)  # Uncomment to inspect the loaded country data

In [None]:
# =============================================================================
# SELECT COUNTRIES FOR TRAINING REGION AND CREATE BOUNDARY SHAPEFILE
# =============================================================================

# Create a combined GeoDataFrame containing all countries in the training region
gdf_countries = gpd.GeoDataFrame()
for country in regions[training]:
    # Extract the country polygon from the global countries dataset
    cntry = countries.loc[countries['ADMIN'] == country]
    # Combine all countries in the region into a single GeoDataFrame
    gdf_countries = pd.concat([gdf_countries, cntry])

# Save the regional boundary as a shapefile for use in downstream analysis
# This creates a single polygon representing the entire training region
gdf_countries.to_file(os.path.join(input_path, '%s.shp' %training))
# Alternative save locations (commented out):
# gdf_countries.to_file(os.path.join(input_path, 'train', '%s.shp' %training))
# gdf_countries.to_file(os.path.join(input_path, 'test', '%s.shp' %training))

# Uncomment the following lines to inspect the regional boundary data:
# gdf_countries
# pd.set_option('display.max_columns', None)
# display(gdf_countries)

In [None]:
# =============================================================================
# FILTER OCCURRENCES BY TRAINING REGION
# =============================================================================

# Perform spatial filtering to identify occurrences within the training region
# Start with the first country in the region
in_region = occurences.within(gdf_countries.geometry.values[0])

# Iterate through remaining countries and combine results using OR logic
# This ensures we capture occurrences in any country within the training region
for i in range(1, len(gdf_countries)):
    in_region = in_region | occurences.within(gdf_countries.geometry.values[i])
    
# Extract only the occurrences that fall within the training region
occurences_region = occurences[in_region]

print('Number of occurences in %s is: %s' %(training, len(occurences_region)))

In [None]:
# =============================================================================
# ALTERNATIVE SPATIAL JOIN METHOD (COMMENTED OUT)
# =============================================================================

# Alternative approach using spatial join to identify country for each occurrence
# This method would add country information to each occurrence point
# Currently commented out as we use the within() method above for filtering

# result = gpd.sjoin(occurences, countries, how='left')  # Spatial join to get country info
# result2 = result['ADMIN'].unique()  # Get unique country names
# print(result2)  # Display all countries where occurrences are found

## 2. Define Training and Test Area

In [None]:
# =============================================================================
# EXTRACT COORDINATES FROM GEOMETRY FOR TRAIN/TEST SPLITTING
# =============================================================================

# Extract longitude and latitude coordinates from the Point geometry objects
# This creates separate columns for lon/lat that are needed for train/test splitting
occurences_region['lon'] = occurences_region['geometry'].x
occurences_region['lat'] = occurences_region['geometry'].y

In [None]:
# =============================================================================
# GENERATE MULTIPLE TRAIN/TEST SPLITS FOR MODEL VALIDATION
# =============================================================================

from sklearn.model_selection import ShuffleSplit

# --- Split Parameters ---
n_repeats = n_iteration  # Number of train/test splits to generate
train_size = 0.7  # 70% of data for training, 30% for testing

# --- Random Sampling Setup ---
# Use ShuffleSplit for random sampling without stratification
# random_state=42 ensures reproducible results across runs
splitter = ShuffleSplit(n_splits=n_repeats, train_size=train_size, random_state=42)

# Generate multiple train/test splits and save each one
for i, (train_idx, test_idx) in enumerate(splitter.split(occurences_region), start=1):
    # Create training and testing datasets
    train_df = occurences_region.iloc[train_idx].reset_index(drop=True)
    test_df = occurences_region.iloc[test_idx].reset_index(drop=True)

    # Save train and test datasets as CSV files
    # Only save longitude and latitude columns for downstream modeling
    train_df.to_csv(os.path.join(input_path, 'train', '%s_presence_%s_%s.csv' %(specie, training, i)), columns=['lon', 'lat'], index=False)
    test_df.to_csv(os.path.join(input_path, 'test', '%s_presence_%s_%s.csv' %(specie, interest, i)), columns=['lon', 'lat'], index=False)

    print(f"Split {i} -> Train: {len(train_df)} rows, Test: {len(test_df)} rows")

    # Create visualization of the train/test split
    fig, ax, gl = make_map(figsize, projection, res='50m')
    
    # Plot regional boundaries and train/test points
    gdf_countries.plot(ax=ax, alpha=0.5)  # Regional boundaries (semi-transparent)
    train_df.plot(ax=ax, marker='*', markersize=100, color='tab:red', label="train")  # Training points (red)
    test_df.plot(ax=ax, marker='*', markersize=100, color='tab:blue', label="test")  # Testing points (blue)
      
    ax.legend(loc='lower left')
    
    # Save the visualization if savefig is enabled
    if savefig:
        fig.savefig(os.path.join(figs_path, '01_presence_%s_%s.png' %(specie, i)), transparent=True)
    

load host data (Eucalyptus) from Abbasi et al. 2023
https://www.nature.com/articles/s41597-023-02383-w

In [None]:
# =============================================================================
# LOAD EUCALYPTUS FOREST DATA FOR EAST ASIA REGION
# =============================================================================

# Load Eucalyptus forest distribution data if the training region is East Asia
# This data provides information about planted Eucalyptus forests in the region
if training == 'east-asia':
    # Construct path to the zipped shapefile containing planted forest data
    planted_forest_file = os.path.join('zip://', data_path, 'planted-forest-east-asia', 'planted-forest-east-asia.zip')
    # Load only Eucalyptus species from the planted forest dataset
    # The 'where' parameter filters the data to include only Eucalyptus genus
    eucalyptus_forest = gpd.read_file(planted_forest_file, where="Genus='Eucalyptus'")