# Occurrence data preparation

In [None]:
# =============================================================================
# CONFIGURATION SECTION - MODIFY THESE SETTINGS AS NEEDED
# =============================================================================

# Species selection for analysis
# Change this variable to select your target species for analysis

#specie = 'leptocybe-invasa' # 'leptocybe-invasa' # 

In [None]:
# =============================================================================
# IMPORT REQUIRED LIBRARIES
# =============================================================================

import os
import pandas as pd 

In [None]:
# =============================================================================
# SET UP FILE PATHS
# =============================================================================
# Define the main data directory path (one level up from current working directory)

data_path = os.path.join(os.path.dirname(os.getcwd()), 'data')
specie_path = os.path.join(data_path, 'species', specie)

In [None]:
# =============================================================================
# DATA LOADING AND PREPROCESSING
# =============================================================================
# This section handles the specific data sources
# Each source has different column names and formats that need standardization

In [None]:
if specie == 'leptocybe-invasa':

    file_name_gbif = 'Gbif_L-invasa_0120814-230530130749713.csv'
    file_name_cabi = 'Cabi_2017_L-invasa-108923.csv'
    file_name_eppo = 'Eppo_2010_L-invasa.csv' # presence only on state level, no coordinates
    file_name_otieno = 'Otieno_2019_L-invasa.csv'
    file_name_peng = 'Peng_2021_L-invasa.csv'
    file_name_mcam = 'Leptocybe Overseas collections SEP 2017 from Madeline_cambodia.csv'
    file_name_mlaos = 'Leptocybe Overseas collections SEP 2017 from Madeline_Laos.csv'
    file_name_mthai = 'Leptocybe Overseas collections SEP 2017 from Madeline_Thailand.csv'
    file_name_sea = 'SE_Asia_Research_L-Invasa.csv'
    
    gbif = pd.read_csv(os.path.join(specie_path, file_name_gbif), sep='\t', usecols=['decimalLatitude', 'decimalLongitude', 'eventDate'])#	day	month	year])
    cabi = pd.read_csv(os.path.join(specie_path, file_name_cabi), skiprows=2, usecols=['Latitude', 'Longitude', 'First Reported', 'References'], na_values='Not recorded')
    eppo = pd.read_csv(os.path.join(data_path, 'species', specie, file_name_eppo), usecols=['country', 'state', 'country code', 'state code', 'Status', 'continent'])
    otieno = pd.read_csv(os.path.join(specie_path, file_name_otieno), usecols=['Lat', 'Lon'])
    peng = pd.read_csv(os.path.join(specie_path, file_name_peng), usecols=['Lat', 'Lon', 'Date', 'Elevation (m)'])
    mcam = pd.read_csv(os.path.join(specie_path, file_name_mcam),usecols=['Date','Lat.','Long'])
    mlaos = pd.read_csv(os.path.join(specie_path, file_name_mlaos),usecols=['Date','Lat.','Long'])
    mthai = pd.read_csv(os.path.join(specie_path, file_name_mthai),usecols=['Date','Lat.','Long'])
    sea = pd.read_csv(os.path.join(specie_path, file_name_sea),usecols=['Lat ','Long'])

    # =============================================================================
    # DATA CLEANING - REMOVE MISSING VALUES
    # =============================================================================
    # Remove rows with missing values 
    
    mcam = mcam.dropna()
    mlaos = mlaos.dropna()
    mthai = mthai.dropna()

    # =============================================================================
    # STANDARDIZE COLUMN NAMES AND ADD SOURCE INFORMATION
    # =============================================================================
    # Rename columns to standardized format and add source attribution for tracking
    
    gbif_renamed = gbif.rename(columns={'decimalLatitude':'lat', 'decimalLongitude': 'lon', 'eventDate': 'date'})
    gbif_renamed['source'] = 'GBIF'
    cabi_renamed = cabi.rename(columns={'Latitude':'lat', 'Longitude': 'lon', 'First Reported': 'date'})
    cabi_renamed['source'] = 'CABI'
    otieno_renamed = otieno.rename(columns={'Lat':'lon', 'Lon': 'lat'})
    otieno_renamed['source'] = 'Otieno et al. 2019'
    peng_renamed = peng.rename(columns={'Lat':'lat', 'Lon': 'lon', 'Date': 'date'})
    peng_renamed['source'] = 'Peng et al. 2021'
    mcam_renamed = mcam.rename(columns={'Lat.':'lat','Long':'lon','Date':'date'})
    mcam_renamed['source'] = 'Madeline overseas collections - Cambodia'
    mlaos_renamed = mlaos.rename(columns={'Lat.':'lat','Long':'lon','Date':'date'})
    mlaos_renamed['source'] = 'Madeline overseas collections - Laos'
    mthai_renamed = mthai.rename(columns={'Lat.':'lat','Long':'lon','Date':'date'})
    mthai_renamed['source'] = 'Madeline overseas collections - Thailand'
    sea_renamed = sea.rename(columns={'Lat ':'lat', 'Long': 'lon'})
    sea_renamed['source'] = 'SE Asia'

    # =============================================================================
    # COMBINE ALL DATA SOURCES INTO SINGLE DATASET
    # =============================================================================
    # Concatenate all standardized datasets into one comprehensive occurrence dataset
    # This creates a unified dataset for downstream species distribution modeling
    
    occurences_global = pd.concat([gbif_renamed, cabi_renamed, otieno_renamed, peng_renamed, mcam_renamed, mlaos_renamed, mthai_renamed,sea_renamed])

In [None]:
if specie == 'thaumastocoris-peregrinus':
    
    file_name_montemayor = 'Montemayor_2015_Thaumastocoris_peregrinus_bronze_bug_transcribed_from_SUPP2.csv'
    file_name_cabi = 'CABI_T-peregrinus.csv'
    file_name_gbif = 'GBIF_T-peregrinus_0026124-240321170329656.csv'

    montemayor = pd.read_csv(os.path.join(specie_path, file_name_montemayor), usecols=['LATITUDE', 'LONGITUDE', 'REFERENCE'], encoding='latin1')
    cabi = pd.read_csv(os.path.join(specie_path, file_name_cabi), usecols=['Latitude', 'Longitude', 'References'], skiprows=2)
    gbif = pd.read_csv(os.path.join(specie_path, file_name_gbif), sep='\t', usecols=['decimalLatitude', 'decimalLongitude', 'eventDate'])

    # =============================================================================
    # STANDARDIZE COLUMN NAMES AND ADD SOURCE INFORMATION
    # =============================================================================
    # Rename columns to standardized format and add source attribution for tracking
    
    montemayor_renamed = montemayor.rename(columns={'LATITUDE':'lat', 'LONGITUDE': 'lon'})
    montemayor_renamed['source'] = 'Montemayor et al. 2015'
    cabi_renamed = cabi.rename(columns={"Latitude": "lat", "Longitude": "lon"})
    cabi_renamed['source'] = 'CABI'
    gbif_renamed = gbif.rename(columns={'decimalLatitude':'lat', 'decimalLongitude': 'lon', 'eventDate': 'date'})
    gbif_renamed['source'] = 'GBIF'

    # =============================================================================
    # COMBINE ALL DATA SOURCES INTO SINGLE DATASET
    # =============================================================================
    # Concatenate all standardized datasets into one comprehensive occurrence dataset
    # This creates a unified dataset for downstream species distribution modeling
    
    occurences_global = pd.concat([montemayor_renamed, cabi_renamed, gbif_renamed])

In [None]:
# =============================================================================
# DATA EXPLORATION AND EXPORT
# =============================================================================
# Display the first few rows of the aggregated dataset to verify structure and content

print(occurences_global.head())
print('Number of occurences globally is: %s' %len(occurences_global)) # Display the total number of occurrence records collected from all sources

# =============================================================================
# SAVE AGGREGATED DATA TO FILE
# =============================================================================
# Export the combined dataset to a CSV file for use in subsequent analyses
# File will be saved in the species-specific directory with standardized naming convention
# This file can be used as input for species distribution modeling workflows

occurences_global.to_csv(os.path.join(specie_path, '%s_aggregated.csv' %specie)) 