# Abundance Predict

**Written by Timm Nawrocki**

*Last updated Thursday March 21, 2021.*

In [None]:
# -*- coding: utf-8 -*-
# ---------------------------------------------------------------------------
# Predict Vegetation Abundance
# Author: Timm Nawrocki, Alaska Center for Conservation Science
# Created on: 2021-03-21
# Usage: Must be executed as a Jupyter Notebook in an Anaconda 3 installation.
# Description: "Predict Vegetation Abundance" applies the trained classifier and regressor to data in regular point grid
# format stored in csv files to create a composite prediction representing the distribution and proportional abundance of the
# target species.
# ---------------------------------------------------------------------------

In [None]:
# Import os
import os

In [None]:
# Set root directory
root_folder = '/home/twnawrocki'
# Define species, genera, or aggregate name
map_group = 'alnus'
# Enter machine number
machine_number = 1

In [None]:
# Define grid folder
grid_folder = os.path.join(root_folder,
                           'extracted_grids')
# Define model folder
model_folder = os.path.join(root_folder,
                           'model_results',
                           map_group)
# Define prediction folder
prediction_folder = os.path.join(root_folder,
                                'predicted_tables',
                                map_group)

In [None]:
# Make output directory if it does not already exist
if os.path.exists(prediction_folder) == 0:
    os.mkdir(prediction_folder)

In [None]:
# Define variable sets
predictor_all = ['aspect', 'wetness', 'elevation', 'slope', 'roughness', 'exposure', 'area', 'relief', 'position',
                 'radiation', 'vh', 'vv', 'shortIR1_06', 'shortIR2_06', 'blue_06', 'green_06', 'red_06', 'redge1_06',
                 'redge2_06', 'redge3_06', 'nearIR_06', 'redge4_06', 'evi2_06', 'nbr_06', 'ndmi_06', 'ndsi_06', 'ndvi_06',
                 'ndwi_06', 'shortIR1_07', 'shortIR2_07', 'blue_07', 'green_07', 'red_07', 'redge1_07', 'redge2_07',
                 'redge3_07', 'nearIR_07', 'redge4_07', 'evi2_07', 'nbr_07', 'ndmi_07', 'ndsi_07', 'ndvi_07', 'ndwi_07',
                 'shortIR1_08', 'shortIR2_08', 'blue_08', 'green_08', 'red_08', 'redge1_08', 'redge2_08', 'redge3_08',
                 'nearIR_08', 'redge4_08', 'evi2_08', 'nbr_08', 'ndmi_08', 'ndsi_08', 'ndvi_08', 'ndwi_08', 'lstWarmth',
                 'precip', 'summerWarmth', 'januaryMin']
coordinates = ['x', 'y']
absence = ['absence']
presence = ['presence']
response = ['response']
prediction = ['prediction']
output_columns = coordinates + prediction

In [None]:
# Import packages for file manipulation, data manipulation, and plotting
import glob
import numpy as np
import pandas as pd
# Import LightGBM gradient boosting implementations
from lightgbm import LGBMClassifier
from lightgbm import LGBMRegressor
# Import joblib
import joblib
# Import timing packages
import time
import datetime

In [None]:
# Define a function to read threshold values from text file
def readThreshold(inFile):
    threshold_reader = open(inFile, "r")
    threshold = threshold_reader.readlines()
    threshold_reader.close()
    outThreshold = float(threshold[0])
    return outThreshold

In [None]:
# Create a function to composite model results
def compositePrediction(input_data, presence, response, threshold):
    # Define a function to threshold absences and set presences equal to regression response
    def compositeRows(row):
        if row[presence[0]] < threshold:
            return 0
        elif row[presence[0]] >= threshold:
            if row[response[0]] <= 0:
                return 0
            elif row[response[0]] > 0:
                return round(row[response[0]], 0)
    # Apply function to all rows in test data
    input_data['prediction'] = input_data.apply(lambda row: compositeRows(row), axis=1)
    # Return the test data frame with composited results
    return input_data

In [None]:
# Import the trained models
classifier = joblib.load(os.path.join(model_folder, 'classifier.joblib'))
regressor = joblib.load(os.path.join(model_folder, 'regressor.joblib'))

In [None]:
# Read thresholds from text files in the workspace folder and store as variables
threshold = readThreshold(os.path.join(model_folder, 'threshold.txt'))

In [None]:
# Create a list of input files for the prediction step
os.chdir(grid_folder)
grid_files = glob.glob('*.csv')
# Subset the grid files
grid_files = grid_files[((machine_number * 7930) - 7930):((machine_number * 7930) - 1)]
grid_length = len(grid_files)

In [None]:
# Loop through the prediction function for all input files
count = 1
for grid in grid_files:
    # Define the output csv file
    output_csv = os.path.join(prediction_folder, grid)
    
    # Predict output table if it does not already exist
    if os.path.exists(output_csv) == 0:
        total_start = time.time()
        print(f'Predicting grid {count} of {grid_length}...')
    
        # Identify file path to the input csv file
        print('\tLoading grid data into memory...')
        iteration_start = time.time()
        input_csv = os.path.join(grid_folder, grid)
        # Load the input data
        input_data = pd.read_csv(input_csv)
        input_data = input_data.rename(columns={'janMin': 'januaryMin'})
        input_data = input_data.dropna(axis=0, how='any')
        input_data[predictor_all] = input_data[predictor_all].astype(float)
        # Define the X data
        X_data = input_data[predictor_all].astype(float)
        iteration_end = time.time()
        iteration_elapsed = int(iteration_end - iteration_start)
        iteration_success_time = datetime.datetime.now()
        print(f'\tCompleted at {iteration_success_time.strftime("%Y-%m-%d %H:%M")} (Elapsed time: {datetime.timedelta(seconds=iteration_elapsed)})')
        print('\t----------')
    
        # Predict the classifier
        print('\tClassifying presence-absence...')
        iteration_start = time.time()
        classification = classifier.predict_proba(X_data)
        # Concatenate predicted values to input data frame
        input_data['absence'] = classification[:,0]
        input_data['presence'] = classification[:,1]
        iteration_end = time.time()
        iteration_elapsed = int(iteration_end - iteration_start)
        iteration_success_time = datetime.datetime.now()
        print(f'\tCompleted at {iteration_success_time.strftime("%Y-%m-%d %H:%M")} (Elapsed time: {datetime.timedelta(seconds=iteration_elapsed)})')
        print('\t----------')
    
        # Predict the regressor
        print('\tPredicting foliar cover...')
        iteration_start = time.time()
        regression = regressor.predict(X_data)
        # Concatenate predicted values to input data frame
        input_data['response'] = regression
        iteration_end = time.time()
        iteration_elapsed = int(iteration_end - iteration_start)
        iteration_success_time = datetime.datetime.now()
        print(f'\tCompleted at {iteration_success_time.strftime("%Y-%m-%d %H:%M")} (Elapsed time: {datetime.timedelta(seconds=iteration_elapsed)})')
        print('\t----------')
    
        # Composite the classifier and regressor predictions
        print('\tExporting results...')
        iteration_start = time.time()
        input_data = compositePrediction(input_data, presence, response, threshold)
        # Export prediction to csv
        output_data = input_data[output_columns]
        output_data.to_csv(output_csv, header=True, index=False, sep=',', encoding='utf-8')
        iteration_end = time.time()
        iteration_elapsed = int(iteration_end - iteration_start)
        iteration_success_time = datetime.datetime.now()
        print(f'\tCompleted at {iteration_success_time.strftime("%Y-%m-%d %H:%M")} (Elapsed time: {datetime.timedelta(seconds=iteration_elapsed)})')
        print('\t----------')
    
        total_end = time.time()
        total_elapsed = int(total_end - total_start)
        total_success_time = datetime.datetime.now()
        print(f'Iteration completed at {total_success_time.strftime("%Y-%m-%d %H:%M")} (Elapsed time: {datetime.timedelta(seconds=total_elapsed)})')
        print('----------')
    
    else:
        print(f'Grid {count} of {grid_length} already predicted.')
        print('----------')
    
    # Increase counter
    count += 1