# Abundance Predict

**Written by Timm Nawrocki**

*Last updated Thursday June 4, 2020.*

In [1]:
# -*- coding: utf-8 -*-
# ---------------------------------------------------------------------------
# Predict Vegetation Abundance
# Author: Timm Nawrocki, Alaska Center for Conservation Science
# Created on: 2020-06-04
# Usage: Must be executed as a Jupyter Notebook in an Anaconda 3 installation.
# Description: "Predict Vegetation Abundance" applies the trained classifier and regressor to data in regular point grid
# format stored in csv files to create a composite prediction representing the distribution and proportional abundance of the
# target species.
# ---------------------------------------------------------------------------

In [2]:
# Import os
import os

In [3]:
# Set root directory
root_folder = 'K:/ACCS_Work'
# Define folder structure
data_input = os.path.join(root_folder,
                          'Projects/VegetationEcology/AKVEG_QuantitativeMap/Project_GIS/Data_Input')
data_output = os.path.join(root_folder,
                           'Projects/VegetationEcology/AKVEG_QuantitativeMap/Project_GIS/Data_Output')

In [4]:
# Define grid folder
grid_folder = os.path.join(data_input,
                           'grids')
# Define model folder
model_folder = os.path.join(data_output,
                           'model_results/final/alnus_r2')
# Define prediction folder
prediction_folder = os.path.join(data_output,
                                'predicted_grids/alnus')

In [5]:
# Make output directory if it does not already exist
if os.path.exists(prediction_folder) == 0:
    os.mkdir(prediction_folder)

In [6]:
# Define variable sets
predictor_all = ['aspect', 'wetness', 'elevation', 'slope', 'roughness', 'exposure', 'area', 'relief', 'position',
                 'radiation', 'vh', 'vv', 'shortIR1_05', 'shortIR2_05', 'blue_05', 'green_05', 'red_05', 'redge1_05',
                 'redge2_05', 'redge3_05', 'nearIR_05', 'redge4_05', 'evi2_05', 'nbr_05', 'ndmi_05', 'ndsi_05', 'ndvi_05',
                 'ndwi_05', 'shortIR1_06', 'shortIR2_06', 'blue_06', 'green_06', 'red_06', 'redge1_06', 'redge2_06',
                 'redge3_06', 'nearIR_06', 'redge4_06', 'evi2_06', 'nbr_06', 'ndmi_06', 'ndsi_06', 'ndvi_06', 'ndwi_06',
                 'shortIR1_07', 'shortIR2_07', 'blue_07', 'green_07', 'red_07', 'redge1_07', 'redge2_07', 'redge3_07',
                 'nearIR_07', 'redge4_07', 'evi2_07', 'nbr_07', 'ndmi_07', 'ndsi_07', 'ndvi_07', 'ndwi_07', 'shortIR1_08',
                 'shortIR2_08', 'blue_08', 'green_08', 'red_08', 'redge1_08', 'redge2_08', 'redge3_08', 'nearIR_08',
                 'redge4_08', 'evi2_08', 'nbr_08', 'ndmi_08', 'ndsi_08', 'ndvi_08', 'ndwi_08', 'shortIR1_09', 'shortIR2_09',
                 'blue_09', 'green_09', 'red_09', 'redge1_09', 'redge2_09', 'redge3_09', 'nearIR_09', 'redge4_09',
                 'evi2_09', 'nbr_09', 'ndmi_09', 'ndsi_09', 'ndvi_09', 'ndwi_09', 'lstWarmth', 'precip', 'summerWarmth']
coordinates = ['x', 'y']
absence = ['absence']
presence = ['presence']
response = ['response']
prediction = ['prediction']
output_columns = coordinates + prediction

In [7]:
# Import packages for file manipulation, data manipulation, and plotting
import glob
import numpy as np
import pandas as pd
# Import LightGBM gradient boosting implementations
from lightgbm import LGBMClassifier
from lightgbm import LGBMRegressor
# Import joblib
import joblib
# Import timing packages
import time
import datetime

In [8]:
# Define a function to read threshold values from text file
def readThreshold(inFile):
    threshold_reader = open(inFile, "r")
    threshold = threshold_reader.readlines()
    threshold_reader.close()
    outThreshold = float(threshold[0])
    return outThreshold

In [9]:
# Create a function to composite model results
def compositePrediction(input_data, presence, response, threshold):
    # Define a function to threshold absences and set presences equal to regression response
    def compositeRows(row):
        if row[presence[0]] < threshold:
            return -1
        elif row[presence[0]] >= threshold:
            if row[response[0]] <= 0:
                return 0
            elif row[response[0]] > 0:
                return round(row[response[0]], 0)
    # Apply function to all rows in test data
    input_data['prediction'] = input_data.apply(lambda row: compositeRows(row), axis=1)
    # Return the test data frame with composited results
    return input_data

In [10]:
# Import the trained models
classifier = joblib.load(os.path.join(model_folder, 'classifier.joblib'))
regressor = joblib.load(os.path.join(model_folder, 'regressor.joblib'))

In [11]:
# Read thresholds from text files in the workspace folder and store as variables
threshold = readThreshold(os.path.join(model_folder, 'threshold.txt'))

In [12]:
# Create a list of input files for the prediction step
os.chdir(grid_folder)
grid_files = glob.glob('*.csv')

In [13]:
# Loop through the prediction function for all input files
count = 1
for grid in grid_files:
    total_start = time.time()
    print(f'Predicting watershed {count} of {len(grid_files)}...')
    
    # Identify file path to the input csv file
    print('\tLoading grid data into memory...')
    iteration_start = time.time()
    input_csv = os.path.join(grid_folder, grid)
    # Define the output csv file
    output_csv = os.path.join(prediction_folder, grid)
    # Load the input data
    input_data = pd.read_csv(input_csv)
    input_data[predictor_all] = input_data[predictor_all].astype(int)
    # Define the X data
    X_data = input_data[predictor_all]
    iteration_end = time.time()
    iteration_elapsed = int(iteration_end - iteration_start)
    iteration_success_time = datetime.datetime.now()
    print(f'\tCompleted at {iteration_success_time.strftime("%Y-%m-%d %H:%M")} (Elapsed time: {datetime.timedelta(seconds=iteration_elapsed)})')
    print('\t----------')
    
    # Predict the classifier
    print('\tClassifying presence-absence...')
    iteration_start = time.time()
    classification = classifier.predict_proba(X_data)
    # Concatenate predicted values to input data frame
    input_data['absence'] = classification[:,0]
    input_data['presence'] = classification[:,1]
    iteration_end = time.time()
    iteration_elapsed = int(iteration_end - iteration_start)
    iteration_success_time = datetime.datetime.now()
    print(f'\tCompleted at {iteration_success_time.strftime("%Y-%m-%d %H:%M")} (Elapsed time: {datetime.timedelta(seconds=iteration_elapsed)})')
    print('\t----------')
    
    # Predict the regressor
    print('\tPredicting foliar cover...')
    iteration_start = time.time()
    regression = regressor.predict(X_data)
    # Concatenate predicted values to input data frame
    input_data['response'] = regression
    iteration_end = time.time()
    iteration_elapsed = int(iteration_end - iteration_start)
    iteration_success_time = datetime.datetime.now()
    print(f'\tCompleted at {iteration_success_time.strftime("%Y-%m-%d %H:%M")} (Elapsed time: {datetime.timedelta(seconds=iteration_elapsed)})')
    print('\t----------')
    
    # Composite the classifier and regressor predictions
    print('\tExporting results...')
    iteration_start = time.time()
    input_data = compositePrediction(input_data, presence, response, threshold)
    # Export prediction to csv
    output_data = input_data[output_columns]
    output_data.to_csv(output_csv, header=True, index=False, sep=',', encoding='utf-8')
    iteration_end = time.time()
    iteration_elapsed = int(iteration_end - iteration_start)
    iteration_success_time = datetime.datetime.now()
    print(f'\tCompleted at {iteration_success_time.strftime("%Y-%m-%d %H:%M")} (Elapsed time: {datetime.timedelta(seconds=iteration_elapsed)})')
    print('\t----------')
    
    count += 1
    total_end = time.time()
    total_elapsed = int(total_end - total_start)
    total_success_time = datetime.datetime.now()
    print(f'Iteration completed at {total_success_time.strftime("%Y-%m-%d %H:%M")} (Elapsed time: {datetime.timedelta(seconds=total_elapsed)})')
    print('----------')

Predicting watershed 1 of 7...
	Loading grid data into memory...
	Completed at 2020-06-04 14:53 (Elapsed time: 0:00:23)
	----------
	Classifying presence-absence...
	Completed at 2020-06-04 14:53 (Elapsed time: 0:00:09)
	----------
	Predicting foliar cover...
	Completed at 2020-06-04 14:53 (Elapsed time: 0:00:14)
	----------
	Exporting results...
	Completed at 2020-06-04 14:54 (Elapsed time: 0:00:46)
	----------
Iteration completed at 2020-06-04 14:54 (Elapsed time: 0:01:34)
----------
Predicting watershed 2 of 7...
	Loading grid data into memory...
	Completed at 2020-06-04 14:54 (Elapsed time: 0:00:28)
	----------
	Classifying presence-absence...
	Completed at 2020-06-04 14:55 (Elapsed time: 0:00:11)
	----------
	Predicting foliar cover...
	Completed at 2020-06-04 14:55 (Elapsed time: 0:00:15)
	----------
	Exporting results...
	Completed at 2020-06-04 14:56 (Elapsed time: 0:00:51)
	----------
Iteration completed at 2020-06-04 14:56 (Elapsed time: 0:01:47)
----------
Predicting watersh