In [None]:
# -*- coding: utf-8 -*-
# ---------------------------------------------------------------------------
# Classifiers Predict
# Author: Timm Nawrocki, Alaska Center for Conservation Science
# Created on: 2018-08-18
# Usage: Must be executed as a Jupyter Notebook in an Anaconda 3 installation on a Google Cloud virtual machine with 64 vCPUs and 57.6 GB of CPU memory with an Ubuntu operating system (18.04 LTS).
# Description: "Classifiers Predict" imports three classifiers trained in an external script and uses them to create a composite prediction for each watershed in a set of watershed input files.
# ---------------------------------------------------------------------------

This script runs the model prediction steps for all watersheds in a set of watershed input files. The script is formatted as a Jupyter Notebook and is intended to be run on a Google Cloud virtual machine with 64 vCPUs and 57.6 GB of CPU memory with an Ubuntu operating system (18.04 LTS). The Random Forest classifier in this script is set to use 16 cores and may work inefficiently or not at all on a machine that has less than 64 cores. For information on generating inputs for this script or on setting up Google Cloud virtual machines, see the [project readme](https://github.com/accs-uaa/vegetation-cover-modeling).

In [None]:
# Import modules

import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
print("All modules successfully imported.")

In [None]:
# Define user input variables
print('Enter root directory:')
root_folder = input()
print('Enter name of output folder:')
output_folder = input()
print('Enter name of predict folder:')
prediction_folder = input()
print('All user-defined variables input.')

In [None]:
# Define variable sets
predictor_variables = ['compoundTopographic', 'dateFreeze_2000s', 'dateThaw_2000s', 'elevation', 'floodplainsDist', 'growingSeason_2000s', 'heatLoad', 'integratedMoisture', 'precipAnnual_2000s', 'roughness', 'siteExposure', 'slope', 'streamLargeDist', 'streamSmallDist', 'summerWarmth_2000s', 'surfaceArea', 'surfaceRelief', 'aspect', 'l8_evi2', 'l8_green', 'l8_nbr', 'l8_ndmi', 'l8_ndsi', 'l8_ndvi', 'l8_ndwi', 'l8_nearInfrared', 'l8_red', 'l8_shortInfrared1', 'l8_shortInfrared2', 'l8_ultrablue', 'l8_blue']
coordinates = ['POINT_X', 'POINT_Y']
output_variables = coordinates + ['predict_0', 'predict_10', 'predict_25', 'classification']
print('Variable sets loaded.')

In [None]:
# Define a function to read threshold values from text file
def readThreshold(inFile):
    threshold_reader = open(inFile, "r")
    threshold = threshold_reader.readlines()
    threshold_reader.close()
    outThreshold = int(threshold[0])
    return outThreshold

print('Function "readThreshold" loaded.')

In [None]:
# Define a function to use a random forest classifier to make a probability prediction, threshold the prediction, and output to dataframe
def predictModel(inModel, inThreshold, inDataframe, variable):
    prediction = inModel.predict_proba(inDataframe[predictor_variables])
    predict_index = [int((p[1] * 1000) + 0.5) for p in prediction]
    predict_index = np.asarray(predict_index)
    outThresholded = np.zeros(predict_index.shape)
    outThresholded[predict_index > inThreshold] = 1
    inDataframe = pd.concat([inDataframe, pd.DataFrame(outThresholded)], axis=1)
    inDataframe = inDataframe.rename(index=int, columns={0: variable})
    return inDataframe

print('Function "predictModel" loaded.')

In [None]:
# Define a function to create composite classification
def compositeClassification (row):
    if row['predict_0'] == 0:
        return 0
    elif row['predict_0'] == 1 and row['predict_10'] == 0:
        return 1
    elif row['predict_0'] == 1 and row['predict_10'] == 1 and row['predict_25'] == 0:
        return 2
    elif row['predict_0'] == 1 and row['predict_10'] == 1 and row['predict_25'] == 1:
        return 3

print('Function "compositeClassification" loaded.')

In [None]:
# Define a function to predict and export a predict dataset using input classifier models and thresholds	
def compositeModel(inModel_0, inModel_10, inModel_25, inThreshold_0, inThreshold_10, inThreshold_25, inData, outCSV):
    # Convert the input data to a data frame
    predict_df = convertFeature(inData)
    # Use the 0 classifier to make binary prediction and append results to predict dataframe
    predict_df = predictModel(inModel_0, inThreshold_0, predict_df, "predict_0")
    # Use the 10 classifier to make binary prediction and append results to predict dataframe
    predict_df = predictModel(inModel_10, inThreshold_10, predict_df, "predict_10")
    # Use the 25 classifier to make binary prediction and append results to predict dataframe
    predict_df = predictModel(inModel_25, inThreshold_25, predict_df, "predict_25")
    # Apply composite classification function to the predictions in the test dataframe
    predict_df['classification'] = predict_df.apply(lambda row: compositeClassification(row), axis=1)
    output_df = predict_df[output_variables]
    # Export the output dataframe to the output csv
    output_df.to_csv(outCSV, header=True, index=False, sep=',', encoding='utf-8')

print('Function "compositeModel" loaded.')

In [None]:
# Import model the zero, ten, and twentyfive classifiers
model_0 = joblib.load(os.path.join(output_folder, 'classifier_0.joblib'))
model_10 = joblib.load(os.path.join(output_folder, 'classifier_10.joblib'))
model_25 = joblib.load(os.path.join(output_folder, 'classifier_25.joblib'))
print(model_0)
print(model_10)
print(model_25)

In [None]:
# Read thresholds from text files in the workspace folder and store as variables
threshold_0 = readThreshold(os.path.join(output_folder, 'threshold_0.txt'))
threshold_10 = readThreshold(os.path.join(output_folder, 'threshold_10.txt'))
threshold_25 = readThreshold(os.path.join(output_folder, 'threshold_25.txt'))
print('Threshold 0: ' + str(threshold_0))
print('Threshold 10: ' + str(threshold_10))
print('Threshold 25: ' + str(threshold_25))

In [None]:
# Create a list of input files for the prediction step
input_files = os.listdir(os.path.join(root_folder, 'watershedData'))
print(input_files)

In [None]:
# Loop through the prediction function for all input files
for watershed_data in input_files:
    predict_csv = os.path.join(os.path.join(root_folder, 'watershedData'), watershed_data)
    output_csv = os.path.join(prediction_folder, watershed_data)
    predict_df = pd.read_csv(predict_csv)
    predict_df[predictor_variables] = predict_df[predictor_variables].astype(int)
    compositeModel(model_0, model_10, model_25, threshold_0, threshold_10, threshold_25, predict_df, output_csv)
    print('Prediction iteration ' + str(input_files.index(watershed_data) + 1) + ' out of ' + str(len(input_files)) + ' complete...')
print('All predictions complete.')