In [None]:
# -*- coding: utf-8 -*-
# ---------------------------------------------------------------------------
# Classifiers Performance Test
# Author: Timm Nawrocki, Alaska Center for Conservation Science
# Created on: 2018-08-18
# Usage: Must be executed as a Jupyter Notebook in an Anaconda 3 installation on a Google Cloud virtual machine with 64 vCPUs and 57.6 GB of CPU memory with an Ubuntu operating system (18.04 LTS).
# Description: "Classifiers Performance Test" runs a single classifier to test the prediction speed on a Google Cloud virtual machine. This script is intended only for infrastructure testing and is not necessary to the analyses.
# ---------------------------------------------------------------------------

This script is a resource test script to ensure that models are performing optimally on Google Cloud virtual machines. This script and its outputs are not necessary to the analyses and can be ignored after testing the prediction performance.

In [None]:
# Import modules

import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
print("All modules successfully imported.")

In [None]:
# Define user input variables
print('Enter root directory:')
root_folder = input()
print('Enter name of output folder:')
output_folder = input()
print('Enter name of predict folder:')
prediction_folder = input()
print('Enter classifier to be tested:')
classifier_name = input()
print('Enter threshold file name:')
threshold_name = input()
print('All user-defined variables input.')

In [None]:
# Define variable sets
predictor_variables = ['compoundTopographic', 'dateFreeze_2000s', 'dateThaw_2000s', 'elevation', 'floodplainsDist', 'growingSeason_2000s', 'heatLoad', 'integratedMoisture', 'precipAnnual_2000s', 'roughness', 'siteExposure', 'slope', 'streamLargeDist', 'streamSmallDist', 'summerWarmth_2000s', 'surfaceArea', 'surfaceRelief', 'aspect', 'l8_evi2', 'l8_green', 'l8_nbr', 'l8_ndmi', 'l8_ndsi', 'l8_ndvi', 'l8_ndwi', 'l8_nearInfrared', 'l8_red', 'l8_shortInfrared1', 'l8_shortInfrared2', 'l8_ultrablue', 'l8_blue']
coordinates = ['POINT_X', 'POINT_Y']
output_variables = coordinates + ['predict_0', 'predict_10', 'predict_25', 'classification']
print('Variable sets loaded.')

In [None]:
# Define a function to read threshold values from text file
def readThreshold(inFile):
    threshold_reader = open(inFile, "r")
    threshold = threshold_reader.readlines()
    threshold_reader.close()
    outThreshold = int(threshold[0])
    return outThreshold

print('Function "readThreshold" loaded.')

In [None]:
# Define a function to use a random forest classifier to make a probability prediction, threshold the prediction, and output to dataframe
def predictModel(inModel, inThreshold, inDataframe, variable):
    prediction = inModel.predict_proba(inDataframe[predictor_variables])
    predict_index = [int((p[1] * 1000) + 0.5) for p in prediction]
    predict_index = np.asarray(predict_index)
    outThresholded = np.zeros(predict_index.shape)
    outThresholded[predict_index > inThreshold] = 1
    inDataframe = pd.concat([inDataframe, pd.DataFrame(outThresholded)], axis=1)
    inDataframe = inDataframe.rename(index=int, columns={0: variable})
    return prediction, inDataframe

print('Function "predictModel" loaded.')

In [None]:
# Import model the zero, ten, and twentyfive classifiers
model = joblib.load(os.path.join(output_folder, classifier_name))
print(model)

In [None]:
# Read thresholds from text files in the workspace folder and store as variables
threshold = readThreshold(os.path.join(output_folder, threshold_name))
print('Threshold: ' + str(threshold))

In [None]:
# Create a prediction dataframe for the test using T1906050111
predict_csv = os.path.join(os.path.join(root_folder, 'watershedData'), 'T1906050111.csv')
output_csv = os.path.join(prediction_folder, 'T1906050111.csv')
predict_df = pd.read_csv(predict_csv)
predict_df[predictor_variables] = predict_df[predictor_variables].astype(int)
print(predict_df)

In [None]:
prediction, predict_df = predictModel(model, threshold, predict_df, 'predict_test')
print(prediction)

In [None]:
print(predict_df)