In [1]:
# Import modules

import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
print("All modules successfully imported.")

All modules successfully imported.


In [2]:
# Define user input variables
print('Enter root directory:')
root_folder = input()
print('Enter name of output folder:')
output_folder = input()
print('Enter name of predict folder:')
prediction_folder = input()
print('Enter classifier to be tested:')
classifier_name = input()
print('Enter threshold file name:')
threshold_name = input()
print('All user-defined variables input.')

Enter root directory:
E:\VegetationEcology\Data_Harmonization\GoogleCloud
Enter name of output folder:
E:\VegetationEcology\Data_Harmonization\GoogleCloud\output_SalixPulchra
Enter name of predict folder:
E:\VegetationEcology\Data_Harmonization\GoogleCloud\prediction_SalixPulchra
Enter classifier to be tested:
classifier_0.joblib
Enter threshold file name:
threshold_0.txt
All user-defined variables input.


In [3]:
# Define variable sets
predictor_variables = ['compoundTopographic', 'dateFreeze_2000s', 'dateThaw_2000s', 'elevation', 'floodplainsDist', 'growingSeason_2000s', 'heatLoad', 'integratedMoisture', 'precipAnnual_2000s', 'roughness', 'siteExposure', 'slope', 'streamLargeDist', 'streamSmallDist', 'summerWarmth_2000s', 'surfaceArea', 'surfaceRelief', 'aspect', 'l8_evi2', 'l8_green', 'l8_nbr', 'l8_ndmi', 'l8_ndsi', 'l8_ndvi', 'l8_ndwi', 'l8_nearInfrared', 'l8_red', 'l8_shortInfrared1', 'l8_shortInfrared2', 'l8_ultrablue', 'l8_blue']
coordinates = ['POINT_X', 'POINT_Y']
output_variables = coordinates + ['predict_0', 'predict_10', 'predict_25', 'classification']
print('Variable sets loaded.')

Variable sets loaded.


In [4]:
# Define a function to read threshold values from text file
def readThreshold(inFile):
    threshold_reader = open(inFile, "r")
    threshold = threshold_reader.readlines()
    threshold_reader.close()
    outThreshold = int(threshold[0])
    return outThreshold

print('Function "readThreshold" loaded.')

Function "readThreshold" loaded.


In [5]:
# Define a function to use a random forest classifier to make a probability prediction, threshold the prediction, and output to dataframe
def predictModel(inModel, inThreshold, inDataframe, variable):
    prediction = inModel.predict_proba(inDataframe[predictor_variables])
    predict_index = [int((p[1] * 1000) + 0.5) for p in prediction]
    predict_index = np.asarray(predict_index)
    outThresholded = np.zeros(predict_index.shape)
    outThresholded[predict_index > inThreshold] = 1
    inDataframe = pd.concat([inDataframe, pd.DataFrame(outThresholded)], axis=1)
    inDataframe = inDataframe.rename(index=int, columns={0: variable})
    return prediction, inDataframe

print('Function "predictModel" loaded.')

Function "predictModel" loaded.


In [6]:
# Import model the zero, ten, and twentyfive classifiers
model = joblib.load(os.path.join(output_folder, classifier_name))
print(model)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=5000, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=False)


In [7]:
# Read thresholds from text files in the workspace folder and store as variables
threshold = readThreshold(os.path.join(output_folder, threshold_name))
print('Threshold: ' + str(threshold))

Threshold: 347


In [12]:
# Create a prediction dataframe for the test using T1906050111
predict_csv = os.path.join(os.path.join(root_folder, 'watershedData'), 'T1906050111.csv')
output_csv = os.path.join(prediction_folder, 'T1906050111.csv')
predict_df = pd.read_csv(predict_csv)
predict_df[predictor_variables] = predict_df[predictor_variables].astype(int)
print(predict_df)

        compoundTopographic  dateFreeze_2000s  dateThaw_2000s  elevation  \
0                      1111               268             150          1   
1                      1100               268             150          1   
2                      1531               268             150          1   
3                      1294               268             150          1   
4                      1271               268             150          1   
5                      1497               268             150          1   
6                      1357               268             150          1   
7                      1357               268             150          1   
8                      1357               268             150          1   
9                      1340               268             150          1   
10                     1320               268             150          1   
11                     1340               268             150          1   
12          

In [14]:
prediction, predict_df = predictModel(model, threshold, predict_df, 'predict_test')
print(prediction)

[[0.8494 0.1506]
 [0.9472 0.0528]
 [0.8506 0.1494]
 ...
 [0.158  0.842 ]
 [0.2594 0.7406]
 [0.2358 0.7642]]


In [13]:
print(predict_df)

        compoundTopographic  dateFreeze_2000s  dateThaw_2000s  elevation  \
0                      1111               268             150          1   
1                      1100               268             150          1   
2                      1531               268             150          1   
3                      1294               268             150          1   
4                      1271               268             150          1   
5                      1497               268             150          1   
6                      1357               268             150          1   
7                      1357               268             150          1   
8                      1357               268             150          1   
9                      1340               268             150          1   
10                     1320               268             150          1   
11                     1340               268             150          1   
12          