In [97]:
# Import modules
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plot
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.externals import joblib
from sklearn.metrics import r2_score
print('All modules successfully imported.')

All modules successfully imported.


In [2]:
# Define user input variables
print('Enter root directory:')
root_folder = 'E:\VegetationEcology\Data_Harmonization\GoogleCloud'
print('Enter name of output folder:')
output_folder = 'E:\VegetationEcology\Data_Harmonization\GoogleCloud\output_SalixPulchra'
print('Enter name of predict folder:')
input_data_name = 'salix_pulchra_regress.csv'
print('Enter name of output report file:')

Enter root directory:
Enter name of output folder:
Enter name of predict folder:
Enter name of output report file:


In [78]:
# Define variable sets
predictor_variables = ['compoundTopographic', 'dateFreeze_2000s', 'dateThaw_2000s', 'elevation', 'floodplainsDist', 'growingSeason_2000s', 'heatLoad', 'integratedMoisture', 'precipAnnual_2000s', 'roughness', 'siteExposure', 'slope', 'streamLargeDist', 'streamSmallDist', 'summerWarmth_2000s', 'surfaceArea', 'surfaceRelief', 'aspect', 'l8_evi2', 'l8_green', 'l8_nbr', 'l8_ndmi', 'l8_ndsi', 'l8_ndvi', 'l8_ndwi', 'l8_nearInfrared', 'l8_red', 'l8_shortInfrared1', 'l8_shortInfrared2', 'l8_ultrablue', 'l8_blue']
zero_variable = ['zero']
ten_variable = ['ten']
twentyfive_variable = ['twentyfive']
cover = ['cover']
cover10 = ['coverTen']
strata = ['strata']
retain_variables = ['project', 'siteID', 'siteCode', 'methodSurvey', 'methodCover']
coordinates = ['POINT_X', 'POINT_Y']
all_variables = predictor_variables + zero_variable + ten_variable + twentyfive_variable + cover + cover10 + strata + retain_variables + coordinates
print('Variable sets loaded.')

Variable sets loaded.


In [69]:
# Define a function to plot variable importances
def plotVariableImportances(inModel, x_train, outVariableFile):
    # Get numerical feature importances
    importances = list(inModel.feature_importances_)
    # List of tuples with variable and importance
    feature_list = list(x_train.columns)
    feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
    # Sort the feature importances by most important first
    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
    # Initialize the plot and set figure size
    variable_figure = plot.figure()
    plot.style.use('fivethirtyeight')
    fig_size = plot.rcParams["figure.figsize"]
    fig_size[0] = 12
    fig_size[1] = 9
    plot.rcParams["figure.figsize"] = fig_size
    # Create list of x locations for plotting
    x_values = list(range(len(importances)))
    # Make a bar chart of the variable importances
    plot.bar(x_values, importances, orientation = 'vertical')
    # Tick labels for x axis
    plot.xticks(x_values, feature_list, rotation='vertical')
    # Axis labels and title
    plot.ylabel('Importance'); plot.xlabel('Variable'); plot.title('Variable Importances');
    # Export
    variable_figure.savefig(outVariableFile, bbox_inches="tight", dpi=300)
    # Clear plot workspace
    plot.clf()
    plot.close()
    
print('Function "plotVariableImportances" loaded.')

Function "plotVariableImportances" loaded.


In [70]:
# Define a function to calculate performance metrics based on a specified threshold value
def thresholdMetrics(inIndex, inProbability, inValue, y_test):
    outThresholded = np.zeros(inIndex.shape)
    outThresholded[inIndex > inValue] = 1
    confusion_test = confusion_matrix(y_test, outThresholded)
    true_negative = confusion_test[0,0]
    false_negative = confusion_test[1,0]
    true_positive = confusion_test[1,1]
    false_positive = confusion_test[0,1]
    outSensitivity = true_positive / (true_positive + false_negative)
    outSpecificity = true_negative / (true_negative + false_positive)
    outAUC = roc_auc_score(y_test, inProbability)
    outAccuracy = (true_negative + true_positive) / (true_negative + false_positive + false_negative + true_positive)
    return (outThresholded, outSensitivity, outSpecificity, outAUC, outAccuracy)

print('Function "thresholdMetrics" loaded.')

Function "thresholdMetrics" loaded.


In [71]:
# Define a function to fit a classifier using training data and determine a best classification threshold using the test data
def trainTestModel(X_train, y_train, X_test, y_test, testData, variable):
    # Fit a random forest classifier to the training dataset
    rf_classify = RandomForestClassifier(n_estimators = 5000, bootstrap = True, oob_score = True, n_jobs=1, class_weight = "balanced")
    rf_classify.fit(X_train, y_train)
    # Use the random forest classifier to predict probabilities for the test dataset
    test_prediction = rf_classify.predict_proba(X_test)
    # Convert the positive class probabilities to a list of probabilities
    test_probability = [p[1] for p in test_prediction]
    # Convert the postitive class probabilities to an index between 0 and 1000
    test_index = [int((p[1] * 1000) + 0.5) for p in test_prediction]
    # Iterate through numbers between 0 and 1000 to output a list of sensitivity and specificity values per threshold number
    i = 1
    test_index = np.asarray(test_index)
    sensitivity_list = []
    specificity_list = []
    while i < 1001:
        test_thresholded, sensitivity_test, specificity_test, auc_test, accuracy_test = thresholdMetrics(test_index, test_probability, i, y_test)
        sensitivity_list.append(sensitivity_test)
        specificity_list.append(specificity_test)
        i = i + 1
    # Calculate a list of absolute value of difference between sensitivity and specificity and find the optimal threshold
    difference_list = [a - b for a, b in zip(sensitivity_list, specificity_list)]
    value, threshold = min((value, threshold) for (threshold, value) in enumerate(difference_list) if value >= 0)
    # Calculate the prediction index to a binary 0 or 1 output using the optimal threshold
    test_thresholded, sensitivity_test, specificity_test, auc_test, accuracy_test = thresholdMetrics(test_index, test_probability, threshold, y_test)
    # Concatenate thresholded predictions to test data frame
    testData = pd.concat([testData, pd.DataFrame(test_thresholded)], axis=1)
    testData = testData.rename(index=int, columns={0: variable})
    return [threshold, sensitivity_test, specificity_test, auc_test, accuracy_test, rf_classify.oob_score_, rf_classify, testData]

print('Function "trainTestModel" loaded.')

Function "trainTestModel" loaded.


In [72]:
# Set initial plot sizefig_size = plot.rcParams["figure.figsize"]
fig_size = plot.rcParams["figure.figsize"]
fig_size[0] = 12
fig_size[1] = 9
plot.rcParams["figure.figsize"] = fig_size
print('Plot size parameters configured.')

Plot size parameters configured.


In [79]:
# Import input data from csv file
input_file = os.path.join(os.path.join(root_folder, "speciesData"), input_data_name)
input_df = pd.read_csv(input_file)
# Convert numerical data to integers
input_df[predictor_variables + zero_variable + ten_variable + twentyfive_variable + cover + cover10 + strata] = input_df[predictor_variables + zero_variable + ten_variable + twentyfive_variable + cover + cover10 + strata].astype(int)
print(input_df)

      cover        project  siteID          siteCode          methodSurvey  \
0         0        NSSI LC    1975      NSSI11_04_01       Visual Estimate   
1         0        NSSI LC    1976      NSSI11_04_02       Visual Estimate   
2         0        NSSI LC    2034      NSSI11_04_04       Visual Estimate   
3         0        NSSI LC    1987      NSSI11_04_03       Visual Estimate   
4         0      AIM NPR-A     180           TMCW-64  Line-Point Intercept   
5         1      AIM NPR-A     182           TMCW-68  Line-Point Intercept   
6         0      AIM NPR-A      64          CPBWM-69  Line-Point Intercept   
7         0      AIM NPR-A     178           TMCW-61  Line-Point Intercept   
8         0      AIM NPR-A      65          CPBWM-70  Line-Point Intercept   
9         5      AIM NPR-A      82          CPHCP-59  Line-Point Intercept   
10        0      AIM NPR-A     181           TMCW-67  Line-Point Intercept   
11        0      AIM NPR-A     179           TMCW-63  Line-Point

In [9]:
# Create a plots folder if it does not exist
plots_folder = os.path.join(output_folder, "plots")
if not os.path.exists(plots_folder):
    os.makedirs(plots_folder)
    print('Plots folder created.')
else:
    print('Plots folder already exists.')

Plots folder created.


In [108]:
# Create train and test splits
X = input_df[all_variables]
y = np.array(input_df[zero_variable]).ravel()
strata = np.array(input_df['strata']).ravel()
All_train, All_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size = 0.7, random_state = None, shuffle = True, stratify = strata)
testData = All_test.reset_index()
print(testData)

     index  compoundTopographic  dateFreeze_2000s  dateThaw_2000s  elevation  \
0     1927                 1076               274             115        250   
1      744                  897               267             144        129   
2      458                 1100               268             149         24   
3     2421                  828               277             127        303   
4      414                 1170               269             150          3   
5       85                  996               269             150          4   
6     1567                 1042               272             120        461   
7     1266                  675               271             132        247   
8     2114                  952               277             122         78   
9     2065                 1330               279             130          1   
10     322                  943               269             150         11   
11     802                 1326         

In [109]:
# Classify and predict the zero class
X_train = All_train[predictor_variables]
y_train = All_train['zero']
X_test = All_test[predictor_variables]
y_test = All_test['zero']
threshold_0, sensitivity_0, specificity_0, auc_0, accuracy_0, oob_score_0, classifier_0, testData = trainTestModel(X_train, y_train, X_test, y_test, testData, 'cover_0')
print(classifier_0)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=5000, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=False)


In [123]:
# Subset the train data
Subset_train = All_train[All_train['strata'] >= 1]
Subset_test = All_test[All_test['strata'] >= 1]
print(Subset_train)

      compoundTopographic  dateFreeze_2000s  dateThaw_2000s  elevation  \
619                  1066               267             143         86   
1770                 1076               276             122         99   
876                   983               267             142         69   
589                  1290               268             145         74   
105                   943               268             151         12   
2493                  997               277             126         73   
958                   949               267             142        163   
646                   983               267             144         88   
2112                  786               277             122         66   
1035                 1111               264             139        284   
303                  1202               268             150          2   
9                    1210               268             154         18   
2205                 1111             

In [111]:
# Classify and predict the ten class
X_train = Subset_train[predictor_variables]
y_train = Subset_train['ten']
X_test = All_test[predictor_variables]
y_test = All_test['ten']
threshold_10, sensitivity_10, specificity_10, auc_10, accuracy_10, oob_score_10, classifier_10, testData = trainTestModel(X_train, y_train, X_test, y_test, testData, 'cover_10')
print(classifier_10)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=5000, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=False)


In [112]:
# Classify and predict the twentyfive class
X_train = Subset_train[predictor_variables]
y_train = Subset_train['twentyfive']
X_test = All_test[predictor_variables]
y_test = All_test['twentyfive']
threshold_25, sensitivity_25, specificity_25, auc_25, accuracy_25, oob_score_25, classifier_25, testData = trainTestModel(X_train, y_train, X_test, y_test, testData, 'cover_25')
print(classifier_25)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=5000, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=False)


In [113]:
# Define a function to create composite classification
def compositeClassification (row):
    if row['cover_0'] == 0:
        return 0
    elif row['cover_0'] == 1 and row['cover_10'] == 0:
        return 1
    elif row['cover_0'] == 1 and row['cover_10'] == 1 and row['cover_25'] == 0:
        return 2
    elif row['cover_0'] == 1 and row['cover_10'] == 1 and row['cover_25'] == 1:
        return 3

print('Function "compositeClassification" loaded.')

Function "compositeClassification" loaded.


In [114]:
testData['classification'] = testData.apply(lambda row: compositeClassification(row), axis=1)

In [115]:
print(testData)

     index  compoundTopographic  dateFreeze_2000s  dateThaw_2000s  elevation  \
0     1927                 1076               274             115        250   
1      744                  897               267             144        129   
2      458                 1100               268             149         24   
3     2421                  828               277             127        303   
4      414                 1170               269             150          3   
5       85                  996               269             150          4   
6     1567                 1042               272             120        461   
7     1266                  675               271             132        247   
8     2114                  952               277             122         78   
9     2065                 1330               279             130          1   
10     322                  943               269             150         11   
11     802                 1326         

In [116]:
# Define regression function
def regressionModel(X_train, y_train, X_test, y_test, testData, variable):
    # Fit a random forest regressor to the training dataset
    rf_regress = RandomForestRegressor(n_estimators = 5000, criterion ='mse', bootstrap = True, oob_score = True, n_jobs=1)
    rf_regress.fit(X_train, y_train)
    # Use the random forest classifier to predict probabilities for the test dataset
    test_prediction = rf_regress.predict(X_test)
    # Calculate r
    r_score = r2_score(y_test, test_prediction, sample_weight=None, multioutput='uniform_average')
    # Concatenate thresholded predictions to test data frame
    testData = pd.concat([testData, pd.DataFrame(test_prediction)], axis=1)
    testData = testData.rename(index=int, columns={0: variable})
    return [r_score, rf_regress, testData]

print('Function "trainTestModel" loaded.')

Function "trainTestModel" loaded.


In [124]:
X_train = Subset_train[predictor_variables]
y_train = Subset_train['cover']
X_test = Subset_test[predictor_variables]
y_test = Subset_test['cover']
r_score_1, rf_regress_1, testData = regressionModel(X_train, y_train, X_test, y_test, testData, 'coverRegress_1')

In [125]:
print(r_score_1)

0.0626613786590815


In [126]:
X_train = Subset_train[predictor_variables]
y_train = Subset_train['coverTen']
X_test = Subset_test[predictor_variables]
y_test = Subset_test['coverTen']
r_score_10, rf_regress_10, testData = regressionModel(X_train, y_train, X_test, y_test, testData, 'coverRegress_10')

In [120]:
print(r_score_10)

0.13089753424091277


In [121]:
# Export data frame to excel
testData.to_excel(os.path.join(root_folder, 'test.xls'))

In [107]:
print(testData)

     index  compoundTopographic  dateFreeze_2000s  dateThaw_2000s  elevation  \
0      875                 1143               266             143         79   
1      276                 1111               268             149         15   
2     1794                  751               275             115        613   
3     1065                 1296               280             140          2   
4     1415                 1400               274             126        128   
5     1161                  613               261             137       1143   
6     1694                  861               274             120        489   
7      764                  915               267             143        131   
8      753                  724               267             143        267   
9     2335                 1048               278             125         14   
10     162                 1747               269             150          2   
11    1475                  889         