In [1]:
# Import modules
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plot
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.externals import joblib
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.linear_model import ARDRegression, BayesianRidge
from collections import Counter
print('All modules successfully imported.')

All modules successfully imported.


In [2]:
# Define user input variables
root_folder = 'E:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/testRegression'
output_folder = 'E:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/testRegression'
file_10 = 'E:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/testRegression/betula_nana_10.csv'
file_25 = 'E:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/testRegression/betula_nana_25.csv'
file_50 = 'E:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/testRegression/betula_nana_50.csv'
file_100 = 'E:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/testRegression/betula_nana_100.csv'

In [3]:
# Define variable sets
predictor_metrics = ['compoundTopographic', 'dateFreeze_2000s', 'dateThaw_2000s', 'elevation', 'floodplainsDist', 'growingSeason_2000s', 'heatLoad', 'integratedMoisture', 'precipAnnual_2000s', 'roughness', 'siteExposure', 'slope', 'streamLargeDist', 'streamSmallDist', 'summerWarmth_2000s', 'surfaceArea', 'surfaceRelief', 'aspect', 'may_2_blue', 'may_evi2', 'may_nbr', 'may_ndmi', 'may_ndsi', 'may_ndvi', 'may_ndwi', 'june_2_blue', 'june_evi2', 'june_nbr', 'june_ndmi', 'june_ndsi', 'june_ndvi', 'june_ndwi', 'july_2_blue', 'july_evi2', 'july_nbr', 'july_ndmi', 'july_ndsi', 'july_ndvi', 'july_ndwi', 'august_2_blue', 'august_evi2', 'august_nbr', 'august_ndmi', 'august_ndsi', 'august_ndvi', 'august_ndwi', 'september_2_blue', 'september_evi2', 'september_nbr', 'september_ndmi', 'september_ndsi', 'september_ndvi', 'september_ndwi']
zero_variable = ['zero']
cover = ['cover']
coverLog = ['coverLog']
strata = ['strata']
retain_variables = ['project', 'siteID', 'siteCode', 'methodSurvey', 'methodCover']
coordinates = ['POINT_X', 'POINT_Y']
all_variables = retain_variables + coordinates + predictor_metrics + zero_variable + strata + cover + coverLog
scale_variables = predictor_metrics
print('Variable sets loaded.')

Variable sets loaded.


In [4]:
# Import input data from csv file
data_10 = pd.read_csv(file_10)
data_25 = pd.read_csv(file_25)
data_50 = pd.read_csv(file_50)
data_100 = pd.read_csv(file_100)
# Convert numerical data to integers
data_10[predictor_metrics + cover + coverLog] = data_10[predictor_metrics + cover + coverLog].astype(float)
data_25[predictor_metrics + cover + coverLog] = data_25[predictor_metrics + cover + coverLog].astype(float)
data_50[predictor_metrics + cover + coverLog] = data_50[predictor_metrics + cover + coverLog].astype(float)
data_100[predictor_metrics + cover + coverLog] = data_100[predictor_metrics + cover + coverLog].astype(float)

In [5]:
# Determine sample size for each class
n_10 = len(data_10[cover])
n_25 = len(data_25[cover])
n_50 = len(data_50[cover])
n_100 = len(data_100[cover])
# Determine minimum sample size for a 70% training ratio
n_min = int(min([n_10*0.7, n_25*0.7, n_50*0.7, n_100*0.7]))
print(n_min)

36


In [6]:
# Determine sampling ratio for each class
p_10 = round(n_min/n_10, 2)
p_25 = round(n_min/n_25, 2)
p_50 = round(n_min/n_50, 2)
p_100 = round(n_min/n_100, 2)
print(p_10, p_25, p_50, p_100)

0.06 0.14 0.29 0.69


In [7]:
# Create train and test splits for the 1-10% data
X = data_10[all_variables]
y = data_10[coverLog[0]]
all_train_10, all_test_10, y_train_10, y_test_10 = train_test_split(X, y, test_size = 1-p_10, train_size = p_10, random_state = None, shuffle = True, stratify = None)
# Create train and test splits for the 11-25% data
X = data_25[all_variables]
y = data_25[coverLog[0]]
all_train_25, all_test_25, y_train_25, y_test_25 = train_test_split(X, y, test_size=1-p_25, train_size = p_25, random_state = None, shuffle = True, stratify = None)
# Create train and test splits for the 26-50% data
X = data_50[all_variables]
y = data_50[coverLog[0]]
all_train_50, all_test_50, y_train_50, y_test_50 = train_test_split(X, y, test_size=1-p_50, train_size = p_50, random_state = None, shuffle = True, stratify = None)
# Create train and test splits for the 51-100% data
X = data_100[all_variables]
y = data_100[coverLog[0]]
all_train_100, all_test_100, y_train_100, y_test_100 = train_test_split(X, y, test_size=1-p_100, train_size = p_100, random_state = None, shuffle = True, stratify = None)

In [8]:
# Build training sample
all_train_raw = all_train_10.append(all_train_25, ignore_index = True, sort = True)
all_train_raw = all_train_raw.append(all_train_50, ignore_index = True, sort = True)
all_train_raw = all_train_raw.append(all_train_100, ignore_index = True, sort = True)

# Build test sample
all_test_raw = all_test_10.append(all_test_25, ignore_index = True, sort = True)
all_test_raw = all_test_raw.append(all_test_50, ignore_index = True, sort = True)
all_test_raw = all_test_raw.append(all_test_100, ignore_index = True, sort = True)

In [9]:
# Output raw train and test data
train_raw = 'E:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/testRegression/train_raw.csv'
test_raw = 'E:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/testRegression/test_raw.csv'
all_train_raw.to_csv(train_raw, header=True, index=False, sep=',', encoding='utf-8')
all_test_raw.to_csv(test_raw, header=True, index=False, sep=',', encoding='utf-8')

In [10]:
# Create a standard scaler to set mean = 0 and scale unit variance (scale all variables to Gaussian distribution)
scaler = StandardScaler()
scaler.fit(all_train_raw[scale_variables])

StandardScaler(copy=True, with_mean=True, with_std=True)

In [11]:
# Scale the training data
train_scaled = scaler.transform(all_train_raw[scale_variables])
all_train_scaled = all_train_raw
all_train_scaled = all_train_scaled.drop(columns=scale_variables)
all_train_scaled = pd.concat([all_train_scaled, pd.DataFrame(data=train_scaled, columns=scale_variables)], axis=1)

In [12]:
# Scale the test data
test_scaled = scaler.transform(all_test_raw[scale_variables])
all_test_scaled = all_test_raw
all_test_scaled = all_test_scaled.drop(columns=scale_variables)
all_test_scaled = pd.concat([all_test_scaled, pd.DataFrame(data=test_scaled, columns=scale_variables)], axis=1)

In [13]:
# Output scaled train and test data
train_scaled = 'E:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/testRegression/train_scaled.csv'
test_scaled = 'E:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/testRegression/test_scaled.csv'
all_train_scaled.to_csv(train_scaled, header=True, index=False, sep=',', encoding='utf-8')
all_test_scaled.to_csv(test_scaled, header=True, index=False, sep=',', encoding='utf-8')

In [None]:
# Define a function to calculate performance metrics based on a specified threshold value
def thresholdMetrics(inIndex, inProbability, inValue, y_test):
    outThresholded = np.zeros(inIndex.shape)
    outThresholded[inIndex > inValue] = 1
    confusion_test = confusion_matrix(y_test, outThresholded)
    true_negative = confusion_test[0,0]
    false_negative = confusion_test[1,0]
    true_positive = confusion_test[1,1]
    false_positive = confusion_test[0,1]
    outSensitivity = true_positive / (true_positive + false_negative)
    outSpecificity = true_negative / (true_negative + false_positive)
    outAUC = roc_auc_score(y_test, inProbability)
    outAccuracy = (true_negative + true_positive) / (true_negative + false_positive + false_negative + true_positive)
    return (outThresholded, outSensitivity, outSpecificity, outAUC, outAccuracy)

print('Function "thresholdMetrics" loaded.')

In [None]:
# Define a function to fit a classifier using training data and determine performance using the test data
def trainTestClassifier(X_train, y_train, X_test, y_test, testData):
    # Fit a classifier to the training dataset
    classifier = RandomForestClassifier(n_estimators=50, criterion='entropy', max_features='log2', bootstrap=True, oob_score=False, n_jobs=16, class_weight='balanced')
    classifier.fit(X_train, y_train)
    # Use the random forest classifier to predict probabilities for the test dataset
    test_prediction = classifier.predict_proba(X_test)
    # Convert the positive class probabilities to a list of probabilities
    test_probability = [p[1] for p in test_prediction]
    # Convert the postitive class probabilities to an index between 0 and 1000
    test_index = [int((p[1] * 1000) + 0.5) for p in test_prediction]
    # Iterate through numbers between 0 and 1000 to output a list of sensitivity and specificity values per threshold number
    i = 1
    test_index = np.asarray(test_index)
    sensitivity_list = []
    specificity_list = []
    while i < 1001:
        test_thresholded, sensitivity_test, specificity_test, auc_test, accuracy_test = thresholdMetrics(test_index, test_probability, i, y_test)
        sensitivity_list.append(sensitivity_test)
        specificity_list.append(specificity_test)
        i = i + 1
    # Calculate a list of absolute value of difference between sensitivity and specificity and find the optimal threshold
    difference_list = [a - b for a, b in zip(sensitivity_list, specificity_list)]
    value, threshold = min((value, threshold) for (threshold, value) in enumerate(difference_list) if value >= 0)
    # Calculate the prediction index to a binary 0 or 1 output using the optimal threshold
    test_thresholded, sensitivity_test, specificity_test, auc_test, accuracy_test = thresholdMetrics(test_index, test_probability, threshold, y_test)
    # Concatenate thresholded predictions to test data frame
    testData = pd.concat([testData, pd.DataFrame(test_thresholded)], axis=1)
    testData = testData.rename(index=int, columns={0: 'classify'})
    return [threshold, sensitivity_test, specificity_test, auc_test, accuracy_test, testData]

print('Function "trainTestClassifier" loaded.')

In [None]:
# Define a function to fit a regressor using training data and determine performance using the test data
def trainTestRegressor(X_train, y_train, X_test, y_test, testData):
    # Fit a regressor to the training dataset
    regressor = RandomForestRegressor(n_estimators=10, criterion='mse', bootstrap=True, oob_score=False, n_jobs=16)
    regressor.fit(X_train, y_train)
    # Use the regressor to predict values for the test dataset
    test_prediction = regressor.predict(X_test)
    # Calculate the r^2
    r_score = r2_score(y_test, test_prediction, sample_weight=None, multioutput='uniform_average')
    # Calculate error
    mae = mean_absolute_error(y_test, test_prediction)
    mse = mean_squared_error(y_test, test_prediction)
    rmse = np.sqrt(mse)
    # Concatenate predicted values to test data frame
    testData = pd.concat([testData, pd.DataFrame(test_prediction)], axis=1)
    testData = testData.rename(index=int, columns={0: 'regress'})
    return [r_score, mae, mse, rmse, testData]

print('Function "trainTestRegressor" loaded.')

In [None]:
# Define random forest regression function
def trainTestSVRegressor(X_train, y_train, X_test, y_test, testData, variable):
    # Fit a random forest regressor to the training dataset
    sv_regress = SVR(kernel='rbf')
    sv_regress.fit(X_train, y_train)
    # Use the random forest classifier to predict probabilities for the test dataset
    test_prediction = sv_regress.predict(X_test)
    # Calculate r
    r_score = r2_score(y_test, test_prediction, sample_weight=None, multioutput='uniform_average')
    # Calculate error
    mae = mean_absolute_error(y_test, test_prediction)
    mse = mean_squared_error(y_test, test_prediction)
    rmse = np.sqrt(mse)
    # Concatenate thresholded predictions to test data frame
    testData = pd.concat([testData, pd.DataFrame(test_prediction)], axis=1)
    testData = testData.rename(index=int, columns={0: variable})
    return [r_score, mae, mse, rmse, sv_regress, testData]

print('Function "trainTestModel" loaded.')

In [None]:
# Import input data from csv file
input_file = os.path.join(os.path.join(root_folder, "speciesData"), input_data_name)
inData = pd.read_csv(input_file)
# Convert numerical data to integers
inData[predictor_metrics + zero_variable + ten_variable + twentyfive_variable + cover + strata] = input_df[predictor_metrics + zero_variable + ten_variable + twentyfive_variable + cover + strata].astype(float)

In [None]:
X = inData[all_variables]
y = inData[zero_variable[0]]
stratify = inData[strata[0]]

In [None]:
threshold_list = []
sensitivity_list = []
specificity_list = []
auc_list = []
accuracy_list = []
r2_list = []
mae_list = []
mse_list = []
rmse_list = []

In [None]:
test_output = pd.DataFrame(columns=[['index'] + all_variables + ['classify', 'regress']])

In [None]:
all_train, all_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size = 0.7, random_state = None, shuffle = True, stratify = stratify)
all_train = all_train.reset_index()
all_test = all_test.reset_index()

In [None]:
scaler = StandardScaler()
scaler.fit(all_train[scale_variables])

In [None]:
train_scaled = scaler.transform(all_train[scale_variables])
all_train.drop(labels=scale_variables, axis='columns', inplace=True)
all_train = pd.concat([all_train, pd.DataFrame(data=train_scaled, columns=scale_variables)], axis=1)

In [None]:
test_scaled = scaler.transform(all_test[scale_variables])
all_test.drop(labels=scale_variables, axis='columns', inplace=True)
all_test = pd.concat([all_test, pd.DataFrame(data=test_scaled, columns=scale_variables)], axis=1)

In [None]:
X_train = all_train[predictor_metrics]
y_train = all_train[zero_variable[0]]
X_test = all_test[predictor_metrics]
y_test = all_test[zero_variable[0]]
threshold, sensitivity, specificity, auc, accuracy, all_test = trainTestClassifier(X_train, y_train, X_test, y_test, all_test)

In [None]:
# Implement naive random oversampling to balance dataset
resample_train, resample_strata = RandomOverSampler(random_state=0).fit_sample(all_train, all_train[strata[0]])
# Convert resampled data back to data frame
all_column_names = np.array(all_train.columns.values)
resample_train = pd.DataFrame(data=resample_train, columns=all_column_names)
# Print resampled strata summary
print(sorted(Counter(resample_strata).items()))

In [None]:
X_train = resample_train[predictor_metrics]
y_train = resample_train[coverLog[0]]
X_test = all_test[predictor_metrics]
y_test = all_test[coverLog[0]]
r_score, mae, mse, rmse, all_test = trainTestRegressor(X_train, y_train, X_test, y_test, all_test)

In [None]:
threshold_list.append(threshold)
sensitivity_list.append(sensitivity)
specificity_list.append(specificity)
auc_list.append(auc)
accuracy_list.append(accuracy)
r2_list.append(r_score)
mae_list.append(mae)
mse_list.append(mse)
rmse_list.append(rmse)

In [None]:
test = pd.DataFrame(data=inverse_scaled, columns=scale_variables)
test.to_csv('K:/VegetationEcology/test.csv', header=True, index=False, sep=',', encoding='utf-8')

In [None]:
inverse_scaled

In [None]:
#all_test = all_test.drop(coverLog[0], axis=1)
#all_test = all_test.rename(index=str, columns={'regress': 'coverLog'})
#inverse_scaled = scaler.inverse_transform(all_test[scale_variables])
#all_test.drop(scale_variables, axis=1)
all_test = pd.concat([all_test, pd.DataFrame(data=inverse_scaled, columns=scale_variables)], axis=1)