In [1]:
# Import modules
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plot
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.externals import joblib
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.linear_model import ARDRegression, BayesianRidge
from collections import Counter
print('All modules successfully imported.')

All modules successfully imported.


In [2]:
# Define variable sets
predictor_metrics = ['compoundTopographic', 'dateFreeze_2000s', 'dateThaw_2000s', 'elevation', 'floodplainsDist', 'growingSeason_2000s', 'heatLoad', 'integratedMoisture', 'precipAnnual_2000s', 'roughness', 'siteExposure', 'slope', 'streamLargeDist', 'streamSmallDist', 'summerWarmth_2000s', 'surfaceArea', 'surfaceRelief', 'aspect', 'may_2_blue', 'may_evi2', 'may_nbr', 'may_ndmi', 'may_ndsi', 'may_ndvi', 'may_ndwi', 'june_2_blue', 'june_evi2', 'june_nbr', 'june_ndmi', 'june_ndsi', 'june_ndvi', 'june_ndwi', 'july_2_blue', 'july_evi2', 'july_nbr', 'july_ndmi', 'july_ndsi', 'july_ndvi', 'july_ndwi', 'august_2_blue', 'august_evi2', 'august_nbr', 'august_ndmi', 'august_ndsi', 'august_ndvi', 'august_ndwi', 'september_2_blue', 'september_evi2', 'september_nbr', 'september_ndmi', 'september_ndsi', 'september_ndvi', 'september_ndwi']
zero_variable = ['zero']
cover = ['cover']
coverLog = ['coverLog']
strata = ['strata']
retain_variables = ['project', 'siteID', 'siteCode', 'methodSurvey', 'methodCover']
coordinates = ['POINT_X', 'POINT_Y']
all_variables = retain_variables + coordinates + predictor_metrics + zero_variable + strata + cover + coverLog
scale_variables = predictor_metrics
print('Variable sets loaded.')

Variable sets loaded.


In [3]:
# Define raw train and test data
train_file = 'E:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/testRegression/train_scaled.csv'
test_file = 'E:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/testRegression/test_scaled.csv'

In [4]:
# Create data frames of train and test data
all_train = pd.read_csv(train_file)
all_test = pd.read_csv(test_file)

# Convert values to floats
all_train[predictor_metrics + cover + coverLog + coordinates] = all_train[predictor_metrics + cover + coverLog + coordinates].astype(float)
all_test[predictor_metrics + cover + coverLog + coordinates] = all_test[predictor_metrics + cover + coverLog + coordinates].astype(float)

# Convert values to integers
all_train[strata + zero_variable] = all_train[strata + zero_variable].astype(int)
all_test[strata + zero_variable] = all_test[strata + zero_variable].astype(int)

In [5]:
# Convert the response variable to a gaussian distribution
y_scaler = StandardScaler()
y_scaler.fit(all_train[coverLog])
# Transform the training data
train_scaled = y_scaler.transform(all_train[coverLog])
all_train.drop(labels=coverLog, axis='columns', inplace=True)
all_train = pd.concat([all_train, pd.DataFrame(data=train_scaled, columns=coverLog)], axis=1)
# Transform the test data
test_scaled = y_scaler.transform(all_test[coverLog])
all_test.drop(labels=coverLog, axis='columns', inplace=True)
all_test = pd.concat([all_test, pd.DataFrame(data=test_scaled, columns=coverLog)], axis=1)

In [6]:
# Define a function to create stratified training partitions
def stratifyTrainingData(inData):
    # Break training data into individual strata
    train_1 = inData[inData[strata[0]] == 1]
    train_2 = inData[inData[strata[0]] == 2]
    train_3 = inData[inData[strata[0]] == 3]
    # Determine sample size for each strata
    n_1 = len(train_1[cover])
    n_2 = len(train_2[cover])
    n_3 = len(train_3[cover])
    # Determine the minimum sample size for the training dataset
    n_min = int(min([n_1, n_2, n_3]))
    print('Minimum sample size is ' + str(n_min))
    # Determine sampling ratio for each class
    p_1 = round(n_min/n_1, 2)
    p_2 = round(n_min/n_2, 2)
    p_3 = round(n_min/n_3, 2)
    return train_1, train_2, train_3, p_1, p_2, p_3

In [7]:
# Break training data into individual strata
train_10, train_25, train_100, p_10, p_25, p_100 = stratifyTrainingData(all_train)

Minimum sample size is 80


In [8]:
# Define a function to create a training sample
def createTrainingSample(strata_1, strata_2, strata_3, p_1, p_2, p_3):
    # Draw a sample from strata 1
    if p_1 < 1:
        X = strata_1[all_variables]
        y = strata_1[cover[0]]
        train_1, test_1, y_train_1, y_test_1 = train_test_split(X, y, test_size = 1-p_1, train_size = p_1, random_state = None, shuffle = True, stratify = None)
    elif p_1 == 1:
        train_1 = strata_1
    # Draw a sample from strata 2
    if p_2 < 1:
        X = strata_2[all_variables]
        y = strata_2[cover[0]]
        train_2, test_2, y_train_2, y_test_2 = train_test_split(X, y, test_size = 1-p_2, train_size = p_2, random_state = None, shuffle = True, stratify = None)
    elif p_2 == 1:
        train_2 = strata_2
    # Draw a sample from strata 3
    if p_3 < 1:
        X = strata_3[all_variables]
        y = strata_3[cover[0]]
        train_3, test_3, y_train_3, y_test_3 = train_test_split(X, y, test_size = 1-p_3, train_size = p_3, random_state = None, shuffle = True, stratify = None)
    elif p_3 == 1:
        train_3 = strata_3
    # Build training sample
    train_sample = train_1.append(train_2, ignore_index = True, sort = True)
    train_sample = train_sample.append(train_3, ignore_index = True, sort = True)
    train_sample.reset_index()
    return train_sample

In [9]:
# Define a function to create 10 trees
def trainRegressor(train_sample, predictors, response):
    # Define the X and y values
    X_train = train_sample[predictors]
    y_train = train_sample[response[0]]
    # Fit a regressor to the training dataset
    regressor = RandomForestRegressor(n_estimators=10, criterion='mse', bootstrap=True, oob_score=False, n_jobs=1)
    regressor.fit(X_train, y_train)
    return regressor

In [10]:
# Define a function to combine random forest estimators
def combineRegressors(meta_regressor, regressor):
    meta_regressor.estimators_ += regressor.estimators_
    meta_regressor.n_estimators = len(meta_regressor.estimators_)
    return meta_regressor

In [11]:
# Define a function to train a meta regressor
def trainMetaRegressor(strata_1, strata_2, strata_3, p_1, p_2, p_3, predictors, response, test_data):
    # Set counter
    i = 1
    # Create initial training sample
    initial_train = createTrainingSample(strata_1, strata_2, strata_3, p_1, p_2, p_3)
    # Train an initial meta-regressor
    meta_regressor = trainRegressor(initial_train, predictors, response)
    # Increase the counter
    print('Model iteration ' + str(i) + ' out of 100 trained and tested...')
    i = i + 1
    
    # Conduct 99 additional regressor training iterations and merge trees into meta regressor
    while i < 101:
        # Create a training sample
        train_sample = createTrainingSample(strata_1, strata_2, strata_3, p_1, p_2, p_3)
        # Train a regressor
        regressor = trainRegressor(train_sample, predictors, response)
        # Merge trees into the meta regressor
        meta_regressor = combineRegressors(meta_regressor, regressor)
        # Increase the counter
        print('Model iteration ' + str(i) + ' out of 100 trained and tested...')
        i = i + 1
    
    # Prepare the test data
    X_test = test_data[predictors]
    y_test = test_data[response[0]]
    # Use the meta regressor to predict values for the test dataset
    test_prediction = meta_regressor.predict(X_test)
    # Concatenate predicted values to test data frame
    test_data = pd.concat([test_data, pd.DataFrame(test_prediction)], axis=1)
    test_data = test_data.rename(index=int, columns={0: 'regress'})
    return meta_regressor, test_data

In [12]:
# Train the meta regressor
meta_regressor_1, test_data_1 = trainMetaRegressor(train_10, train_25, train_100, p_10, p_25, p_100, predictor_metrics, coverLog, all_test)

Model iteration 1 out of 100 trained and tested...
Model iteration 2 out of 100 trained and tested...
Model iteration 3 out of 100 trained and tested...
Model iteration 4 out of 100 trained and tested...
Model iteration 5 out of 100 trained and tested...
Model iteration 6 out of 100 trained and tested...
Model iteration 7 out of 100 trained and tested...
Model iteration 8 out of 100 trained and tested...
Model iteration 9 out of 100 trained and tested...
Model iteration 10 out of 100 trained and tested...
Model iteration 11 out of 100 trained and tested...
Model iteration 12 out of 100 trained and tested...
Model iteration 13 out of 100 trained and tested...
Model iteration 14 out of 100 trained and tested...
Model iteration 15 out of 100 trained and tested...
Model iteration 16 out of 100 trained and tested...
Model iteration 17 out of 100 trained and tested...
Model iteration 18 out of 100 trained and tested...
Model iteration 19 out of 100 trained and tested...
Model iteration 20 ou

In [13]:
# Define the response and prediction
response = ['regress']
remove_variables = coverLog + response
# Inverse transform the response
cover_unscaled = y_scaler.inverse_transform(test_data_1[coverLog])
# Inverse transform the prediction
predict_unscaled = y_scaler.inverse_transform(test_data_1[response])
# Add unscaled data to data frame
test_data_1.drop(labels=remove_variables, axis='columns', inplace=True)
test_data_1 = pd.concat([test_data_1, pd.DataFrame(data=cover_unscaled, columns=coverLog)], axis=1)
test_data_1 = pd.concat([test_data_1, pd.DataFrame(data=predict_unscaled, columns=response)], axis=1)

In [14]:
# Export the predicted test data
output_file = 'E:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/testRegression/predicted.csv'
test_data_1.to_csv(output_file, header=True, index=False, sep=',', encoding='utf-8')