In [1]:
# Import modules
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plot
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.externals import joblib
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.linear_model import ARDRegression, BayesianRidge
from collections import Counter
print('All modules successfully imported.')

All modules successfully imported.


In [8]:
# Define variable sets
predictor_metrics = ['compoundTopographic', 'dateFreeze_2000s', 'dateThaw_2000s', 'elevation', 'floodplainsDist', 'growingSeason_2000s', 'heatLoad', 'integratedMoisture', 'precipAnnual_2000s', 'roughness', 'siteExposure', 'slope', 'streamLargeDist', 'streamSmallDist', 'summerWarmth_2000s', 'surfaceArea', 'surfaceRelief', 'aspect', 'may_2_blue', 'may_evi2', 'may_nbr', 'may_ndmi', 'may_ndsi', 'may_ndvi', 'may_ndwi', 'june_2_blue', 'june_evi2', 'june_nbr', 'june_ndmi', 'june_ndsi', 'june_ndvi', 'june_ndwi', 'july_2_blue', 'july_evi2', 'july_nbr', 'july_ndmi', 'july_ndsi', 'july_ndvi', 'july_ndwi', 'august_2_blue', 'august_evi2', 'august_nbr', 'august_ndmi', 'august_ndsi', 'august_ndvi', 'august_ndwi', 'september_2_blue', 'september_evi2', 'september_nbr', 'september_ndmi', 'september_ndsi', 'september_ndvi', 'september_ndwi']
zero_variable = ['zero']
cover = ['cover']
coverLog = ['coverLog']
strata = ['strata']
retain_variables = ['project', 'siteID', 'siteCode', 'methodSurvey', 'methodCover']
coordinates = ['POINT_X', 'POINT_Y']
all_variables = retain_variables + coordinates + predictor_metrics + zero_variable + strata + cover + coverLog
scale_variables = predictor_metrics
print('Variable sets loaded.')

Variable sets loaded.


In [69]:
# Define input data
input_file = 'E:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/testRegression/betula_nana_mod.csv'
# Create data frame of input data
input_data = pd.read_csv(input_file)
# Convert values to floats
input_data[predictor_metrics + cover + coordinates] = input_data[predictor_metrics + cover + coordinates].astype(float)
# Convert values to integers
input_data[strata + zero_variable + coverLog] = input_data[strata + zero_variable + coverLog].astype(int)

In [70]:
# Create train and test splits
X = input_data[all_variables]
y = input_data[coverLog[0]]
stratify = input_data[coverLog[0]]
all_train, all_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, train_size = 0.7, random_state = None, shuffle = True, stratify = stratify)
all_train = all_train.reset_index()
all_test = all_test.reset_index()

In [71]:
# Create a standard scaler to set mean = 0 and scale unit variance (scale all variables to Gaussian distribution)
scaler = StandardScaler()
scaler.fit(all_train[predictor_metrics])
# Scale the train data
train_scaled = scaler.transform(all_train[predictor_metrics])
all_train.drop(columns=predictor_metrics)
all_train = pd.concat([all_train, pd.DataFrame(data=train_scaled, columns=predictor_metrics)], axis=1)
# Scale the test data
test_scaled = scaler.transform(all_test[predictor_metrics])
all_test.drop(columns=predictor_metrics)
all_test = pd.concat([all_test, pd.DataFrame(data=test_scaled, columns=predictor_metrics)], axis=1)

In [72]:
# Define X and y
X_train = all_train[predictor_metrics]
y_train = all_train[coverLog[0]]
X_test = all_test[predictor_metrics]
y_test = all_test[coverLog[0]]

In [73]:
# Fit a random forest model
classifier = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy', max_features = 'log2', bootstrap = True, oob_score = False, n_jobs = 1, class_weight = 'balanced')
classifier.fit(X_train, y_train)

# Use the classifier to predict class probabilities
test_prediction = classifier.predict_proba(X_test)
# Concatenate predicted values to test data frame
all_test = pd.concat([all_test, pd.DataFrame(test_prediction)], axis=1)
all_test = all_test.rename(index=int, columns={0: 'predict'})

In [74]:
# Export predicted data
output_file = 'E:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/testRegression/predicted.csv'
all_test.to_csv(output_file, header=True, index=False, sep=',', encoding='utf-8')