In [1]:
# Import modules
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
print('All modules successfully imported.')

All modules successfully imported.


In [2]:
# Define user input variables
root_folder = 'E:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/testRegression'
input_file = 'E:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/testRegression/betula_nana_mod.csv'

In [3]:
# Define variable sets
predictor_metrics = ['compoundTopographic', 'dateFreeze_2000s', 'dateThaw_2000s', 'elevation', 'floodplainsDist', 'growingSeason_2000s', 'heatLoad', 'integratedMoisture', 'precipAnnual_2000s', 'roughness', 'siteExposure', 'slope', 'streamLargeDist', 'streamSmallDist', 'summerWarmth_2000s', 'surfaceArea', 'surfaceRelief', 'aspect', 'may_2_blue', 'may_evi2', 'may_nbr', 'may_ndmi', 'may_ndsi', 'may_ndvi', 'may_ndwi', 'june_2_blue', 'june_evi2', 'june_nbr', 'june_ndmi', 'june_ndsi', 'june_ndvi', 'june_ndwi', 'july_2_blue', 'july_evi2', 'july_nbr', 'july_ndmi', 'july_ndsi', 'july_ndvi', 'july_ndwi', 'august_2_blue', 'august_evi2', 'august_nbr', 'august_ndmi', 'august_ndsi', 'august_ndvi', 'august_ndwi', 'september_2_blue', 'september_evi2', 'september_nbr', 'september_ndmi', 'september_ndsi', 'september_ndvi', 'september_ndwi']
zero_variable = ['zero']
cover = ['cover']
coverLog = ['coverLog']
strata = ['strata']
retain_variables = ['project', 'siteID', 'siteCode', 'methodSurvey', 'methodCover']
coordinates = ['POINT_X', 'POINT_Y']
all_variables = retain_variables + coordinates + predictor_metrics + zero_variable + strata + cover + coverLog
scale_variables = predictor_metrics
print('Variable sets loaded.')

Variable sets loaded.


In [4]:
# Create data frame of input file
input_data = pd.read_csv(input_file)
input_data[predictor_metrics + cover + coverLog + coordinates] = input_data[predictor_metrics + cover + coverLog + coordinates].astype(float)
input_data[strata + zero_variable] = input_data[strata + zero_variable].astype(int)

In [5]:
# Create train and test split
X = input_data[all_variables]
y = input_data[cover[0]]
stratify = input_data[strata[0]]
all_train_raw, all_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size = 0.7, random_state = None, shuffle = True, stratify = stratify)
all_train_raw = all_train_raw.reset_index()
all_test_raw = all_test_raw.reset_index()

In [6]:
# Output raw train and test data
train_raw = 'E:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/testRegression/train_raw.csv'
test_raw = 'E:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/testRegression/test_raw.csv'
all_train_raw.to_csv(train_raw, header=True, index=False, sep=',', encoding='utf-8')
all_test_raw.to_csv(test_raw, header=True, index=False, sep=',', encoding='utf-8')

In [7]:
# Create a standard scaler to set mean = 0 and scale unit variance (scale all variables to Gaussian distribution)
scaler = StandardScaler()
scaler.fit(all_train_raw[scale_variables])

StandardScaler(copy=True, with_mean=True, with_std=True)

In [8]:
# Scale the training data
train_scaled = scaler.transform(all_train_raw[scale_variables])
all_train_scaled = all_train_raw
all_train_scaled = all_train_scaled.drop(columns=scale_variables)
all_train_scaled = pd.concat([all_train_scaled, pd.DataFrame(data=train_scaled, columns=scale_variables)], axis=1)

In [9]:
# Scale the test data
test_scaled = scaler.transform(all_test_raw[scale_variables])
all_test_scaled = all_test_raw
all_test_scaled = all_test_scaled.drop(columns=scale_variables)
all_test_scaled = pd.concat([all_test_scaled, pd.DataFrame(data=test_scaled, columns=scale_variables)], axis=1)

In [10]:
# Output scaled train and test data
train_scaled = 'E:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/testRegression/train_scaled.csv'
test_scaled = 'E:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/testRegression/test_scaled.csv'
all_train_scaled.to_csv(train_scaled, header=True, index=False, sep=',', encoding='utf-8')
all_test_scaled.to_csv(test_scaled, header=True, index=False, sep=',', encoding='utf-8')