In [1]:
# Import packages for file manipulation, data manipulation, and plotting
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plot
# Import module for altering output display
from IPython.display import clear_output
# Import modules for preprocessing, model selection, linear regression, and performance from Scikit Learn
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [2]:
# Define input file
input_file = 'E:/VegetationEcology/Data_Harmonization/Supplemental/discrete_vaccinium_vitisidaea.csv'
# Define output metrics file
metrics_file = 'E:/VegetationEcology/Data_Harmonization/Supplemental/discrete_vaccinium_vitisidaea_metrics.csv'

In [4]:
# Create a function to calculate pseudo r-squared and RMSE for the composited prediction
def calculatePerformance(test, cover, prediction):
    # Define the true values and the predicted values for the response variable
    y_test = test[cover[0]]
    y_prediction = test[prediction[0]]
    # Calculate pseudo r-squared
    r_score = r2_score(y_test, y_prediction, sample_weight=None, multioutput='uniform_average')
    # Calculate error
    mae = mean_absolute_error(y_test, y_prediction)
    rmse = np.sqrt(mean_squared_error(y_test, y_prediction))
    # Return performance metrics
    return r_score, mae, rmse

In [3]:
# Define a function to conduct a train test iteration
def trainTest(X_array, y):
    # Split the data into a train and test partitions
    stratify = aim_data[strata[0]]
    X_train, X_test, y_train, y_test = train_test_split(X_array,
                                                        y,
                                                        test_size = 0.3,
                                                        train_size = 0.7,
                                                        random_state = None,
                                                        shuffle = True,
                                                        stratify = stratify)
    y_train = y_train.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)
    # Fit and predict a linear regression
    regression = LinearRegression()
    regression.fit(X_train, y_train)
    prediction = regression.predict(X_test)
    # Concatenate the formatted labels to the data frame
    result = pd.concat([pd.DataFrame(y_test), pd.DataFrame(prediction)], axis=1)
    result = result.rename(index=int, columns={0: 'prediction'})
    r2, mae, mse = calculatePerformance(result, cover, response)
    return r2, mae, mse

In [5]:
# Define variables
cover = ['cover']
discrete = ['NSSI']
strata = ['strata']
predictors = [0,1,2,3,4,5,6,7,8,9,10,11,12]
response = ['prediction']

In [6]:
# Create data frame of input data
input_data = pd.read_csv(input_file)
# Convert values to floats
input_data[cover[0]] = input_data[cover[0]].astype(float)
input_data = shuffle(input_data)

In [7]:
# Subset input data to AIM data
aim_data = input_data[input_data['project'] == 'AIM NPR-A']
aim_data = aim_data.reset_index()

In [8]:
# Split the X and y data
X = aim_data[discrete[0]]
y = aim_data[cover[0]]
# Convert the X data to numpy array
X = np.asarray(X)
X = np.reshape(X, (-1,1))

In [9]:
# Fit a one-hot encoder to the discrete map classes
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X)

OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='ignore', n_values='auto', sparse=True)

In [10]:
# Transform X data using one-hot encoder
X_array = encoder.transform(X)

In [None]:
# Conduct 100 train test iterations
r2_list = []
mae_list = []
mse_list = []
i = 1
while i < 101:
    # Set output display to show one message with replacement
    clear_output(wait=True)
    # Run train test iteration
    r2, mae, mse = trainTest(X_array, y)
    # Append performance metrics to list
    r2_list.append(r2)
    mae_list.append(mae)
    mse_list.append(mse)
    # Print status
    print('Model train-test iteration ' + str(i) + ' out of 100 completed...')
    # Increase counter
    i = i + 1

Model train-test iteration 34 out of 100 completed...


In [None]:
# Calculate mean and standard deviation
r2_mean = np.mean(r2_list)
mae_mean = np.mean(mae_list)
mse_mean = np.mean(mse_list)
r2_sd = np.std(r2_list)
mae_sd = np.std(mae_list)
mse_sd = np.std(mse_list)

In [None]:
# Export threshold and performance metrics as a table
metrics_dataframe = pd.DataFrame({'r2':r2_list,
                                  'mae':mae_list,
                                  'mse':mse_list})
metrics_dataframe.to_csv(metrics_file, header=True, index=False, sep=',', encoding='utf-8')

In [None]:
# Report mean and standard deviation r2
print(r2_mean)
print(r2_sd)