# Map Performance Discrete Random

**Written by Timm Nawrocki**

Last Updated Sunday November 17, 2019

In [1]:
# -*- coding: utf-8 -*-
# ---------------------------------------------------------------------------
# Map Performance Discrete NSSI
# Author: Timm Nawrocki, Alaska Center for Conservation Science
# Created on: 2018-10-22
# Usage: Must be executed as a Jupyter Notebook in an Anaconda 3 installation.
# Description: "Map Performance Discrete NSSI" estimates the amount of observed spatial heterogeneity in species foliar cover predicted by a discrete type vegetation map, the North Slope land cover map. All model performance metrics are calculated on independent test partitions.
# ---------------------------------------------------------------------------

In [2]:
# Import packages for file manipulation, data manipulation, and plotting
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plot
# Import module for altering output display
from IPython.display import clear_output
# Import modules for preprocessing, model selection, linear regression, and performance from Scikit Learn
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
# Import timing packages
import time
import datetime

In [3]:
# Define input file
input_file = 'N:/ACCS_Work/Projects/VegetationEcology/Data_Harmonization/Supplemental/discrete_vaccinium_vitisidaea.csv'

In [4]:
# Define variables
cover = ['cover']
discrete = ['random']
strata = ['strata']
response = ['response']

In [5]:
# Create data frame of input data
input_data = pd.read_csv(input_file)
# Convert values to floats
input_data[cover[0]] = input_data[cover[0]].astype(float)
# Shuffle data
input_data = shuffle(input_data, random_state=21)

In [6]:
# Subset input data to AIM data
aim_data = input_data[input_data['project'] == 'AIM NPR-A']
aim_data = aim_data.reset_index()

In [7]:
# Set the discrete X data
X_discrete = aim_data[discrete[0]]
# Convert the X data to numpy array
X_discrete_array = np.asarray(X_discrete)
X_discrete_array = np.reshape(X_discrete_array, (-1,1))

# Fit a one-hot encoder to the discrete map classes
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_discrete_array)
# Transform X data using one-hot encoder
X_transformed_array = encoder.transform(X_discrete_array)

In [8]:
# Define 10-fold cross validation split methods
outer_cv_splits = KFold(n_splits=10, shuffle=True, random_state=314)

In [9]:
# Create an empty data frame to store the outer test results
outer_results = pd.DataFrame(columns=cover + discrete + strata + response + ['iteration'])

In [10]:
# Iterate through outer cross-validation splits
i = 1
for train_index, test_index in outer_cv_splits.split(aim_data):
    
    
    #### CONDUCT MODEL TRAIN
    
    
    # Partition the outer train split by iteration number
    print(f'Conducting outer cross-validation iteration {i} of 10...')
    iteration_start = time.time()
    train_iteration = aim_data.iloc[train_index]
    
    # Identify X and y train splits for the regressor
    X_train_regress = X_transformed_array[train_index]
    y_train_regress = train_iteration[cover[0]]

    # Fit and predict a linear regression
    regression = LinearRegression()
    regression.fit(X_train_regress, y_train_regress)
    
    
    #### CONDUCT MODEL TEST

    
    # Partition the outer test split by iteration number
    test_iteration = aim_data.iloc[test_index]
    
    # Identify X test split
    X_test = X_transformed_array[test_index]
    
    # Use the regressor to predict foliar cover response
    response_prediction = regression.predict(X_test)
    # Concatenate predicted values to test data frame
    test_iteration['response'] = response_prediction
    
    # Add iteration number to test iteration
    test_iteration['iteration'] = i
    
    # Add the test results to output data frame
    outer_results = outer_results.append(test_iteration, ignore_index=True, sort=True)
    iteration_end = time.time()
    iteration_elapsed = int(iteration_end - iteration_start)
    iteration_success_time = datetime.datetime.now()
    print(f'Completed at {iteration_success_time.strftime("%Y-%m-%d %H:%M")} (Elapsed time: {datetime.timedelta(seconds=iteration_elapsed)})')
    print('----------')
          
    # Increase iteration number
    i += 1

Conducting outer cross-validation iteration 1 of 10...
Completed at 2019-11-21 17:34 (Elapsed time: 0:00:00)
----------
Conducting outer cross-validation iteration 2 of 10...
Completed at 2019-11-21 17:34 (Elapsed time: 0:00:00)
----------
Conducting outer cross-validation iteration 3 of 10...
Completed at 2019-11-21 17:34 (Elapsed time: 0:00:00)
----------
Conducting outer cross-validation iteration 4 of 10...
Completed at 2019-11-21 17:34 (Elapsed time: 0:00:00)
----------
Conducting outer cross-validation iteration 5 of 10...
Completed at 2019-11-21 17:34 (Elapsed time: 0:00:00)
----------
Conducting outer cross-validation iteration 6 of 10...
Completed at 2019-11-21 17:34 (Elapsed time: 0:00:00)
----------
Conducting outer cross-validation iteration 7 of 10...
Completed at 2019-11-21 17:34 (Elapsed time: 0:00:00)
----------
Conducting outer cross-validation iteration 8 of 10...
Completed at 2019-11-21 17:34 (Elapsed time: 0:00:00)
----------
Conducting outer cross-validation iterat

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_gui

In [11]:
print(len(outer_results))

183


In [12]:
print(outer_results[response])

      response
0     4.692308
1     7.125000
2     4.200000
3     0.500000
4    11.000000
..         ...
178   5.000000
179   5.666667
180   5.600000
181   4.571429
182   7.800000

[183 rows x 1 columns]


In [13]:
# Partition output results to foliar cover observed and predicted
y_regress_observed = outer_results[cover[0]]
y_regress_predicted = outer_results[response[0]]

# Calculate performance metrics from output_results
r_score = r2_score(y_regress_observed, y_regress_predicted, sample_weight=None, multioutput='uniform_average')
mae = mean_absolute_error(y_regress_observed, y_regress_predicted)
rmse = np.sqrt(mean_squared_error(y_regress_observed, y_regress_predicted))

# Print performance results
print(f'Final R^2 = {r_score}')
print(f'Final MAE = {mae}')
print(f'Final RMSE = {rmse}')

Final R^2 = -0.14103771100495255
Final MAE = 7.241147555346228
Final RMSE = 9.368658358832633
