# Delineate Sample Representation Area

**Written by Timm Nawrocki**

*Last updated Saturday, October 20, 2018*

In [1]:
# -*- coding: utf-8 -*-
# ---------------------------------------------------------------------------
# Delineate Prediction Area
# Author: Timm Nawrocki, Alaska Center for Conservation Science
# Created on: 2018-10-20
# Usage: Must be executed as a Jupyter Notebook in an Anaconda 3 installation. Created using Anaconda 3 version 5.2.0.
# Description: "Delineate Prediction Area" predicts a one-class outlier detection model to watershed data to determine the sample coverage of the watershed.
# ---------------------------------------------------------------------------

In [2]:
# Define model folder
model_folder = 'K:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/modelResults/area_prediction/'
# Define input data folder
watershed_folder = 'K:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/watershedData/'
# Define output folder
output_folder = 'K:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/predictions/study_area/'
# Define subset for parallel computing
subset = list(range(1, 2))
print(subset)

[1]


In [3]:
predictor_all = ['compoundTopographic', 'dateFreeze_2000s', 'dateThaw_2000s', 'elevation', 'floodplainsDist', 'growingSeason_2000s', 'heatLoad', 'integratedMoisture', 'precipAnnual_2000s', 'roughness', 'siteExposure', 'slope', 'streamLargeDist', 'streamSmallDist', 'summerWarmth_2000s', 'surfaceArea', 'surfaceRelief', 'aspect', 'may_1_ultraBlue', 'may_2_blue', 'may_3_green', 'may_4_red', 'may_5_nearInfrared', 'may_6_shortInfrared1', 'may_7_shortInfrared2', 'may_evi2', 'may_nbr', 'may_ndmi', 'may_ndsi', 'may_ndvi', 'may_ndwi', 'june_1_ultraBlue', 'june_2_blue', 'june_3_green', 'june_4_red', 'june_5_nearInfrared', 'june_6_shortInfrared1', 'june_7_shortInfrared2', 'june_evi2', 'june_nbr', 'june_ndmi', 'june_ndsi', 'june_ndvi', 'june_ndwi', 'july_1_ultraBlue', 'july_2_blue', 'july_3_green', 'july_4_red', 'july_5_nearInfrared', 'july_6_shortInfrared1', 'july_7_shortInfrared2', 'july_evi2', 'july_nbr', 'july_ndmi', 'july_ndsi', 'july_ndvi', 'july_ndwi', 'august_1_ultraBlue', 'august_2_blue', 'august_3_green', 'august_4_red', 'august_5_nearInfrared', 'august_6_shortInfrared1', 'august_7_shortInfrared2', 'august_evi2', 'august_nbr', 'august_ndmi', 'august_ndsi', 'august_ndvi', 'august_ndwi', 'september_1_ultraBlue', 'september_2_blue', 'september_3_green', 'september_4_red', 'september_5_nearInfrared', 'september_6_shortInfrared1', 'september_7_shortInfrared2', 'september_evi2', 'september_nbr', 'september_ndmi', 'september_ndsi', 'september_ndvi', 'september_ndwi']
coordinates = ['POINT_X', 'POINT_Y']
outlier = ['outlier']
output_columns = coordinates + outlier

In [4]:
# Import packages for file manipulation, data manipulation, and plotting
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plot
# Import module for altering output display
from IPython.display import clear_output
# Import modules for feature pre-processing and novelty detection from Scikit Learn
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.externals import joblib

In [5]:
# Define model files
scaler_file = os.path.join(model_folder, 'scaler.joblib')
outlier_file = os.path.join(model_folder, 'outlier_detector.joblib')

In [6]:
# Load the scaler and the outlier detector
scaler = joblib.load(scaler_file)
outlier_detector = joblib.load(outlier_file)

In [7]:
# Create a list of input files for the prediction step
input_files = os.listdir(watershed_folder)
# Define the subset list of files
subset_files = [input_files[n] for n in subset]
subset_files

['T1908010402.csv']

In [8]:
# Define a function to predict outliers
def detectOutliers(input_data, predictors, scaler, outlier_detector):
    # Create X data from the predictors
    X = input_data[predictors]
    # Scale the X data
    X_scaled = scaler.transform(X)
    # Predict outliers in the scaled X data
    prediction = outlier_detector.predict(X_scaled)
    # Concatenate predicted values to input data frame
    output_data = pd.concat([input_data, pd.DataFrame(prediction)], axis=1)
    output_data = output_data.rename(index=int, columns={0: 'outlier'})
    # Return the output data
    return output_data

In [9]:
# Loop through the prediction function for all input files
for watershed_data in subset_files:
    # Set output display to show one message with replacement
    clear_output(wait=True)
    # Identify input and output csv files
    predict_csv = os.path.join(watershed_folder, watershed_data)
    output_csv = os.path.join(output_folder, watershed_data)
    # Read input data to data frame
    predict_data = pd.read_csv(predict_csv)
    predict_data[predictor_all + coordinates] = predict_data[predictor_all + coordinates].astype(float)
    # Predict outliers in the data frame
    output_data = detectOutliers(predict_data, predictor_all, scaler, outlier_detector)
    # Export prediction to csv
    output_data = output_data[output_columns]
    output_data.to_csv(output_csv, header=True, index=False, sep=',', encoding='utf-8')
    # Print loop status
    print('Prediction iteration ' + str(input_files.index(watershed_data) + 1) + ' out of ' + str(len(input_files)) + ' complete...')

Prediction iteration 2 out of 2 complete...
