# Create Sample Outlier Detector

**Written by Timm Nawrocki**

*Last updated Saturday, October 20, 2018*

In [None]:
# -*- coding: utf-8 -*-
# ---------------------------------------------------------------------------
# Create Sample Outlier Detector
# Author: Timm Nawrocki, Alaska Center for Conservation Science
# Created on: 2018-10-20
# Usage: Must be executed as a Jupyter Notebook in an Anaconda 3 installation. Created using Anaconda 3 version 5.2.0.
# Description: "Create Sample Outlier Detector" trains a one-class outlier detection model to determine the landscape coverage of the sampled points.
# ---------------------------------------------------------------------------

In [None]:
# Define input file
input_file = 'K:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/speciesData/carex_aquatilis.csv'
# Define output folder
output_folder = 'K:/VegetationEcology/Data_Harmonization/Project_GIS/Data_Output/modelResults/area_prediction/'

In [None]:
predictor_all = ['compoundTopographic', 'dateFreeze_2000s', 'dateThaw_2000s', 'elevation', 'floodplainsDist', 'growingSeason_2000s', 'heatLoad', 'integratedMoisture', 'precipAnnual_2000s', 'roughness', 'siteExposure', 'slope', 'streamLargeDist', 'streamSmallDist', 'summerWarmth_2000s', 'surfaceArea', 'surfaceRelief', 'aspect', 'may_1_ultraBlue', 'may_2_blue', 'may_3_green', 'may_4_red', 'may_5_nearInfrared', 'may_6_shortInfrared1', 'may_7_shortInfrared2', 'may_evi2', 'may_nbr', 'may_ndmi', 'may_ndsi', 'may_ndvi', 'may_ndwi', 'june_1_ultraBlue', 'june_2_blue', 'june_3_green', 'june_4_red', 'june_5_nearInfrared', 'june_6_shortInfrared1', 'june_7_shortInfrared2', 'june_evi2', 'june_nbr', 'june_ndmi', 'june_ndsi', 'june_ndvi', 'june_ndwi', 'july_1_ultraBlue', 'july_2_blue', 'july_3_green', 'july_4_red', 'july_5_nearInfrared', 'july_6_shortInfrared1', 'july_7_shortInfrared2', 'july_evi2', 'july_nbr', 'july_ndmi', 'july_ndsi', 'july_ndvi', 'july_ndwi', 'august_1_ultraBlue', 'august_2_blue', 'august_3_green', 'august_4_red', 'august_5_nearInfrared', 'august_6_shortInfrared1', 'august_7_shortInfrared2', 'august_evi2', 'august_nbr', 'august_ndmi', 'august_ndsi', 'august_ndvi', 'august_ndwi', 'september_1_ultraBlue', 'september_2_blue', 'september_3_green', 'september_4_red', 'september_5_nearInfrared', 'september_6_shortInfrared1', 'september_7_shortInfrared2', 'september_evi2', 'september_nbr', 'september_ndmi', 'september_ndsi', 'september_ndvi', 'september_ndwi']

In [None]:
# Import packages for file manipulation, data manipulation, and plotting
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plot
# Import module for altering output display
from IPython.display import clear_output
# Import modules for feature pre-processing and novelty detection from Scikit Learn
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.externals import joblib

In [None]:
# Define output model file
scaler_file = os.path.join(output_folder, 'scaler.joblib')
outlier_file = os.path.join(output_folder, 'outlier_detector.joblib')

In [None]:
# Create data frame of input data
input_data = pd.read_csv(input_file)
# Convert values to floats
input_data[predictor_all] = input_data[predictor_all].astype(float)
# Shuffle data
input_data = shuffle(input_data)

In [None]:
# Subset the input data to samples from AIM NPR-A
aim_data = input_data[input_data['project'] == 'AIM NPR-A']
# Split the X and y data
X = aim_data[predictor_all]

In [None]:
# Create a standard scaler for the X data
scaler = StandardScaler()
scaler.fit(X)
# Transform the X data to Gaussian distribution using scaler
X_scaled = scaler.transform(X)
# Export the standard scaler to file
joblib.dump(scaler, scaler_file)

In [None]:
# Create an outlier detector using the 95% confidence interval of the sample points
gamma = 1/(len(predictor_all)*X_scaled.std())
outlier_detector = OneClassSVM(kernel='rbf', gamma=gamma, nu=0.05)
# Fit the outlier detector to the AIM NPR-A sample
outlier_detector.fit(X_scaled)
# Export the outlier detector to file
joblib.dump(outlier_detector, outlier_file)