# Sample Area Environmental Clustering

**Timm Nawrocki**  
Alaska Center for Conservation Science  
2019-04-16

In [1]:
# -*- coding: utf-8 -*-
# ---------------------------------------------------------------------------
# Sample Area Environmental Clustering
# Author: Timm Nawrocki
# Created on: 2019-04-16
# Usage: Must be executed as a Jupyter Notebook in an ArcGIS Pro Python 3 installation.
# Description: "Sample Area Environmental Clustering" clusters the environmental variation in samples from sampling areas.
# ---------------------------------------------------------------------------

## 1. Initialize Environment

In [2]:
# Import packages
import datetime
import numpy as np
import os
import pandas as pd
import time
# Import clustering and data tools from scikit-learn
from sklearn.cluster import AgglomerativeClustering
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler

# Set root directory
drive = 'F:/'
root_directory = os.path.join(drive, 'ACCS_Work/Projects/VegetationEcology/BristolBay_Vegetation/Project_GIS/Data_Output')

# Define input dataset
sample_file = os.path.join(root_directory, 'sample_environment/EnvironmentalSample_AleknagikNuyakuk.csv')

# Define output dataset
output_csv = os.path.join(root_directory, 'sample_environment/EnvironmentalSample_Clustered.csv')

In [3]:
# Define variable sets
predictor_env = ['aspect', 'compoundTopographic', 'dateFreeze', 'dateThaw', 'elevation', 'exposure', 'growingSeason', 'heatLoad', 'moisture', 'precipitation', 'roughness', 'slope', 'summerWarmth', 'surfaceArea', 'surfaceRelief']
predictor_may = ['X05May_1_ultraBlue', 'X05May_2_blue', 'X05May_3_green', 'X05May_4_red', 'X05May_5_redEdge1', 'X05May_6_redEdge2', 'X05May_7_redEdge3', 'X05May_8_nearInfrared', 'X05May_8a_redEdge4', 'X05May_11_shortInfrared1', 'X05May_12_shortInfrared2', 'X05May_nbr', 'X05May_ndmi', 'X05May_ndsi', 'X05May_ndvi', 'X05May_ndwi']
predictor_jun = ['X06June_1_ultraBlue', 'X06June_2_blue', 'X06June_3_green', 'X06June_4_red', 'X06June_5_redEdge1', 'X06June_6_redEdge2', 'X06June_7_redEdge3', 'X06June_8_nearInfrared', 'X06June_8a_redEdge4', 'X06June_11_shortInfrared1', 'X06June_12_shortInfrared2', 'X06June_nbr', 'X06June_ndmi', 'X06June_ndsi', 'X06June_ndvi', 'X06June_ndwi']
predictor_jul = ['X07July_1_ultraBlue', 'X07July_2_blue', 'X07July_3_green', 'X07July_4_red', 'X07July_5_redEdge1', 'X07July_6_redEdge2', 'X07July_7_redEdge3', 'X07July_8_nearInfrared', 'X07July_8a_redEdge4', 'X07July_11_shortInfrared1', 'X07July_12_shortInfrared2', 'X07July_nbr', 'X07July_ndmi', 'X07July_ndsi', 'X07July_ndvi', 'X07July_ndwi']
predictor_aug = ['X08August_1_ultraBlue', 'X08August_2_blue', 'X08August_3_green', 'X08August_4_red', 'X08August_5_redEdge1', 'X08August_6_redEdge2', 'X08August_7_redEdge3', 'X08August_8_nearInfrared', 'X08August_8a_redEdge4', 'X08August_11_shortInfrared1', 'X08August_12_shortInfrared2', 'X08August_nbr', 'X08August_ndmi', 'X08August_ndsi', 'X08August_ndvi', 'X08August_ndwi']
predictor_sep = ['X09September_1_ultraBlue', 'X09September_2_blue', 'X09September_3_green', 'X09September_4_red', 'X09September_5_redEdge1', 'X09September_6_redEdge2', 'X09September_7_redEdge3', 'X09September_8_nearInfrared', 'X09September_8a_redEdge4', 'X09September_11_shortInfrared1', 'X09September_12_shortInfrared2', 'X09September_nbr', 'X09September_ndmi', 'X09September_ndsi', 'X09September_ndvi', 'X09September_ndwi']
predictor_oct = ['X10October_1_ultraBlue', 'X10October_2_blue', 'X10October_3_green', 'X10October_4_red', 'X10October_5_redEdge1', 'X10October_6_redEdge2', 'X10October_7_redEdge3', 'X10October_8_nearInfrared', 'X10October_8a_redEdge4', 'X10October_11_shortInfrared1', 'X10October_12_shortInfrared2', 'X10October_nbr', 'X10October_ndmi', 'X10October_ndsi', 'X10October_ndvi', 'X10October_ndwi']
predictor_all = predictor_env + predictor_may + predictor_jun + predictor_jul + predictor_aug + predictor_sep + predictor_oct
coordinates = ['POINT_X', 'POINT_Y']
cluster = ['cluster']
output_fields = coordinates + cluster

## 2. Cluster Data

In [4]:
# Start timing function execution
start = time.time()
# Create data frame of input data
sample_data = pd.read_csv(sample_file)
# Remove rows with missing values
sample_data = sample_data.dropna(axis=0, how='any')
# Convert values to floats
sample_data[predictor_all] = sample_data[predictor_all].astype(float)
# Shuffle data
sample_data = shuffle(sample_data)
# Split the X data
X = sample_data[predictor_all]
# End timing function execution and calculate elapsed time
end = time.time()
elapsed = int(end - start)
success_time = datetime.datetime.now()
# Report process success
out_process = 'Succeeded at {0} (Elapsed time: {1})'.format(success_time.strftime("%Y-%m-%d %H:%M"),
                                                            datetime.timedelta(seconds=elapsed))
print(out_process)

Succeeded at 2019-04-17 11:12 (Elapsed time: 0:00:07)


In [5]:
# Start timing function execution
start = time.time()
# Create a standard scaler for the X data
scaler = StandardScaler()
scaler.fit(X)
# Transform the X data to Guassian distribution using scaler
X_scaled = scaler.transform(X)
# End timing function execution and calculate elapsed time
end = time.time()
elapsed = int(end - start)
success_time = datetime.datetime.now()
# Report process success
out_process = 'Succeeded at {0} (Elapsed time: {1})'.format(success_time.strftime("%Y-%m-%d %H:%M"),
                                                            datetime.timedelta(seconds=elapsed))
print(out_process)

Succeeded at 2019-04-17 11:12 (Elapsed time: 0:00:00)


In [6]:
# Start timing function execution
start = time.time()
# Cluster the samples using Agglomerative Clustering with 64 clusters
agglom = AgglomerativeClustering(n_clusters=64, affinity='euclidean').fit(X_scaled)
# Assign fitted labels to sample data frame
sample_data['cluster'] = agglom.labels_
# Export sample data frame to csv
output_data = sample_data[output_fields]
output_data.to_csv(output_csv, header=True, index=False, sep=',', encoding='utf-8')
# End timing function execution and calculate elapsed time
end = time.time()
elapsed = int(end - start)
success_time = datetime.datetime.now()
# Report process success
out_process = 'Succeeded at {0} (Elapsed time: {1})'.format(success_time.strftime("%Y-%m-%d %H:%M"),
                                                            datetime.timedelta(seconds=elapsed))
print(out_process)

Succeeded at 2019-04-17 11:31 (Elapsed time: 0:18:59)
