# Floodplain Predict-Subdivide

**Timm Nawrocki**  
Alaska Center for Conservation Science  
2019-04-21

In [1]:
# -*- coding: utf-8 -*-
# ---------------------------------------------------------------------------
# Floodplain Predict
# Author: Timm Nawrocki, Alaska Center for Conservation Science
# Created on: 2019-04-21
# Usage: Must be executed as a Jupyter Notebook in an Anaconda 3 installation.
# Description: "Floodplain Predict-Subdivide" applies the trained classifier to data in regular point grid format stored in csv files to create a prediction representing the distribution of floodplains. This script is meant for subwatershed grids large enough in size to require subdividing into multiple sets.
# ---------------------------------------------------------------------------

## 1. Initialize Environment

In [2]:
# Import packages
import datetime
import matplotlib.pyplot as plot
import numpy as np
import pandas as pd
import os
import seaborn as sns
import time
# Import modules for model selection, cross validation, random forest, and performance from Scikit Learn
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib

# Set root directory
drive = 'E:/'
root_directory = os.path.join(drive, 'ACCS_Work/Projects/VegetationEcology/BristolBay_Vegetation/Project_GIS/Data_Output')

# Define inputs
subwatershed_path = os.path.join(root_directory, 'prediction_tables')
subwatershed = os.path.join(subwatershed_path, '190303011505.csv')
#subwatershed = os.path.join(subwatershed_path, '190303040305.csv')
floodplain_classifier = os.path.join(root_directory, 'model_floodplain/classifier_floodplain.joblib')
threshold_file = os.path.join(root_directory, 'model_floodplain/threshold.txt')

# Define output location
output_path = os.path.join(root_directory, 'model_floodplain/output_tables')

In [3]:
# Define variable sets
classifier_features = ['compoundTopographic', 'elevation', 'exposure', 'heatLoad', 'moisture', 'roughness', 'slope', 'summerWarmth', 'surfaceArea', 'surfaceRelief', 'X05May_ndsi', 'X05May_ndvi', 'X05May_ndwi', 'X06June_4_red', 'X06June_ndsi', 'X06June_ndvi', 'X06June_ndwi', 'X07July_4_red', 'X07July_6_redEdge2', 'X07July_7_redEdge3', 'X07July_8_nearInfrared', 'X07July_11_shortInfrared1', 'X07July_12_shortInfrared2', 'X07July_nbr', 'X07July_ndmi', 'X07July_ndvi', 'X07July_ndwi', 'X08August_ndvi', 'X09September_1_ultraBlue', 'X09September_4_red', 'X09September_11_shortInfrared1', 'X09September_ndvi', 'X09September_ndwi', 'X10October_2_blue', 'X10October_3_green', 'X10October_4_red', 'X10October_5_redEdge1', 'X10October_6_redEdge2', 'X10October_11_shortInfrared1', 'X10October_12_shortInfrared2', 'X10October_nbr', 'X10October_ndmi', 'X10October_ndsi', 'X10October_ndvi', 'X10October_ndwi']
coordinates = ['POINT_X', 'POINT_Y']
predict = ['presence']
output_variables = coordinates + predict

## 2. Prepare data and model

In [4]:
# Import the trained classifier
classifier = joblib.load(floodplain_classifier)

In [None]:
# Start timing function execution
start = time.time()
# Load the input data
input_data = pd.read_csv(subwatershed)
total_n = len(input_data['pointid'])
# End timing
end = time.time()
elapsed = int(end - start)
success_time = datetime.datetime.now()
# Report process success
out_process = 'Succeeded at {0} (Elapsed time: {1})'.format(success_time.strftime("%Y-%m-%d %H:%M"),
                                                            datetime.timedelta(seconds=elapsed))
print(out_process)

Succeeded at 2019-04-21 12:24 (Elapsed time: 0:22:31)


In [None]:
# Start a counter
n = 0
# Loop through the prediction function for all input files
while n < 10:
    output_name = os.path.split(subwatershed)[1]
    output_name = os.path.splitext(output_name)[0] + '-' + str(n + 1) + '.csv'
    output_csv = os.path.join(output_path, output_name)
    if os.path.isfile(output_csv) == False:
        # Start timing function execution
        start = time.time()
        print('Predicting grid {0} out of {1}...'.format((n + 1), str(10)))
        # Determine subset start and finish row indices
        start = int((total_n/10) * n)
        finish = int((total_n/10) * (n + 1)) - 1
        # Subset data
        subset_data = input_data.iloc[start:finish]
        subset_data = subset_data.dropna(axis=0, how='any')
        subset_data[classifier_features] = subset_data[classifier_features].astype(float)
        # Define the X data
        X_data = subset_data[classifier_features]
        # Predict the classifier
        classification = classifier.predict_proba(X_data)
        # Concatenate predicted values to input data frame
        subset_data = pd.concat([subset_data, pd.DataFrame(classification)], axis=1)
        subset_data = subset_data.rename(index=int, columns={0: 'absence', 1: 'presence'})
        # Export prediction to csv
        output_data = subset_data[output_variables]
        output_data.to_csv(output_csv, header=True, index=False, sep=',', encoding='utf-8')
        # End timing
        end = time.time()
        elapsed = int(end - start)
        success_time = datetime.datetime.now()
        # Report process success
        out_process = 'Succeeded at {0} (Elapsed time: {1})'.format(success_time.strftime("%Y-%m-%d %H:%M"),
                                                                    datetime.timedelta(seconds=elapsed))
        print(out_process)
        print('----------')
    n += 1

Predicting grid 1 out of 10...
Succeeded at 2019-04-21 12:27 (Elapsed time: 18007 days, 20:27:14)
----------
Predicting grid 2 out of 10...
