# Woolsey Fire Random Forest Model

This random-forest model in this notebook seeks to model how pre-fire conditionsâ€”including vegetation, topography, and climateâ€”contributed to burn severity during the Woolsey Fire. The configuration and results of each run can be modified under the **Configure model** section and are saved when the run finishes. 

In [None]:
import json
import hashlib
import os
import re
import warnings

import earthpy as et
import earthpy.plot as ep
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import rioxarray as rxr
import seaborn as sns
from scipy.stats import mode
from shapely.geometry import box
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import (
    classification_report,
    f1_score,
    mean_squared_error,
    precision_score,
    recall_score
)
import xarray as xr

from ea_drought_burn.config import CRS
from ea_drought_burn.utils import (
    aggregate,
    copy_xr_metadata,
    create_figure,
    create_sampling_mask,
    hist,
    load_nifc_fires,
    open_raster,
    plot_bands,
    plot_rgb,
    plot_regression,
    reproject_match
)



# Set plot style
plt.rc("figure.constrained_layout", use=True, h_pad=12/72, w_pad=12/72)
sns.set(font_scale=1.5, style="white")

# Set working directory to the earthpy data directory
os.chdir(os.path.join(et.io.HOME, "earth-analytics", "data", "woolsey-fire"))

In [None]:
def float_format(num):
    """Formats float to a string
    
    Parameters
    ----------
    num: int or float
        number to format for print
        
    Returns
    -------
    str
        number as a string
    """
    return f"{num:.3f}" if num % 1 else str(int(num))


def prep(x, y, mask=None):
    """Prepares data for use in a sklearn model
    
    Parameters
    ----------
    x: xarray.DataArray
        an array containing the explanatory data. Can contain more than one
        band.
    y: xarray.DataArray
        an array with a single band containing the response variable
    mask: numpy.array
        an array used to mask x and y
    
    Returns
    -------
    tuple of xarray.DataArray
        x and y formatted for use in a sklean model
    """
    
    x = x.copy()
    y = y.copy()
    
    # Ignore areas within scar that did not burn
    if mask is not None:
        x = x.where(mask)
        y = y.where(mask)

    # Convert to 1D arrays
    x = np.array([np.ravel(b) for b in x])
    y = np.ravel(y)

    # Limit to values that are finite in all arrays
    xy_mask = y.copy()
    for band in x:
        xy_mask *= band

    x = np.array([b[np.isfinite(xy_mask)] for b in x])
    y = y[np.isfinite(xy_mask)]
    
    return x.transpose(), y


def slugify(val):
    """Makes a string suitable for a filename
    
    Parameters
    ----------
    val: str
        the string to base the filename on
        
    Returns
    -------
    str
        the string as a filename
    """
    return re.sub(r"[^a-z0-9]+", "_", val.lower())

## Configure model

The constants in the cell belowâ€”`FEATURES`, `LABELS`, `CLASSIFIER`, `CLASSIFIER_PARAMS`, and `SAMPLING PARAMS`â€”are used to configure how the model will run. At the end of the notebook, the notebook saves the content of each variable along with the results of the model.

In [None]:
# Select features
FEATURES = [
    # Vegetation
    "Community",
    #"FAL",
    "Dead",
    #"dFAL",
    #"Years Dead",
    #"Burned (2000-2018)",
    
    # Pre-fire spectral indices
    #"LFMC",
    #"NDVI",
    #"NDMI",
    "NDWI",
    #"SAVI",
    
    # Topography
    "Elevation",
    #"Aspect",
    "Folded Aspect",
    "Slope",
    
    # Climate
    #"Days Precipitation",
    #"Max VPD",
    #"Min Temperature",
    #"Heat Days Over 95",
    #"Cumulative Precipitation",
    
    # Burn severity
    #"dNBR",
    #"Classified dNBR",
]

# Response variable. Must be either "Classified dNBR" or "Dead Pixels".
LABELS = "Classified dNBR"

# Class of classifier
CLASSIFIER = RandomForestClassifier

# Keyword arguments passed to classifier. If this dict is empty, the notebook
# will tune the classifier based on a set of reasonable parameters using the
# GridSearchCV function. This is very slow, so provide params if you can.
CLASSIFIER_PARAMS = {
    "n_estimators": 100,
    "max_depth": 5,
    "min_samples_leaf": 10,
    "max_features": "log2",
    "oob_score": True,
}

# Keyword arguments pass to create_sampling_mask
SAMPLING_PARAMS = {
    "counts": {"training": 9000, "validation": 3000},
    #"balanced": True,
    "seed": 20210421
}

## Restore and prepare data

In [None]:
# Check for stored ready variable and run load-data.ipynb if not found
%store -r woolsey_data_ready
try:
    woolsey_data_ready
except NameError:
    print("Running load-data.ipynb...")
    run_notebook("load-data.ipynb")

# Retore variables using storemagic. Each variable is restored explicitly to
# avoid confusion about where variable names are coming from.    
%store -r all_data
%store -r cmap_dnbr
%store -r labels_dnbr
%store -r prism_grid
%store -r reproj_to
%store -r woolsey_fire

In [None]:
# Create lookup for all data
datasets = all_data.copy()

# Use the mean for each pixel for each four-year set of climate data
for key in (
    "Days Precipitation",
    "Max VPD",
    "Minimum Temperature",
    "Heat Days Over 95",
    "Cumulative Precipitation"
):
    datasets[key] = datasets[key].mean(axis=0)

In [None]:
# Limit the model to a subset of the available pixels
# FIXME: Subset is not captured when model is saved
cond = (
    (datasets["Classified dNBR"] > 1)    # ignore unburned
    & (datasets["Classified dNBR"] < 5)  # ignore increased greenness
    & (datasets["Community"] != 6)       # ignore substrate
)
mask = xr.where(cond, True, False).values

In [None]:
# Create and verify the sampling mask
xda = reproj_to.rio.clip(woolsey_fire.geometry)
sampling_mask = create_sampling_mask(xda, **SAMPLING_PARAMS)

# Verify that the sampling mask is the right shape
if sampling_mask.shape[-2:] != prism_grid.shape[-2:]:
    raise ValueError("Invalid shape")

# Create training and validation subsets
training = {k: v.where(sampling_mask[0]) for k, v in datasets.items()}
validation = {k: v.where(sampling_mask[1]) for k, v in datasets.items()}

# Create lookup for all subsets
subsets = {
    "all": datasets,
    "training": training,
    "validation": validation,
}

## Run the random-forest classifier

In [None]:
# Tune hyperparameters if no parameters provided for the classifier
if not CLASSIFIER_PARAMS:
    
    param_grid = {
        "min_samples_leaf": [1, 10, 25],
        "max_depth": [5, 10, None],
        "max_features": ["auto", "log2"],
        "n_estimators": [100, 500]
    }

    classifier = CLASSIFIER()

    grid_search = GridSearchCV(
        estimator=classifier,
        param_grid=param_grid,
        cv=3,
        return_train_score=True
    )

    grid_search.fit(tx, ty)
    CLASSIFIER_PARAMS = grid_search.best_params_

In [None]:
# Create feature and label datasets
features = sorted([f for f in FEATURES if f != LABELS])

xdata = {}
for subset, lookup in subsets.items():
    
    bands = []
    for key in features:
        band = lookup[key]
        
        # If more than one layer, use the last one. You can get around this
        # behavior by selecting a layer or aggregating all layers above (for
        # example, climate data uses the mean of the four years).
        if len(band.shape) > 2:
            band = band[-1]
        
        bands.append(band)
        
        try:
            del bands[-1]["band"]
        except KeyError:
            pass

        bands[-1] = bands[-1].squeeze()
        bands[-1]["band"] = len(bands)
        
    xdata[subset] = xr.concat(bands, dim="band")

In [None]:
# Create training and validation splits
tx, ty = prep(xdata["training"], training[LABELS], mask)
vx, vy = prep(xdata["validation"], validation[LABELS], mask)

In [None]:
# Make pair plots of training data
paired = pd.DataFrame(data=tx, columns=features)
paired["response"] = ty

# Suppress warnings from pair plot. The pair plot function throws a lot of
# warnings based on the source data, but they aren't useful.
with warnings.catch_warnings():
    warnings.filterwarnings("ignore")

    sns.pairplot(paired,
                 height=5,
                 plot_kws={"s": 100},
                 hue="response")

In [None]:
# Run model multiple times to assess performance
runs = []
for i in range(20):
    classifier = CLASSIFIER(**CLASSIFIER_PARAMS)
    classifier.fit(tx, ty)
    
    # Calculate F1 score for low, moderate, and high severity pixels
    f1_scores = []
    recall = []
    precision = []
    for i in (2, 3, 4):
        mask = np.where(vy == i, True, False)
        predicted = classifier.predict(vx[mask])
        f1_scores.append(f1_score(vy[mask], predicted, average="micro"))
        #recall.append(recall_score(vy[mask], predicted, average="micro"))
        #precision.append(precision_score(vy[mask], predicted, average="micro"))
    
    run = [
        classifier.oob_score_,
        classifier.score(vx, vy),
        mean_squared_error(vy, classifier.predict(vx)) ** 0.5,
    ]
    run.extend(f1_scores)
    run.extend(recall)
    run.extend(precision)
    run.extend(classifier.feature_importances_)
    
    runs.append(run)

## Summarize and plot results

In [None]:
# Show results from each run
cols = [
    "oob",
    "score",
    "rmse",
    "f1_low",
    "f1_mod",
    "f1_high",
    #"recall_low",
    #"recall_mod",
    #"recall_high",
    #"precision_low",
    #"precision_mod",
    #"precision_high",
] + [slugify(f"fi_{f}") for f in features]
results = pd.DataFrame(runs, columns=cols)

results

In [None]:
# Aggregate the results from the separate runs
agg_results = results.agg(["mean", "std", "min", "max", "count"])
agg_results

In [None]:
# Create the confusion matrix
conf_mtx = pd.DataFrame()
conf_mtx["truth"] = vy
conf_mtx["predict"] = classifier.predict(vx)

# Cross-tabulate predictions
crosstab = pd.crosstab(conf_mtx["truth"], conf_mtx["predict"], margins=True)
crosstab

In [None]:
# Apply the model to the full dataset
x = np.array([np.ravel(b) for b in xdata["all"]])
x[~np.isfinite(x)] = -9999
y = classifier.predict(x.transpose())

In [None]:
# Plot model prediction compared to original values

# Create a plottable copy of the truth data
truth = datasets[LABELS].copy()
truth = truth.rio.clip(woolsey_fire.geometry)
truth = truth.where(truth != -9999, np.nan)

# Create a plottable copy of the model prediction
prediction = y.copy()
prediction = prediction.reshape(*datasets[LABELS].shape)
prediction = copy_xr_metadata(datasets[LABELS], prediction)
prediction = prediction.rio.clip(woolsey_fire.geometry)

mask = xr.where(cond, True, False).values
truth = truth.where(mask)
prediction = prediction.where(mask)

vmin = truth.min()
vmax = truth.max()

vmin = 1
vmax = 5

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 10))
fig.suptitle(f"Random Forest - {LABELS}\n{', '.join(features)}")

plot_bands(truth,
           ax=ax1,
           title="Measured",
           cmap=cmap_dnbr,
           vmin=vmin,
           vmax=vmax,
           cbar=False,
           scale=False)

plot_bands(prediction,
           ax=ax2,
           title="Predicted",
           cmap=cmap_dnbr,
           vmin=vmin,
           vmax=vmax,
           cbar=False,
           scale=False)

for ax in (ax1, ax2):
    woolsey_fire.plot(ax=ax, facecolor="none", edgecolor="black", linewidth=1)

ep.draw_legend(im_ax=ax1.get_images()[0],
               classes=range(5),
               titles=labels_dnbr,
               bbox=(0.01, 0.99))

## Save model parameters and results

In [None]:
# Collect parameters in a dict
model = {
    "classifier": CLASSIFIER.__name__,
    "features": features,
    "labels": LABELS,
    "params": {
        "classifier_params": CLASSIFIER_PARAMS,
        "sampling_params": SAMPLING_PARAMS}
}

In [None]:
# Use the model parameters to create a directory for this run
json_model = json.dumps(model, sort_keys=True).encode("utf-8")
md5_hash = hashlib.md5(json_model).hexdigest()
dirname = f"{agg_results['rmse']['mean']:.3f}_{md5_hash}".replace("0.", "")
outdir = os.path.join("models", dirname)

try:
    os.makedirs(outdir)
except OSError:
    pass

In [None]:
# Save map as PNG
fig.savefig(os.path.join(outdir, "map.png"), bbox_inches="tight")
with open(os.path.join(outdir, "model.json"), "w", encoding="utf-8") as f:
    json.dump(model, f, indent=2)
  

In [None]:
# Save crosstab and result tables as HTML
css = (
    "<style>"
    "table {border-collapse: collapse;}"
    "td, th { text-align: center; padding: 8px; }"
    "th { background-color: #eee; }"
    "</style>"
)

html = crosstab.transpose().to_html(float_format=float_format)
with open(os.path.join(outdir, "crosstab.htm"), "w") as f:
    f.write(css + "\n" + html)

html = agg_results.transpose().to_html(float_format=float_format)
with open(os.path.join(outdir, "results.htm"), "w") as f:
    f.write(css + "\n" + html)

In [None]:
# Save sampling masks as PNG
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 16))
plot_bands(sampling_mask[0], ax=ax1, cbar=None, title="Training")
plot_bands(sampling_mask[1], ax=ax2, cbar=None, title="Validation")
for ax in (ax1, ax2):
    woolsey_fire.plot(ax=ax, facecolor="none", edgecolor="gray", linewidth=1)
fig.savefig(os.path.join(outdir, "sampling_masks.png"), bbox_inches="tight")

In [None]:
# Save histograms of labels as PNG
response = np.array([a[LABELS] for a in [datasets, training, validation]])

fig, axes = plt.subplots(3, 1, figsize=(8, 12))
for ax, arr, title in (zip(axes, response, ["All", "Training", "Validation"])):
    counts = {}
    for val in np.unique(arr[np.isfinite(arr)]):
        counts[val] = np.sum(arr == val)
    ax.bar(counts.keys(), counts.values())
    ax.set(title=title, xlabel="bin", ylabel="counts")

fig.savefig(os.path.join(outdir, "sampling_histograms.png"),
            bbox_inches="tight")