In [None]:
import os, math
import pandas as pd
import geopandas as gpd
import numpy as np
import gdal, osr
import rasterio as rio
import numpy as np
from datetime import datetime
#from sklearn.linear_model import LogisticRegression, BayesianRidge, LinearRegression, LarsCV
from sklearn.ensemble import RandomForestRegressor

# make sure that statsmodel (latest in conda forge) and pytest (conda forge) are installed
import statsmodels.api as sm
import statsmodels.formula.api as smf
import Utilities as utils
import logging

In [None]:
def getRasterNamesList(pdir):
    raster_paths = []
    raster_names = []
    for root, dirs, files in os.walk(pdir):
        for file in files:
            if file.endswith(".tif") or file.endswith(".img"):
                fpath = os.path.join(root, file).replace("\\", "/")
                if "elev_meters" not in file.lower():
                    raster_names.append(file[:-4])
                    raster_paths.append(fpath)
                else:
                    raster_names.insert(0, file[:-4])
                    raster_paths.insert(0, fpath)

    return [raster_names, raster_paths]


def createClassifiedFile(rasters, regmodel, loc_classified_file, overwrite=False):
    cl_start = datetime.now()

    if not os.path.exists(loc_classified_file) or overwrite:
        logging.info("Creating classified file: %s..." % loc_classified_file)
        # GET RASTER INFO FROM INPUT
        # NEED TO GET BANDS DATA INTO SINGLE ARRAY FOR OUTPUT CLASSIFICATION
        # bands_data_rio = []
        bands_data = []
        for inras in rasters:
            logging.debug("Reading in raster file as array - %s" % inras)

            with rio.open(inras) as raster:
                kwargs = raster.profile
                b_array = raster.read(1).astype(rio.float32)

            bands_data.append(b_array)

        # CREATE NP DATASTACK FROM ALL RASTERS
        logging.debug("Creating numpy array stack...")
        bands_data = np.dstack(bands_data)

        # print("BANDS_DATA.SHAPE: ", bands_data.shape)
        # CREATE VARIABLES OF ROWS, COLUMNS, AND NUMBER OF BANDS
        rows, cols, n_bands = bands_data.shape
        n_samples = rows * cols
        # print("N_Samples: ", n_samples)
        # print("n_bands: ", n_bands)

        # CREATE EMPTY ARRAY WITH SAME SIZE AS RASTER
        logging.debug("Reshaping numpy array to raster shape...")
        flat_pixels = bands_data.reshape((n_samples, n_bands))

        logging.debug("Predicting Valley Bottoms...")
        result = regmodel.predict(flat_pixels)

        # Reshape the result: split the labeled pixels into rows to create an image
        classification = result.reshape((rows, cols))

        # WRITE OUT THE CLASSIFIED ARRAY TO RASTER BASED ON PROPERTIES OF TRAINING RASTERS
        # write_geotiff(loc_classified_file, classification, geo_transform, proj, classes, COLORS)

        kwargs.update(
            dtype=rio.float32,
            nodata=1
        )
          
        with rio.open(loc_classified_file, 'w', **kwargs) as outras:
            outras.write_band(1, classification.astype(rio.float32))

        logging.info("Classification created:\n\t", output_fname, " in ", str(datetime.now() - cl_start))
    else:
        logging.info("The file exists and no overwrite set. Skipping creating %s" % loc_classified_file)

    return loc_classified_file


def rasterSubDivide(preds_dir, overwrite=False):
    parent_dir = utils.getParentDir(preds_dir)
    outdir = os.path.join(parent_dir, "predictors_quads")
    utils.useDirectory(outdir)

    rasters = getRasterNamesList(preds_dir)[1]

    for raster in rasters:
        reference_f = gdal.Open(raster)
        geo_transform = reference_f.GetGeoTransform()
        resx = geo_transform[1]
        resy = geo_transform[5]
        proj = reference_f.GetProjectionRef()
        minx = geo_transform[0]
        maxy = geo_transform[3]
        maxx = minx + (resx * reference_f.RasterXSize)
        miny = maxy + (resy * reference_f.RasterYSize)

        quads_extent_dict = {}

        quad1_minx = str(minx)
        quad1_maxx = str(minx + ((maxx - minx) / 2))
        quad1_miny = str(miny + ((maxy - miny) / 2))
        quad1_maxy = str(maxy)
        quads_extent_dict[1] = " ".join([quad1_minx, quad1_miny, quad1_maxx, quad1_maxy])

        quad2_minx = str(quad1_maxx)
        quad2_maxx = str(maxx)
        quad2_miny = str(quad1_miny)
        quad2_maxy = str(maxy)

        quads_extent_dict[2] = " ".join([quad2_minx, quad2_miny, quad2_maxx, quad2_maxy])

        quad3_minx = str(minx)
        quad3_maxx = str(quad1_maxx)
        quad3_miny = str(miny)
        quad3_maxy = str(quad1_miny)

        quads_extent_dict[3] = " ".join([quad3_minx, quad3_miny, quad3_maxx, quad3_maxy])

        quad4_minx = str(quad1_maxx)
        quad4_maxx = str(maxx)
        quad4_miny = str(miny)
        quad4_maxy = str(quad1_miny)

        quads_extent_dict[4] = " ".join([quad4_minx, quad4_miny, quad4_maxx, quad4_maxy])

        logging.debug("Clipping Quads for %s" % raster)
        for i in range(1, 5):
            #print("Starting on quad %d" % i)
            quad_name = "quad" + str(i)
            quad_dir = os.path.join(outdir, quad_name)
            
            utils.useDirectory(quad_dir)

            oname = os.path.splitext(os.path.basename(raster))[0] + "_" + quad_name + ".tif"
            opath = os.path.join(quad_dir, oname)

            if not os.path.exists(opath) or overwrite:
                ouput_options = "-overwrite -t_srs %s -tr %s %s -te_srs %s -te %s" % (
                    proj, resx, resy, proj, quads_extent_dict[i])

                logging.info("Executing gdal_warp operation on %s with extent %s" % (raster, quads_extent_dict[i]))
                gdal.Warp(opath, raster, options=ouput_options)

    return outdir

def specifyTrainingRasters(alist, to_remove):
    keepers = alist[:]
    for ras in alist:
        for string in to_remove:
            if string in ras:
                keepers.remove(ras)

    keepers = sorted(keepers)
    return keepers

def pickSampleRaster(directory, fileName):
    """ given a directory returns the path of the first file matching the first file matching fileName """
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file == fileName:
                fpath = os.path.join(root, file)
                return fpath

def extractValuesToVBPoints(watersheds_shp, vb_classification_pnts, watershedsDir):
    # Extract raster values to training points
    watersheds_df = gpd.read_file(watersheds_shp)
    class_points_df = gpd.read_file(vb_classification_pnts)

    if not watersheds_df.crs == class_points_df.crs:
        class_points_df.to_crs(watersheds_df.crs, inplace=True)

    vbpoints_ras_extract = gpd.sjoin(class_points_df, watersheds_df, op='within')

    sample_raster = pickSampleRaster(watershedsDir, "elev_cm.tif")
    ras = gdal.Open(sample_raster)
    ras_proj = ras.GetProjection()
    spatialRef = osr.SpatialReference()
    osr.UseExceptions()
    # Apparently osr has difficulties identifying albers projections
    prjText = ras_proj.replace('"Albers"', '"Albers_Conic_Equal_Area"')
    spatialRef.ImportFromWkt(prjText)
    ras_proj_proj4 = spatialRef.ExportToProj4()

    logging.debug("Reprojecting training points to cooridinate system of rasters...")
    vbpoints_ras_extract.to_crs(ras_proj_proj4, inplace=True)

    for watershed, group in vbpoints_ras_extract.groupby("HUC4"):
        logging.info("Starting extraction on points in watershed %s" % watershed)

        # FIND RELEVANT WATERSHED DIRECTORY
        # TODO This is a redunant workflow in this and the VBET Script
        for wdir in os.listdir(watershedsDir):
            if watershed in wdir:
                logging.debug("--- BEGINNING ON WATERSHED %s ---" % wdir)
                w_dir = os.path.join(watershedsDir, wdir)
                for subdir in os.listdir(w_dir):
                    if "Rasters" in subdir:
                        rasters_dir = os.path.join(w_dir, subdir)
                    if "GDB" in subdir:
                        geodatabase = os.path.join(w_dir, subdir)

                    predictors_dir = os.path.join(rasters_dir, "Predictors")
                
                break
                
        # simple check to make sure that predictors have been made.
        # TODO - initiate calculation if not
        elev_raster = os.path.join(predictors_dir, "elev_meters.tif")
        if not os.path.exists(elev_raster):
            logging.ERROR("PROBLEM - %s doesn't exist in directory %s" % ("elev_meters.tif", predictors_dir))
            raise Exception    
            
        float32_raster_paths = []
        raster_names = []
        for root, dirs, files in os.walk(predictors_dir):
            for file in files:
                if file.endswith(".tif") or file.endswith(".img"):
                    if file.lower() != "elev_meters.tif":
                        raster_names.append(file[:-4]) # append to list without file extension
                        float32_raster_paths.append(os.path.join(root, file))

        rasters = [elev_raster] + float32_raster_paths
        raster_names = ["elev_meters"] + raster_names

        for name in raster_names:
            vbpoints_ras_extract[name] = np.NaN
            
        # Build VRT to makes extraction easier/simpler. Can't include elev_meters because different data type
        logging.debug("Building VRT of FLOAT32 Rasters...")
        vrt_of_rasters = os.path.join(predictors_dir, "float32_predictors.vrt")
        build_vrt = "gdalbuildvrt -overwrite -separate %s %s" % (vrt_of_rasters, '"' + '" "'.join(float32_raster_paths) +'"')
        os.system(build_vrt)

        def get_values(geom):
            
            x = geom.centroid.x
            y = geom.centroid.y

            values = []

            for val in elev_ras.sample([(x, y)]):
                values += np.ndarray.tolist(val)
            for val in float32_ras.sample([(x, y)]):
                values += np.ndarray.tolist(val)

            return pd.Series(values, index=raster_names)
            
            
        with rio.open(elev_raster) as elev_ras:
            with rio.open(vrt_of_rasters) as float32_ras:
                vbpoints_ras_extract.loc[vbpoints_ras_extract.HUC4 == watershed, raster_names] = \
                    vbpoints_ras_extract.loc[vbpoints_ras_extract.HUC4 == watershed, "geometry"].apply(get_values)
    
    return {"points":vbpoints_ras_extract, "raster_paths":rasters, "raster_names": raster_names}

In [None]:
def regressValleyBottoms(vbpoints_ras_extract, values_to_train_on, nhdDir, watershedsDir, overwrite=False):
    logging.debug("Beginning Regression Training")
    n_job = 2
    msl = 20

    # http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor
    regressor = RandomForestRegressor(n_jobs=n_job, verbose=True, min_samples_leaf=msl)

    regressor.fit(vbpoints_ras_extract[values_to_train_on].dropna(),
                  vbpoints_ras_extract[values_to_train_on + ["VB"]].dropna()["VB"])

    # CREATE CLASSIFIED RASTERS FOR QUARTER QUADS USED IN TRAINING DATA FIRST
    for watershed, group in vbpoints_ras_extract.groupby("HUC4"):

        # FIND RELEVANT WATERSHED DIRECTORY
        ## TODO, this is a redundant workflow in this file and in the VBET process. Create function to replace
        for wdir in os.listdir(watershedsDir):
            if watershed in wdir:
                logging.debug("--- BEGINNING ON WATERSHED %s ---" % wdir)
                watershedDir = os.path.join(watershedsDir, wdir)
                for subdir in os.listdir(watershedDir):
                    if "Rasters" in subdir:
                        rasters_dir = os.path.join(watershedDir, subdir)
                    # if "GDB" in subdir:
                    #    geodatabase = os.path.join(watershedDir, subdir)

                    predictors_dir = os.path.join(rasters_dir, "Predictors")

                break

        # This step divide the watershed predictors into 4 quadrants and return the location of the folder
        quadsDir = rasterSubDivide(predictors_dir, overwrite=False)

        # Set the output directory to write prediction rasters to
        outquad_preds_dir = os.path.join(watershedDir, "RSAC_temp")
        utils.useDirectory(outquad_preds_dir)

        for subdir in os.listdir(quadsDir):
            
            dirpath = os.path.join(quadsDir, subdir)
            raster_paths = sorted(getRasterNamesList(dirpath)[1])
            rasters_to_use = sorted(specifyTrainingRasters(raster_paths, ["TPI_20", "TPI_30"]))
            
            for i in range(len(values_to_train_on)):
                logging.debug(i, values_to_train_on[i], os.path.basename(raster_paths[i]))
            
            if len(rasters_to_use) != len(values_to_train_on):
                raise Exception
                
            logging.debug("Starting on directory : %s" % dirpath)

            modeltype = "RandomForestsReg"
            output_fname = "VB_" + watershed + "_" + subdir + "_" + modeltype + ".tif"
            loc_classified_file = os.path.join(outquad_preds_dir, output_fname)

            classified_File = createClassifiedFile(rasters_to_use, regressor, loc_classified_file, overwrite=True)

        # NEED TO MERGE QUADS OF WATERSHED BACK TO ONE THE WATERSHED
        watershed_rsac_name = "VB_" + watershed + "_" + modeltype + ".tif"
        watershed_rsac_path = os.path.join(watershedDir, watershed_rsac_name)

        if not os.path.exists(watershed_rsac_path) or overwrite:
            quadfiles = []
            for file in os.listdir(outquad_preds_dir):
                if modeltype in file and file.endswith(".tif"):
                    fpath = os.path.join(outquad_preds_dir, file)
                    quadfiles.append(fpath)

            utils.mergeRasters(quadfiles, watershed_rsac_path)

    # MERGE ALL WATERSHEDS TO ONE RASTER FOR WHOLE STATE
    state_rsac_name = "RSAC_ValleyBottoms.tif"
    state_rsac_path = os.path.join(nhdDir, state_rsac_name)

    if not os.path.exists(state_rsac_path) or overwrite:
        watershedfiles = []
        for w_dir in os.listdir(watershedsDir):
            watershedDir = os.path.join(watershedsDir, w_dir)
            for file in os.listdir(watershedDir):
                if modeltype in file and file.endswith(".tif"):
                    fpath = os.path.join(watershedDir, file)
                    watershedfiles.append(fpath)

        utils.mergeRasters(watershedfiles, state_rsac_path)

In [None]:
def valleyBottomRegression(watersheds_shp, vb_classification_pnts, nhd_dir, watersheds_dir):
        extraction_variables = extractValuesToVBPoints(watersheds_shp, vb_classification_pnts, watersheds_dir)

        vbpoints_raster_values = extraction_variables["points"]
        #rasters = extraction_variables["raster_paths"]
        raster_names = extraction_variables["raster_names"]

        # remove the large TPI rasters. Created in prep, but not useful
        rasters_to_not_use = ["TPI_20", "TPI_30"]

        rasters_to_regress_with = specifyTrainingRasters(raster_names, rasters_to_not_use) 

        logging.debug("Rasters to use in regression: ", rasters_to_regress_with)

        # Only some training data will be used. Those marked with a 1 in 'Use' column
        vbpoints_raster_values = vbpoints_raster_values[vbpoints_raster_values.Use == 1]

        regressValleyBottoms(vbpoints_raster_values, rasters_to_regress_with, nhd_dir, watersheds_dir)

In [None]:
if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    
    nhd_dir = os.path.abspath(r"M:\Data\NHD")
    watersheds_dir = os.path.join(nhd_dir, "Watersheds")

    vb_classification_pnts = os.path.join(nhd_dir, "VM_TrainingData_20180619.shp")
    watersheds_shp = os.path.join(nhd_dir, "WBDHU4_Arizona.shp")
    
    valleyBottomRegression(watersheds_shp, vb_classification_pnts, nhd_dir, watersheds_dir)
    
    logging.info("Finished creating valley bottoms from regression")

In [None]:
"""
Experimental section

### Classical RSAC calculation fitting distribution of training variables to curve

formula = 'VB ~ ' + "+".join(values_to_train_on)

print( formula)
regressor = smf.glm(formula=formula, data=vbpoints_raster_values, family=sm.families.Binomial())
coefficients = regressor.fit().params
#coefficients

intercept = coefficients.Intercept
coeffs = coefficients[1:]

predictors = removeTPIs(getRasterNamesList(r"M:\Data\NHD\Rasters\HRNHDPlusRasters1504\predictors_quads\quad1")[1])
predictors

calc = 0
for i in range(len(coefficients)-1):
    print("Beginning on %s..." % predictors[i])
    with rio.open(predictors[i]) as raster:
        raster_array = raster.read(1).astype(float)
        
    print("\tCalculating on %s..." % predictors[i])
    calc += (float(coeffs[i]) * raster_array)
    
    del raster_array
    
calc += intercept

outfile = r"M:\Data\NHD\RSAC_VB_Preds\glm_test_1505_quad1.tif"
with rio.open(predictors[0]) as ras:
    kwargs = ras.profile

print("Calculating Valley Bottoms...\n")
#fp = eval(string)
lp = 1.0/(1.0 + np.exp(-1.0 * calc))

kwargs.update(dtype=np.float64)

print("Writing calculation to outfile %s...\n" % outfile)
with rio.open(outfile, 'w', **kwargs) as dst:
    dst.write_band(1, lp.astype(np.float64))
"""