In [4]:
import logging, math
import geopandas as gpd
import rasterio as rio
import numpy as np
import fiona
from datetime import datetime
import os
import shutil
import pandas as pd
import gdal
from pyproj import transform, Proj

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.externals import joblib
from shapely.geometry import Point

import Utilities as utils
import Create_Classification_Points
from RasterCalculations import *
from StackGeneration import generateStack

import logging
from joblib import Parallel, delayed


ImportError: No module named 'StackGeneration'

In [None]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

riolog = rio.logging.getLogger()
riolog.setLevel(logging.ERROR)


start = datetime.now()

overwrite = True

In [None]:
def getFullNAIPPath(naip_file, naipdir):
    for root,dirs,files in os.walk(naipdir):
        for file in files:
            if naip_file in file:
                return os.path.join(root,file)
    
    logging.error("Unable to find naip file %s in %s. Exiting" % (naip_file, naipdir))
    raise Exception


def findSTDDevFile(dir, naip_file, band_num, windowsize):
    #findFile(os.path.join(std3px_dir, bandnum), ffile)
    
    window_dir = os.path.join(dir, "StdDev_" + str(windowsize) + "px")
    utilities.useDirectory(window_dir)
    band_dir = os.path.join(window_dir, "band" + band_num)
    utilities.useDirectory(band_dir)
    
    for root, dirs, files in os.walk(band_dir):
        for file in files:
            if f in file:
                fpath = os.path.join(root, file)
                return fpath
                
    if "fpath" not in locals():
        standardDeviation(naip_file, dir, window_size=windowsize, overwrite=False)
    logging.error("Unable to find file %s" % f)
    raise Exception


def findVIFile(type, dir, f):
    for root, dirs, files in os.walk(dir):
        for file in files:
            if f in file:
                fpath = os.path.join(root, file)
                return fpath

    if "fpath" not in locals():
        vegIndexCalc(fpath, dir, [type])
    return None


def createSubSetLandsat(naip_path, landsat_file, opath, overwrite=False):
    ssl_start = datetime.now()
    ofile = "Landsat8_" + os.path.basename(naip_path)

    landsat_opath = os.path.join(opath, ofile)
    
    if not os.path.exists(landsat_opath) or overwrite:
        start = datetime.now()
        reference_f = gdal.Open(naip_path)
        geo_transform = reference_f.GetGeoTransform()
        resx = geo_transform[1]
        resy = geo_transform[5]
        proj = reference_f.GetProjectionRef()
        minx = geo_transform[0]
        maxy = geo_transform[3]
        maxx = minx + (resx * reference_f.RasterXSize)
        miny = maxy + (resy * reference_f.RasterYSize)

        # build landsat tile from naip extent

        if "ndsi" in opath.lower() or "ndwi" in opath.lower():
            resampletype = "bilinear"
        else:
            resampletype = "bilinear"
            #resampletype = "near"

        gdal_warp = "gdalwarp -overwrite -tap -r %s -t_srs %s -tr %s %s -te_srs %s -te %s %s %s %s %s %s" % (
            resampletype, proj, resx, resy, proj, str(minx), str(miny), str(maxx), str(maxy), landsat_file, landsat_opath)
        logging.debug("Executing gdal_warp operation on %s for footprint of naip file %s" % (landsat_file, naip_path))
        os.system(gdal_warp)

        logging.debug("\tFinished qquad for %s landsat in %s" % (landsat_file, str(datetime.now() - ssl_start)))
    
    return landsat_opath


# FUNCTION TO WRITE OUT CLASSIFIED RASTER
def write_geotiff(fname, data, geo_transform, projection, classes, COLORS, data_type=gdal.GDT_Byte):
    """
    Create a GeoTIFF file with the given data.
    :param fname: Path to a directory with shapefiles
    :param data: Number of rows of the result
    :param geo_transform: Returned value of gdal.Dataset.GetGeoTransform (coefficients for
                          transforming between pixel/line (P,L) raster space, and projection
                          coordinates (Xp,Yp) space.
    :param projection: Projection definition string (Returned by gdal.Dataset.GetProjectionRef)
    """
    driver = gdal.GetDriverByName('GTiff')
    rows, cols = data.shape
    dataset = driver.Create(fname, cols, rows, 1, data_type)
    dataset.SetGeoTransform(geo_transform)
    dataset.SetProjection(projection)
    band = dataset.GetRasterBand(1)
    band.WriteArray(data)

    ct = gdal.ColorTable()
    for pixel_value in range(len(classes)+1):
        color_hex = COLORS[pixel_value]
        r = int(color_hex[1:3], 16)
        g = int(color_hex[3:5], 16)
        b = int(color_hex[5:7], 16)
        ct.SetColorEntry(pixel_value, (r, g, b, 255))
    band.SetColorTable(ct)

    metadata = {
        'TIFFTAG_COPYRIGHT': 'CC BY 4.0, AND BEN HICKSON',
        'TIFFTAG_DOCUMENTNAME': 'Land Cover Classification',
        'TIFFTAG_IMAGEDESCRIPTION': 'Random Forests Supervised classification.',
        'TIFFTAG_MAXSAMPLEVALUE': str(len(classes)),
        'TIFFTAG_MINSAMPLEVALUE': '0',
        'TIFFTAG_SOFTWARE': 'Python, GDAL, scikit-learn'
    }
    dataset.SetMetadata(metadata)

    dataset = None  # Close the file
    return


def report_and_exit(txt, *args, **kwargs):
    logger.error(txt, *args, **kwargs)
    exit(1)


def get_values(geom):
    #print(row)
    #geom = row['geometry']
    x = geom.centroid.x
    y = geom.centroid.y

    values = []
    # for raster in raster_objects:
    # print("Starting Raster Extract for %s at x:%s y:%s" % (os.path.basename(raster), str(x), str(y)))
    # with rio.open(raster) as ras:
    for val in rasnaip.sample([(x, y)]):
        values += np.ndarray.tolist(val)
    for val in rasnaipvis.sample([(x, y)]):
        values += np.ndarray.tolist(val)
    for val in rasgauss.sample([(x, y)]):
        values += np.ndarray.tolist(val)
    for val in raslandsat.sample([(x, y)]):
        values += np.ndarray.tolist(val)
    for val in rasNDSI.sample([(x, y)]):
        values += np.ndarray.tolist(val)
    for val in rasNDWI.sample([(x, y)]):
        values += np.ndarray.tolist(val)
    
    return pd.Series(values, index=rasters_names)


def getQQuadFromNAIP(f):
    qquad = f.split("_")[1] + "_" + f.split("_")[2]
    return qquad


def get_STDDev_VRT(naip_file):
    naip_path = getFullNAIPPath(naip_file, naip_dir)
    qquad = getQQuadFromNAIP(naip_file)
    rasters_stddev = []
    
    rasters_stddev += standardDeviation(naip_path, base_datadir, window_size=3, overwrite=False)
    rasters_stddev += standardDeviation(naip_path, base_datadir, window_size=5, overwrite=False)
    rasters_stddev += standardDeviation(naip_path, base_datadir, window_size=10, overwrite=False)
    """
    for bandnum in range(1, 5):
        bandnum = "band" + str(bandnum)
        ffile = "stddev_" + os.path.splitext(naip_file)[0] + bandnum + ".tif"
        
        rasters_stddev.append(os.path.abspath(findFile(os.path.join(std3px_dir, bandnum), ffile)).replace("\\", "/"))
        rasters_stddev.append(os.path.abspath(findFile(os.path.join(std5px_dir, bandnum), ffile)).replace("\\", "/"))
        rasters_stddev.append(os.path.abspath(findFile(os.path.join(std10px_dir, bandnum), ffile)).replace("\\", "/"))
    """
    stddev_vrt_dir = os.path.join(qquad_vrt_dir, "stddev")
    vrt_stddev = os.path.join(stddev_vrt_dir, qquad + "_stddev.vrt")
    #print(vrt_stddev)

    if not os.path.exists(vrt_stddev):
        build_vrt = "gdalbuildvrt -overwrite -separate %s %s" % (vrt_stddev, " ".join(rasters_stddev))
        logging.debug("BUILDING VRT WITH: \n\t%s" % build_vrt)
        os.system(build_vrt)

    return vrt_stddev


def get_GaussianFile(naip_file):
    naip_path = getFullNAIPPath(naip_file, naip_dir)
    qquad = getQQuadFromNAIP(naip_file)
    
    gaussfile = gaussianCalc(naip_path, base_datadir, sigma=1, overwrite=False)
    
    return gaussfile
    
    
def get_VegIndicies_VRT(naip_file):
    qquad = getQQuadFromNAIP(naip_file)
    rasters_float = []

    rasters_float.append(os.path.normpath(findVIFile("NDVI", ndvi_dir, naip_file)).replace("\\", "/"))
    rasters_float.append(os.path.normpath(findVIFile("SAVI", savi_dir, naip_file)).replace("\\", "/"))
    rasters_float.append(os.path.normpath(findVIFile("OSAVI", osavi_dir, naip_file)).replace("\\", "/"))
    rasters_float.append(os.path.normpath(findVIFile("MSAVI2", msavi2_dir, naip_file)).replace("\\", "/"))
    rasters_float.append(os.path.normpath(findVIFile("EVI2", evi2_dir, naip_file)).replace("\\", "/"))

    naipvis_vrt_dir = os.path.join(qquad_vrt_dir, "naipvis")
    vrt_naipvis = os.path.join(naipvis_vrt_dir, qquad + "_naipvis.vrt")
    
    if not os.path.exists(vrt_naipvis):
        build_vrt = "gdalbuildvrt -overwrite -separate %s %s" % (vrt_naipvis, " ".join(rasters_float))
        os.system(build_vrt)

    return vrt_naipvis


def createClassifiedFile(loc_NAIPFile, mltype, rf_classifier, overwrite=False):
    file = os.path.basename(loc_NAIPFile)
    
    qquad = getQQuadFromNAIP(file)
    
    if mltype == "RF":
        output_fname = mltype + "_D" + str(maxdepth) + "E" + str(n_est) + "MPL" + str(min_per_leaf) + "_" + qquad + ".tif"
    elif mltype == "SVM":
        output_fname = mltype + "_" + maxi + "_" + qquad + ".tif"
        
    loc_classified_file = os.path.join(loc_classifiedQuarterQuads, output_fname)
    #print(loc_classified_file)

    if not os.path.exists(loc_classified_file) or overwrite:
        cl_start = datetime.now()
        logging.info("\tClassifying landcover file at %s..." % (loc_classified_file))
        # loc_NAIPFile = os.path.join(root, file)

        file = os.path.basename(loc_NAIPFile)

        vrt_naipvis = get_VegIndicies_VRT(file) #  All float32
        #vrt_stddev = get_STDDev_VRT(file)  # All 8 bit ("byte")
        gaussf_path = get_GaussianFile(file)

        # BEN!!!!!! YOU REMOVED LANDSAT VARIABLE FOR QUICK RUN
        #landsat_path = createSubSetLandsat(loc_NAIPFile, landsat_file, landsat_dir).replace("\\", "/")

        landsat_ndsi_path = createSubSetLandsat(loc_NAIPFile, ndsi_file, ndsi_dir).replace("\\", "/")
        landsat_ndwi_path = createSubSetLandsat(loc_NAIPFile, ndwi_file, ndwi_dir).replace("\\", "/")

        # GET RASTER INFO FROM INPUT
        # NEED TO GET BANDS DATA INTO SINGLE ARRAY FOR OUTPUT CLASSIFICATION
        bands_data = []
        #for inras in [loc_NAIPFile, vrt_naipvis, gaussf_path, landsat_path, landsat_ndsi_path, landsat_ndwi_path]:
        for inras in [loc_NAIPFile, vrt_naipvis, gaussf_path, landsat_ndsi_path, landsat_ndwi_path]:
            try:
                raster_dataset = gdal.Open(inras, gdal.GA_ReadOnly)
            except RuntimeError as e:
                report_and_exit(str(e))

            geo_transform = raster_dataset.GetGeoTransform()
            proj = raster_dataset.GetProjectionRef()

            for b in range(1, raster_dataset.RasterCount + 1):
                band = raster_dataset.GetRasterBand(b)
                bands_data.append(band.ReadAsArray())

        # CREATE NP DATASTACK FROM ALL RASTERS
        bands_data = np.dstack(bands_data)
        # CREATE VARIABLES OF ROWS, COLUMNS, AND NUMBER OF BANDS
        rows, cols, n_bands = bands_data.shape
        n_samples = rows * cols

        # CREATE EMPTY ARRAY WITH SAME SIZE AS RASTER
        flat_pixels = bands_data.reshape((n_samples, n_bands))
        
        classes = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']

        # A list of colors for each class
        COLORS = [
            "#000000",  # 0 EMPTY
            "#00af11",  # 1 - Veg - Thick
            "#00e513",  # 2 - Veg - Sparse
            "#e9ff5a",  # 3 - Herbaceous
            "#f1ac34",  # 4 - Barren - Light
            "#a9852e",  # 5 - Barren - Dark
            "#2759ff",  # 6 - Water
            "#efefef",  # 7 - Roof - White
            "#d65133",  # 8 - Roof - Red
            "#cecece",  # 9 - Roof - Grey
            "#a0a0a0",  # 10 - Impervious - Light
            "#555555",  # 11 - Impervious - Dark
            "#000000"  # 12 - Shadows
        ]

        #print("Classifing...")
        
        """if not np.all(np.isfinite(flat_pixels)):
            print("Not all value finite. Fixing...")
            flat_pixels = np.where(flat_pixels > np.finfo(np.float32).max, np.finfo(np.float32).max, flat_pixels)
        if np.any(np.isnan(flat_pixels)):
            print("Some values are NaN. Fixing...")
            flat_pixels = np.where(flat_pixels == np.NaN, np.finfo(np.float32).max, flat_pixels)
        """
        try:
            result = rf_classifier.predict(flat_pixels)
            # Reshape the result: split the labeled pixels into rows to create an image
            classification = result.reshape((rows, cols))

            # WRITE OUT THE CLASSIFIED ARRAY TO RASTER BASED ON PROPERTIES OF TRAINING RASTERS
            # TODO - Rewrite this to use rasterio for consistency
            write_geotiff(loc_classified_file, classification, geo_transform, proj, classes, COLORS)
            logging.info("\tCreated classified file in %s" % (str(datetime.now() - cl_start)))
        except (ValueError) as e:
            logging.info("-----------BAD VALUES FOR PREDICTORS. SKIPPING FILE %s\n%s" % (file, str(e)))
            return None
        
        del bands_data
        del flat_pixels
        
    else:
        logging.info("LandCover file %s exists and no overwrite." % loc_classified_file)
    
    o_veg_loc = r"Q:\GoogleDrive\AridRiparianProject\WorkingDirectory\Data\RiparianClass_VBs"
    utilities.useDirectory(o_veg_loc)
        
    quadrant_loc = os.path.join(o_veg_loc, qquad[:5])
    utilities.useDirectory(quadrant_loc)
    
    # not writing out density calculations
    #dq_path = os.path.join(degress_quadrant_loc, denseVeg_file)
    #sq_path = os.path.join(degress_quadrant_loc, sparseVeg_file)
    
    riparian_class_qquad = os.path.join(quadrant_loc, "RiparianClassification_" + qquad + ".tif")
        
    if not os.path.exists(riparian_class_qquad) or overwrite:
        createRiparianClass(loc_classified_file, riparian_class_qquad, qquad)
    
    return loc_classified_file

def createRiparianClass(lc_raster, o_file, qquad):
    rc_start = datetime.now()
    logging.info("\tClassifying riparian zones for %s" % lc_raster)
    with rio.open(lc_raster) as class_file:
        class_array = class_file.read(1)#_band(1)
        kwargs = class_file.profile
    
    # Get average densities of each class across the whole raster.
    # TO DO - Updatet this to be evaulate on on something more specific than the qquad area
    dense_veg_array = np.where(class_array == 1, 1, 0)
    dense_file_avg = np.mean(dense_veg_array)
    sparse_veg_array = np.where(class_array == 2, 1, 0)
    sparse_file_avg = np.mean(sparse_veg_array)
    
    sparse_veg_array_localmean = ndimage.uniform_filter(sparse_veg_array.astype(np.float32), size=vaa_diameter, mode='constant')
    dense_veg_array_localmean = ndimage.uniform_filter(dense_veg_array.astype(np.float32), size=vaa_diameter, mode='constant')
    
    # ------------------------------------------------
    # CRITICAL : Identify the splits where xero, meso, and hydro will be identified 
    # based on density ofsparse and thick vegetation
    sparse_xero_lowlimit = sparse_file_avg + np.std(sparse_veg_array_localmean)
    sparse_meso_lowlimit = sparse_file_avg + (1-sparse_file_avg)*0.7
    sparse_hydro_lowlimit = sparse_file_avg + (1-sparse_file_avg)*0.9
    
    dense_xero_lowlimit = dense_file_avg + np.std(dense_veg_array_localmean)
    dense_meso_lowlimit = dense_file_avg + (1-dense_file_avg)*0.7
    dense_hydro_lowlimit = dense_file_avg + (1-dense_file_avg)*0.9
    
    # ------------------------------------------------
    
    # Reassign pixel values based on density assessment
    sparse_local_xero  = np.where(sparse_veg_array_localmean > sparse_xero_lowlimit,  1, 0) # xero (1) if true, upland (0) if false
    sparse_local_meso  = np.where(sparse_veg_array_localmean > sparse_meso_lowlimit,  2, 0) # meso (2) if true, upland (0) if false
    sparse_local_hydro = np.where(sparse_veg_array_localmean > sparse_hydro_lowlimit, 3, 0) # hydro (3) if true, upland (0) if false
    # For some reason can't take numpy.maximum from more than two arrays at once
    sparse_combine = np.maximum(sparse_local_xero, sparse_local_meso)#, sparse_local_hydro)
    sparse_combine = np.maximum(sparse_combine, sparse_local_hydro)
        
    dense_local_xero  = np.where(dense_veg_array_localmean > dense_xero_lowlimit,  1, 0) # xero (1) if true, upland (0) if false
    dense_local_meso  = np.where(dense_veg_array_localmean > dense_meso_lowlimit,  2, 0) # meso (2) if true, upland (0) if false
    dense_local_hydro = np.where(dense_veg_array_localmean > dense_hydro_lowlimit, 3, 0) # hydro (3) if true, upland (0) if false
    # For some reason can't take numpy.maximum from more than two arrays at once
    dense_combine = np.maximum(dense_local_xero, dense_local_meso)#, sparse_local_hydro)
    dense_combine = np.maximum(dense_combine, dense_local_hydro)
    
    # COMPARISON OF DENSITY VALUES OF BOTH RASTERS AT EACH PIXEL FOR 
    # DETERMINATION. ESSENTAILLY A DECISION TREE
    p = np.where(dense_combine == 0, np.where(sparse_combine == 0, 0, 0), 0)
    o = np.where(dense_combine == 0, np.where(sparse_combine == 1, 1, p), p)
    n = np.where(dense_combine == 0, np.where(sparse_combine == 2, 2, o), o)
    m = np.where(dense_combine == 0, np.where(sparse_combine == 3, 3, n), n)
    l = np.where(dense_combine == 1, np.where(sparse_combine == 0, 1, m), m)
    k = np.where(dense_combine == 1, np.where(sparse_combine == 1, 1, l), l)
    j = np.where(dense_combine == 1, np.where(sparse_combine == 2, 2, k), k)
    i = np.where(dense_combine == 1, np.where(sparse_combine == 3, 3, j), j)
    h = np.where(dense_combine == 2, np.where(sparse_combine == 0, 1, i), i)
    g = np.where(dense_combine == 2, np.where(sparse_combine == 1, 2, h), h)
    f = np.where(dense_combine == 2, np.where(sparse_combine == 2, 2, g), g)
    e = np.where(dense_combine == 2, np.where(sparse_combine == 3, 3, f), f)
    d = np.where(dense_combine == 3, np.where(sparse_combine == 0, 2, e), e)
    c = np.where(dense_combine == 3, np.where(sparse_combine == 1, 2, d), d)
    b = np.where(dense_combine == 3, np.where(sparse_combine == 2, 3, c), c)
    riparian = np.where(dense_combine == 3, np.where(sparse_combine == 3, 3, b), b)
    
    kwargs.update(
        dtype=np.uint8,
        nodata=0,
        compress='lzw'
    )
    
    valleybottom_ras = findVBRaster(qquad)
    
    with rio.open(valleybottom_ras) as vb_raster:
        vb_array = vb_raster.read(1).astype(np.float32)
    
    #print("Clipping to Valley Bottoms")
    clipped_riparian = np.where(vb_array > 1, riparian, 0)

    with rio.open(o_file, 'w', **kwargs) as dst:
        dst.write_band(1, clipped_riparian.astype(np.uint8))

        dst.write_colormap(
            1, {
                0: (255, 255, 255),
                1: (186,228,179),
                2: (116,196,118),
                3: (35,139,69)})
        cmap = dst.colormap(1)
        
    logging.info("\tFinished riparian classification in %s" % (str(datetime.now()-rc_start)))


def findVBRaster(qquad, overwrite=False):
    vb_start = datetime.now()
    logging.debug("Starting creation of subset of valley bottom...")
    naip_path = getFullNAIPPath(qquad, naip_dir)
    ofile = "ValleyBottom_" + qquad + ".tif"

    o_path = os.path.join(loc_valleybottoms, ofile)
    
    reference_f = gdal.Open(VBET_VB_loc)
    geo_transform = reference_f.GetGeoTransform()
    sproj = reference_f.GetProjectionRef()
    
    # TODO - Duplicative scripting. Exisits twice in this file and also in the VBET classification
    if not os.path.exists(o_path) or overwrite:
        reference_f = gdal.Open(naip_path)
        geo_transform = reference_f.GetGeoTransform()
        resx = geo_transform[1]
        resy = geo_transform[5]
        tproj = reference_f.GetProjectionRef()
        minx = geo_transform[0]
        maxy = geo_transform[3]
        maxx = minx + (resx * reference_f.RasterXSize)
        miny = maxy + (resy * reference_f.RasterYSize)

        resampletype = "bilinear"
        
        gdal_warp = "gdalwarp -overwrite -tap -r %s -s_srs %s -t_srs %s -tr %s %s -te_srs %s -te %s %s %s %s %s %s" % (
            resampletype, sproj, tproj, resx, resy, tproj, str(minx), str(miny), str(maxx), str(maxy), VBET_VB_loc, o_path)
        #print("Executing gdal_warp operation on %s for footprint of naip file %s" % (o_path, naip_path))
        os.system(gdal_warp)

        logging.debug("\tFinished VB subset in %s" % (str(datetime.now() - vb_start)))
    
    return o_path
        
def get_class_value(geom):
    """ TAKES A VARIABLE OF GEOMETRY TYPE AND RETURNS THE VALUE AT X,Y
    AS A PANDAS SERIES FOR FOR LOCAL RASTER 'CLASSRAS' """
    #print(row)
    #geom = row.geometry
    x = geom.centroid.x
    y = geom.centroid.y
    for val in classras.sample([(x, y)]):
        # print(np.ndarray.tolist(val))
        return pd.Series(val, index=[predicted_column])


def apply_and_concat(dataframe, field, func, column_names):
    return pd.concat((
        dataframe,
        dataframe[field].apply(
            lambda cell: pd.Series(func(cell), index=column_names))), axis=1)


def calculateGeom(row):
    #print(row)
    geom = row["geometry"]
    if row['PROJ'] == "NAD83 / UTM zone 11N":
        x = geom.centroid.x
        y = geom.centroid.y
        point = Point(transform(utm11, utm12, x, y))
        return point
    else:
        return geom

In [None]:
veg_assessment_area = 0.1 # in acres
vaa_meters = veg_assessment_area * 4046.86
vaa_radius = math.sqrt(vaa_meters/math.pi)
vaa_diameter = vaa_radius*2

# LOCATIONS OF FOLDERS HOLDING ALL INPUT RASTER DATA
naip_dir = os.path.abspath(r"Q:\Arid Riparian Project\Data\NAIP_2015_Compressed")

base_datadir = os.path.abspath(r"M:\Data")
ndvi_dir = os.path.join(base_datadir, "NDVI")
savi_dir = os.path.join(base_datadir, "SAVI")
osavi_dir = os.path.join(base_datadir, "OSAVI")
msavi2_dir = os.path.join(base_datadir, "MSAVI2")
evi2_dir = os.path.join(base_datadir, "EVI2")
ndwi_dir = os.path.join(base_datadir, "NDWI")
ndsi_dir = os.path.join(base_datadir, "NDSI")

std3px_dir = os.path.join(base_datadir, "StdDev_3px")
std5px_dir = os.path.join(base_datadir, "StdDev_5px")
std10px_dir = os.path.join(base_datadir, "StdDev_10px")

base_landsatdir = os.path.join(base_datadir, "Landsat8")
landsat_dir = os.path.join(base_landsatdir, "byNAIPDOY_QQuads")

# LOCATION OF LANDSAT RASTER
landsat_file = os.path.os.path.join(base_landsatdir, "Landsat1to8_TOA_NAIPAcquiDate_merge_rectified.tif")

# LOCATION OF THE NDSI FILE
ndsi_file = os.path.join(ndsi_dir, "LandsatOLI_NDSI_30m.tif")

# LOCATION OF THE NDWI FILE
ndwi_file = os.path.join(ndwi_dir, "LandsatOLI_NDWI_30m.tif")

# LOCATION OF FILE CONTAINING CLASSIFICATION POINTS PRE-EXTRACT
loc_class_points = os.path.abspath(r"Q:\GoogleDrive\AridRiparianProject\WorkingDirectory\classificationPoints_join.shp")

# LOCATION OF FILE CONTAINING CLASSIFICATION POINTS POST EXTRACT
loc_points_wRaster_extracts = loc_class_points[:-4] + "_extracts.shp"

# DIRECTORY HOLDING VRTS BY QUARTER QUAD FOR VEGETATION INDICIES (FLOAT32) AND STD_DEV (UINT8))
qquad_vrt_dir = os.path.join(base_datadir, "QQuad_VRTs")

loc_classifiedQuarterQuads = os.path.join(base_datadir, "classifiedQuarterQuads")

nhd_dir = os.path.join(base_datadir, "NHD")
loc_valleybottoms = os.path.join(nhd_dir, "VBET_ValleyBottoms")
utilities.useDirectory(loc_valleybottoms)
VBET_VB_loc = os.path.abspath(r"M:\Data\NHD\VBET_ValleyBottoms_20180624.tif")

# DEFINE THE PROJECTION USED OVER ARIZONA. USED FOR TRANSLATING POINT GEOMETRY LATER ON
utm11 = Proj(init="epsg:26911")
utm12 = Proj(init="epsg:26912")

# IDENTIFY RASTER VARIABLES
# THESE ORDER OF THESE RASTER VARIABLES MUST COINCIDE WITH THE CONSTRUCTED ARRAY OF EXTRACTS in the get_values FUNCTION
naip = ["NAIP1", "NAIP2", "NAIP3", "NAIP4"]
landsat = ["Landsat1", "Landsat2", "Landsat3", "Landsat4", "Landsat5", "Landsat6", "Landsat7", "Landsat8"]
landsat_vis = ["NDSI", "NDWI"]
naip_vis = ["NDVI", "EVI2", "SAVI", "OSAVI", 'MSAVI2']
# NOT using texture in this iteration
textures = ["StdDev_3px_band1", "StdDev_3px_band2", "StdDev_3px_band3", "StdDev_3px_band4",
            "StdDev_5px_band1", "StdDev_5px_band2", "StdDev_5px_band3", "StdDev_5px_band4",
            "StdDev_10px_band1", "StdDev_10px_band2", "StdDev_10px_band3", "StdDev_10px_band4"]
filters = ["Gauss1_band1", "Gauss1_band2", "Gauss1_band3", "Gauss1_band4",]
            
rasters_names = naip + naip_vis + filters + landsat + landsat_vis
print(rasters_names)
classes = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']

In [None]:
# IF VECTOR FILE OF POINTS WITH RASTER EXTRACTS DOESN'T EXIST, BUILD IT
if not os.path.exists(loc_points_wRaster_extracts):
    if "class_points" not in locals():
        logging.debug("READING IN %s as class_points" % loc_class_points)
        class_points = gpd.read_file(loc_class_points, crs={'init': 'epsg:26912'})
    
    if "utm_geom" not in class_points:
        logging.debug("ADDING COLUMN 'utm_geom' WITH CORRECT UTM COORDINATES FOR EACH QUARTER QUAD")
        # CREATE TRUE RASTER GEOMETRY COLUMN (BASED ON UTM)
        class_points["utm_geom"] = class_points.apply(calculateGeom, axis=1)
        
    # NDSI is only used because its the last raster column
    if "NDSI" not in class_points:
        logging.debug("CREATING COLUMNS...")
        # CREATE EMPTY COLUMNS IN DATA FRAME FOR EACH RASTER VARIABLE
        for column in rasters_names:
            class_points[column] = np.NaN
    
    net_percentage = 0.0
    # ITERATE THROUGH DATAFRAME IN GROUPS BY NAIP_FILE. KEEPS US FROM OPENING/CLOSING RASTERS FOR EACH POINT - INSTEAD FOR EACH GROUP
    for loc_NAIPFile, group in class_points.groupby("NAIP_FILE"):
        logger.debug("\nStarting raster value extraction for points in qquad %s" % loc_NAIPFile)
        loc_NAIPFile.replace("\\", "/")
    
        # LOOK FOR RASTERS FROM WHICH VALUES WILL BE EXTRACTED
        file = os.path.basename(loc_NAIPFile)
    
        vrt_naipvis = get_VegIndicies_VRT(file)
        #vrt_stddev = get_STDDev_VRT(file)
        gaussf_path = get_GaussianFile(file)
    
        landsat_path = createSubSetLandsat(loc_NAIPFile, landsat_file, landsat_dir).replace("\\","/")
    
        landsat_ndsi_path = createSubSetLandsat(loc_NAIPFile, ndsi_file, ndsi_dir).replace("\\", "/")
        landsat_ndwi_path = createSubSetLandsat(loc_NAIPFile, ndwi_file, ndwi_dir).replace("\\", "/")
    
        net_percentage += 100 * len(class_points.loc[class_points["NAIP_FILE"] == loc_NAIPFile])/len(class_points)
        logger.debug("Percentage of total: %d" % net_percentage)
        # SELECT POINTS WHICH HAVE NAIP PATH VALUE
        
        # Only if group hasn't had values assigned (Jupyter and Rodeo iterations)
        if group["NDSI"].isnull().values.any():
            with rio.open(loc_NAIPFile) as rasnaip:
                with rio.open(vrt_naipvis) as rasnaipvis:
                    with rio.open(gaussf_path) as rasgauss:
                        with rio.open(landsat_path) as raslandsat:
                            with rio.open(landsat_ndsi_path) as rasNDSI:
                                with rio.open(landsat_ndwi_path) as rasNDWI:
                                    count = 0
                                    class_points.loc[class_points.NAIP_FILE == loc_NAIPFile, rasters_names] = \
                                        class_points.loc[class_points.NAIP_FILE == loc_NAIPFile, "utm_geom"].apply(get_values)
        
        logger.debug("Finished with group %s at %s" % (loc_NAIPFile, str(datetime.now())))
    
    # REMOVE ALL ROWS WHICH EXTRACTED NO DATA VALUES FROM LANDSAT
    #for column in landsat:
    #    class_points = class_points[class_points.loc[column] != 32766]
    
    logger.info("Finished raster value extraction of %s points in %s" % (str(len(class_points)), str(datetime.now() - start)))
    
    # GEOPANDAS WON"T ALLOW MORE THAN ONE COLUMN WITH GEOMETRY TYPE. REMOVE THE utm_geom COLUMN CREATED PREVIOUSLY
    del class_points['utm_geom']
    #print("COLUMNS:\n", class_points.columns)
    logger.debug("WRITING DATAFRAME TO OUTPUT...")
    class_points.to_file(loc_points_wRaster_extracts)

else:
    if "class_points" not in "locals":
        logger.info("Reading in point file %s" % loc_points_wRaster_extracts)
        class_points = gpd.read_file(loc_points_wRaster_extracts)
        # Had to delete utm_geom when writing file (can't have two geometry columns). Recreate...
        rasters_names = class_points.columns.tolist()[18:-1]

In [None]:
class_points["utm_geom"] = class_points.apply(calculateGeom, axis=1)

# Split the points data frame into train and test
train, test = train_test_split(class_points, test_size=0.3)

In [None]:
# CREATE COLUMN FOR PREDICTED CLASSIFICATION VALUES
predicted_column = "CLASS_PREDICT"
test[predicted_column] = "Null"

#rasters_names = class_points.columns.tolist()[20:-2]
logger.info("Available raster variables: \n\t%s" % rasters_names)

"""
Allows removal of some rasters
#rasters values used in random forest
temp_rasters = rasters_names[:]
rf_rasters = rasters_names[:]
for r in temp_rasters:
    if "Landsat" in r:
#    if "StdDev_" in r:
#        print(r)
        rf_rasters.remove(r)
"""

logger.info("Using raster variables: \n%s" % rf_rasters)

In [None]:
# TRAIN RANDOM FORESTS
rf_start = datetime.now()
logger.info("Beginning Random Forest Train")
#maxdepth = -1
maxdepth = 60
n_est = 40
n_job = 6
min_per_leaf = 50
crit = "entropy" # gini or entropy

rf = RandomForestClassifier(verbose=1, max_depth=maxdepth, n_estimators=n_est,
                            n_jobs=n_job, min_samples_leaf=min_per_leaf,
                            criterion=crit)

from sklearn.preprocessing import Imputer
X = Imputer().fit_transform(train[rf_rasters].dropna())

rf.fit(train[rf_rasters].dropna(),
       train[rf_rasters+["Class"]].dropna()["Class"])


logger.info("Finished Fitting in", datetime.now() - rf_start)
#print('Out-of-bag score estimate:', {rf.oob_score_:.3})

In [None]:
"""
# LINEAR SVM
datetimestart = datetime.now()
print("- Beginning Linear SVM Train -")
maxi=1000

svm = LinearSVC(verbose=1, max_iter=maxi)

svm.fit(train[rf_rasters].dropna(),
       train[rf_rasters+["Class"]].dropna()["Class"])
"""

In [None]:
# CREATE CLASSIFIED RASTERS FOR QUARTER QUADS USED IN TRAINING DATA FIRST
for loc_NAIPFile, group in class_points.groupby("NAIP_FILE"):
    #if "3210911_sw" in loc_NAIPFile:
        #print("FOUND")
    classified_File_rf = createClassifiedFile(loc_NAIPFile, "RF", rf, overwrite=False)
    #classified_File_svm = createClassifiedFile(loc_NAIPFile, "SVM", svm, overwrite=True)
    """
    # EXTRACT PREDICTED PIXEL CLASSIFICATION TO TESTING DATAFRAME
    print("Extracting predicted classified values...")
    with rio.open(classified_File_rf) as classras:
        # print(classras.indexes)
        test.loc[test.NAIP_FILE == loc_NAIPFile, [predicted_column]] = \
            test.loc[test.NAIP_FILE == loc_NAIPFile, "utm_geom"].apply(get_class_value)
    """

In [None]:
"""THIS CODE BLOCK USES A SHAPEFILE OF THE NAIP FOOTPRINTS AND THE AOI ECOREGIONS FEATURE 
CLASS TO FIND ONLY THE NAMES OF THE QQUADS WHICH WE WAND TO CLASSIFY. THEN IDENTIFIES THE 
ACTUAL PATH OF THE NAIP FILE AND PASSES IT TO THE CLASSIFIER"""


aoi = gpd.GeoDataFrame.from_file(r"Q:\Arid Riparian Project\AridRiparianProject\AridRiparianProject.gdb", layer='TargetEcoregions')
aoi.crs = fiona.crs.from_epsg(2163)

naip_footprints = gpd.read_file(r"Q:\Arid Riparian Project\AridRiparianProject")
aoi.to_crs(naip_footprints.crs, inplace=True)

aoi_qquads = []
for i, row in naip_footprints.iterrows():
    for j, arow in aoi.iterrows():
        if row.geometry.within(arow.geometry):
            aoi_qquads.append(row.Name)
#print(len(aoi_qquads))


In [None]:
files_already_created = []
for root, dirs, files in os.walk(r"Q:\GoogleDrive\AridRiparianProject\WorkingDirectory\Data\RiparianClass_VBs"):
    for file in files:
        if file.endswith(".tif"):
            files_already_created.append(file)

for qquad_name in aoi_qquads:
    fpath = getFullNAIPPath(qquad_name, naip_dir)
    already_created = False
    for file in files_already_created:
        if qquad_name[2:12] in file:
            already_created = True
            break
            
    if not already_created:
        beg = datetime.now()
        logger.debug("Starting on qquad: %s" % qquad_name)
        createClassifiedFile(fpath, "RF", rf, overwrite=False)
        logger.debug("COMPLETED riparian classification. Finished in %s" % (str(datetime.now()-beg)))
        logger.debug("_____________________________________________________________________________________")

In [None]:
# THEN CREATE CLASSIFIED RASTER FROM ALL OTHER QUARTER QUADS
for root, dirs, files in os.walk(naip_dir):
    for file in files:
        if file.endswith(".tif"):
            fpath = os.path.join(root,file)
            #createClassifiedFile(fpath, "RF", rf, overwrite=False)
            
logger.info("\n\t\t-- DONE WITH ALL FILES --\n")

----------------------------------------------------
# TESTING SECTION

In [None]:
"""flat_pixels = np.array([1,10,50,500, np.NaN])

if np.isfinite(flat_pixels.any()) and not np.isnan(flat_pixels.any()):
    flat_pixels = np.where(flat_pixels > np.finfo(np.float32).max, np.finfo(np.float32).max, flat_pixels)
    #result = rf_classifier.predict(flat_pixels)
    #np.where(x.values >= np.finfo(np.float32).max,)
    print("MODING")
else:
    print("TURRIBLE")"""