# SMRF Evaluation

In [1]:
import os
import re
from glob import glob
import pandas as pd
import rasterio as rio
import numpy as np
from rasterio.windows import from_bounds
from skll.metrics import kappa

from joblib import Parallel, delayed
from datetime import datetime
import re

from skll.metrics import correlation
from sklearn.metrics import *


1. Loop through SMRF DTMs
1. Pull Parameters from file name
1. Read in SMRF Raster
1. Get Mask from No Data Values
1. Get Window of raster extent
1. Read in 2015 DEM from Window
1. Read in 10m Slope at relative resolution
1. Read in DTG at relative resolution
1. Double check check array shapes
1. Create empty arrays to hold raster names, slope values, dtg values, nodata count, and kappa number
1. For each category of Slope and DTG
    1. Create new mask from slope and dtg
    1. Mask no data mask, mask DEM, mask DTM
    1. Get Count of NoData Mask
    1. Get Kappa Number for difference between the two masked arrays
    1. Append kappa, nodata, etc to new empty arrays
1. Create pandas DF from various arrays
1. Return Pandas DF

In [2]:
# 1. Loop through SMRF DTMs
loc_dem_2015 = "/media/ben/PAG2015Elev/EPC_DEM_2015.vrt"
loc_dtg = "/media/ben/Bertha/PAG2019/EPCExtent_30cm/Elevation_80cmNPS/HAG_NED_80cm_DTG/EPC_HAGNEDDTG80cm_2019.vrt"
loc_slope = "/media/ben/Bertha/PAG2019/OtherData/10mDEMs/DEM10mNED_slope.tif"
loc_dtm = "/media/ben/Bertha/PAG2019/EPCExtent_30cm/Elevation_80cmNPS/DTM80cm/"
dtm_tifs = glob(loc_dtm+"/*/*.tif")

loc_stats_df = './CompletedStatsEvaluations.csv'

slope_breaks = [5,15,25]
dtg_breaks = [25,50,100,200]

slope_breaks = [5,15,25]
dtg_breaks = [25,50,100,200]

slope_categories = {}
for si, slope_break in enumerate(slope_breaks):
    slope_min = slope_breaks[si-1] if si-1 >= 0 else 0
    slope_max = slope_breaks[si] if si+1 < len(slope_breaks) else 90
    slope_categories[slope_break] = {'Min':slope_min, 'Max':slope_max}

dtg_categories = {}
for di, dtg_break in enumerate(dtg_breaks):
    dtg_min = dtg_breaks[di-1] if di-1 >= 0 else 0
    dtg_max = dtg_breaks[di] if di+1 < len(dtg_breaks) else 20000
    dtg_categories[dtg_break] = {'Min':dtg_min, 'Max':dtg_max}
    
    
def getStatsDF(dtm_file, writeOutput=False):
    # 2. Pull Parameters from file name
    fname = os.path.basename(dtm_file)
    params = fname.split("_")[-1].replace(".tif","")
    
    param_list = re.split('[a-zA-z]+',params)
    scalar_param = param_list[1]
    slope_param = param_list[2]
    threshold_param = param_list[3]
    winSize_param = param_list[4]
    
    # 3. Read in SMRF Raster
    with rio.open(dtm_file) as src:
        kwargs = src.profile
        smrf = src.read(1)
        nd_value = src.nodata
        # 4. Get Mask from No Data Values
        smrf_mask = smrf != nd_value
        # 5 Get window of raster extent
        bnds = src.bounds
    
    with rio.open(loc_dem_2015) as src:
        #6. Read in 2015 DEM from Window
        dem = src.read(1, window=from_bounds(bnds.left, bnds.bottom, bnds.right, bnds.top, transform=src.transform), out_shape=(smrf.shape))

    # 7. Read in 10m Slope at relative resolution
    with rio.open(loc_slope) as src:
        slope = src.read(1, window=from_bounds(bnds.left, bnds.bottom, bnds.right, bnds.top, transform=src.transform), out_shape=(smrf.shape))
        
    # 8. Read in DTG at relative resolution
    with rio.open(loc_dtg) as src:
        dtg = src.read(1, window=from_bounds(bnds.left, bnds.bottom, bnds.right, bnds.top, transform=src.transform), out_shape=(smrf.shape))
        
    # 9. Double check check array shapes
    if not smrf.shape == dtg.shape == slope.shape == dem.shape:
        print("Bad Shapes")
        return None
        
    # 10. Create empty arrays to hold raster names, slope values, dtg values, nodata count, and kappa number
    slope_values = []
    dtg_values = []
    valid_counts = []
    sumErrors = []
    sumError2s = []
    
    # 11. For each category of Slope and DTG
    for sv, stolerances in slope_categories.items():
        for dv, dtolerances in dtg_categories.items():
            
            outfile_2015 = f"{fname[:-4]}2015_Sl{sv}DTG{dv}.tif"
            outfile_2019 = f"{fname[:-4]}2019_Sl{sv}DTG{dv}.tif"
            file_2015 = os.path.join("./temp", outfile_2015)
            file_2019 = os.path.join("./temp", outfile_2019)
            
            
            
            # 11a Create new mask from slope and dtg
            slope_mask = (slope >= stolerances['Min']) & (slope < stolerances['Max'])
            dtg_mask =   (dtg   >= dtolerances['Min']) & (dtg   < dtolerances['Max'])
            
            # 11b Mask no data mask, mask DEM, mask DTM
            all_mask = (dtg_mask==True) & (slope_mask==True) & (smrf_mask==True)
            
            if writeOutput and (not os.path.exists(file_2015) or not os.path.exists(file_2019)):
                dem_out = np.where(all_mask == True, dem, -9999)
                smrf_out = np.where(all_mask == True, smrf, -9999)
                
                kwargs.update(nodata=-9999, dtype=np.float32)

                with rio.open(file_2015, 'w', **kwargs) as dst:
                    dst.write(dem_out.astype(np.float32),1)
                with rio.open(file_2019, 'w', **kwargs) as dst:
                    dst.write(smrf_out.astype(np.float32),1)


                
            
            # 11c Mask SMRF and DTM
            dem_valid = dem[all_mask==True]
            smrf_valid = smrf[all_mask==True]
            #print(len(dem_valid), len(smrf_valid))
            
            error = np.absolute(dem_valid-smrf_valid)
            error_2 = error**2
            
            # Get Count of non-masked data
            vc = len(error)
            
            # Get sum of absolute error values
            se = error.sum()
            # Get sum of absolute error values
            se2 = error_2.sum()
            
            
            # 11c Append slope value, dtg value, counts, sum error and sum error squared, to new empty arrays
            slope_values.append(stolerances['Max'])
            dtg_values.append(dtolerances['Max'])
            valid_counts.append(vc)
            sumErrors.append(se)
            sumError2s.append(se2)

    raster_name = [fname] * len(valid_counts)
    scalar_params = [scalar_param] * len(valid_counts)
    slope_params = [slope_param] * len(valid_counts)
    threshold_params = [threshold_param] * len(valid_counts)
    winSize_params = [winSize_param] * len(valid_counts)
    
    d = {'DTMRaster': raster_name,
         "Slope_Max": slope_values,
         "DTG_Max": dtg_values,
         "Scalar_Param" : scalar_params,
         "Slope_Param": slope_params,
         "Threshold_Param": threshold_params,
         "WindowSize_Param": winSize_params,
         "Valid_Count": valid_counts,
         "SumError": sumErrors,
         "SumError2": sumError2s}
    
    df = pd.DataFrame(data=d)
    
    return df



In [3]:
def getToDo(dtm_files, csv_location = './CompletedStatsEvaluations.csv'):
    data_frames = []
    to_do_files = []

    if os.path.exists(csv_location):
        stats_df = pd.read_csv(csv_location)
        data_frames.append(stats_df)

        files_done = set(stats_df['DTMRaster'].to_list())
    else:
        files_done = []

    for dtm in dtm_tifs:
        if os.path.basename(dtm) not in files_done:
            to_do_files.append(dtm)

    print(f"{len(to_do_files)} remaining to process")
    
    
    return to_do_files, data_frames

#### TEST FUNCTIONS

In [5]:
to_do_tifs, dfs = getToDo(dtm_tifs, csv_location = loc_stats_df)

for i, dtm in enumerate(to_do_tifs[:1]):
    print(f"{datetime.now()}  -  Starting {i}, {len(to_do_tifs)-i} remaining")
    dfs.append(getStatsDF(dtm))
    stats_df = pd.concat(dfs)
    stats_df.to_csv(loc_stats_df)

0 remaining to process


In [11]:
loc_stats_df

'./CompletedStatsEvaluations.csv'

In [None]:
to_do_tifs, dfs = getToDo(dtm_tifs, csv_location = loc_stats_df)

batch_size = 50

for i in range(0, len(to_do_tifs), batch_size):
    batch = to_do_tifs[i:i + batch_size]
    print(f"{datetime.now()}  -  Starting {i}-{i+batch_size}")
    try:
        new_dfs = Parallel(n_jobs=5, verbose=5, backend="loky")(delayed(getStatsDF)(dtm) for dtm in batch)
        dfs += new_dfs
        stats_df = pd.concat(dfs)
        stats_df.to_csv(loc_stats_df)
    except:
        print(f"Issue with batch starting at {i}")


In [None]:
to_do_tifs, dfs = getToDo(dtm_tifs, csv_location = loc_stats_df)

for i, dtm in enumerate(to_do_tifs):
    print(f"{datetime.now()}  -  Starting {i}, {len(to_do_tifs)-i} remaining")
    dfs.append(getStatsDF(dtm))
    stats_df = pd.concat(dfs)
    stats_df.to_csv(loc_stats_df)

___________________________________
## Evalute Dataframe of results

In [13]:
stats_df = pd.read_csv("CompletedStatsEvaluations.csv")
stats_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,Unnamed: 0.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1,DTMRaster,Slope_Max,DTG_Max,Scalar_Param,Slope_Param,Threshold_Param,WindowSize_Param,Valid_Count,SumError,SumError2
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,E0860_N370_9_Sc0.25Sl0.005Th0.25WS100.tif,5,25,0.25,0.005,0.25,100,4680225,3680750.0,3840429.0
1,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,E0860_N370_9_Sc0.25Sl0.005Th0.25WS100.tif,5,50,0.25,0.005,0.25,100,0,0.0,0.0
2,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,E0860_N370_9_Sc0.25Sl0.005Th0.25WS100.tif,5,100,0.25,0.005,0.25,100,0,0.0,0.0
3,3,3.0,3.0,3.0,3.0,3.0,3.0,3.0,E0860_N370_9_Sc0.25Sl0.005Th0.25WS100.tif,5,20000,0.25,0.005,0.25,100,0,0.0,0.0
4,4,4.0,4.0,4.0,4.0,4.0,4.0,4.0,E0860_N370_9_Sc0.25Sl0.005Th0.25WS100.tif,15,25,0.25,0.005,0.25,100,4827,14353.32,89834.65


In [14]:
stats_df["subTileName"] = stats_df['DTMRaster'].apply(lambda f: "_".join(f.split("_")[:3]))
len(stats_df["subTileName"].unique())

81

In [15]:
#stats_df = pd.read_csv(loc_stats_df)
display(stats_df.head())

# Re-establish slope_values in case function was changed above
slope_values = stats_df['Slope_Param'].value_counts().keys().tolist()
scalar_values = stats_df['Scalar_Param'].value_counts().keys().tolist()
threshold_values = stats_df['Threshold_Param'].value_counts().keys().tolist()
window_values = stats_df['WindowSize_Param'].value_counts().keys().tolist()
slope_values.sort()
scalar_values.sort()
threshold_values.sort()
window_values.sort()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,Unnamed: 0.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1,DTMRaster,Slope_Max,DTG_Max,Scalar_Param,Slope_Param,Threshold_Param,WindowSize_Param,Valid_Count,SumError,SumError2,subTileName
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,E0860_N370_9_Sc0.25Sl0.005Th0.25WS100.tif,5,25,0.25,0.005,0.25,100,4680225,3680750.0,3840429.0,E0860_N370_9
1,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,E0860_N370_9_Sc0.25Sl0.005Th0.25WS100.tif,5,50,0.25,0.005,0.25,100,0,0.0,0.0,E0860_N370_9
2,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,E0860_N370_9_Sc0.25Sl0.005Th0.25WS100.tif,5,100,0.25,0.005,0.25,100,0,0.0,0.0,E0860_N370_9
3,3,3.0,3.0,3.0,3.0,3.0,3.0,3.0,E0860_N370_9_Sc0.25Sl0.005Th0.25WS100.tif,5,20000,0.25,0.005,0.25,100,0,0.0,0.0,E0860_N370_9
4,4,4.0,4.0,4.0,4.0,4.0,4.0,4.0,E0860_N370_9_Sc0.25Sl0.005Th0.25WS100.tif,15,25,0.25,0.005,0.25,100,4827,14353.32,89834.65,E0860_N370_9


In [17]:
for col in stats_df.columns.tolist():
    if "Unnamed: " in col:
        del stats_df[col]
#del stats_df['FilePath'], stats_df['File']
stats_df.columns

Index(['DTMRaster', 'Slope_Max', 'DTG_Max', 'Scalar_Param', 'Slope_Param',
       'Threshold_Param', 'WindowSize_Param', 'Valid_Count', 'SumError',
       'SumError2', 'subTileName'],
      dtype='object')

In [18]:
stats_df_grp = stats_df.groupby(by = ["DTG_Max","Slope_Max", "Slope_Param", "Scalar_Param", "Threshold_Param", "WindowSize_Param"]).agg(
        {"SumError":'sum', "SumError2":'sum',"Valid_Count":'sum'})
stats_df_grp

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,SumError,SumError2,Valid_Count
DTG_Max,Slope_Max,Slope_Param,Scalar_Param,Threshold_Param,WindowSize_Param,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
25,5,0.005,0.0,0.25,100,2.073454e+08,6.188887e+08,267814069
25,5,0.005,0.0,0.25,200,2.106033e+08,5.772800e+08,271494663
25,5,0.005,0.0,0.50,100,2.261047e+08,6.664189e+08,307350341
25,5,0.005,0.0,0.50,200,2.302424e+08,6.233356e+08,312184515
25,5,0.005,0.0,1.00,100,2.472249e+08,7.108124e+08,348231727
...,...,...,...,...,...,...,...,...
20000,90,0.250,1.5,0.50,200,1.684396e+06,8.111611e+06,559882
20000,90,0.250,1.5,1.00,100,1.663975e+06,7.550302e+06,542351
20000,90,0.250,1.5,1.00,200,1.702035e+06,8.229310e+06,559923
20000,90,0.250,1.5,1.50,100,1.682444e+06,7.696996e+06,542377


In [19]:
# Calculate MAE and RMSE

In [21]:
stats_df_grp['MAE'] = stats_df_grp.apply(lambda r: r.SumError/r.Valid_Count, axis=1)
stats_df_grp['RMSE']= stats_df_grp.apply(lambda r: np.sqrt(r.SumError2/r.Valid_Count), axis=1)
stats_df_grp.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,SumError,SumError2,Valid_Count,MAE,RMSE
DTG_Max,Slope_Max,Slope_Param,Scalar_Param,Threshold_Param,WindowSize_Param,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
25,5,0.005,0.0,0.25,100,207345400.0,618888700.0,267814069,0.774214,1.520161
25,5,0.005,0.0,0.25,200,210603300.0,577280000.0,271494663,0.775718,1.458185
25,5,0.005,0.0,0.5,100,226104700.0,666418900.0,307350341,0.735658,1.472505
25,5,0.005,0.0,0.5,200,230242400.0,623335600.0,312184515,0.73752,1.413043
25,5,0.005,0.0,1.0,100,247224900.0,710812400.0,348231727,0.709944,1.428708
25,5,0.005,0.0,1.0,200,252392200.0,664958000.0,354290676,0.712387,1.36999
25,5,0.005,0.0,1.5,100,264794900.0,751247300.0,369839081,0.715973,1.42523
25,5,0.005,0.0,1.5,200,270648900.0,700240800.0,376658646,0.718552,1.363483
25,5,0.005,0.25,0.25,100,208679100.0,628794200.0,269826165,0.773384,1.526554
25,5,0.005,0.25,0.25,200,211946900.0,584715000.0,273574457,0.774732,1.461956


In [63]:
stats_df_grp_reset = stats_df_grp.reset_index(level=['Slope_Param', 'Scalar_Param','Threshold_Param', 'WindowSize_Param'])

best_fits = []
worst_fits = []

#for stat in ['MAE', 'RMSE']:
def getMinimumGroupOfStat(df, stat):
    stat_best_fits = []
    stat_worst_fits = []
    
    for name, group in df.groupby(by=['DTG_Max','Slope_Max']):
        best_fit = group[group[stat] == group[stat].min()].copy()
        worst_fit = group[group[stat] == group[stat].max()].copy()

        best_fit.rename(columns={stat:stat+"_min"}, inplace=True)
        worst_fit.rename(columns={stat:stat+"_min"}, inplace=True)
        stat_best_fits.append(best_fit)
        stat_worst_fits.append(worst_fit)

    stat_best_fits = pd.concat(stat_best_fits)
    stat_best_fits['Stat'] = stat
    
    stat_worst_fits = pd.concat(stat_worst_fits)
    stat_worst_fits['Stat'] = stat
    
    return stat_best_fits.reset_index()

mae_best_fits = getMinimumGroupOfStat(stats_df_grp_reset, 'MAE')
rmse_best_fits = getMinimumGroupOfStat(stats_df_grp_reset, 'RMSE')

print(f"STATISTIC: Mean Absolute Error (MAE)")
display(mae_best_fits)
print(f"STATISTIC: Root Mean Squared Error (RMSE)")
display(rmse_best_fits)

STATISTIC: Mean Absolute Error (MAE)


Unnamed: 0,DTG_Max,Slope_Max,Slope_Param,Scalar_Param,Threshold_Param,WindowSize_Param,SumError,SumError2,Valid_Count,MAE_min,RMSE,Stat
0,25,5,0.005,0.0,1.0,100,247224900.0,710812400.0,348231727,0.709944,1.428708,MAE
1,25,15,0.01,1.0,1.5,200,107167600.0,313104500.0,64953670,1.649908,2.195547,MAE
2,25,90,0.01,1.5,1.5,200,20648610.0,115847500.0,7536484,2.73982,3.920658,MAE
3,50,5,0.01,0.0,0.25,200,1940934.0,18249730.0,869194,2.233027,4.582156,MAE
4,50,15,0.01,0.0,0.25,200,4799759.0,14149720.0,3461850,1.386472,2.021714,MAE
5,50,90,0.01,0.0,0.25,200,1981120.0,9293482.0,740051,2.677005,3.543712,MAE
6,100,5,0.01,0.0,0.25,200,2832633.0,51264280.0,337486,8.393334,12.324791,MAE
7,100,15,0.005,0.0,0.25,100,3602730.0,13288510.0,2258362,1.595285,2.425724,MAE
8,100,90,0.01,0.0,0.25,200,1975718.0,8944431.0,728990,2.710212,3.502802,MAE
9,20000,5,0.005,0.0,0.25,200,91890860.0,1943271000.0,4421642,20.782068,20.964036,MAE


STATISTIC: Root Mean Squared Error (RMSE)


Unnamed: 0,DTG_Max,Slope_Max,Slope_Param,Scalar_Param,Threshold_Param,WindowSize_Param,SumError,SumError2,Valid_Count,MAE,RMSE_min,Stat
0,25,5,0.01,0.0,1.0,200,290089600.0,730352600.0,399680916,0.725803,1.351791,RMSE
1,25,15,0.01,1.0,1.5,200,107167600.0,313104500.0,64953670,1.649908,2.195547,RMSE
2,25,90,0.01,0.0,0.25,200,19106370.0,102588200.0,6881758,2.77638,3.860993,RMSE
3,50,5,0.01,0.0,0.25,200,1940934.0,18249730.0,869194,2.233027,4.582156,RMSE
4,50,15,0.01,0.0,0.25,200,4799759.0,14149720.0,3461850,1.386472,2.021714,RMSE
5,50,90,0.01,0.0,0.25,200,1981120.0,9293482.0,740051,2.677005,3.543712,RMSE
6,100,5,0.01,0.0,0.25,200,2832633.0,51264280.0,337486,8.393334,12.324791,RMSE
7,100,15,0.005,0.0,0.25,100,3602730.0,13288510.0,2258362,1.595285,2.425724,RMSE
8,100,90,0.01,0.0,0.25,200,1975718.0,8944431.0,728990,2.710212,3.502802,RMSE
9,20000,5,0.005,0.0,0.25,200,91890860.0,1943271000.0,4421642,20.782068,20.964036,RMSE


In [23]:
mae_best_fits

Unnamed: 0,DTG_Max,Slope_Max,Slope_Param,Scalar_Param,Threshold_Param,WindowSize_Param,SumError,SumError2,Valid_Count,MAE_min,RMSE,Stat
0,25,5,0.005,0.0,1.0,100,247224900.0,710812400.0,348231727,0.709944,1.428708,MAE
1,25,15,0.01,1.0,1.5,200,107167600.0,313104500.0,64953670,1.649908,2.195547,MAE
2,25,90,0.01,1.5,1.5,200,20648610.0,115847500.0,7536484,2.73982,3.920658,MAE
3,50,5,0.01,0.0,0.25,200,1940934.0,18249730.0,869194,2.233027,4.582156,MAE
4,50,15,0.01,0.0,0.25,200,4799759.0,14149720.0,3461850,1.386472,2.021714,MAE
5,50,90,0.01,0.0,0.25,200,1981120.0,9293482.0,740051,2.677005,3.543712,MAE
6,100,5,0.01,0.0,0.25,200,2832633.0,51264280.0,337486,8.393334,12.324791,MAE
7,100,15,0.005,0.0,0.25,100,3602730.0,13288510.0,2258362,1.595285,2.425724,MAE
8,100,90,0.01,0.0,0.25,200,1975718.0,8944431.0,728990,2.710212,3.502802,MAE
9,20000,5,0.005,0.0,0.25,200,91890860.0,1943271000.0,4421642,20.782068,20.964036,MAE


In [28]:
bf_compare = pd.merge(mae_best_fits, rmse_best_fits[['DTG_Max', 'Slope_Max', 'RMSE_min']], left_on=['DTG_Max','Slope_Max'], right_on=['DTG_Max','Slope_Max'])
rows_w_param_discrepancies = bf_compare[bf_compare.RMSE != bf_compare.RMSE_min][['DTG_Max','Slope_Max','MAE_min', 'RMSE_min']]
#display(rows_w_mae_rmse_discrepancies)
for i,row in rows_w_param_discrepancies.iterrows():
    rmse_params = rmse_best_fits[(rmse_best_fits.DTG_Max == row.DTG_Max) & (rmse_best_fits.Slope_Max == row.Slope_Max)][['Slope_Param', 'Scalar_Param','Threshold_Param', 'WindowSize_Param', 'RMSE_min']]
    rmse_params['Stat'] = 'RMSE'
    print(f"\nBest Parameter Difference for DTG Max of {row.DTG_Max} and Slope Max of {row.Slope_Max}")
    mae_params = mae_best_fits[(mae_best_fits.DTG_Max == row.DTG_Max) & (mae_best_fits.Slope_Max == row.Slope_Max)][['Slope_Param', 'Scalar_Param','Threshold_Param', 'WindowSize_Param', 'MAE_min']]
    
    mae_params['Stat'] = 'MAE'
    stat_comp = pd.concat([rmse_params, mae_params])
    
    mae_value_for_best_rmse = stats_df_grp_reset[stats_df_grp_reset['RMSE'] == row.RMSE_min]['MAE'].values[0]
    mae_diff = abs(row.MAE_min-mae_value_for_best_rmse)
    print(f"Difference in MAE for best parameters evalued against RMSE and MAE:  {mae_diff}")
    
    rmse_value_for_best_rmse = stats_df_grp_reset[stats_df_grp_reset['MAE'] == row.MAE_min]['MAE'].values[0]
    rmse_diff = abs(row.RMSE_min-rmse_value_for_best_rmse)
    print(f"Difference in RMSE for best parameters evalued against RMSE and MAE: {rmse_diff}")
    display(stat_comp)


Best Parameter Difference for DTG Max of 25.0 and Slope Max of 5.0
Difference in MAE for best parameters evalued against RMSE and MAE:  0.015859172406689992
Difference in RMSE for best parameters evalued against RMSE and MAE: 0.6418473335023152


Unnamed: 0,Slope_Param,Scalar_Param,Threshold_Param,WindowSize_Param,RMSE_min,Stat,MAE_min
0,0.01,0.0,1.0,200,1.351791,RMSE,
0,0.005,0.0,1.0,100,,MAE,0.709944



Best Parameter Difference for DTG Max of 25.0 and Slope Max of 90.0
Difference in MAE for best parameters evalued against RMSE and MAE:  0.036559587227920076
Difference in RMSE for best parameters evalued against RMSE and MAE: 1.1211725542628477


Unnamed: 0,Slope_Param,Scalar_Param,Threshold_Param,WindowSize_Param,RMSE_min,Stat,MAE_min
2,0.01,0.0,0.25,200,3.860993,RMSE,
2,0.01,1.5,1.5,200,,MAE,2.73982



Best Parameter Difference for DTG Max of 20000.0 and Slope Max of 90.0
Difference in MAE for best parameters evalued against RMSE and MAE:  0.022179292582356958
Difference in RMSE for best parameters evalued against RMSE and MAE: 0.4257968964057248


Unnamed: 0,Slope_Param,Scalar_Param,Threshold_Param,WindowSize_Param,RMSE_min,Stat,MAE_min
11,0.005,0.0,0.25,100,3.167943,RMSE,
11,0.01,0.0,0.25,200,,MAE,2.742147


In [61]:
# Using MAE for Evaluations...
num_min_slopes = len(mae_best_fits[mae_best_fits.Slope_Param == min(slope_values)])
if num_min_slopes > 0 and min(slope_values) != 0:
    print(f"Num with minimum slope value of {min(slope_values)} : {num_min_slopes}")

num_min_scalars = len(mae_best_fits[mae_best_fits.Threshold_Param == min(scalar_values)])
if num_min_scalars > 0 and min(scalar_values) != 0:
    print(f"Num with minimum scalar of {min(scalar_values)} : {um_min_scalars}")
    
num_min_threshold = len(mae_best_fits[mae_best_fits.Threshold_Param == min(threshold_values)])
if num_min_threshold > 0 and min(threshold_values) != 0:
    print(f"Num with minimum threshold value of {min(threshold_values)} : {num_min_threshold}")


num_max_slopes = len(mae_best_fits[mae_best_fits.Slope_Param == max(slope_values)])
if num_max_slopes > 0 and max(slope_values) != 90:
    print(f"Num with maximum slope value of {max(slope_values)} : {num_max_slopes}")

num_max_scalars = len(mae_best_fits[mae_best_fits.Threshold_Param == max(scalar_values)])
if num_max_scalars > 0:
    print(f"Num with maximum scalar value of {max(scalar_values)} : {num_max_scalars}")
    
num_max_threshold = len(mae_best_fits[mae_best_fits.Threshold_Param == max(threshold_values)])
if num_max_threshold > 0:
    print(f"Num with maximum threshold value of {max(threshold_values)} : {num_max_threshold}")

Num with minimum slope value of 0.005 : 3
Num with minimum threshold value of 0.25 : 9
Num with maximum scalar value of 1.5 : 2
Num with maximum threshold value of 1.5 : 2


#convert MSE to RMSE
stats_df['RMSE'] = np.sqrt(stats_df['MSE'])

for stat in ['Kappa', 'RMSE', 'MAE', 'EVS', 'R2']:
    
    temp = stats_df.groupby(by = ["DTG_Max","Slope_Max", "Slope_Param", "Scalar_Param", "Threshold_Param", "WindowSize_Param"])[stat].agg([
        ('Min', 'min'),
        ('Max', 'max'),
        ('Mean', 'mean')]).add_prefix(stat)
    print(len(temp))
    temp = temp.reset_index(level=['Slope_Param', 'Scalar_Param','Threshold_Param', 'WindowSize_Param'])
    
    best_fits = []
    worst_fits = []
    
    mean_col = f'{stat}Mean'
    for name, group in temp.groupby(by=['DTG_Max','Slope_Max']):
        if stat == 'MSE' or stat == 'MAE':
            best_fit = group[group[mean_col] == group[mean_col].min()]
            worst_fit = group[group[mean_col] == group[mean_col].max()]
        else:
            best_fit = group[group[mean_col] == group[mean_col].max()]
            worst_fit = group[group[mean_col] == group[mean_col].min()]
            
        best_fits.append(best_fit)
        worst_fits.append(worst_fit)

    best_fits = pd.concat(best_fits)
    worst_fits = pd.concat(worst_fits)
    print(f"STATISTIC: {stat}")
    
    display(best_fits)

In [None]:





        
#stats_df[['Valid_Count',"SumError","SumError2"]] = stats_df.apply(lambda row: getPixelCounts(row),axis=1)
#stats_df['Valid_Count']
#stats_df.to_csv(loc_stats_df)


In [None]:
import math

def split(dfm, chunk_size):
    indices = index_marks(dfm.shape[0], chunk_size)
    return np.split(dfm, indices)

def index_marks(nrows, chunk_size):
    return range(chunk_size, math.ceil(nrows / chunk_size) * chunk_size, chunk_size)

def applyGetStats(df):
    df[['Valid_Count',"SumError","SumError2"]] = df.apply(lambda row: getStats(row), axis=1)
    return df

try:
    del invalid['Valid_Count'], invalid["SumError"], invalid["SumError2"]
except:
    print("No rows to delete")
    
#invalid_chunks = split(invalid, 50)
#test_chunks = split(test,10)
#len(test_chunks)

In [None]:
rasters_df = dfs[0][['DTMRaster', 'FilePath', 'DTG_Max',
       'Slope_Max', 'Slope_Param', 'Scalar_Param', 'Threshold_Param',
       'WindowSize_Param']].copy()


In [None]:
rasters_df

In [None]:
#test = rasters_df.iloc[:250].copy()
rasters_df_chunks = split(rasters_df,50)

s1 = datetime.now()
rasters_df_processed = Parallel(n_jobs=5, verbose=5, backend="loky")(delayed(applyGetStats)(dataFrame) for dataFrame in rasters_df_chunks)
s2 = datetime.now()
print(f"\nElapsed: {s2-s1}\n")

In [None]:
def getStats(r):
    if r.name % 1000 == 0:
        print(r.name)
    fname_2015 = f"{r.DTMRaster[:-4]}2015_Sl{r.Slope_Max}DTG{r.DTG_Max}.tif"
    fname_2019 = f"{r.DTMRaster[:-4]}2019_Sl{r.Slope_Max}DTG{r.DTG_Max}.tif"
    file_2015 = os.path.join("./temp", fname_2015)
    file_2019 = os.path.join("./temp", fname_2019)
    
    if os.path.exists(file_2015) and os.path.exists(file_2019):
        try:
            with rio.open(file_2015) as src:
                a1 = src.read(1)
                a1 = a1[a1!=-9999]
            with rio.open(file_2019) as src:
                a2 = src.read(1)
                a2 = a2[a2!=-9999]
                
            error = np.absolute(a1-a2)
        except:
            os.remove(file_2015)
            os.remove(file_2019)
            error = getDiff(r.FilePath, slope_target=r.Slope_Max, dtg_target=r.DTG_Max)
    else:
        #print(f"Getting values for slope: {r.Slope_Max}, dtg: {r.DTG_Max}")
        error = getDiff(r.FilePath, slope_target=r.Slope_Max, dtg_target=r.DTG_Max)
    
    vc = len(error)
    error_2 = error**2
    se = error.sum()
    se2 = error_2.sum()
    
    return pd.Series([vc,se,se2])


def getDiff(dtm_file, slope_target, dtg_target,  writeOutput=False):
    # 2. Pull Parameters from file name
    fname = os.path.basename(dtm_file)
    params = fname.split("_")[-1].replace(".tif","")
    
    param_list = re.split('[a-zA-z]+',params)
    scalar_param = param_list[1]
    slope_param = param_list[2]
    threshold_param = param_list[3]
    winSize_param = param_list[4]
    
    
    s1 = datetime.now()
    # 3. Read in SMRF Raster
    with rio.open(dtm_file) as src:
        kwargs = src.profile
        smrf = src.read(1)
        nd_value = src.nodata
        # 4. Get Mask from No Data Values
        smrf_mask = smrf != nd_value
        # 5 Get window of raster extent
        bnds = src.bounds
    
    with rio.open(loc_dem_2015) as src:
        #6. Read in 2015 DEM from Window
        dem = src.read(1, window=from_bounds(bnds.left, bnds.bottom, bnds.right, bnds.top, transform=src.transform), out_shape=(smrf.shape))

    # 7. Read in 10m Slope at relative resolution
    with rio.open(loc_slope) as src:
        slope = src.read(1, window=from_bounds(bnds.left, bnds.bottom, bnds.right, bnds.top, transform=src.transform), out_shape=(smrf.shape))
        
    # 8. Read in DTG at relative resolution
    with rio.open(loc_dtg) as src:
        dtg = src.read(1, window=from_bounds(bnds.left, bnds.bottom, bnds.right, bnds.top, transform=src.transform), out_shape=(smrf.shape))
        
    # 9. Double check check array shapes
    if not smrf.shape == dtg.shape == slope.shape == dem.shape:
        print("Bad Shapes")
        return None
    
    s2 = datetime.now()
    #print(f"Took {s2-s1} to read in rasters")
    
    #for sv, stolerances in slope_categories.items():
    #    for dv, dtolerances in dtg_categories.items():
            outfile_2015 = f"./temp/{fname[:-4]}2015_Sl{sv}DTG{dv}.tif"
            outfile_2019 = f"./temp/{fname[:-4]}2019_Sl{sv}DTG{dv}.tif"
    
            if stolerances['Max'] != slope_target or dtolerances['Max'] != dtg_target:
                continue
            
            #print(f"Targeting slope: {slope_target}, DTG: {dtg_target}")
                
            # 11a Create new mask from slope and dtg
            slope_mask = (slope >= stolerances['Min']) & (slope < stolerances['Max'])
            dtg_mask =   (dtg   >= dtolerances['Min']) & (dtg   < dtolerances['Max'])
            
            # 11b Mask no data mask, mask DEM, mask DTM
            all_mask = (dtg_mask==True) & (slope_mask==True) & (smrf_mask==True)
            
            if writeOutput:
                # Temp code to check outputs
                dem_out = np.where(all_mask == True, dem, -9999)
                smrf_out = np.where(all_mask == True, smrf, -9999)
                
                kwargs.update(nodata=-9999, dtype=np.float32)

                with rio.open(outfile_2019, 'w', **kwargs) as dst:
                    dst.write(smrf_out.astype(np.float32),1)
                with rio.open(outfile_2015, 'w', **kwargs) as dst:
                    dst.write(dem_out.astype(np.float32),1)
            
            dem_valid = dem[all_mask==True]
            smrf_valid = smrf[all_mask==True]
            #print(len(dem_valid), len(smrf_valid))
            
            diff_error = np.absolute(dem_valid-smrf_valid)
            #print(diff_error)
            
            return diff_error



In [None]:
invalid_valid = valid[valid.Valid_Count==-9999]
valid_valid = valid[~(valid.Valid_Count==-9999)]
invalid_valid_chunks = split(invalid_valid,50)
invalid_Processed = Parallel(n_jobs=5, verbose=5, backend="loky")(delayed(applyGetStats)(dataFrame) for dataFrame in invalid_valid_chunks)
invalid_Processed.append(valid_valid)
valid = pd.concat(invalid_Processed)
valid.shape