In [2]:
# ESSENTIALS
import numpy as np
import matplotlib.pyplot as plt

# CLUSTERING AND RANDOM FOREST
import skfuzzy as fuzz
import sklearn
from sklearn.datasets import make_blobs
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection

# DATA LIBRARIES
import geopandas as gpd
from glob import glob
import pandas as pd

# PREFERENCES
pd.set_option('display.max_columns', 500)

Define our Scikit-Learn Compliant Class so we can implement validation schemes within. Only two methods required: predict() and fit()

In [21]:
class ModelSelector():

    def __init__(self, c_kwargs={}, rf_kwargs={}):
       self.c_kwargs=c_kwargs        # CLUSTERING HYPERPARAMETERS
       self.rf_kwargs=rf_kwargs      # RANDOM FOREST HYPERPARAMETERS
       self.m = 2                    # EXPONENTIATION COEFFICIENT FOR CLUSTERING. TODO: MAKE ADJUSTABLE

    def fuzzyCluster(self, data):
        # Wraps Fuzzy Cluster function, only outputting percent belongs and formal cluster.

        # CHECK THAT REQUIRED FIELDS ARE IN KWARGS, IF NOT ADD
        if "error" not in self.c_kwargs:
            self.c_kwargs['error']=0.005

        if "maxiter" not in self.c_kwargs:
            self.c_kwargs['maxiter']=1000

        cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(data.T, self.n_centers, self.m, **self.c_kwargs)
        label = np.argmax(u, axis=0)
        return cntr, u, fpc, label

    def howManyClusters(self, X, mintest=2,maxtest=15):
        # Determines how many clusters should be used using the Fuzzy Partitions Coefficient (FPC)
        # https://scikit-fuzzy.github.io/scikit-fuzzy/auto_examples/plot_cmeans.html#example-plot-cmeans-py
        # TODO: FIGURE OUT IF THIS METHOD IS APPROPRIATE OR NOT
        fpcs = []
        listtests = np.arange(mintest,maxtest)
        for ncenters in listtests:
            self.n_centers = ncenters
            _, _, fpc, _ = self.fuzzyCluster(X)
            fpcs.append(fpc)
        return listtests[np.argmax(fpcs)]

    def train_rf(self, X_train, y_train, rf_controls={}):
        # ADAPTED FROM https://stackoverflow.com/questions/28489667/combining-random-forest-models-in-scikit-learn

        # RF CONTROLS PASSED DIRECTLY FROM PARAMETER, DEFAULT IS EMPTY
        rf = RandomForestRegressor(**rf_controls) 

        # RF FITTING 
        rf.fit(X_train, y_train)

        return rf

    def fit(self, attributes, model_perf):
        # GET NUMBER OF CLUSTERS VIA MAXIMUM FUZZY PARTITION COEFFICIENT 
        self.n_centers = self.howManyClusters(model_perf)

        # RUN CLUSTERING AND SAVE CENTERS FOR FUTURE PREDICTIONS
        cntr, u, fpc, label = self.fuzzyCluster(model_perf)
        # cntr:  CLUSTER CENTERS, WHICH ARE ON N-DIM MODEL PERFORMANCE SPACE
        #        WHERE N IS THE NUMBER OF MODELS BEING COMPARED. 
        # u:     CLUSTER MEMBERSHIP MATRIX (% BELONGING)
        # fpc:   FUZZY PARTITION COEFFICIENT FOR CLUSTERING RUN
        # label: "CLOSEST CLUSTER", DEFINED BY MAXIMUM CLUSTER MEMBERSHIP (argmax(u))
        self.centers = cntr

        # CREATE RANDOM FOREST AND TRAIN
        self.rf = self.train_rf(attributes, model_perf, rf_controls=self.rf_kwargs)

        return self

    def predict(self, attributes):

        # CHECK WHETHER MODEL HAS BEEN TRAINED
        if self.rf is None:
            raise(Exception("ModelSelector isn't trained!"))

        # GET RANDOM FOREST PREDICTION
        pred_cluster_scores = self.rf.predict(attributes)

        self.pred_cluster_scores = pred_cluster_scores # FOR TROUBLESHOOTING, DELETEME

        # CALCULATE PROBABILITY OF INDIVIDUAL MODELS
        prob_model = pred_cluster_scores * self.centers

        self.prob_model = prob_model # FOR TROUBLESHOOTING, DELETEME

        return prob_model


Read CAMELS Data

In [21]:
# FILEPATH TO SHAPEFILE CONTAINING CAMELS DATASET
camelsdir = r"C:\Users\franc\OneDrive - University Of Houston\000_SIResearch\data\HCDN_nhru_final\HCDN_nhru_final_671.shp"

# DIRECTORY TO FOLDER CONTAINING CAMELS ATTRIBUTE TEXTFILES
# PRIOR TO THIS STEP MAKE SURE THE README IN THE FILE SYSTEM HAS BEEN REMOVED (or the file extension has been changed)
attdir = r"C:\Users\franc\OneDrive - University Of Houston\000_SIResearch\data\camels_attributes_v2.0\camels_attributes_v2.0\\"

# READ CAMELS DATASET
camels = gpd.read_file(camelsdir)


# COPY TO KEEP ORIGINAL IN MEMORY
camels_df = camels 

# LOOP THROUGH AND JOIN
filelist = glob(attdir + "*.txt")
for i in filelist:
    currdf = pd.read_csv(i, sep=";")
    camels_df = camels_df.merge(currdf, how='left', left_on="hru_id", right_on="gauge_id")

# DEFINE WHAT WE WANT TO RUN ON
perf_dir = r"C:\Users\franc\OneDrive - University Of Houston\000_SIResearch\Repo\nextgen-form-eval\FrankTests\data\JonathanTests\\"
perf_prefixes = ["daymet_time_split1", "daymet_time_split2", "nldas_time_split1", "nldas_time_split2"]
perf_prefixes_abb = ["daymetS1", "daymetS2", "nldasS1", "nldasS2"]
perf_metrics = ["KGE"]

# CAMELS COPY TO MODIFY BELOW

# LOOP THROUGH EACH TO ADD TO CAMELS DATASET
for i in range(0, len(perf_prefixes)):
    for ii in range(0, len(perf_metrics)):
        currdir = perf_dir + perf_prefixes[i] + "_" + perf_metrics[ii] + ".csv"
        currdf = pd.read_csv(currdir).add_prefix(perf_prefixes_abb[i] + "_" + perf_metrics[ii] + "_")
        first_col_name = currdf.columns.to_list()[0]
        camels_df = camels_df.merge(currdf, how='right', left_on="hru_id", right_on=first_col_name)


# camels_df = camels_df.dropna(axis = 0, how = 'any')
camels_df.head()

  result = DataFrame.merge(self, *args, **kwargs)
  result = DataFrame.merge(self, *args, **kwargs)


Unnamed: 0,hru_id,ann_P,lon_cen,lat_cen,AREA,elev_mean_x,ave_T,july_T,Perimeter,geometry,gauge_id_x,p_mean,pet_mean,p_seasonality,frac_snow,aridity,high_prec_freq,high_prec_dur,high_prec_timing,low_prec_freq,low_prec_dur,low_prec_timing,gauge_id_y,geol_1st_class,glim_1st_class_frac,geol_2nd_class,glim_2nd_class_frac,carbonate_rocks_frac,geol_porostiy,geol_permeability,gauge_id_x.1,q_mean,runoff_ratio,slope_fdc,baseflow_index,stream_elas,q5,q95,high_q_freq,high_q_dur,low_q_freq,low_q_dur,zero_q_freq,hfd_mean,gauge_id_y.1,huc_02,gauge_name,gauge_id_x.2,soil_depth_pelletier,soil_depth_statsgo,soil_porosity,soil_conductivity,max_water_content,sand_frac,silt_frac,clay_frac,water_frac,organic_frac,other_frac,gauge_id_y.2,gauge_lat,gauge_lon,elev_mean_y,slope_mean,area_gages2,area_geospa_fabric,gauge_id,frac_forest,lai_max,lai_diff,gvf_max,gvf_diff,dom_land_cover_frac,dom_land_cover,root_depth_50,root_depth_99,daymetS1_KGE_Unnamed: 0,daymetS1_KGE_lstm,daymetS1_KGE_mc,daymetS1_KGE_sac,daymetS2_KGE_Unnamed: 0,daymetS2_KGE_lstm,daymetS2_KGE_mc,daymetS2_KGE_sac,nldasS1_KGE_Unnamed: 0,nldasS1_KGE_lstm,nldasS1_KGE_mc,nldasS1_KGE_sac,nldasS2_KGE_Unnamed: 0,nldasS2_KGE_lstm,nldasS2_KGE_mc,nldasS2_KGE_sac
0,1022500,0.0,-68.07313,44.79691,620387300.0,103.6042,,,312624,"POLYGON ((-67.97836 44.61310, -67.97800 44.613...",1022500,3.608126,2.119256,-0.11453,0.245259,0.587356,20.55,1.205279,son,233.65,3.662226,jja,1022500,Acid plutonic rocks,0.590658,Siliciclastic sedimentary rocks,0.164618,0.0,0.071,-14.2138,1022500,2.173062,0.602269,1.77628,0.554478,1.702782,0.204734,7.123049,3.9,2.294118,65.15,17.144737,0.0,166.25,1022500,1,"Narraguagus River at Cherryfield, Maine",1022500,17.412808,1.491846,0.415905,2.375005,0.626229,59.390156,28.080937,12.037646,1.226913,0.0,0.358472,1022500,44.60797,-67.93524,92.68,17.79072,573.6,620.38,1022500,0.9232,4.871392,3.746692,0.863936,0.337712,0.820493,Mixed Forests,0.237435,2.238444,1022500,0.696263,0.713571,0.713571,1022500,0.696263,0.713571,0.713571,1022500,0.832636,0.838801,0.838801,1022500,0.832636,0.838801,0.838801
1,1031500,0.0,-69.58119,45.23568,766544700.0,304.74349,,,309614,"MULTIPOLYGON (((-69.33810 45.12317, -69.33800 ...",1031500,3.522957,2.071324,0.104091,0.291836,0.58795,18.9,1.148936,son,227.35,3.473644,djf,1031500,Siliciclastic sedimentary rocks,0.448928,Metamorphics,0.443863,0.026258,0.0747,-14.841,1031500,2.030242,0.576289,1.494019,0.445091,1.648693,0.111345,8.010503,18.9,3.286957,94.8,14.697674,0.0,181.0,1031500,1,"Piscataquis River near Dover-Foxcroft, Maine",1031500,7.252557,1.279047,0.450236,1.373292,0.559123,35.26903,50.841232,12.654125,0.674594,0.0,0.0,1031500,45.17501,-69.3147,247.8,29.56035,769.05,766.53,1031500,0.9548,4.903259,3.990843,0.870668,0.398619,1.0,Mixed Forests,0.25,2.4,1031500,0.892571,0.862105,0.862105,1031500,0.892571,0.862105,0.862105,1031500,0.770392,0.719698,0.719698,1031500,0.770392,0.719698,0.719698
2,1047000,0.0,-70.16213,44.98744,904956200.0,379.779978,,,310157,"POLYGON ((-70.10847 45.21669, -70.10858 45.216...",1047000,3.323146,2.090024,0.147776,0.280118,0.628929,20.1,1.165217,son,235.9,3.691706,djf,1047000,Metamorphics,0.308488,Acid plutonic rocks,0.288613,0.0,0.0522,-14.4819,1047000,2.18287,0.656868,1.415939,0.473465,1.510238,0.196458,8.095148,14.95,2.577586,71.55,12.776786,0.0,184.8,1047000,1,"Carrabassett River near North Anson, Maine",1047000,5.359655,1.392779,0.422749,2.615154,0.561181,55.163133,34.185443,10.303622,0.0,0.0,0.147867,1047000,44.8692,-69.9551,310.38,49.92122,909.1,904.94,1047000,0.9906,5.086811,4.300978,0.891383,0.445473,0.85045,Mixed Forests,0.241027,2.34018,1047000,0.781862,0.780109,0.780109,1047000,0.781862,0.780109,0.780109,1047000,0.736531,0.764304,0.764304,1047000,0.736531,0.764304,0.764304
3,1052500,0.0,-71.17197,44.96168,396110300.0,646.073604,,,172588,"POLYGON ((-71.10862 45.12730, -71.10825 45.127...",1052500,3.730858,2.096423,0.152097,0.352698,0.561914,13.5,1.129707,jja,193.5,2.896707,mam,1052500,Siliciclastic sedimentary rocks,0.497458,Metamorphics,0.374062,0.0,0.0711,-15.1658,1052500,2.405105,0.644652,1.301062,0.4597,1.025555,0.305965,8.669019,14.1,2.517857,58.9,7.31677,0.0,197.2,1052500,1,"Diamond River near Wentworth Location, NH",1052500,1.301349,1.494807,0.452326,1.262995,0.615538,30.557666,52.614646,11.143326,0.0,0.0,5.675527,1052500,44.87739,-71.05749,615.7,60.05183,383.82,396.1,1052500,1.0,4.80083,4.124313,0.880034,0.477328,0.593588,Mixed Forests,0.225615,2.237435,1052500,0.891948,0.8937,0.8937,1052500,0.891948,0.8937,0.8937,1052500,0.741548,0.747145,0.747145,1052500,0.741548,0.747145,0.747145
4,1054200,0.0,-71.05872,44.31072,181328700.0,629.165746,,,111995,"POLYGON ((-70.97999 44.39574, -70.97963 44.395...",1054200,4.067132,2.128355,0.10496,0.299642,0.523306,17.5,1.194539,son,220.3,3.263704,mam,1054200,Metamorphics,0.871443,Siliciclastic sedimentary rocks,0.104295,0.0,0.0288,-14.2147,1054200,2.731742,0.671663,1.319645,0.43705,1.357808,0.256851,10.095605,16.05,1.888235,82.2,9.963636,0.0,183.45,1054200,1,"Wild River at Gilead, Maine",1054200,0.914754,1.477442,0.418461,2.919142,0.594526,58.376169,31.836472,9.851262,0.0,0.0,0.0,1054200,44.39044,-70.97964,472.31,90.13951,180.98,181.33,1054200,1.0,5.019827,4.283551,0.886711,0.455084,0.580954,Mixed Forests,0.224857,2.232382,1054200,0.885035,0.815074,0.815074,1054200,0.885035,0.815074,0.815074,1054200,0.849023,0.86536,0.86536,1054200,0.849023,0.86536,0.86536


In [22]:
camels_df.shape

(531, 92)

Unnamed: 0,hru_id,ann_P,lon_cen,lat_cen,AREA,elev_mean_x,ave_T,july_T,Perimeter,geometry,gauge_id_x,p_mean,pet_mean,p_seasonality,frac_snow,aridity,high_prec_freq,high_prec_dur,high_prec_timing,low_prec_freq,low_prec_dur,low_prec_timing,gauge_id_y,geol_1st_class,glim_1st_class_frac,geol_2nd_class,glim_2nd_class_frac,carbonate_rocks_frac,geol_porostiy,geol_permeability,gauge_id_x.1,q_mean,runoff_ratio,slope_fdc,baseflow_index,stream_elas,q5,q95,high_q_freq,high_q_dur,low_q_freq,low_q_dur,zero_q_freq,hfd_mean,gauge_id_y.1,huc_02,gauge_name,gauge_id_x.2,soil_depth_pelletier,soil_depth_statsgo,soil_porosity,soil_conductivity,max_water_content,sand_frac,silt_frac,clay_frac,water_frac,organic_frac,other_frac,gauge_id_y.2,gauge_lat,gauge_lon,elev_mean_y,slope_mean,area_gages2,area_geospa_fabric,gauge_id,frac_forest,lai_max,lai_diff,gvf_max,gvf_diff,dom_land_cover_frac,dom_land_cover,root_depth_50,root_depth_99,daymetS1_KGE_Unnamed: 0,daymetS1_KGE_lstm,daymetS1_KGE_mc,daymetS1_KGE_sac
0,1013500,0.0,-68.56551,47.01169,2303988000.0,277.49349,,,647993,"MULTIPOLYGON (((-68.35650 46.90311, -68.35612 ...",1013500,3.126679,1.971555,0.18794,0.31344,0.630559,12.95,1.348958,son,202.2,3.427119,mam,1013500,Siliciclastic sedimentary rocks,0.815904,Basic volcanic rocks,0.179729,0.0,0.1714,-14.7019,1013500,1.699155,0.543437,1.528219,0.585226,1.845324,0.241106,6.373021,6.1,8.714286,41.35,20.170732,0.0,207.25,1013500,1,"Fish River near Fort Kent, Maine",1013500,7.404762,1.248408,0.461149,1.106522,0.558055,27.841827,55.15694,16.275732,5.376698,0.408717,0.0,1013500,47.23739,-68.58264,250.31,21.64152,2252.7,2303.95,1013500,0.9063,4.167304,3.340732,0.804567,0.371648,0.883452,Mixed Forests,,,,,,
1,1022500,0.0,-68.07313,44.79691,620387300.0,103.6042,,,312624,"POLYGON ((-67.97836 44.61310, -67.97800 44.613...",1022500,3.608126,2.119256,-0.11453,0.245259,0.587356,20.55,1.205279,son,233.65,3.662226,jja,1022500,Acid plutonic rocks,0.590658,Siliciclastic sedimentary rocks,0.164618,0.0,0.071,-14.2138,1022500,2.173062,0.602269,1.77628,0.554478,1.702782,0.204734,7.123049,3.9,2.294118,65.15,17.144737,0.0,166.25,1022500,1,"Narraguagus River at Cherryfield, Maine",1022500,17.412808,1.491846,0.415905,2.375005,0.626229,59.390156,28.080937,12.037646,1.226913,0.0,0.358472,1022500,44.60797,-67.93524,92.68,17.79072,573.6,620.38,1022500,0.9232,4.871392,3.746692,0.863936,0.337712,0.820493,Mixed Forests,0.237435,2.238444,1022500.0,0.696263,0.713571,0.713571
2,1030500,0.0,-68.14985,45.80527,3676155000.0,174.433896,,,662248,"MULTIPOLYGON (((-67.83991 45.36614, -67.83955 ...",1030500,3.274405,2.043594,0.047358,0.277018,0.624111,17.15,1.207746,son,215.6,3.514262,djf,1030500,Siliciclastic sedimentary rocks,0.573305,Metamorphics,0.28701,0.05214,0.1178,-14.4918,1030500,1.820108,0.555859,1.87111,0.508441,1.377505,0.107149,6.854887,12.25,7.205882,89.25,19.402174,0.0,184.9,1030500,1,"Mattawamkeag River near Mattawamkeag, Maine",1030500,19.011414,1.461363,0.459091,1.289807,0.65302,32.235458,51.779182,14.776824,1.634345,1.330278,0.022016,1030500,45.50097,-68.30596,143.8,12.79195,3676.17,3676.09,1030500,0.8782,4.6852,3.665543,0.858502,0.351393,0.975258,Mixed Forests,,,,,,
3,1031500,0.0,-69.58119,45.23568,766544700.0,304.74349,,,309614,"MULTIPOLYGON (((-69.33810 45.12317, -69.33800 ...",1031500,3.522957,2.071324,0.104091,0.291836,0.58795,18.9,1.148936,son,227.35,3.473644,djf,1031500,Siliciclastic sedimentary rocks,0.448928,Metamorphics,0.443863,0.026258,0.0747,-14.841,1031500,2.030242,0.576289,1.494019,0.445091,1.648693,0.111345,8.010503,18.9,3.286957,94.8,14.697674,0.0,181.0,1031500,1,"Piscataquis River near Dover-Foxcroft, Maine",1031500,7.252557,1.279047,0.450236,1.373292,0.559123,35.26903,50.841232,12.654125,0.674594,0.0,0.0,1031500,45.17501,-69.3147,247.8,29.56035,769.05,766.53,1031500,0.9548,4.903259,3.990843,0.870668,0.398619,1.0,Mixed Forests,0.25,2.4,1031500.0,0.892571,0.862105,0.862105
4,1047000,0.0,-70.16213,44.98744,904956200.0,379.779978,,,310157,"POLYGON ((-70.10847 45.21669, -70.10858 45.216...",1047000,3.323146,2.090024,0.147776,0.280118,0.628929,20.1,1.165217,son,235.9,3.691706,djf,1047000,Metamorphics,0.308488,Acid plutonic rocks,0.288613,0.0,0.0522,-14.4819,1047000,2.18287,0.656868,1.415939,0.473465,1.510238,0.196458,8.095148,14.95,2.577586,71.55,12.776786,0.0,184.8,1047000,1,"Carrabassett River near North Anson, Maine",1047000,5.359655,1.392779,0.422749,2.615154,0.561181,55.163133,34.185443,10.303622,0.0,0.0,0.147867,1047000,44.8692,-69.9551,310.38,49.92122,909.1,904.94,1047000,0.9906,5.086811,4.300978,0.891383,0.445473,0.85045,Mixed Forests,0.241027,2.34018,1047000.0,0.781862,0.780109,0.780109


In [None]:

# CLEAN UP NONSENSICAL DATA (EG, BASIN LABELS)
# SO LETS GET A LIST OF VARIABLE NAMES WE WANT TO KEEP.

# TO START WE WILL KEEP THE SAME VARIABLES AS Kratzert et al. 2019, AS SHOWN BY OUR
# INTERNAL SPREADSHEET Attributes_CAMELS_vs_NHDPlus
varstokeep = ['p_mean',
'pet_mean',
'aridity',
'p_seasonality',
'frac_snow',                   # In spreadsheet as 'frac_snow_daily'
'high_prec_freq',
'high_prec_dur',
'low_prec_freq',
'low_prec_dur',
'elev_mean_x',                 # In spreadsheet as 'elev_mean' 
'slope_mean',
'area_gages2',
'frac_forest',                 # In spreadsheet as 'forest_frac'
'lai_max',
'lai_diff',
'gvf_max',
'gvf_diff',
'soil_depth_pelletier',
'soil_depth_statsgo',
'soil_porosity',
'soil_conductivity',
'max_water_content',
'sand_frac',
'silt_frac',
'clay_frac',
'carbonate_rocks_frac',         # In spreadsheet as 'carb_rocks_frac'
'geol_permeability']

# DO NOT DELETE!!! ---------------------------------------------------------------------------------------------
# CHECK OUR VALUES ARE GOOD
tflist = []
for i in varstokeep: # LOOP THROUGH AND CHECK COLUMN NAME IS IN DATAFRAME, STORE RESULT IN TFLIST
    tflist.append(i in camels_df)

# CONVERT TF LIST TO NUMPY ARRAY, THEN CHECK IF ALL ARE TRUE. IF NOT, PRINT WHICH ONE ISN'T AND RAISE EXCEPTION
tflist = np.array(tflist)
if np.any(np.logical_not(tflist)):
    print("\n".join(np.array(varstokeep)[np.logical_not(tflist)]))
    raise(Exception("Printed values not in CAMELS DataFrame"))
# ---------------------------------------------------------------------------------------------------------------

inputdataset = camels_df[varstokeep]


# TODO: ONCE WE GET THE REAL PERFORMANCE METRICS, MODIFY ACCORDINGLY

## Wrapper function for SKLearn MakeBlobs
#def generateRandomFit(size, models, std = 0.05, center = (0.1, 0.9), n = 5):
#    # Random state = 1 for reproducibility
#    X, y = make_blobs(n_samples=size, cluster_std=std, center_box=center, centers=n, n_features=models, random_state=1)
#    return X, y

## X, y = generateRandomFit(671, 3)

## outputdataset = X 



In [34]:
test = ModelSelector()

test.fit(inputdataset, outputdataset)


print(test.n_centers)

test_pred = test.predict(inputdataset.iloc[0:2])

print(test.pred_cluster_scores)
print(test.centers)
print(test_pred.shape)

float(np.nanmean((sim-obs)**2))

2
[[0.31996677 0.4028829  0.20233777]
 [0.37271176 0.33812005 0.23524379]]
[[0.37115071 0.35500335 0.42366885]
 [0.35318508 0.71803877 0.12237367]]
(2, 3)
