Find your regressor. test the need for a lot of data , different regressors and which months I should consider (forget the winter?)
    
    1) break dataset into .001, .01, .1. 
    
    2) find the smallest data size to experiment on
    
    3) tune and compare a variety of classifiers (rf and xgb) on appropriate size of data: make a dataset with columns for classifier, dataset size, and metrics for 
    
        1) overall performance 
    
        2) performance by month (?)


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgb
from pyprojroot import here
import math
import gc
import pickle
import os

In [2]:
# break dataset into .001, .01, .1. 
# if you don't have subsamples of the dataset, make them, otherwise, load them

outpath = str(here("./data/for_analysis/sample/"))
fracs = [.001, .01, .1] #, 1, .0001, .00001] # I should add the full dataset as an option
samples = {} #dictionary of subsamples

if not os.path.exists(outpath):
    os.makedirs(outpath)

    # read in dataset
    dataset = pd.read_csv(str(here("./data/for_analysis/counterfactual.csv")))
    dataset = dataset.query('ET >= 0') # remove missing data
    dataset.head()
    
    for frac in fracs: # make and save subsets for each frac
        samples[frac] = dataset.sample(frac = frac)
        samples[frac].to_csv(outpath+"/sample"+str(frac)+".csv", index=False)
else:
    for frac in fracs: # read in subsets for each frac   
        samples[frac] = pd.read_csv(outpath+"/sample"+str(frac)+".csv")


In [3]:
# train a rf on all sizes of data. 

random_split_eval = []

for frac in fracs: 
    
    dataset = samples[frac]
    
    # split between predictors and predicted
    X = dataset.iloc[:, 0:(dataset.shape[1]-1)].values # everything, including lat, lon, and date, are predictors. 
    # I might want to eventually redefine dates as times of year to make the actual year not matter

    y = dataset.iloc[:, (dataset.shape[1]-1)].values # Predict ET
    # print(X)

    # make train test split for a random (not spatial) hold out validation
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # random state is for reproducibility to consistently get the same random shuffle

    # build the regressor
    regressor = RandomForestRegressor(n_estimators=100, random_state=0) # I stick with the default recommended 100 trees in my forest
    regressor.fit(X_train, y_train)
    
    # predict y 
    y_pred = regressor.predict(X_test)
    
    # evaluate
    random_test = pd.DataFrame({'frac' : [frac], 
                   'r2' : [np.corrcoef(y_test, y_pred)[0,1]**2],
                   'r2_score' : [metrics.r2_score(y_test, y_pred)], 
                   'rmse' : [np.sqrt(metrics.mean_squared_error(y_test, y_pred))]})
    random_split_eval.append(random_test)

random_split_eval = pd.concat(random_split_eval, axis=0)
print(random_split_eval)
    
    

    frac        r2  r2_score      rmse  frac        r2  r2_score      rmse  \
0  0.001  0.928135  0.928116  4.761597  0.01  0.963025  0.962988  3.472063   

   frac        r2  r2_score      rmse  
0   0.1  0.983443  0.983433  2.318941  


The metrics do improve with sample size, so the full sample should be used for the final analysis, but for now we use the smallest subsample in order to save on computation. 

In [8]:
# choose the dataset size to continue working with
dataset = samples[.001]
# dataset = dataset.sample(frac = .01) # make even smaller for testing purposes
del samples

# split between predictors and predicted
X = dataset.iloc[:, 0:(dataset.shape[1]-1)].values # everything, including lat, lon, and date, are predictors. 
# I might want to eventually redefine dates as times of year to make the actual year not matter

y = dataset.iloc[:, (dataset.shape[1]-1)].values # Predict ET
# print(X)

# make train test split for a random (not spatial) hold out validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # random state is for reproducibility to consistently get the same random shuffle


NameError: name 'samples' is not defined

In [6]:
# define an evaluation function 
def r2_rmse(g):
    r2 = np.corrcoef(g['ET'], g['ET_pred'])[0,1]**2
    r2_score = metrics.r2_score(g['ET'], g['ET_pred'])
    rmse = np.sqrt(metrics.mean_squared_error(g['ET'], g['ET_pred']))
    count = g.shape[0]
    return pd.Series(dict(r2 = r2, r2_score = r2_score, rmse = rmse, count = count))

In [12]:
# try to improve the RF by tuning hyperparameters
# see: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters

# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

# We can view the best parameters from fitting the random search
print(rf_random.best_params_)

# evaluate this improved RF
# predict y 
y_pred = rf_random.predict(X_test)

# evaluate
# random_test = dict(val_type = 'random_test', 
#                    r2 = np.corrcoef(y_test, y_pred)[0,1]**2,
#                    r2_score = metrics.r2_score(y_test, y_pred), 
#                    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

# make a df with monthgroup, y, and pred
df_rand_eval = pd.DataFrame({'monthgroup': X_test[:,dataset.columns.get_loc('monthgroup')], 
                             'ET':y_test, 
                             'ET_pred': y_pred})

# evaluate
random_test = pd.DataFrame(r2_rmse(df_rand_eval)).transpose()
random_test["fold_type"] = "random_test"
print(random_test)

# evaluate by monthgroup
random_test_by_month = df_rand_eval.groupby(['monthgroup']).apply(r2_rmse).reset_index()
random_test_by_month["fold_type"] = "random_test"
print(random_test_by_month)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
{'n_estimators': 1366, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 100, 'bootstrap': True}
         r2  r2_score      rmse    count    fold_type
0  0.928648  0.928627  4.744654  23126.0  random_test
   monthgroup        r2  r2_score      rmse   count    fold_type
0         0.0  0.537561  0.535175  6.244479   941.0  random_test
1         1.0  0.815317  0.814258  5.619110  1231.0  random_test
2         2.0  0.932552  0.931614  5.307099  2487.0  random_test
3         3.0  0.955092  0.954894  1.170514  9726.0  random_test
4         4.0  0.850332  0.849614  6.444139  6054.0  random_test
5         5.0  0.827679  0.827278  6.406084  2687.0  random_test


Wow it does way better in the growing season where there are fewer clouds. Not surprised... might want to toss the rest

In [13]:
# To do this I first generate an extra column for my dataset called cv_fold which corresponds to its location
dataset = dataset.assign(cv_fold = lambda x: x.x.apply(math.floor)*1000 + x.y.apply(math.floor))

# crossvalidate and make crossvalidation dataset

df = dataset
del dataset

In [14]:
# elect the best model to try validating by location

n_fold = len(set(df['cv_fold'])) # set is same as unique function in R
kf = GroupKFold(n_fold)
split = kf.split(df, groups = df['cv_fold'])

cv_df = pd.DataFrame()

for i, (train_idx, test_idx) in enumerate(split):
    print(f'Starting training fold {i + 1} of {n_fold}.')
    _ = gc.collect()

    X_train = X[train_idx,:]
    X_test = X[test_idx,:]
    y_train = y[train_idx]
    y_test = y[test_idx]

    regressor = RandomForestRegressor(random_state=0) 
    regressor.set_params(**rf_random.best_params_) # use the parameters from the randomized search
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)

    # cv_fold = np.repeat(df.loc[test_idx]['cv_fold'].iloc[0], X_test.shape[0])
    df_to_append = pd.DataFrame({# 'cv_fold': cv_fold, 
                                 'monthgroup': X_test[:,df.columns.get_loc('monthgroup')], 
                                 'ET': y_test, 
                                 'ET_pred': y_pred})

    cv_df = cv_df.append(df_to_append, ignore_index = True)

print("Done!!")

# save the full predictions using the spatial CV
cv_df.to_csv(str(here("./data/for_analysis/sklearn_RF_full_cv_outputs_1x1.csv")), index=False)


Starting training fold 1 of 10.
Starting training fold 2 of 10.
Starting training fold 3 of 10.
Starting training fold 4 of 10.
Starting training fold 5 of 10.
Starting training fold 6 of 10.
Starting training fold 7 of 10.
Starting training fold 8 of 10.
Starting training fold 9 of 10.
Starting training fold 10 of 10.
Done!!


In [18]:
len(df)

12

In [73]:
# evaluate

# get r2, rmse, and count by cv_fold
cv_stats = cv_df.groupby('cv_fold').apply(r2_rmse).reset_index()

# save this df
cv_stats.to_csv(str(here("./data/for_analysis/sklearn_RF_cv_fold_stats_1x1.csv")), index=False)

# make a df for general stats for both the spatial cv and the random 20% test
spatial_cv = pd.DataFrame(r2_rmse(cv_df)).transpose()
spatial_cv["fold_type"] = "spatial_cv"

test_stats = pd.concat([spatial_cv, random_test])
print(test_stats)

# save this df
test_stats.to_csv(str(here("./data/for_analysis/sklearn_RF_validation_stats_1x1.csv")), index=False)

# grouped by month, get r2, rmse, and count
cv_stats_by_month = cv_df.groupby('monthgroup').apply(r2_rmse).reset_index()
cv_stats_by_month["fold_type"] = "spatial_cv"

# concat
test_stats_by_month = pd.concat([cv_stats_by_month, random_test_by_month])
print(test_stats_by_month)

# save 
test_stats_by_month.to_csv(str(here("./data/for_analysis/sklearn_RF_test_stats_by_month_1x1.csv")), index=False)

         r2  r2_score          rmse   count    fold_type
0  0.727424  0.725608  8.514942e+37  3218.0   spatial_cv
0  0.798889  0.793866  7.359809e+37   644.0  random_test




In [None]:
# also try lgb maybe? 
# see: https://www.analyticsvidhya.com/blog/2021/08/complete-guide-on-how-to-use-lightgbm-in-python/
