In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import GroupKFold
from pyprojroot import here
import math
import gc
import pickle
import time

In [2]:
# read in dataset
dataset = pd.read_csv(str(here("./data/for_analysis/counterfactual.csv")))
dataset.head()

Unnamed: 0,y,x,agriculture,counterfactual,elevation,aspect,slope,soil,monthgroup,PET,ET
0,42.009453,-122.378251,-3.4000000000000003e+38,1,938.7329,-3.4000000000000003e+38,-3.4000000000000003e+38,0.251709,0,1.309877,-3.4000000000000003e+38
1,42.009453,-122.378251,-3.4000000000000003e+38,1,938.7329,-3.4000000000000003e+38,-3.4000000000000003e+38,0.251709,1,3.607158,-3.4000000000000003e+38
2,42.009453,-122.378251,-3.4000000000000003e+38,1,938.7329,-3.4000000000000003e+38,-3.4000000000000003e+38,0.251709,2,5.558523,-3.4000000000000003e+38
3,42.009453,-122.378251,-3.4000000000000003e+38,1,938.7329,-3.4000000000000003e+38,-3.4000000000000003e+38,0.251709,3,4.548428,-3.4000000000000003e+38
4,42.009453,-122.378251,-3.4000000000000003e+38,1,938.7329,-3.4000000000000003e+38,-3.4000000000000003e+38,0.251709,4,1.630449,-3.4000000000000003e+38


In [None]:
# remove missing data
dataset = dataset.query(ET >= 0)

In [3]:
# save a random subset of the data in case you want that because the full dataset is enormous
sample = dataset.sample(frac = .01)
sample.to_csv(str(here("./data/for_analysis/counterfactual_sample.csv")), index=False)



In [6]:
start_time = time.time()
# split between predictors and predicted
X = dataset.iloc[:, 0:(dataset.shape[1]-1)].values # everything, including lat, lon, and date, are predictors. 
# I might want to eventually redefine dates as times of year to make the actual year not matter

y = dataset.iloc[:, (dataset.shape[1]-1)].values # Predict ET
print(X)

# make train test split for a random (not spatial) hold out validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # random state is for reproducibility to consistently get the same random shuffle

# build the regressor
regressor = RandomForestRegressor(n_estimators=100, random_state=0) # I stick with the default recommended 100 trees in my forest
regressor.fit(X_train, y_train)

# pickle the regressor
with open(here("./data/for_analysis/sklearn_RF.pkl"), 'wb') as f:
    pickle.dump(regressor, f)

print("Time this took:", time.time() - start_time)

[[ 3.54887461e+01 -1.19782336e+02 -3.40000000e+38 ...  8.80000000e-01
   0.00000000e+00  2.19787400e+00]
 [ 3.69160579e+01 -1.17566910e+02 -3.40000000e+38 ...  1.07962035e-01
   2.00000000e+00  5.19120030e+00]
 [ 3.98798436e+01 -1.20045461e+02 -3.40000000e+38 ...  4.43838800e-01
   5.00000000e+00  4.84140460e-01]
 ...
 [ 3.97990762e+01 -1.22580169e+02 -3.40000000e+38 ...  3.13847570e-01
   0.00000000e+00  1.92314580e+00]
 [ 3.58376866e+01 -1.17751161e+02 -3.40000000e+38 ...  2.50204100e-01
   1.00000000e+00  4.27879520e+00]
 [ 3.55259748e+01 -1.16959262e+02 -3.40000000e+38 ...  5.19800000e-01
   0.00000000e+00  2.53230900e+00]]


KeyboardInterrupt: 

In [7]:
dataset.head()

Unnamed: 0,y,x,agriculture,counterfactual,elevation,aspect,slope,soil,monthgroup,PET,ET
177805446,35.488746,-119.782336,-3.4000000000000003e+38,1,216.70773,1.336026,0.011207,0.88,0,2.197874,42.92213
104799038,36.916058,-117.56691,-3.4000000000000003e+38,1,2306.6936,5.074407,0.462482,0.107962,2,5.1912,-3.4000000000000003e+38
41993555,39.879844,-120.045461,-3.4000000000000003e+38,1,1437.7153,2.460312,0.174732,0.443839,5,0.48414,-3.4000000000000003e+38
300150112,33.390055,-117.571327,-3.4000000000000003e+38,1,70.6514,2.182943,0.160779,0.211668,4,2.663345,75.32075
139322358,36.138671,-118.124079,-3.4000000000000003e+38,1,2615.4263,3.539625,0.088929,0.13744,0,0.995533,-3.4000000000000003e+38


In [8]:
print(y)

[ 4.292213e+01 -3.400000e+38 -3.400000e+38 ... -3.400000e+38 -3.400000e+38
 -3.400000e+38]


In [16]:
# count the missing ET values 
(y != -3.400000e+38).sum()

1156945

In [13]:
ag = pd.read_csv(str(here("./data/for_analysis/agriculture.csv")))
ag.head()

Unnamed: 0,y,x,agriculture,counterfactual,elevation,aspect,slope,soil,monthgroup,PET,ET
0,42.00125,-121.898063,1,-3.4000000000000003e+38,1276.3254,2.546305,0.020388,0.9,0,1.330335,-3.4000000000000003e+38
1,42.00125,-121.898063,1,-3.4000000000000003e+38,1276.3254,2.546305,0.020388,0.9,1,3.74402,-3.4000000000000003e+38
2,42.00125,-121.898063,1,-3.4000000000000003e+38,1276.3254,2.546305,0.020388,0.9,2,5.714141,-3.4000000000000003e+38
3,42.00125,-121.898063,1,-3.4000000000000003e+38,1276.3254,2.546305,0.020388,0.9,3,4.65945,-3.4000000000000003e+38
4,42.00125,-121.898063,1,-3.4000000000000003e+38,1276.3254,2.546305,0.020388,0.9,4,1.653674,-3.4000000000000003e+38


In [14]:
y_ag = ag.iloc[:, (ag.shape[1]-1)].values
(y_ag == -3.400000e+38).sum()

39370386

19374306

In [None]:

# predict y 
y_pred = regressor.predict(X_test)

# evaluate
random_test = dict(val_type = 'random_test', 
                   r2 = np.corrcoef(y_test.apply(int), y_pred.apply(int))[0,1]**2,
                   r2_score = metrics.r2_score(y_test, y_pred), 
                   rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

# Read in the ag dataset and predict ET using this regressor
ag = pd.read_csv(str(here("./data/for_analysis/agriculture.csv")))
ag.head()
X = ag.iloc[:, 0:(ag.shape[1]-1)].values # everything, including lat, lon, and date, are predictors. 
ag["ET_pred"] = regressor.predict(X)
ag.to_csv(str(here("./data/for_analysis/agriculture_sklearn_RF.csv")), index=False)

# For a spatially informed split I instead do cross-validation by splitting california into 1 degree lon by 1 degree lat cubes. 

# To do this I first generate an extra column for my dataset called cv_fold which corresponds to its location
dataset = dataset.assign(cv_fold = lambda x: x.x.apply(math.floor)*1000 + x.y.apply(math.floor))

# crossvalidate and make crossvalidation dataset

df = dataset

n_fold = len(set(df['cv_fold'])) # set is same as unique function in R
kf = GroupKFold(n_fold)
split = kf.split(df, groups = df['cv_fold'])

cv_df = pd.DataFrame(columns = ['cv_fold', 'start_date', 'ET', 'ET_pred'])

for i, (train_idx, test_idx) in enumerate(split):
    print(f'Starting training fold {i + 1} of {n_fold}.')
    _ = gc.collect()

    X_train = X[train_idx,:]
    X_test = X[test_idx,:]
    y_train = y[train_idx]
    y_test = y[test_idx]

    regressor = RandomForestRegressor(n_estimators=100, random_state=0) # I stick with the default recommended 100 trees in my forest
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)

    cv_fold = np.repeat(df.loc[test_idx]['cv_fold'].iloc[0], X_test.shape[0])
    df_to_append = pd.DataFrame({'cv_fold': cv_fold, 'start_date': X_test[:,df.columns.get_loc('start_date')], 'ET':y_test, 'ET_pred': y_pred})

    cv_df = cv_df.append(df_to_append, ignore_index = True)

print("Done!!")

# save this df
cv_df.to_csv(str(here("./data/for_analysis/sklearn_RF_full_cv_outputs_1x1.csv")), index=False)

# get r2, rmse, and count by cv_fold

def r2_rmse(g):
    r2 = np.corrcoef(g['ET'].apply(int), g['ET_pred'].apply(int))[0,1]**2
    r2_score = metrics.r2_score(g['ET'], g['ET_pred'])
    rmse = np.sqrt(metrics.mean_squared_error(g['ET'], g['ET_pred']))
    count = g.shape[0]
    return pd.Series(dict(r2 = r2, r2_score = r2_score, rmse = rmse, count = count))

cv_stats = cv_df.groupby('cv_fold').apply(r2_rmse).reset_index()

# save this df
cv_stats.to_csv(str(here("./data/for_analysis/sklearn_RF_cv_fold_stats_1x1.csv")), index=False)

# make a df for general stats for both the spatial cv and the random 20% test
spatial_cv = dict(val_type = "spatial_cv", 
                  r2 = np.corrcoef(cv_df['ET'].apply(int), cv_df['ET_pred'].apply(int))[0,1]**2,
                  r2_score = metrics.r2_score(cv_df['ET'], cv_df['ET_pred']), 
                  rmse = np.sqrt(metrics.mean_squared_error(cv_df['ET'], cv_df['ET_pred'])))

test_stats = pd.DataFrame([spatial_cv, random_test])
print(test_stats)

# save this df
test_stats.to_csv(str(here("./data/for_analysis/sklearn_RF_validation_stats_1x1.csv")), index=False)