In [178]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import GroupKFold
from pyprojroot import here
import math
import gc

In [179]:
# check https://stackabuse.com/random-forest-algorithm-with-python-and-scikit-learn/ for original code

In [180]:
dataset = pd.read_csv(here("./data/for_analysis/counterfactual.csv"))
dataset.head()



FileNotFoundError: [Errno 2] No such file or directory: '/Users/annaboser/Documents/GitHub/ET_agriculture/data/for_analysis/counterfactual.csv'

In [205]:
# dataset = pd.DataFrame(np.array([[1, 2, 3, 4], [4, 5, 6, 8], [7, 8, 9, 9], [10,11,12, 12]]),
#                    columns=['x', 'y', 'start_date', 'c'])
# dataset = pd.concat([dataset]*6, ignore_index = True)
# dataset

Unnamed: 0,x,y,start_date,c
0,1,2,3,4
1,4,5,6,8
2,7,8,9,9
3,10,11,12,12
4,1,2,3,4
5,4,5,6,8
6,7,8,9,9
7,10,11,12,12
8,1,2,3,4
9,4,5,6,8


In [206]:
dataset.shape[1]

4

In [207]:
dataset.iloc[:, 2].values

array([ 3,  6,  9, 12,  3,  6,  9, 12,  3,  6,  9, 12,  3,  6,  9, 12,  3,
        6,  9, 12,  3,  6,  9, 12])

In [208]:
X = dataset.iloc[:, 0:(dataset.shape[1]-1)].values # everything, including lat, lon, and date, are predictors. 
# I might want to eventually redefine dates as times of year to make the actual year not matter

y = dataset.iloc[:, (dataset.shape[1]-1)].values # Predict ET
print(X)

[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]
 [ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]
 [ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]
 [ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]
 [ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]
 [ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]


In [209]:
# make train test split. I note that I should not be doing this randomly and instead should be holding out huge chunks. Will do in the future. 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # random state is for reproducibility to consistently get the same random shuffle
print(X_train)

[[ 4  5  6]
 [ 4  5  6]
 [10 11 12]
 [ 1  2  3]
 [ 1  2  3]
 [ 7  8  9]
 [ 4  5  6]
 [ 1  2  3]
 [ 7  8  9]
 [ 4  5  6]
 [ 7  8  9]
 [ 4  5  6]
 [10 11 12]
 [10 11 12]
 [10 11 12]
 [ 1  2  3]
 [ 4  5  6]
 [10 11 12]
 [ 1  2  3]]


In [210]:
regressor = RandomForestRegressor(n_estimators=100, random_state=0) # I stick with the default recommended 100 trees in my forest
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

In [212]:
print('R2: ', metrics.r2_score(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
random_test = dict(val_type = 'random_test', 
                   r2 = np.corrcoef(y_test.apply(int), y_pred.apply(int))[0,1]**2,
                   r2_score = metrics.r2_score(y_test, y_pred), 
                   rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

R2:  0.999855421686747
Root Mean Squared Error: 0.030983866769658676


AttributeError: 'numpy.ndarray' object has no attribute 'apply'

In [213]:
# fit this model to the ag dataset and save that

In [214]:
# For a spatially informed split I instead do cross-validation by splitting california into 1 degree lon by 1 degree lat cubes. 
# To do this I first generate an extra column for my dataset called cv_fold which corresponds to its location

# def assign_cv_fold(d):
#     # to make 1x1 degree cubes, all I need to do is floor the lat and lon variables and combine them to make a unique coordinate
#     return "".join([math.floor(d['x']), math.floor(d['y'])])

# dataset = dataset.assign(cv_fold = dataset.apply(assign_cv_fold))

dataset = dataset.assign(cv_fold = lambda x: x.x.apply(math.floor)*1000 + x.y.apply(math.floor))
dataset.head()

Unnamed: 0,x,y,start_date,c,cv_fold
0,1,2,3,4,1002
1,4,5,6,8,4005
2,7,8,9,9,7008
3,10,11,12,12,10011
4,1,2,3,4,1002


In [215]:
# crossvalidate and make crossvalidation dataset

df = dataset

n_fold = len(set(df['cv_fold'])) # set is same as unique function in R
kf = GroupKFold(n_fold)
split = kf.split(df, groups = df['cv_fold'])

cv_df = pd.DataFrame(columns = ['cv_fold', 'start_date', 'ET', 'ET_pred'])

for i, (train_idx, test_idx) in enumerate(split):
    print(f'Starting training fold {i + 1} of {n_fold}.')
    _ = gc.collect()

    X_train = X[train_idx,:]
    X_test = X[test_idx,:]
    y_train = y[train_idx]
    y_test = y[test_idx]

    regressor = RandomForestRegressor(n_estimators=100, random_state=0) # I stick with the default recommended 100 trees in my forest
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)

    cv_fold = np.repeat(df.loc[test_idx]['cv_fold'].iloc[0], X_test.shape[0])
    df_to_append = pd.DataFrame({'cv_fold': cv_fold, 'start_date': X_test[:,df.columns.get_loc('start_date')], 'ET':y_test, 'ET_pred': y_pred})

    cv_df = cv_df.append(df_to_append, ignore_index = True)

print("Done!!")

# save this df
cv_df.to_csv(str(here("./data/for_analysis/full_cv_outputs_1x1.csv")), index=False)

Starting training fold 1 of 4.
Starting training fold 2 of 4.
Starting training fold 3 of 4.
Starting training fold 4 of 4.
Done!!


In [216]:
# get r2, rmse, and count by cv_fold

def r2_rmse(g):
    r2 = np.corrcoef(g['ET'].apply(int), g['ET_pred'].apply(int))[0,1]**2
    r2_score = metrics.r2_score(g['ET'], g['ET_pred'])
    rmse = np.sqrt(metrics.mean_squared_error(g['ET'], g['ET_pred']))
    count = g.shape[0]
    return pd.Series(dict(r2 = r2, r2_score = r2_score, rmse = rmse, count = count))

cv_stats = cv_df.groupby('cv_fold').apply(r2_rmse).reset_index()

# save this df
cv_stats.to_csv(str(here("./data/for_analysis/cv_fold_stats_1x1.csv")), index=False)

  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]
  c /= stddev[:, None]


In [218]:
# make a df for general stats for both the spatial cv and the random 20% test
spatial_cv = dict(val_type = "spatial_cv", 
                  r2 = np.corrcoef(cv_df['ET'].apply(int), cv_df['ET_pred'].apply(int))[0,1]**2,
                  r2_score = metrics.r2_score(cv_df['ET'], cv_df['ET_pred']), 
                  rmse = np.sqrt(metrics.mean_squared_error(cv_df['ET'], cv_df['ET_pred'])))

test_stats = pd.DataFrame([spatial_cv, random_test])
print(test_stats)

# save this df
test_stats.to_csv(here("./data/for_analysis/cv_fold_stats_1x1.csv"), index=False)

      val_type        r2  r2_score      rmse
0   spatial_cv  0.046707 -0.282443  3.240370
1  random_test  0.999855       NaN  0.030984
