In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
import sklearn.metrics as metrics
import sklearn.model_selection as ms

In [None]:
from itertools import combinations


combos = combinations([[0,1,2],[3,4,5]], 2)

In [None]:
# import itertools package
import itertools
from itertools import permutations
 
# initialize lists
n_estimators = list(range(85, 95, 1))
max_depth = list(range(55, 65, 1))
 
# create empty list to store the
# combinations
unique_combinations = []
 
# Getting all permutations of n_estimators
# with length of max_depth
permut = itertools.permutations(n_estimators, len(max_depth))
 
# zip() is called to pair each permutation
# and shorter list element into combination
for comb in permut:
    zipped = zip(comb, max_depth)
    unique_combinations.append(list(zipped))
 
# printing unique_combination list
print(len(unique_combinations))

In [None]:
unique_combinations

In [None]:
data = pd.read_csv('data/full.csv')

In [None]:
data['GW_MEAS_DATE'] = pd.to_numeric(pd.to_datetime(data['GW_MEAS_DATE']))

In [None]:
x_cols = ['GW_MEAS_DATE', 'PRCP', 'TMAX', 'TMIN', 'ELEVATION']
y_cols = ['DEPTH', 'WATER_ELEVATION']

d_x_train, d_x_test, d_y_train, d_y_test = ms.train_test_split(data[x_cols], data[y_cols[0]], test_size=0.2)
we_x_train, we_x_test, we_y_train, we_y_test = ms.train_test_split(data[x_cols], data[y_cols[1]], test_size=0.2)

In [None]:
from math import ceil
min_samples = ceil(len(d_x_train)*.05)

In [None]:
min_samples

In [None]:
d_model = RandomForestRegressor(
    n_estimators=100,
    #min_samples_split=min_samples,
    criterion ='squared_error',
    random_state=0,
    n_jobs=11,
    verbose=1
    )

d_model = d_model.fit(d_x_train, d_y_train)

In [None]:
d_predictions = d_model.predict(d_x_test)

In [None]:
r2 = metrics.r2_score(d_y_test, d_predictions)
adj_r2 = 1 - (1 - r2) * (len(d_y_test) - 1) / (len(d_y_test) - len(d_x_test.columns))
mae = metrics.mean_absolute_error(d_y_test, d_predictions)
mse = metrics.mean_squared_error(d_y_test, d_predictions)
rmse = np.sqrt(mse)

print('Depth Metrics: (squared error)')
print('R2:', r2)
print('Adjusted R2:', adj_r2)
print('MAE:', mae)
print('MSE:', mse)
print('RMSE:', rmse)

In [None]:
d_model = RandomForestRegressor(
    n_estimators=100,
    min_samples_split=min_samples,
    criterion ='absolute_error',
    random_state=0,
    n_jobs=11,
    verbose=2
    )

d_model = d_model.fit(d_x_train, d_y_train)

In [None]:
d_predictions = d_model.predict(d_x_test)

In [None]:
r2 = metrics.r2_score(d_y_test, d_predictions)
adj_r2 = 1 - (1 - r2) * (len(d_y_test) - 1) / (len(d_y_test) - len(d_x_test.columns))
mae = metrics.mean_absolute_error(d_y_test, d_predictions)
mse = metrics.mean_squared_error(d_y_test, d_predictions)
rmse = np.sqrt(mse)

print('Depth Metrics: (absolute error)')
print('R2:', r2)
print('Adjusted R2:', adj_r2)
print('MAE:', mae)
print('MSE:', mse)
print('RMSE:', rmse)

In [None]:
we_model = RandomForestRegressor(
    n_estimators=100,
    criterion ='squared_error',
    n_jobs=11,
    verbose=2
    )

we_model = we_model.fit(we_x_train, we_y_train)

In [None]:
we_predictions = we_model.predict(we_x_test)

In [None]:
r2 = metrics.r2_score(we_y_test, we_predictions)
adj_r2 = 1 - (1 - r2) * (len(we_y_test) - 1) / (len(we_y_test) - len(we_x_test.columns))
mae = metrics.mean_absolute_error(we_y_test, we_predictions)
mse = metrics.mean_squared_error(we_y_test, we_predictions)
rmse = np.sqrt(mse)

print('Water Elevation Metrics:')
print('R2:', r2)
print('Adjusted R2:', adj_r2)
print('MAE:', mae)
print('MSE:', mse)
print('RMSE:', rmse)

### Optimize Hyperparameters

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 42)
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
base_params = rf.get_params()
pprint(base_params)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(we_x_train, we_y_train)

In [None]:
rf_random.best_params_

In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy
base_model = RandomForestRegressor(random_state = 42)
base_model.fit(we_x_train, we_y_train)
base_accuracy = evaluate(base_model, we_x_test, we_y_test)


best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, we_x_test, we_y_test)
print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))