In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn.tree as tree
import sklearn.model_selection as ms
import sklearn.metrics as metrics

In [59]:
data = pd.read_csv('data/full.csv')

In [60]:
data['GW_MEAS_DATE'] = pd.to_numeric(pd.to_datetime(data['GW_MEAS_DATE']))

In [61]:
from collections import Counter

# Create a Counter object for the WID column.
counter = Counter(data['WID'])

# Create a list of WID values that have 100 or more depth samples.
wids = []
for i in counter.items():
    if i[1] >= 100:
        wids.append(i[0])

In [62]:
model = tree.DecisionTreeRegressor(max_depth=4)

In [63]:
well = data[data['WID'] == wids[0]]

In [64]:
x_cols = ['GW_MEAS_DATE', 'PRCP', 'TMAX', 'TMIN', 'ELEVATION']
y_cols = ['DEPTH']

In [65]:
well = well[x_cols+y_cols].reset_index(drop=True)

In [66]:
x_train, x_test, y_train, y_test = ms.train_test_split(well[x_cols], well[y_cols], test_size=0.2)

In [67]:
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

In [68]:
r2 = metrics.r2_score(y_test, y_pred)
adj_r2 = 1 - (1 - r2) * (len(y_test) - 1) / (len(y_test) - len(x_test.columns))
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

In [69]:
print('R2:', r2)
print('Adjusted R2:', adj_r2)
print('MAE:', mae)
print('MSE:', mse)
print('RMSE:', rmse)

R2: 0.945730563919186
Adjusted R2: 0.9425845096536315
MAE: 1.2340879455691358
MSE: 2.465789258199417
RMSE: 1.5702831777101278


In [56]:
r2

0.9032089194126874

In [57]:
adj_r2

0.8975978422771911

In [74]:
from scipy import stats
from itertools import combinations

cols = ['GW_MEAS_DATE', 'PRCP', 'TMAX', 'TMIN', 'ELEVATION']

# Create blank dataframe for the evaluations.
blank = pd.DataFrame(columns=['WID','R2','ADJ_R2','MAE','MSE','RMSE'])

f_res = pd.DataFrame(columns=['PARA','R2','ADJ_R2','MAE','MSE','RMSE'])
f_res_count = 0

t_count = 31

for n in range(1, 6):
        # Load linear regression model.
        model = tree.DecisionTreeRegressor(max_depth=6)
        combos = list(combinations(cols, n))
        for para in combos:
                parameters = list(para)
                para_id = "['{}']".format("', '".join(parameters))
                
                # Copy the blank dataframe to the results dataframe.
                res = blank.copy()

                # Set count for results dataframe.
                res_count = 0

                # Loop through the WID values. These have 100 or more DEPTH samples.
                for j in wids:
                        try:
                                # Create temp dataframe from the blank dataframe to store specific well evaluation results.
                                tdf = blank.copy()

                                # Set count for temp dataframe.
                                tdf_count = 0

                                # Create subset of the data dataframe for the specific well.
                                df = data[data['WID'] == j].copy()

                                # Sort Date values
                                df.sort_values(by='GW_MEAS_DATE', inplace=True)

                                df.reset_index(drop=True, inplace=True)

                                # Loop through model processing 3 times to get an average of the results.
                                for n in range(3):

                                        x_train, x_test, y_train, y_test = ms.train_test_split(
                                                df[parameters], df['DEPTH'], test_size=0.2
                                                )

                                        #x_train = x_train.to_numpy().reshape(-1, 1)
                                        #x_test = x_test.to_numpy().reshape(-1, 1)
                                        y_train = y_train.to_numpy()
                                        y_test = y_test.to_numpy()

                                        model.fit(x_train, y_train)

                                        y_pred = model.predict(x_test)

                                        r2 = metrics.r2_score(y_test, y_pred)
                                        adj_r2 = 1 - (1 - r2) * (len(y_test) - 1) / (len(y_test) - len(x_test.columns))
                                        mae = metrics.mean_absolute_error(y_test, y_pred)
                                        mse = metrics.mean_squared_error(y_test, y_pred)
                                        rmse = np.sqrt(mse)

                                        tdf.loc[tdf_count] = [j, r2, adj_r2, mae, mse, rmse]
                                        tdf_count += 1

                                # Add the temp dataframe to the results dataframe.
                                tdf = tdf.mean(numeric_only=True).to_frame().T.reset_index().rename(columns={'index':'WID'})
                                tdf['WID'] = j
                                res.loc[res_count] = tdf.loc[0]
                                res_count += 1
                                
                        except:
                                print(f'{j} failed')
                                continue

                # Add the results dataframe to the final results dataframe.
                res_mean = res.mean(numeric_only=True).to_frame().T.reset_index().rename(columns={'index':'PARA'})
                res_mean['PARA'] = para_id
                f_res.loc[f_res_count] = res_mean.loc[0]
                f_res_count += 1
                pct = (f_res_count / t_count) * 100
                print(f'{pct:.2f}%')

3.23%
6.45%
9.68%
12.90%
16.13%
19.35%
22.58%
25.81%
29.03%
32.26%
35.48%
38.71%
41.94%
45.16%
48.39%
51.61%
54.84%
58.06%
61.29%
64.52%
67.74%
70.97%
74.19%
77.42%
80.65%
83.87%
87.10%
90.32%
93.55%
96.77%
100.00%


In [75]:
f_res

Unnamed: 0,PARA,R2,ADJ_R2,MAE,MSE,RMSE
0,['GW_MEAS_DATE'],0.153451,0.153451,0.977158,9.833199,1.871986
1,['PRCP'],-0.208902,-0.208902,4.074467,46.505119,5.452131
2,['TMAX'],-0.427099,-0.427099,4.329969,52.144609,5.786183
3,['TMIN'],-0.417447,-0.417447,4.132182,47.56671,5.550357
4,['ELEVATION'],-0.054023,-0.054023,3.922257,41.566509,5.136541
5,"['GW_MEAS_DATE', 'PRCP']",0.519917,0.501005,0.980099,9.720639,1.925821
6,"['GW_MEAS_DATE', 'TMAX']",0.701079,0.691553,0.998982,8.412583,1.876464
7,"['GW_MEAS_DATE', 'TMIN']",0.438453,0.416967,1.014672,10.350367,1.987032
8,"['GW_MEAS_DATE', 'ELEVATION']",0.712696,0.703664,0.944348,8.107384,1.838147
9,"['PRCP', 'TMAX']",-0.551904,-0.60464,4.341037,52.705466,5.851645


In [71]:
f_res

Unnamed: 0,PARA,R2,ADJ_R2,MAE,MSE,RMSE
0,['GW_MEAS_DATE'],0.572102,0.555723,0.955876,9.037452,1.862853
1,['PRCP'],-0.200991,-0.239687,4.05179,45.325931,5.382292
2,['TMAX'],-0.476276,-0.527353,4.29797,50.148077,5.726227
3,['TMIN'],-0.343491,-0.388941,4.188728,48.803469,5.577627
4,['ELEVATION'],-0.049085,-0.083285,3.897397,40.226032,5.079521
5,"['GW_MEAS_DATE', 'PRCP']",0.583523,0.550102,0.966428,9.299028,1.898785
6,"['GW_MEAS_DATE', 'TMAX']",0.475922,0.433543,1.033341,10.055431,2.019439
7,"['GW_MEAS_DATE', 'TMIN']",0.531526,0.494517,1.020559,10.069598,1.991688
8,"['GW_MEAS_DATE', 'ELEVATION']",0.43376,0.383963,0.953101,9.121592,1.864729
9,"['PRCP', 'TMAX']",-0.537147,-0.646249,4.32903,51.884627,5.811943


In [76]:
f_res = f_res.sort_values(by='ADJ_R2', ascending=False).reset_index(drop=True)

In [73]:
f_res

Unnamed: 0,PARA,R2,ADJ_R2,MAE,MSE,RMSE
0,"['GW_MEAS_DATE', 'TMAX', 'ELEVATION']",0.660172,0.625502,1.032947,10.055494,2.002649
1,['GW_MEAS_DATE'],0.572102,0.555723,0.955876,9.037452,1.862853
2,"['GW_MEAS_DATE', 'PRCP']",0.583523,0.550102,0.966428,9.299028,1.898785
3,"['GW_MEAS_DATE', 'TMIN']",0.531526,0.494517,1.020559,10.069598,1.991688
4,"['GW_MEAS_DATE', 'TMAX']",0.475922,0.433543,1.033341,10.055431,2.019439
5,"['GW_MEAS_DATE', 'PRCP', 'TMAX', 'ELEVATION']",0.494604,0.406983,1.043534,9.883692,2.004917
6,"['GW_MEAS_DATE', 'PRCP', 'TMIN', 'ELEVATION']",0.483851,0.390596,1.031916,10.29765,2.010539
7,"['GW_MEAS_DATE', 'ELEVATION']",0.43376,0.383963,0.953101,9.121592,1.864729
8,"['GW_MEAS_DATE', 'PRCP', 'TMAX', 'TMIN']",0.47398,0.37947,1.058702,10.280149,2.03726
9,"['GW_MEAS_DATE', 'TMAX', 'TMIN', 'ELEVATION']",0.460208,0.362722,1.070013,10.930926,2.08106


In [77]:
f_res

Unnamed: 0,PARA,R2,ADJ_R2,MAE,MSE,RMSE
0,"['GW_MEAS_DATE', 'PRCP', 'ELEVATION']",0.745923,0.729173,0.956801,9.085489,1.836905
1,"['GW_MEAS_DATE', 'ELEVATION']",0.712696,0.703664,0.944348,8.107384,1.838147
2,"['GW_MEAS_DATE', 'TMAX']",0.701079,0.691553,0.998982,8.412583,1.876464
3,"['GW_MEAS_DATE', 'PRCP', 'TMAX']",0.661178,0.639102,1.021807,9.46828,1.982266
4,"['GW_MEAS_DATE', 'PRCP', 'TMAX', 'ELEVATION']",0.665461,0.630342,1.037114,9.922872,1.998134
5,"['GW_MEAS_DATE', 'PRCP', 'TMAX', 'TMIN']",0.622671,0.584728,1.068887,10.727807,2.035377
6,"['GW_MEAS_DATE', 'PRCP']",0.519917,0.501005,0.980099,9.720639,1.925821
7,"['GW_MEAS_DATE', 'TMIN']",0.438453,0.416967,1.014672,10.350367,1.987032
8,"['GW_MEAS_DATE', 'TMAX', 'TMIN', 'ELEVATION']",0.454054,0.382207,1.023472,10.040408,2.001735
9,"['GW_MEAS_DATE', 'TMAX', 'ELEVATION']",0.410969,0.360692,1.03074,9.882596,2.006316


In [None]:
from scipy import stats
from itertools import combinations

cols = ['GW_MEAS_DATE', 'PRCP', 'TMAX', 'TMIN', 'ELEVATION']

# Create blank dataframe for the evaluations.
blank = pd.DataFrame(columns=['WID','R2','ADJ_R2','MAE','MSE','RMSE'])

f_res = pd.DataFrame(columns=['PARA','R2','ADJ_R2','MAE','MSE','RMSE'])
f_res_count = 0

t_count = 31

for n in range(1, 6):
        # Load linear regression model.
        model = tree.DecisionTreeRegressor(max_depth=6)
        combos = list(combinations(cols, n))
        for para in combos:
                parameters = list(para)
                para_id = "['{}']".format("', '".join(parameters))
                
                # Copy the blank dataframe to the results dataframe.
                res = blank.copy()

                # Set count for results dataframe.
                res_count = 0

                # Loop through the WID values. These have 100 or more DEPTH samples.
                for j in wids:
                        try:
                                # Create temp dataframe from the blank dataframe to store specific well evaluation results.
                                tdf = blank.copy()

                                # Set count for temp dataframe.
                                tdf_count = 0

                                # Create subset of the data dataframe for the specific well.
                                df = data[data['WID'] == j].copy()

                                # Sort Date values
                                df.sort_values(by='GW_MEAS_DATE', inplace=True)

                                df.reset_index(drop=True, inplace=True)

                                # Loop through model processing 3 times to get an average of the results.
                                for n in range(3):

                                        x_train, x_test, y_train, y_test = ms.train_test_split(
                                                df[parameters], df['DEPTH'], test_size=0.2
                                                )

                                        #x_train = x_train.to_numpy().reshape(-1, 1)
                                        #x_test = x_test.to_numpy().reshape(-1, 1)
                                        y_train = y_train.to_numpy()
                                        y_test = y_test.to_numpy()

                                        model.fit(x_train, y_train)

                                        y_pred = model.predict(x_test)

                                        r2 = metrics.r2_score(y_test, y_pred)
                                        adj_r2 = 1 - (1 - r2) * (len(y_test) - 1) / (len(y_test) - len(x_test.columns))
                                        mae = metrics.mean_absolute_error(y_test, y_pred)
                                        mse = metrics.mean_squared_error(y_test, y_pred)
                                        rmse = np.sqrt(mse)

                                        tdf.loc[tdf_count] = [j, r2, adj_r2, mae, mse, rmse]
                                        tdf_count += 1

                                # Add the temp dataframe to the results dataframe.
                                tdf = tdf.mean(numeric_only=True).to_frame().T.reset_index().rename(columns={'index':'WID'})
                                tdf['WID'] = j
                                res.loc[res_count] = tdf.loc[0]
                                res_count += 1
                                
                        except:
                                print(f'{j} failed')
                                continue

                # Add the results dataframe to the final results dataframe.
                res_mean = res.mean(numeric_only=True).to_frame().T.reset_index().rename(columns={'index':'PARA'})
                res_mean['PARA'] = para_id
                f_res.loc[f_res_count] = res_mean.loc[0]
                f_res_count += 1
                pct = (f_res_count / t_count) * 100
                print(f'{pct:.2f}%')

In [97]:
model = tree.DecisionTreeRegressor(max_depth=3)

x_train, x_test, y_train, y_test = ms.train_test_split(data[x_cols], data['WATER_ELEVATION'], test_size=0.2)

In [98]:
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

In [99]:
r2 = metrics.r2_score(y_test, y_pred)
adj_r2 = 1 - (1 - r2) * (len(y_test) - 1) / (len(y_test) - len(x_test.columns))
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'R2: {r2}')
print(f'Adjusted R2: {adj_r2}')
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')

R2: 0.9680884853502995
Adjusted R2: 0.9680873648518874
MAE: 32.74839709735421
MSE: 2380.268108958048
RMSE: 48.78799144213715
