## Mayberry GPP Regression using Westpond Data Cross Validation

### Train Features
1. land surface temp (wp_LST.day)
2. sensible heat flux (wp_le)
3. latent heat flux (wp_h)
4. net radiation (net_rad)
5. avg air temp (avg_air_temp)

In [1]:
import sys
sys.path.append('../')
import exp
import regression as r

In [2]:
df = exp.get_exp1_data()
df.head()

Unnamed: 0,avg_air_temp,avg_soil_temp,doy,net_rad,year,wp_ch4_gf,wp_co2_gf,wp_er,wp_gpp,wp_h,...,mb_bnd2,mb_bnd3,mb_bnd7,mb_evi,mb_lswi,mb_ndvi,wp_LST.day,wp_LST.night,mb_LST.day,mb_LST.night
0,19.2,22.3,195,190.0,2012,4332.368657,-304.542172,145.072376,-449.614548,1447.549899,...,0.187575,0.025212,0.053137,0.298162,0.56237,0.6491,29.61,17.285,26.335,18.645
1,19.3,21.8,196,189.0,2012,5305.896768,-335.648791,150.278671,-485.927462,1921.833137,...,0.186562,0.024569,0.051306,0.296544,0.574074,0.6504,29.63,17.2325,26.4075,18.5925
2,20.3,21.9,197,187.0,2012,6215.371936,-313.150966,158.307017,-471.457982,1176.374322,...,0.18555,0.023925,0.049475,0.294925,0.585779,0.6517,29.65,17.18,26.48,18.54
3,16.8,22.4,198,186.0,2012,7129.353337,-339.900067,153.561669,-493.461736,2575.636175,...,0.184537,0.023281,0.047644,0.293306,0.597483,0.653,29.67,17.1275,26.5525,18.4875
4,17.0,21.5,199,151.0,2012,7070.768573,-319.771564,144.05348,-463.825044,1916.08126,...,0.183525,0.022638,0.045812,0.291687,0.609188,0.6543,29.69,17.075,26.625,18.435


In [3]:
train_cols, test_col = ["wp_LST.day", "wp_h", "wp_le", "net_rad", "avg_air_temp"], ["mb_gpp"]
X, Y = exp.featurize(df, train_cols, test_col)
X, Y, scaler = r.preprocess(X, Y)
X.shape

(1028, 5)

In [4]:
r.random_forests_cross_val(X, Y, feature_names=train_cols)

Running Random Forests Cross Validation...
10-fold CV Acc Mean:  0.859997689207
CV Scores:  0.854104608063, 0.882607178299, 0.861833984627, 0.871217810771, 0.893061398507, 0.847300257134, 0.842415576594, 0.844418645768, 0.848627853407, 0.854389578897
OOB score: 0.862598095462
Feature Importances:
('wp_le', 0.43979250141814324)
('wp_LST.day', 0.22736138557216262)
('avg_air_temp', 0.13839147566433041)
('net_rad', 0.13300199239661781)
('wp_h', 0.06145264494874593)


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=200, n_jobs=1, oob_score=True, random_state=None,
           verbose=0, warm_start=False)

In [5]:
r.xgb_trees_cross_val(X, Y, feature_names=train_cols)

Running Gradient Boosted Trees Cross Validation...
10-fold CV Acc Mean:  0.84391797134
CV Scores:  0.82949684828, 0.850965756478, 0.871584476819, 0.867571405919, 0.878779542623, 0.836216026071, 0.845353124672, 0.795711931529, 0.824075044926, 0.839425556081
Feature Importances:
('wp_le', 0.27960652158149896)
('wp_LST.day', 0.24690003020878962)
('net_rad', 0.18687339256142393)
('avg_air_temp', 0.16362414437404138)
('wp_h', 0.12299591127424625)


GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.1, loss='ls',
             max_depth=3, max_features='sqrt', max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=200,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)

In [6]:
r.svc_cross_val(X, Y)

Running SVC Cross Validation...
10-fold CV Acc Mean:  0.518785083133
CV Scores:  0.550024539583, 0.5750983232, 0.456754251914, 0.559876929893, 0.584904954278, 0.513020715155, 0.56004903383, 0.4417556376, 0.416838033912, 0.529528411964


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [7]:
r.dnn_cross_val(X, Y)

Running Neural Network Cross Validation...
Step #1, avg. loss: 74004.65625
Step #501, epoch #50, avg. loss: 15053.18066
Step #1001, epoch #100, avg. loss: 6272.86230
Step #1501, epoch #150, avg. loss: 4687.00977
Step #2001, epoch #200, avg. loss: 3681.34644
Step #2501, epoch #250, avg. loss: 3087.14209
Step #3001, epoch #300, avg. loss: 2601.05542
Step #3501, epoch #350, avg. loss: 2351.09790
Step #4001, epoch #400, avg. loss: 2083.46387
Step #4501, epoch #450, avg. loss: 1874.39978
Step #1, avg. loss: 64593.41016
Step #501, epoch #50, avg. loss: 15650.83301
Step #1001, epoch #100, avg. loss: 6380.18799
Step #1501, epoch #150, avg. loss: 4644.95215
Step #2001, epoch #200, avg. loss: 3630.39502
Step #2501, epoch #250, avg. loss: 3075.04199
Step #3001, epoch #300, avg. loss: 2580.29102
Step #3501, epoch #350, avg. loss: 2318.72119
Step #4001, epoch #400, avg. loss: 2060.11841
Step #4501, epoch #450, avg. loss: 1907.39856
Step #1, avg. loss: 69121.03906
Step #501, epoch #50, avg. loss: 15

TensorFlowEstimator(batch_size=100, class_weight=None,
          continue_training=False, early_stopping_rounds=None,
          keep_checkpoint_every_n_hours=10000, learning_rate=0.1,
          max_to_keep=5, model_fn=<function tanh_dnn at 0x10a847b90>,
          n_classes=0, num_cores=4, optimizer='SGD', steps=5000,
          tf_master='', tf_random_seed=42, verbose=1)