## ER Cross Validation

### Train Features
1. land surface temp (wp_LST.day)
2. sensible heat flux (wp_le)
3. latent heat flux (wp_h)
4. net radiation (net_rad)
5. avg air temp (avg_air_temp)

In [1]:
import sys
sys.path.append('../')
import exp
import regression as r

In [2]:
df = exp.get_exp1_data()
df.head()

Unnamed: 0,avg_air_temp,avg_soil_temp,doy,net_rad,year,wp_ch4_gf,wp_co2_gf,wp_er,wp_gpp,wp_h,...,mb_bnd2,mb_bnd3,mb_bnd7,mb_evi,mb_lswi,mb_ndvi,wp_LST.day,wp_LST.night,mb_LST.day,mb_LST.night
0,19.2,22.3,195,190.0,2012,4332.368657,-304.542172,145.072376,-449.614548,1447.549899,...,0.187575,0.025212,0.053137,0.298162,0.56237,0.6491,29.61,17.285,26.335,18.645
1,19.3,21.8,196,189.0,2012,5305.896768,-335.648791,150.278671,-485.927462,1921.833137,...,0.186562,0.024569,0.051306,0.296544,0.574074,0.6504,29.63,17.2325,26.4075,18.5925
2,20.3,21.9,197,187.0,2012,6215.371936,-313.150966,158.307017,-471.457982,1176.374322,...,0.18555,0.023925,0.049475,0.294925,0.585779,0.6517,29.65,17.18,26.48,18.54
3,16.8,22.4,198,186.0,2012,7129.353337,-339.900067,153.561669,-493.461736,2575.636175,...,0.184537,0.023281,0.047644,0.293306,0.597483,0.653,29.67,17.1275,26.5525,18.4875
4,17.0,21.5,199,151.0,2012,7070.768573,-319.771564,144.05348,-463.825044,1916.08126,...,0.183525,0.022638,0.045812,0.291687,0.609188,0.6543,29.69,17.075,26.625,18.435


In [3]:
train_cols = ["wp_LST.day", "wp_h", "wp_le", "net_rad", "avg_air_temp"]
X, Y = exp.featurize(df, train_cols, ["wp_er"])
X, Y, scaler = r.preprocess(X, Y)
X.shape

(1028, 5)

In [4]:
r.random_forests_cross_val(X, Y, feature_names=train_cols)

Running Random Forests Cross Validation...
10-fold CV Acc Mean:  0.857337985068
CV Scores:  0.874681300328, 0.868921260455, 0.875852586341, 0.795715040219, 0.878931276735, 0.870778789434, 0.816441920564, 0.873346636766, 0.808799028991, 0.909912010852
OOB score: 0.858987369948
Feature Importances:
('wp_LST.day', 0.35723829424592546)
('wp_le', 0.27470074099284991)
('avg_air_temp', 0.20466046289626189)
('net_rad', 0.11232348011625294)
('wp_h', 0.051077021748709581)


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=200, n_jobs=1, oob_score=True, random_state=None,
           verbose=0, warm_start=False)

In [5]:
r.xgb_trees_cross_val(X, Y, feature_names=train_cols)

Running Gradient Boosted Trees Cross Validation...
10-fold CV Acc Mean:  0.831542752229
CV Scores:  0.843868738761, 0.825418826583, 0.860540668419, 0.750514668222, 0.85004821534, 0.847289841829, 0.787336934199, 0.872095866296, 0.802508547125, 0.875805215519
Feature Importances:
('wp_LST.day', 0.24507578703390023)
('wp_le', 0.22793142628476207)
('net_rad', 0.20282047142521878)
('wp_h', 0.17519659874738891)
('avg_air_temp', 0.14897571650872984)


GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.1, loss='ls',
             max_depth=3, max_features='sqrt', max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=200,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)

In [6]:
r.svc_cross_val(X, Y)

Running SVC Cross Validation...
10-fold CV Acc Mean:  0.771182127639
CV Scores:  0.814170348489, 0.762410402554, 0.731739100746, 0.71165490303, 0.811906981889, 0.745120307347, 0.767490936253, 0.830131465544, 0.778929000292, 0.758267830248


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [7]:
r.dnn_cross_val(X, Y)

Running Neural Network Cross Validation...
Step #1, avg. loss: 16350.42480
Step #501, epoch #50, avg. loss: 1104.73523
Step #1001, epoch #100, avg. loss: 330.36124
Step #1501, epoch #150, avg. loss: 280.13647
Step #2001, epoch #200, avg. loss: 241.87595
Step #2501, epoch #250, avg. loss: 212.85681
Step #3001, epoch #300, avg. loss: 188.88225
Step #3501, epoch #350, avg. loss: 172.81244
Step #4001, epoch #400, avg. loss: 155.33730
Step #4501, epoch #450, avg. loss: 141.31464
Step #1, avg. loss: 16027.59473
Step #501, epoch #50, avg. loss: 1125.91309
Step #1001, epoch #100, avg. loss: 336.02170
Step #1501, epoch #150, avg. loss: 279.05429
Step #2001, epoch #200, avg. loss: 237.17784
Step #2501, epoch #250, avg. loss: 200.79710
Step #3001, epoch #300, avg. loss: 175.23323
Step #3501, epoch #350, avg. loss: 156.59172
Step #4001, epoch #400, avg. loss: 138.03038
Step #4501, epoch #450, avg. loss: 118.97449
Step #1, avg. loss: 15808.92383
Step #501, epoch #50, avg. loss: 1081.33923
Step #100

TensorFlowEstimator(batch_size=100, class_weight=None,
          continue_training=False, early_stopping_rounds=None,
          keep_checkpoint_every_n_hours=10000, learning_rate=0.1,
          max_to_keep=5, model_fn=<function tanh_dnn at 0x10df84b90>,
          n_classes=0, num_cores=4, optimizer='SGD', steps=5000,
          tf_master='', tf_random_seed=42, verbose=1)