Fill train gap and test with any primary solver (e.g. RFE) and then use LSTM to predict the test.

In [44]:
import numpy as np
import pandas as pd

import lightgbm as lgb
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Lasso
from sklearn.pipeline import make_pipeline
from copy import deepcopy

from skopt import gp_minimize
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args

from keras.models import Sequential
from keras.layers import LSTM, Bidirectional
from keras.layers import Dense

import sys
sys.path.insert(0, '../configs/')
sys.path.insert(0, '../src/')

import utils
import test_config as conf
import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

In [45]:
Xvar = conf.variables['xvar']
yvar = conf.variables['yvar']
frac = 0.7    #If missing data > 48 %, find a new test window

path_to_package = '/Users/pluto/Desktop/bag/tutoring/atbin/imputation/package/'

143:layer_train_test_set:Data used only between dates 2013-03-03 00:00:00 and 2013-07-31 23:30:00 (both inclusive).
164:layer_train_test_set:Test interval start: 2013-05-17 19:30:00 end: 2013-06-16 19:30:00


Number of missing days 30 > 21 = (0.7*30).
Gap condition not satisfied.
Number of missing days 24 > 21 = (0.7*30).
Gap condition not satisfied.


((7248, 12), (1441, 12), (5807, 12))

#### Data preprocessing

In [None]:
# Independent split
full_df = pd.read_csv(path_to_package + 'data_out/Gingin_L4_processed.csv', parse_dates=['DateTime'])

test_df_, train_df_ = train_test_split.layer_train_test_set(full_df, conf, missing_frac=frac)

df = pd.concat([test_df_, train_df_])
df.shape, test_df_.shape, train_df_.shape

In [47]:
# Split same as in imputation package
full_df = pd.read_csv(path_to_package + 'data_out/temp_full.csv', parse_dates=['DateTime'], 
            usecols= Xvar + [yvar, conf.variables['tvar'], 'Set_rank'])

test_df_ = full_df[full_df['Set_rank']=='test']
train_df_ = full_df[~(full_df['Set_rank']=='test')]

df = pd.concat([test_df_, train_df_])
df.shape, test_df_.shape, train_df_.shape

((7248, 12), (1441, 12), (5807, 12))

In [48]:
assert df.shape[0] == train_df_.shape[0]  + test_df_.shape[0]

In [49]:
test_df_.info(), train_df_.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1441 entries, 3593 to 5033
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Ta        1441 non-null   float64       
 1   Ws        1441 non-null   float64       
 2   Fg        1441 non-null   float64       
 3   VPD       1441 non-null   float64       
 4   Fn        1441 non-null   float64       
 5   q         1441 non-null   float64       
 6   Ts        1441 non-null   float64       
 7   Sws       1441 non-null   float64       
 8   EVI       1441 non-null   float64       
 9   Set_rank  1441 non-null   object        
 10  DateTime  1441 non-null   datetime64[ns]
 11  Fc        1173 non-null   float64       
dtypes: datetime64[ns](1), float64(10), object(1)
memory usage: 146.4+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5807 entries, 0 to 7247
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    ----

(None, None)

### Filling training gap and test data using primary solver 

In [50]:
# --------- Training
# Layer 1 training parameters
N_FOLDS = 3
N_CALLS = 51
THRESHOLD = 0.05

# List of models

model_library = {
    'LGBM':
        {
            'params_space': [Integer(2, 10, name='num_leaves'),
                             Categorical(['regression'], name="objective"),
                             Integer(2, 10, name='min_data_in_leaf'),
                             Real(10 ** -4, 10 ** 0, "uniform", name='learning_rate'),
                             Integer(100, 500, name='n_estimators')],
            'model_instance': lgb.LGBMRegressor(),
            'data': 'subset1'},

    'RFE':
        {
            'params_space': [Integer(2, 25, name='max_depth'),
                             Integer(2, 15, name='min_samples_leaf'),
                             Integer(2, 15, name='min_samples_split'),
                             Integer(100, 500, name='n_estimators')],
            'model_instance': RandomForestRegressor(),
            'data': 'subset2'
        },
}

solvers = ['LGBM']

In [51]:
print(yvar, Xvar)

Fc ['Ta', 'Ws', 'Fg', 'VPD', 'Fn', 'q', 'Ts', 'Sws', 'EVI']


In [52]:
train_primary = train_df_[~train_df_[yvar].isna()]
X_train_primary = train_primary[Xvar]
y_train_primary = train_primary[yvar]

train_primary.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3945 entries, 793 to 7239
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Ta        3945 non-null   float64       
 1   Ws        3945 non-null   float64       
 2   Fg        3945 non-null   float64       
 3   VPD       3945 non-null   float64       
 4   Fn        3945 non-null   float64       
 5   q         3945 non-null   float64       
 6   Ts        3945 non-null   float64       
 7   Sws       3945 non-null   float64       
 8   EVI       3945 non-null   float64       
 9   Set_rank  3945 non-null   object        
 10  DateTime  3945 non-null   datetime64[ns]
 11  Fc        3945 non-null   float64       
dtypes: datetime64[ns](1), float64(10), object(1)
memory usage: 400.7+ KB


In [53]:
for key in solvers:
    label = f'val_single_{key}'
    val = deepcopy(model_library[key])
    reg = val['model_instance']
    params_space = val['params_space']


    # Bayesian opt. part
    @use_named_args(params_space)
    def jth_objective(**params):
        cls = reg.set_params(**params)
        return utils.objective_core(cls, X_train_primary, y_train_primary,
                                    label, [1,0],
                                    nfolds=N_FOLDS, **params)


    res = gp_minimize(jth_objective, params_space, n_calls=N_CALLS, random_state=0)
    "Best score=%.4f" % res.fun

    # Generating final optimized model instance
    print("Optimal parameters")
    params = {}
    for param, value in zip(params_space, res.x):
        print(f"Param: {param.name}, value: {value}")
        params[param.name] = value

    jth_model = reg.set_params(**params)
    jth_model.fit(X_train_primary.values, y_train_primary.values)

    # Model instance for ensemble
    model_library[key]['model_instance_single'] = jth_model

------ Sampling new data point ------
RMSE: 3.568, R^2: 0.245, MBE: -0.141
RMSE: 3.321, R^2: 0.306, MBE: 0.073
RMSE: 3.455, R^2: 0.290, MBE: 0.074
Params: {'num_leaves': 7, 'objective': 'regression', 'min_data_in_leaf': 9, 'learning_rate': 0.8472670136102473, 'n_estimators': 349, 'n_jobs': -1}
Score: 3.4483547966607095
------ Sampling new data point ------
RMSE: 3.018, R^2: 0.383, MBE: 0.005
RMSE: 3.089, R^2: 0.354, MBE: 0.049
RMSE: 2.792, R^2: 0.437, MBE: -0.079
Params: {'num_leaves': 5, 'objective': 'regression', 'min_data_in_leaf': 2, 'learning_rate': 0.27272902895065526, 'n_estimators': 291, 'n_jobs': -1}
Score: 2.9662875669689908
------ Sampling new data point ------
RMSE: 3.111, R^2: 0.355, MBE: 0.023
RMSE: 3.281, R^2: 0.319, MBE: 0.035
RMSE: 3.385, R^2: 0.333, MBE: 0.120
Params: {'num_leaves': 8, 'objective': 'regression', 'min_data_in_leaf': 5, 'learning_rate': 0.836095155661024, 'n_estimators': 235, 'n_jobs': -1}
Score: 3.2586259074137836
------ Sampling new data point ------


------ Sampling new data point ------
RMSE: 2.367, R^2: 0.505, MBE: -0.041
RMSE: 3.007, R^2: 0.425, MBE: -0.081
RMSE: 2.895, R^2: 0.423, MBE: 0.110
Params: {'num_leaves': 4, 'objective': 'regression', 'min_data_in_leaf': 5, 'learning_rate': 0.02857873416524894, 'n_estimators': 132, 'n_jobs': -1}
Score: 2.75637120878309
------ Sampling new data point ------
RMSE: 2.703, R^2: 0.450, MBE: -0.153
RMSE: 3.021, R^2: 0.403, MBE: 0.082
RMSE: 2.632, R^2: 0.465, MBE: 0.067
Params: {'num_leaves': 2, 'objective': 'regression', 'min_data_in_leaf': 3, 'learning_rate': 0.03110790731852642, 'n_estimators': 378, 'n_jobs': -1}
Score: 2.7853421306577033
------ Sampling new data point ------
RMSE: 2.914, R^2: 0.397, MBE: 0.131
RMSE: 2.692, R^2: 0.480, MBE: -0.063
RMSE: 2.703, R^2: 0.460, MBE: -0.035
Params: {'num_leaves': 3, 'objective': 'regression', 'min_data_in_leaf': 5, 'learning_rate': 0.026872920970838692, 'n_estimators': 349, 'n_jobs': -1}
Score: 2.769835677691207
------ Sampling new data point ---

In [54]:
# Predicting for entire X
for key in solvers:
    val = model_library[key]
    cls = val[f'model_instance_single']
    df['yall_predicted'] = cls.predict(df[Xvar])
    
assert df['yall_predicted'].shape[0] == df.shape[0]

In [55]:
# Labelling test as nan so that in next step we can predict for Xtest and gaps in X_train
# at once.
new_yvar = yvar + '_filled'

df[new_yvar] = df[yvar].copy()
test_filter = (df['Set_rank']=='test')
df.loc[test_filter, new_yvar] = np.nan

assert test_df_.shape[0] == df[test_filter].shape[0]

df[new_yvar] = df[new_yvar].fillna(df['yall_predicted'])
df.drop(columns={'yall_predicted'}, inplace=True)

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7248 entries, 3593 to 7247
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Ta         7248 non-null   float64       
 1   Ws         7248 non-null   float64       
 2   Fg         7248 non-null   float64       
 3   VPD        7248 non-null   float64       
 4   Fn         7248 non-null   float64       
 5   q          7248 non-null   float64       
 6   Ts         7248 non-null   float64       
 7   Sws        7248 non-null   float64       
 8   EVI        7248 non-null   float64       
 9   Set_rank   7248 non-null   object        
 10  DateTime   7248 non-null   datetime64[ns]
 11  Fc         5118 non-null   float64       
 12  Fc_filled  7248 non-null   float64       
dtypes: datetime64[ns](1), float64(11), object(1)
memory usage: 792.8+ KB


In [57]:
# Combining data frame for scaling
ymean, ystd = df[new_yvar].mean(), df[new_yvar].std()
yvar_val = df[new_yvar].values
yvar_val = (yvar_val - ymean)/ystd
yscale = (ymean, ystd)

In [58]:
dtime = df['DateTime'].values
set_rank = df['Set_rank'].values

scaler = StandardScaler()
df = scaler.fit_transform(df[Xvar])
df = pd.DataFrame.from_records(df, columns=Xvar)
df['DateTime'] = dtime
df['Set_rank'] = set_rank
df[new_yvar + '_scaled'] = yvar_val
df.sort_values('DateTime', inplace=True)

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7248 entries, 1441 to 7247
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Ta                7248 non-null   float64       
 1   Ws                7248 non-null   float64       
 2   Fg                7248 non-null   float64       
 3   VPD               7248 non-null   float64       
 4   Fn                7248 non-null   float64       
 5   q                 7248 non-null   float64       
 6   Ts                7248 non-null   float64       
 7   Sws               7248 non-null   float64       
 8   EVI               7248 non-null   float64       
 9   DateTime          7248 non-null   datetime64[ns]
 10  Set_rank          7248 non-null   object        
 11  Fc_filled_scaled  7248 non-null   float64       
dtypes: datetime64[ns](1), float64(10), object(1)
memory usage: 736.1+ KB


In [60]:
# ------- Single model run ------
Xtrain_ = df.loc[df['Set_rank']!='test', Xvar]
ytrain_ = df.loc[df['Set_rank']!='test', new_yvar + '_scaled']

Xtest_ = df.loc[df['Set_rank']=='test', Xvar]
ytest_ = df.loc[df['Set_rank']=='test', new_yvar + '_scaled']

print('Train data:', Xtrain_.shape, ytrain_.shape)
print('Test data:', Xtest_.shape, ytest_.shape)

Train data: (5807, 9) (5807,)
Test data: (1441, 9) (1441,)


### LSTM with yvar = NAN removed

In [69]:
# LSTM -- Single
NSTEPS = 5
NFEATURES = Xtrain_.shape[1]

# convert into input/output sequences
dataset_train = np.column_stack((Xtrain_, ytrain_))
dataset_trainX, dataset_trainy = utils.split_sequences(dataset_train, NSTEPS)
print(dataset_trainX.shape, dataset_trainy.shape)

# define model
model_lstm = Sequential()
#model_lstm.add(LSTM(5, input_shape=(NSTEPS, NFEATURES), activation='relu', dropout=0.5, recurrent_dropout=0.5))
model_lstm.add(Bidirectional(LSTM(5, input_shape=(NSTEPS, NFEATURES), activation='relu', dropout=0.5, recurrent_dropout=0.5)))
#model_lstm.add(Dense(3, kernel_initializer='normal', activation='relu'))
model_lstm.add(Dense(1, activation='linear'))
model_lstm.compile(optimizer='adam', loss='mean_squared_error')
history = model_lstm.fit(dataset_trainX, dataset_trainy,
                            validation_split=0.5, shuffle=False,
                            epochs=50, batch_size=32, verbose=2)

dataset_test = np.column_stack((Xtest_, ytest_))
dataset_testX, dataset_testy = utils.split_sequences(dataset_test, n_steps=NSTEPS)
yhat_test = model_lstm.predict(dataset_testX, verbose=0)

metric_lstm = utils.diagnostic_stats(dataset_testy*ystd + ymean,
                                     yhat_test.squeeze()*ystd + ymean)

print(yhat_test.shape, dataset_testy.shape)
yhat_test = np.concatenate((np.array([np.nan]*(NSTEPS-1)), yhat_test.squeeze()))

(5803, 5, 9) (5803,)
Train on 2901 samples, validate on 2902 samples
Epoch 1/50
 - 5s - loss: 3.8783 - val_loss: 1.2637
Epoch 2/50
 - 1s - loss: 2.6373 - val_loss: 1.2807
Epoch 3/50
 - 1s - loss: 1.6523 - val_loss: 1.2495
Epoch 4/50
 - 1s - loss: 1.4871 - val_loss: 1.2372
Epoch 5/50
 - 1s - loss: 1.1922 - val_loss: 1.1988
Epoch 6/50
 - 1s - loss: 1.0534 - val_loss: 1.1712
Epoch 7/50
 - 1s - loss: 1.0360 - val_loss: 1.1513
Epoch 8/50
 - 1s - loss: 0.9628 - val_loss: 1.1279
Epoch 9/50
 - 1s - loss: 0.9092 - val_loss: 1.1023
Epoch 10/50
 - 1s - loss: 0.8790 - val_loss: 1.0893
Epoch 11/50
 - 1s - loss: 0.8559 - val_loss: 1.0678
Epoch 12/50
 - 1s - loss: 0.8269 - val_loss: 1.0400
Epoch 13/50
 - 1s - loss: 0.7925 - val_loss: 1.0204
Epoch 14/50
 - 1s - loss: 0.7460 - val_loss: 1.0070
Epoch 15/50
 - 1s - loss: 0.7920 - val_loss: 0.9820
Epoch 16/50
 - 1s - loss: 0.7402 - val_loss: 0.9537
Epoch 17/50
 - 1s - loss: 0.7323 - val_loss: 0.9403
Epoch 18/50
 - 1s - loss: 0.7277 - val_loss: 0.9202
Epoc

In [70]:
yhat_test.shape

(1441,)

In [62]:
#df.loc[test_filter, yvar + f'_predicted_test_filled_LSTM'] = yhat_test * ystd + ymean

utils.SCORES['LSTM' + '_' + 'single'] = {'rmse':metric_lstm[0],
                                          'rsqr':metric_lstm[1],
                                          'mbe':metric_lstm[2],
                                          'corr':metric_lstm[3],
                                          'stddev':metric_lstm[4]}

In [63]:
utils.SCORES

{'val_single_LGBM': {'rmse': 2.83968335151147,
  'rsqr': 0.4250878765268005,
  'mbe': -0.01877053754209072,
  'corr': 0.6509234707622525,
  'stddev': 2.759883505851562},
 'LSTM_single': {'rmse': 1.084089379496343,
  'rsqr': 0.8127185095144449,
  'mbe': 0.10242508023806093,
  'corr': 0.9015090102491516,
  'stddev': 2.2625182}}

In [64]:
pd.DataFrame.from_dict(utils.SCORES).T.round(3)

Unnamed: 0,rmse,rsqr,mbe,corr,stddev
val_single_LGBM,2.84,0.425,-0.019,0.651,2.76
LSTM_single,1.084,0.813,0.102,0.902,2.263


In [67]:
#Scores from imputation package

In [68]:
pd.read_csv(path_to_package + "data_out/temp_full_score.csv")

Unnamed: 0,Models,rmse,rsqr,mbe,corr,stddev
0,val_Layer1_LGBM,0.512,0.372,0.004,0.609,0.425
1,val_Layer1_RFE,0.51,0.374,-0.001,0.601,0.379
2,val_Layer1_SVM,0.476,0.413,0.003,0.637,0.43
3,val_Layer1_GP,0.466,0.365,0.035,0.603,0.355
4,val_Layer1_ANN,0.462,0.404,-0.013,0.626,0.368
5,val_Layer2_ensemble_LGBM,0.469,0.439,0.001,0.663,0.424
6,val_Layer2_single_LGBM,0.459,0.445,-0.0,0.666,0.409
7,val_single_LGBM,2.744,0.455,-0.013,0.674,2.566
8,LSTM_single,2.648,0.195,2.351,0.442,1.072
9,Layer2_LGBM_single,2.513,0.518,-0.237,0.719,2.444


In [None]:
e