import the basics required packages.

In [3]:
#  Import some data manipulation and plotting packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

#ignore warnings
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error,median_absolute_error,mean_squared_log_error, r2_score
from sklearn.model_selection import GridSearchCV

# Read Datasets

In [6]:
# Read all the given datasets into separate dataframes
aquifers = pd.read_csv("Feature-Engineered-Datasets/aquifers_fe.csv")
lakes = pd.read_csv('Feature-Engineered-Datasets/lakes_fe.csv')
rivers = pd.read_csv('Feature-Engineered-Datasets/rivers_fe.csv')
springs = pd.read_csv('Feature-Engineered-Datasets/springs_fe.csv')

# Model

# ==============================================================
# ==============================================================
## ============================= aquifers ====================================
# ==============================================================
# ==============================================================

In [7]:
aquifers.head()

Unnamed: 0,Mean_Rainfall,Mean_Temp,Actual_Depth,Actual_Volume,Actual_Hydrometry,Date
0,0.415556,6.625,-6.08026,-8019.271158,-0.083056,1998-01-04
1,2.054444,6.075,-6.06452,-7956.571285,-0.104167,1998-01-05
2,0.921111,9.0875,-6.15706,-7715.808854,0.011944,1998-01-06
3,0.878889,12.325,-6.10774,-7731.378766,-0.008611,1998-01-07
4,0.908889,12.65,-6.0531,-7812.676449,-0.072222,1998-01-08


In [8]:
# Normalizing the columns to remove negative values
aquifers['Mean_Rainfall'] = (aquifers['Mean_Rainfall'] - aquifers['Mean_Rainfall'].min()) / (aquifers['Mean_Rainfall'].max() - aquifers['Mean_Rainfall'].min())
aquifers['Mean_Temp'] = (aquifers['Mean_Temp'] - aquifers['Mean_Temp'].min()) / (aquifers['Mean_Temp'].max() - aquifers['Mean_Temp'].min())
aquifers['Actual_Depth'] = (aquifers['Actual_Depth'] - aquifers['Actual_Depth'].min()) / (aquifers['Actual_Depth'].max() - aquifers['Actual_Depth'].min())
aquifers['Actual_Volume'] = (aquifers['Actual_Volume'] - aquifers['Actual_Volume'].min()) / (aquifers['Actual_Volume'].max() - aquifers['Actual_Volume'].min())
aquifers['Actual_Hydrometry'] = (aquifers['Actual_Hydrometry'] - aquifers['Actual_Hydrometry'].min()) / (aquifers['Actual_Hydrometry'].max() - aquifers['Actual_Hydrometry'].min())

In [9]:
aquifers.head()

Unnamed: 0,Mean_Rainfall,Mean_Temp,Actual_Depth,Actual_Volume,Actual_Hydrometry,Date
0,0.007267,0.295126,0.984879,0.395764,0.090032,1998-01-04
1,0.035926,0.276216,0.985372,0.401094,0.084072,1998-01-05
2,0.016108,0.379792,0.982474,0.421561,0.116854,1998-01-06
3,0.015369,0.491104,0.984019,0.420238,0.11105,1998-01-07
4,0.015894,0.502278,0.985729,0.413327,0.093091,1998-01-08


In [10]:
# separating the date and target feature
X = aquifers.drop(['Date','Actual_Depth'],axis=1)
y = aquifers['Actual_Depth']

In [11]:
#Dividing the dataset into train and test for features as well as labels
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [12]:
#checking the number of rows in training data
len(X_train)

6523

In [14]:

#checking the number of rows in training labels
len(y_train)

6523

In [15]:
#checking the number of rows in testing data
len(X_test)

1631

In [16]:
#checking the number of rows in testing labels
len(y_test)

1631

# ================================================================

### knn

In [17]:
# create a knn regressor
neigh = KNeighborsRegressor()

In [18]:
# find optimal parameters with grid search
param_grid = {
    'n_neighbors': [3,5,7,9,11,19],
    'weights': ['uniform', 'distance'],
    'metric' : ['euclidean','manhattan']
}
grid = GridSearchCV(estimator=neigh, param_grid=param_grid, cv=10,
                    scoring=['neg_median_absolute_error','neg_mean_squared_log_error','r2'], refit = 'neg_mean_squared_log_error',
                    verbose=1, n_jobs=-1)

In [19]:
# fit the grid on the train data
grid_result = grid.fit(X_train, y_train)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


In [20]:
# print the best score and parameters found
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  -0.0028144705118891005
Best Params:  {'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'distance'}


In [21]:
# find predictions for test data
y_pred = grid_result.best_estimator_.predict(X_test)

In [22]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [23]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.02103068051342738

In [24]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.05328383316662198

In [25]:
# find r2 score
r2_score(y_test,y_pred)

0.9607730924181563

## ===========================================================================


### Multiple Linear Regression

In [26]:
# create a linear regressor
lr = LinearRegression()

In [27]:
# fit the lr on the train data
lr = lr.fit(X_train, y_train)

In [28]:
# find predictions for test data
y_pred = lr.predict(X_test)

In [29]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [30]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.3866462761518887

In [31]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.31056503793039225

In [32]:
# find r2 score
r2_score(y_test,y_pred)

-0.361099450434454

R2  is negative only when the chosen model does not follow the trend of the data, so fits worse than a horizontal line.

## ==========================================================================================================================================================

### Random Forest

In [None]:
# create a random forest regressor
rf = RandomForestRegressor()

In [None]:
# find optimal parameters with grid search
param_grid = {
'bootstrap': [True, False],
 'max_depth': [None,2,10, 20, 50, 100],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100,200,500, 1000]
             }
grid = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5,
                    scoring=['neg_median_absolute_error','neg_mean_squared_log_error','r2'], refit = 'neg_mean_squared_log_error',
                    verbose=1, n_jobs=-1)

In [None]:
# fit the grid on the train data
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 864 candidates, totalling 4320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed: 21.1min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed: 33.6min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed: 45.0min
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed: 79.3min
[Parallel(n_jobs=-1)]: Done 1796 tasks      | elapsed: 122.7min
[Parallel(n_jobs=-1)]: Done 2446 tasks      | elapsed: 196.9min
[Parallel(n_jobs=-1)]: Done 3196 tasks      | elapsed: 242.8min
[Parallel(n_jobs=-1)]: Done 4046 tasks      | elapsed: 358.7min
[Parallel(n_jobs=-1)]: Done 4320 out of 4320 | elapsed: 392.0min finished


In [None]:
# print the best score and parameters found
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  -0.0021697543201036028
Best Params:  {'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}


In [None]:
# find predictions for test data
y_pred = grid_result.best_estimator_.predict(X_test)

In [None]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [None]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.019655865584285948

In [None]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.04842439087861878

In [None]:
# find r2 score
r2_score(y_test,y_pred)

0.9668303549718164

# ===============================================================

### SGDRegression

In [33]:
# create a linear regressor
lr = SGDRegressor()

In [34]:
# find optimal parameters with grid search
param_grid = {
    
    'loss' : ['squared_loss','huber'],
    'alpha': [0.0001,0.001,0.01,0.1],
     'eta0': [0.01,0.1,1,10,100],
    'tol' : [0.00001,0.0001,0.001,0.01,0.1]    
 }
grid = GridSearchCV(estimator=lr, param_grid=param_grid,cv=5,
                    scoring=['neg_median_absolute_error','r2'], refit = 'neg_median_absolute_error',
                    verbose=1, n_jobs=-1)

In [35]:
# fit the grid on the train data
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


In [36]:
# print the best score and parameters found
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  -0.04661886028567611
Best Params:  {'alpha': 0.01, 'eta0': 0.01, 'loss': 'huber', 'tol': 0.01}


In [37]:
# find predictions for test data
y_pred = grid_result.best_estimator_.predict(X_test)

In [38]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [39]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.21549702959981842

In [40]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.2411878618526684

In [41]:
# find r2 score
r2_score(y_test,y_pred)

0.11108558019503556

# ================================================================ 

### Decision Tree Regressor

In [None]:
dt = DecisionTreeRegressor()

In [None]:
# find optimal parameters with grid search
param_grid = {
    
    'criterion' : ['mse','mae'],
    'max_depth': range(10,20),
     'min_samples_split': range(2,10),
    'min_samples_leaf' : range(1,5)    
 }
grid = GridSearchCV(estimator=dt, param_grid=param_grid, cv=10,
                    scoring=['neg_median_absolute_error','neg_mean_squared_log_error','r2'], refit = 'neg_mean_squared_log_error',
                    verbose=1, n_jobs=-1)

In [None]:
# fit the grid on the train data
grid_result = grid.fit(X_train, y_train)

Fitting 10 folds for each of 640 candidates, totalling 6400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:   53.0s
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 40.9min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed: 64.8min
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed: 90.8min
[Parallel(n_jobs=-1)]: Done 6400 out of 6400 | elapsed: 99.5min finished


In [None]:
# print the best score and parameters found
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  -0.0029725895160034803
Best Params:  {'criterion': 'mae', 'max_depth': 12, 'min_samples_leaf': 2, 'min_samples_split': 4}


In [None]:
# find predictions for test data
y_pred = grid_result.best_estimator_.predict(X_test)

In [None]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [None]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.01684240996782499

In [None]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.05367391890221074

In [None]:
# find r2 score
r2_score(y_test,y_pred)

0.9561599730076912

# ================================================================

### XGBoost Regressor

In [None]:
xgbtree = XGBRegressor()

In [None]:
# find optimal parameters with grid search
param_grid = {
              'learning_rate': [0.001,0.01,0.1,1],
              'max_depth': range(2,10),
              'subsample':[0.01,0.1,1] ,
              'colsample_bytree': [0.01,0.1,1],
              'n_estimators': [100,200,500,1000]}
grid = GridSearchCV(estimator=xgbtree, param_grid=param_grid, cv=5,
                    scoring=['neg_median_absolute_error','r2'], refit = 'neg_median_absolute_error',
                    verbose=1, n_jobs=-1)

In [None]:
# fit the grid on the train data
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 1152 candidates, totalling 5760 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed: 13.4min
[Parallel(n_jobs=-1)]: Done 1796 tasks      | elapsed: 19.9min
[Parallel(n_jobs=-1)]: Done 2446 tasks      | elapsed: 27.1min
[Parallel(n_jobs=-1)]: Done 3196 tasks      | elapsed: 35.3min
[Parallel(n_jobs=-1)]: Done 4046 tasks      | elapsed: 45.0min
[Parallel(n_jobs=-1)]: Done 4996 tasks      | elapsed: 60.7min
[Parallel(n_jobs=-1)]: Done 5760 out of 5760 | elapsed: 74.4min finished




In [None]:
# print the best score and parameters found
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  -0.016081142855098807
Best Params:  {'colsample_bytree': 1, 'learning_rate': 0.01, 'max_depth': 8, 'n_estimators': 1000, 'subsample': 1}


In [None]:
# find predictions for test data
y_pred = grid_result.best_estimator_.predict(X_test)

In [None]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [None]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.02528232677151125

In [None]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.053061569375817304

In [None]:
# find r2 score
r2_score(y_test,y_pred)

0.9585200503118089

# ================================================================

### AdaBoost Regressor

In [42]:
abreg = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(),
                              random_state=0)

In [43]:
# find optimal parameters with grid search
param_grid = { 'loss' : ['linear', 'square', 'exponential'],
              'learning_rate': [0.001,0.01,0.1,1],
              'base_estimator__max_depth': range(2,10),
              'n_estimators': [100,200,500,1000]}
grid = GridSearchCV(estimator=abreg, param_grid=param_grid, cv=5,
                    scoring=['neg_median_absolute_error','r2'], refit = 'neg_median_absolute_error',
                    verbose=1, n_jobs=-1)

In [None]:
# fit the grid on the train data
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 384 candidates, totalling 1920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed: 27.6min
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed: 53.7min
[Parallel(n_jobs=-1)]: Done 1796 tasks      | elapsed: 100.0min
[Parallel(n_jobs=-1)]: Done 1920 out of 1920 | elapsed: 108.9min finished


In [None]:
# print the best score and parameters found
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  -0.014184367982375096
Best Params:  {'base_estimator__max_depth': 9, 'learning_rate': 1, 'loss': 'square', 'n_estimators': 200}


In [None]:
# find predictions for test data
y_pred = grid_result.best_estimator_.predict(X_test)

In [None]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [None]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.02192634636241439

In [None]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.051313886313701484

In [None]:
# find r2 score
r2_score(y_test,y_pred)

0.9641048020090041

# ===============================================================

### MLP Regressor

In [44]:
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard

In [None]:
import datetime

In [46]:
# Creating a Neural Network Model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam

In [47]:
model = Sequential()
model.add(Dense(1000,activation='relu'))
model.add(Dense(1000,activation='relu'))
model.add(Dense(1000,activation='relu'))
model.add(Dense(1000,activation='relu'))
model.add(Dense(1))

In [48]:
import tensorflow.keras.backend as K

In [49]:

def rmsle(y_true, y_pred):
    msle = tf.keras.losses.MeanSquaredLogarithmicError()
    return K.sqrt(msle(y_true, y_pred)) 

In [50]:
model.compile(optimizer='Adam',loss=rmsle,metrics=[tf.keras.metrics.RootMeanSquaredError()])

In [52]:
model.fit(x=X_train,y=y_train,
          validation_data=(X_test,y_test),
          
          batch_size=128,epochs=100)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x1436b00e3a0>

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1000)              5000      
_________________________________________________________________
dense_1 (Dense)              (None, 1000)              1001000   
_________________________________________________________________
dense_2 (Dense)              (None, 1000)              1001000   
_________________________________________________________________
dense_3 (Dense)              (None, 1000)              1001000   
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 1001      
Total params: 3,009,001
Trainable params: 3,009,001
Non-trainable params: 0
_________________________________________________________________


In [53]:
K.clear_session()

In [54]:
y_pred = model.predict(X_test)

In [55]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.019173547932702206

In [56]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.047100793422137466

In [57]:
# find r2 score
r2_score(y_test,y_pred)

0.9680868404916957

# =============================================================
# =============================================================
## ============================= Lakes =====================================
# =============================================================
# =============================================================

In [58]:
lakes.head()

Unnamed: 0,Date,Mean_Rainfall,Mean_Temp,Actual_Flow_Rate,Actual_Lake_Level
0,0,2.857312,14.530141,2.578255,249.606745
1,1,2.857312,14.530141,2.578255,249.606745
2,2,2.857312,14.530141,2.578255,249.606745
3,3,2.857312,14.530141,2.578255,249.606745
4,4,2.857312,14.530141,2.578255,249.606745


In [59]:
# Normalizing the columns to remove negative values
lakes['Mean_Rainfall'] = (lakes['Mean_Rainfall'] - lakes['Mean_Rainfall'].min()) / (lakes['Mean_Rainfall'].max() - lakes['Mean_Rainfall'].min())
lakes['Mean_Temp'] = (lakes['Mean_Temp'] - lakes['Mean_Temp'].min()) / (lakes['Mean_Temp'].max() - lakes['Mean_Temp'].min())
lakes['Actual_Flow_Rate'] = (lakes['Actual_Flow_Rate'] - lakes['Actual_Flow_Rate'].min()) / (lakes['Actual_Flow_Rate'].max() - lakes['Actual_Flow_Rate'].min())
lakes['Actual_Lake_Level'] = (lakes['Actual_Lake_Level'] - lakes['Actual_Lake_Level'].min()) / (lakes['Actual_Lake_Level'].max() - lakes['Actual_Lake_Level'].min())

In [60]:
lakes.head()

Unnamed: 0,Date,Mean_Rainfall,Mean_Temp,Actual_Flow_Rate,Actual_Lake_Level
0,0,0.033568,0.505213,0.028683,0.658369
1,1,0.033568,0.505213,0.028683,0.658369
2,2,0.033568,0.505213,0.028683,0.658369
3,3,0.033568,0.505213,0.028683,0.658369
4,4,0.033568,0.505213,0.028683,0.658369


In [61]:
# separating the date and target feature
X = lakes.drop(['Date','Actual_Flow_Rate','Actual_Lake_Level'],axis=1)
y = lakes[['Actual_Flow_Rate','Actual_Lake_Level']]

In [62]:
#Dividing the dataset into train and test for features as well as labels
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [63]:
#checking the number of rows in training data
len(X_train)

5282

In [64]:
#checking the number of rows in training labels
len(y_train)

5282

In [65]:
#checking the number of rows in testing data
len(X_test)

1321

In [66]:
#checking the number of rows in testing labels
len(y_test)

1321

# ================================================================

### knn

In [67]:
# create a knn regressor
neigh = KNeighborsRegressor()

In [68]:
# find optimal parameters with grid search
param_grid = {
    'n_neighbors': [3,5,7,9,11,19],
    'weights': ['uniform', 'distance'],
    'metric' : ['euclidean','manhattan']
}
grid = GridSearchCV(estimator=neigh, param_grid=param_grid,
                    scoring=['neg_median_absolute_error','neg_mean_squared_log_error','r2'], refit = 'neg_mean_squared_log_error',
                    verbose=1, n_jobs=-1)

In [69]:
# fit the grid on the train data
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [70]:
# print the best score and parameters found
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  -0.007168036852265819
Best Params:  {'metric': 'euclidean', 'n_neighbors': 19, 'weights': 'uniform'}


In [71]:
# find predictions for test data
y_pred = grid_result.best_estimator_.predict(X_test)

In [72]:
# normalizing the predicted values
y_pred[:,0] = (y_pred[:,0] - y_pred[:,0].min())/(y_pred[:,0].max() - y_pred[:,0].min())

In [73]:

# normalizing the predicted values
y_pred[:,1] = (y_pred[:,1] - y_pred[:,1].min())/(y_pred[:,1].max() - y_pred[:,1].min())

In [74]:
# find median absolute error
median_absolute_error(y_test.iloc[:,0],y_pred[:,0])

0.11279539978027128

In [75]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test.iloc[:,0],y_pred[:,0]))

0.1570520689352546

In [76]:
# find r2 score
r2_score(y_test.iloc[:,0],y_pred[:,0])

-18.42880247998059

#### the secoond prediction

In [77]:
# find median absolute error
median_absolute_error(y_test.iloc[:,1],y_pred[:,1])

0.1811120486734537

In [78]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test.iloc[:,1],y_pred[:,1]))

0.1791217384377378

In [79]:
# find r2 score
r2_score(y_test.iloc[:,1],y_pred[:,1])

-1.4160763701487715

# ================================================================

### Multiple Linear Regression

In [80]:
# create a linear regressor
lr = LinearRegression()

In [81]:
# fit the lr on train data
lr = lr.fit(X_train, y_train)

In [82]:
# find the predictions on test data
y_pred = lr.predict(X_test)

In [83]:
# normalize the predictions
y_pred[:,0] = (y_pred[:,0] - y_pred[:,0].min())/(y_pred[:,0].max() - y_pred[:,0].min())

In [84]:
# normalize the predictions
y_pred[:,1] = (y_pred[:,1] - y_pred[:,1].min())/(y_pred[:,1].max() - y_pred[:,1].min())

In [85]:
# find median absolute error
median_absolute_error(y_test.iloc[:,0],y_pred[:,0])

0.5015375144710312

In [86]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test.iloc[:,0],y_pred[:,0]))

0.4073832306770018

In [87]:
# find r2 score
r2_score(y_test.iloc[:,0],y_pred[:,0])

-156.95424008399553

####  the second prediction

In [88]:
# find median absolute error
median_absolute_error(y_test.iloc[:,1],y_pred[:,1])

0.16368554371553756

In [89]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test.iloc[:,1],y_pred[:,1]))

0.14938090265660803

In [90]:
# find r2 score
r2_score(y_test.iloc[:,1],y_pred[:,1])

-0.8211047468624715

R2  is negative only when the chosen model does not follow the trend of the data, so fits worse than a horizontal line.

# ================================================================

### Random Forest

In [91]:
# create a random forest regressor
rf = RandomForestRegressor()

In [92]:
# find optimal parameters with grid search
param_grid = {
'bootstrap': [True, False],
 'max_depth': [None,10, 20, 50, 100],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100,200,500, 1000]
             }
grid = GridSearchCV(estimator=rf, param_grid=param_grid,
                    scoring=['neg_median_absolute_error','neg_mean_squared_log_error','r2'], refit = 'neg_mean_squared_log_error',
                    verbose=1, n_jobs=-1)

In [None]:
# fit the grid on the train data
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed: 21.3min
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed: 34.7min
[Parallel(n_jobs=-1)]: Done 1796 tasks      | elapsed: 50.6min
[Parallel(n_jobs=-1)]: Done 2446 tasks      | elapsed: 73.4min
[Parallel(n_jobs=-1)]: Done 3196 tasks      | elapsed: 101.1min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 116.7min finished


In [None]:
# print the best score and parameters found
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  -0.00704013646129713
Best Params:  {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 500}


In [None]:
# find predictions for test data
y_pred = grid_result.best_estimator_.predict(X_test)

In [None]:
# normalize the predictions
y_pred[:,0] = (y_pred[:,0] - y_pred[:,0].min())/(y_pred[:,0].max() - y_pred[:,0].min())

In [None]:
# normalize the predictions
y_pred[:,1] = (y_pred[:,1] - y_pred[:,1].min())/(y_pred[:,1].max() - y_pred[:,1].min())

In [None]:
# find median absolute error
median_absolute_error(y_test.iloc[:,0],y_pred[:,0])

0.13486025796631473

In [None]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test.iloc[:,0],y_pred[:,0]))

0.18098893002166355

In [None]:
# find r2 score
r2_score(y_test.iloc[:,0],y_pred[:,0])

-25.890108945315262

#### the second prediction

In [None]:
# find median absolute error
median_absolute_error(y_test.iloc[:,1],y_pred[:,1])

0.13719308429587834

In [None]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test.iloc[:,1],y_pred[:,1]))

0.1461849494815439

In [None]:
# find r2 score
r2_score(y_test.iloc[:,1],y_pred[:,1])

-0.6840530272584708

# ================================================================

### SGRegression

In [93]:
from sklearn.multioutput import MultiOutputRegressor

In [94]:
# create a linear regressor
lr = MultiOutputRegressor(SGDRegressor())

In [95]:
# find optimal parameters with grid search
param_grid = {
    
    'estimator__loss' : ['squared_loss','huber'],
    'estimator__alpha': [0.0001,0.001,0.01,0.1],
     'estimator__eta0': [0.01,0.1,1,10,100],
    'estimator__tol' : [0.00001,0.0001,0.001,0.01,0.1]    
 }
grid = GridSearchCV(estimator=lr, param_grid=param_grid,
                    scoring=['neg_median_absolute_error','r2'], refit = 'neg_median_absolute_error',
                    verbose=1, n_jobs=-1)

In [96]:
# fit the grid on the train data
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


In [97]:
# print the best score and parameters found
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  -0.0499613785813279
Best Params:  {'estimator__alpha': 0.01, 'estimator__eta0': 1, 'estimator__loss': 'huber', 'estimator__tol': 1e-05}


In [98]:
# find predictions for test data
y_pred = grid_result.best_estimator_.predict(X_test)

In [99]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [100]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.11297241870299786

In [101]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.12222366848096275

In [102]:
# find r2 score
r2_score(y_test,y_pred)

-0.8425463518399292

# ================================================================

### Decision Tree Regressor

In [None]:
dt = DecisionTreeRegressor()

In [None]:
# find optimal parameters with grid search
param_grid = {
    
    'criterion' : ['mse','mae'],
    'max_depth': range(10,20),
     'min_samples_split': range(2,10),
    'min_samples_leaf' : range(1,5)    
 }
grid = GridSearchCV(estimator=dt, param_grid=param_grid, cv=10,
                    scoring=['neg_median_absolute_error','neg_mean_squared_log_error','r2'], refit = 'neg_mean_squared_log_error',
                    verbose=1, n_jobs=-1)

In [None]:
# fit the grid on the train data
grid_result = grid.fit(X_train, y_train)

Fitting 10 folds for each of 640 candidates, totalling 6400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 202 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 1402 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done 3262 tasks      | elapsed:   58.4s
[Parallel(n_jobs=-1)]: Done 3612 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 4062 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 4612 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 5262 tasks      | elapsed: 12.8min
[Parallel(n_jobs=-1)]: Done 6012 tasks      | elapsed: 17.3min
[Parallel(n_jobs=-1)]: Done 6400 out of 6400 | elapsed: 19.7min finished


In [None]:
# print the best score and parameters found
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  -0.00773425390770469
Best Params:  {'criterion': 'mse', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 9}


In [None]:
# find predictions for test data
y_pred = grid_result.best_estimator_.predict(X_test)

In [None]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [None]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.051486780984233205

In [None]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.08403082471073176

In [None]:
# find r2 score
r2_score(y_test,y_pred)

-0.091339400115092

# ================================================================

### XGBoost Regressor

In [None]:
from sklearn.multioutput import MultiOutputRegressor

In [None]:
xgbtree = MultiOutputRegressor(XGBRegressor())

In [None]:
# find optimal parameters with grid search
param_grid = {
              'estimator__learning_rate': [0.001,0.01,0.1,1],
              'estimator__max_depth': range(2,10),
              'estimator__subsample':[0.01,0.1,1] ,
              'estimator__colsample_bytree': [0.01,0.1,1],
              'estimator__n_estimators': [100,200,500,1000]}
grid = GridSearchCV(estimator=xgbtree, param_grid=param_grid, cv=5,
                    scoring=['neg_median_absolute_error','r2'], refit = 'neg_median_absolute_error',
                    verbose=1, n_jobs=-1)

In [None]:
# fit the grid on the train data
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 1152 candidates, totalling 5760 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   20.7s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed: 18.8min
[Parallel(n_jobs=-1)]: Done 1796 tasks      | elapsed: 28.8min
[Parallel(n_jobs=-1)]: Done 2446 tasks      | elapsed: 38.1min
[Parallel(n_jobs=-1)]: Done 3196 tasks      | elapsed: 50.5min
[Parallel(n_jobs=-1)]: Done 4046 tasks      | elapsed: 64.6min
[Parallel(n_jobs=-1)]: Done 4996 tasks      | elapsed: 82.9min
[Parallel(n_jobs=-1)]: Done 5760 out of 5760 | elapsed: 99.7min finished




In [None]:
# print the best score and parameters found
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  -0.05736183594793019
Best Params:  {'estimator__colsample_bytree': 0.01, 'estimator__learning_rate': 0.01, 'estimator__max_depth': 8, 'estimator__n_estimators': 1000, 'estimator__subsample': 0.01}


In [None]:
# find predictions for test data
y_pred = grid_result.best_estimator_.predict(X_test)

In [None]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [None]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.073711019955124

In [None]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.0992570475717765

In [None]:
# find r2 score
r2_score(y_test,y_pred)

-0.3250485495553004

# ================================================================

### AdaBoost Regressor

In [None]:
abreg = MultiOutputRegressor(AdaBoostRegressor(base_estimator=DecisionTreeRegressor(),
                              random_state=0))

In [None]:
# find optimal parameters with grid search
param_grid = { 'estimator__loss' : ['linear', 'square', 'exponential'],
              'estimator__learning_rate': [0.001,0.01,0.1,1],
              'estimator__base_estimator__max_depth': range(2,10),
              'estimator__n_estimators': [100,200,500,1000]}
grid = GridSearchCV(estimator=abreg, param_grid=param_grid, cv=5,
                    scoring=['neg_median_absolute_error','r2'], refit = 'neg_median_absolute_error',
                    verbose=1, n_jobs=-1)

In [None]:
# fit the grid on the train data
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 384 candidates, totalling 1920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed: 13.7min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed: 28.4min
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed: 48.8min
[Parallel(n_jobs=-1)]: Done 1796 tasks      | elapsed: 80.0min
[Parallel(n_jobs=-1)]: Done 1920 out of 1920 | elapsed: 84.8min finished


In [None]:
# print the best score and parameters found
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  -0.05696000343719722
Best Params:  {'estimator__base_estimator__max_depth': 9, 'estimator__learning_rate': 0.001, 'estimator__loss': 'exponential', 'estimator__n_estimators': 100}


In [None]:
# find predictions for test data
y_pred = grid_result.best_estimator_.predict(X_test)

In [None]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [None]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.049232930801555316

In [None]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.08178335355032659

In [None]:
# find r2 score
r2_score(y_test,y_pred)

-0.053764952172182356

# ================================================================

### MLP Regressor

In [104]:
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard

In [105]:
import datetime

In [106]:
# Creating a Neural Network Model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.optimizers import Adam

In [107]:
model = Sequential()
model.add(Dense(1024,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1024,activation='relu'))
#model.add(Dropout(0.2))
model.add(Dense(128,activation='relu'))
#model.add(Dropout(0.2))
model.add(Dense(128,activation='relu'))
#model.add(Dropout(0.2))
model.add(Dense(128,activation='relu'))
model.add(Dense(2))

In [108]:
import tensorflow.keras.backend as K

In [109]:

def rmsle(y_true, y_pred):
    msle = tf.keras.losses.MeanSquaredLogarithmicError()
    return K.sqrt(msle(y_true, y_pred)) 

In [110]:
model.compile(optimizer='Adam',loss=rmsle,metrics=[tf.keras.metrics.RootMeanSquaredError()])

In [111]:
model.fit(x=X_train,y=y_train,
          validation_data=(X_test,y_test),
          batch_size=64,epochs=100)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100


Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x1436bec7c10>

In [112]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1024)              3072      
_________________________________________________________________
dropout (Dropout)            (None, 1024)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              1049600   
_________________________________________________________________
dense_2 (Dense)              (None, 128)               131200    
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_4 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 2

In [113]:
K.clear_session()

In [114]:
y_pred = model.predict(X_test)

In [115]:
y_pred.shape

(1321, 2)

In [116]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.05627708154857054

In [117]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.07571663797547266

In [118]:
# find r2 score
r2_score(y_test,y_pred)

0.08660359814715346

# ================================================================

# =============================================================
# =============================================================
## ============================= Rivers =====================================
# =============================================================
# =============================================================

In [119]:
rivers.head()

Unnamed: 0,Date,Mean_Rainfall,Mean_Temp,Actual_Hydrometry
0,0,2.789049,16.605386,1.602951
1,1,2.789049,16.605386,1.602951
2,2,2.789049,16.605386,1.602951
3,3,2.789049,16.605386,1.602951
4,4,2.789049,16.605386,1.602951


In [120]:
# Normalizing the columns to remove negative values
rivers['Mean_Rainfall'] = (rivers['Mean_Rainfall'] - rivers['Mean_Rainfall'].min()) / (rivers['Mean_Rainfall'].max() - rivers['Mean_Rainfall'].min())
rivers['Mean_Temp'] = (rivers['Mean_Temp'] - rivers['Mean_Temp'].min()) / (rivers['Mean_Temp'].max() - rivers['Mean_Temp'].min())
rivers['Actual_Hydrometry'] = (rivers['Actual_Hydrometry'] - rivers['Actual_Hydrometry'].min()) / (rivers['Actual_Hydrometry'].max() - rivers['Actual_Hydrometry'].min())

In [121]:
rivers.head()

Unnamed: 0,Date,Mean_Rainfall,Mean_Temp,Actual_Hydrometry
0,0,0.060726,0.513441,0.260642
1,1,0.060726,0.513441,0.260642
2,2,0.060726,0.513441,0.260642
3,3,0.060726,0.513441,0.260642
4,4,0.060726,0.513441,0.260642


In [122]:
# separating the date and target feature
X = rivers.drop(['Date','Actual_Hydrometry'],axis=1)
y = rivers['Actual_Hydrometry']

In [123]:
#Dividing the dataset into train and test for features as well as labels
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [124]:
#checking the number of rows in training data
len(X_train)

6573

In [125]:
#checking the number of rows in training labels
len(y_train)

6573

In [126]:
#checking the number of rows in testing data
len(X_test)

1644

In [127]:
#checking the number of rows in testing labels
len(y_test)

1644

# ===============================================================

### knn

In [128]:
# create a knn regressor
neigh = KNeighborsRegressor()

In [129]:
# find optimal parameters with grid search
param_grid = {
    'n_neighbors': [3,5,7,9,11,19],
    'weights': ['uniform', 'distance'],
    'metric' : ['euclidean','manhattan']
}
grid = GridSearchCV(estimator=neigh, param_grid=param_grid,
                    scoring=['neg_median_absolute_error','neg_mean_squared_log_error','r2'], refit = 'neg_mean_squared_log_error',
                    verbose=1, n_jobs=-1)

In [130]:
# fit the grid on the train data
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [131]:
# print the best score and parameters found
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  -0.0044697518815531015
Best Params:  {'metric': 'euclidean', 'n_neighbors': 19, 'weights': 'uniform'}


In [132]:
# find predictions for test data
y_pred = grid_result.best_estimator_.predict(X_test)

In [133]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [134]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.19775886224766392

In [135]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.1810277615039372

In [136]:
# find r2 score
r2_score(y_test,y_pred)

-6.605017025038958

# ===============================================================

### Multiple Linear Regression

In [137]:
# create a linear regressor
lr = LinearRegression()

In [138]:
# fit the lr on the train data
lr = lr.fit(X_train, y_train)

In [139]:
# find the predictions
y_pred = lr.predict(X_test)

In [140]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [141]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.2475937752746636

In [142]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.2236055865817873

In [143]:
# find r2 score
r2_score(y_test,y_pred)

-11.111931767657907

# ===============================================================

### Random Forest

In [144]:
# create a random forest regressor
rf = RandomForestRegressor()

In [145]:
# find optimal parameters with grid search
param_grid = {
'bootstrap': [True, False],
 'max_depth': [None,10, 20, 50, 100],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100,200,500, 1000]
             }
grid = GridSearchCV(estimator=rf, param_grid=param_grid,
                    scoring=['neg_median_absolute_error','neg_mean_squared_log_error','r2'], refit = 'neg_mean_squared_log_error',
                    verbose=1, n_jobs=-1)

In [None]:
# fit the grid on the train data
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed: 17.3min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed: 28.7min
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed: 47.0min
[Parallel(n_jobs=-1)]: Done 1796 tasks      | elapsed: 67.6min
[Parallel(n_jobs=-1)]: Done 2446 tasks      | elapsed: 99.5min
[Parallel(n_jobs=-1)]: Done 3196 tasks      | elapsed: 140.0min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 163.1min finished


In [None]:
# print the best score and parameters found
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  -0.004418392261406096
Best Params:  {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}


In [None]:
# find predictions for test data
y_pred = grid_result.best_estimator_.predict(X_test)

In [None]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [None]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.2872661248004256

In [None]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.2101381173298768

In [None]:
# find r2 score
r2_score(y_test,y_pred)

-9.331726982797875

# ===============================================================

### SGRegression

In [146]:
# create a linear regressor
lr = SGDRegressor()

In [147]:
# find optimal parameters with grid search
param_grid = {
    
    'loss' : ['squared_loss','huber'],
    'alpha': [0.0001,0.001,0.01,0.1],
     'eta0': [0.01,0.1,1,10,100],
    'tol' : [0.00001,0.0001,0.001,0.01,0.1]    
 }
grid = GridSearchCV(estimator=lr, param_grid=param_grid,
                    scoring=['neg_median_absolute_error','r2'], refit = 'neg_median_absolute_error',
                    verbose=1, n_jobs=-1)

In [148]:
# fit the grid on the train data
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


In [149]:
# print the best score and parameters found
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  -0.03492612920285247
Best Params:  {'alpha': 0.001, 'eta0': 0.1, 'loss': 'huber', 'tol': 0.1}


In [150]:
# find predictions for test data
y_pred = grid_result.best_estimator_.predict(X_test)

In [151]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [152]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.2486989779219191

In [153]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.2241543323981015

In [154]:
# find r2 score
r2_score(y_test,y_pred)

-11.1807894557034

# ===============================================================

### Decision Tree Regressor

In [155]:
dt = DecisionTreeRegressor()

In [156]:
# find optimal parameters with grid search
param_grid = {
    
    'criterion' : ['mse','mae'],
    'max_depth': range(10,20),
     'min_samples_split': range(2,10),
    'min_samples_leaf' : range(1,5)    
 }
grid = GridSearchCV(estimator=dt, param_grid=param_grid, cv=10,
                    scoring=['neg_median_absolute_error','neg_mean_squared_log_error','r2'], refit = 'neg_mean_squared_log_error',
                    verbose=1, n_jobs=-1)

In [157]:
# fit the grid on the train data
grid_result = grid.fit(X_train, y_train)

Fitting 10 folds for each of 640 candidates, totalling 6400 fits


In [158]:
# print the best score and parameters found
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  -0.00484721745060179
Best Params:  {'criterion': 'mae', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 7}


In [159]:
# find predictions for test data
y_pred = grid_result.best_estimator_.predict(X_test)

In [160]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [161]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.2239422598307616

In [162]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.18900328356148613

In [163]:
# find r2 score
r2_score(y_test,y_pred)

-6.969489312348588

# ===============================================================

### XGBoost Regressor

In [164]:
xgbtree = XGBRegressor()

In [165]:
# find optimal parameters with grid search
param_grid = {
              'learning_rate': [0.001,0.01,0.1,1],
              'max_depth': range(2,10),
              'subsample':[0.01,0.1,1] ,
              'colsample_bytree': [0.01,0.1,1],
              'n_estimators': [100,200,500,1000]}
grid = GridSearchCV(estimator=xgbtree, param_grid=param_grid, cv=5,
                    scoring=['neg_median_absolute_error','r2'], refit = 'neg_median_absolute_error',
                    verbose=1, n_jobs=-1)

In [None]:
# fit the grid on the train data
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 1152 candidates, totalling 5760 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done 1796 tasks      | elapsed: 19.3min
[Parallel(n_jobs=-1)]: Done 2446 tasks      | elapsed: 26.0min
[Parallel(n_jobs=-1)]: Done 3196 tasks      | elapsed: 34.2min
[Parallel(n_jobs=-1)]: Done 4046 tasks      | elapsed: 44.0min
[Parallel(n_jobs=-1)]: Done 4996 tasks      | elapsed: 56.7min
[Parallel(n_jobs=-1)]: Done 5760 out of 5760 | elapsed: 67.7min finished




In [166]:
# print the best score and parameters found
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  -0.00484721745060179
Best Params:  {'criterion': 'mae', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 7}


In [167]:
# find predictions for test data
y_pred = grid_result.best_estimator_.predict(X_test)

In [168]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [169]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.2239422598307616

In [170]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.18900328356148613

In [171]:
# find r2 score
r2_score(y_test,y_pred)

-6.969489312348588

# ===============================================================

### AdaBoost Regressor

In [172]:
abreg = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(),
                              random_state=0)

In [None]:
# find optimal parameters with grid search
param_grid = { 'loss' : ['linear', 'square', 'exponential'],
              'learning_rate': [0.001,0.01,0.1,1],
              'base_estimator__max_depth': range(2,10),
              'n_estimators': [100,200,500,1000]}
grid = GridSearchCV(estimator=abreg, param_grid=param_grid, cv=5,
                    scoring=['neg_median_absolute_error','r2'], refit = 'neg_median_absolute_error',
                    verbose=1, n_jobs=-1)

In [None]:
# fit the grid on the train data
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 384 candidates, totalling 1920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed: 20.4min
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed: 35.2min
[Parallel(n_jobs=-1)]: Done 1796 tasks      | elapsed: 58.1min
[Parallel(n_jobs=-1)]: Done 1920 out of 1920 | elapsed: 61.6min finished


In [None]:
# print the best score and parameters found
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  -0.0347410677510596
Best Params:  {'base_estimator__max_depth': 9, 'learning_rate': 0.001, 'loss': 'linear', 'n_estimators': 100}


In [None]:
# find predictions for test data
y_pred = grid_result.best_estimator_.predict(X_test)

In [None]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [None]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.19729867696620906

In [None]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.1536971298773078

In [None]:
# find r2 score
r2_score(y_test,y_pred)

-4.110286046096933

# ===============================================================

### MLP Regressor

In [173]:
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard

In [174]:
import datetime

In [175]:
# Creating a Neural Network Model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.optimizers import Adam

In [176]:
model = Sequential()
model.add(Dense(1200,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1200,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1))

In [177]:
import tensorflow.keras.backend as K

In [178]:

def rmsle(y_true, y_pred):
    msle = tf.keras.losses.MeanSquaredLogarithmicError()
    return K.sqrt(msle(y_true, y_pred)) 

In [179]:
model.compile(optimizer='Adam',loss=rmsle,metrics=[tf.keras.metrics.RootMeanSquaredError()])

In [180]:
model.fit(x=X_train,y=y_train,
          validation_data=(X_test,y_test),
          batch_size=64,epochs=100)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x1436f853d60>

In [181]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1200)              3600      
_________________________________________________________________
dropout (Dropout)            (None, 1200)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1200)              1441200   
_________________________________________________________________
dropout_1 (Dropout)          (None, 1200)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               153728    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               1

In [182]:
K.clear_session()

In [183]:
y_pred = model.predict(X_test)

In [184]:
y_pred.shape

(1644, 1)

In [185]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.034375390010636026

In [186]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.06645822429299328

In [187]:
# find r2 score
r2_score(y_test,y_pred)

0.10452870396800462

# ===============================================================

# =============================================================
# =============================================================
## ============================= Springs =====================================
# =============================================================
# =============================================================

In [188]:
springs.head()

Unnamed: 0,Mean_Rainfall,Mean_Temp,Actual_Depth,Actual_Flow_Rate,Date
0,4.125714,-0.084921,-131.779048,-7.393259,2000-01-01
1,4.737143,6.701587,-132.510476,-7.393882,2000-01-02
2,6.824762,5.405556,-130.98619,-7.597644,2000-01-03
3,5.326667,4.813492,-129.141111,-6.168792,2000-01-04
4,4.177143,14.011905,-130.089524,-7.34417,2000-01-05


In [189]:
# Normalizing the columns to remove negative values
springs['Mean_Rainfall'] = (springs['Mean_Rainfall'] - springs['Mean_Rainfall'].min()) / (springs['Mean_Rainfall'].max() - springs['Mean_Rainfall'].min())
springs['Mean_Temp'] = (springs['Mean_Temp'] - springs['Mean_Temp'].min()) / (springs['Mean_Temp'].max() - springs['Mean_Temp'].min())
springs['Actual_Depth'] = (springs['Actual_Depth'] - springs['Actual_Depth'].min()) / (springs['Actual_Depth'].max() - springs['Actual_Depth'].min())
springs['Actual_Flow_Rate'] = (springs['Actual_Flow_Rate'] - springs['Actual_Flow_Rate'].min()) / (springs['Actual_Flow_Rate'].max() - springs['Actual_Flow_Rate'].min())

In [190]:
springs.head()

Unnamed: 0,Mean_Rainfall,Mean_Temp,Actual_Depth,Actual_Flow_Rate,Date
0,0.076309,0.169289,0.416992,0.530379,2000-01-01
1,0.087618,0.357043,0.346262,0.530375,2000-01-02
2,0.12623,0.321187,0.493661,0.529121,2000-01-03
3,0.098522,0.304807,0.67208,0.537916,2000-01-04
4,0.07726,0.559289,0.580368,0.530681,2000-01-05


In [191]:
# separating the date and target feature
X = springs.drop(['Date','Actual_Flow_Rate'],axis=1)
y = springs['Actual_Flow_Rate']

In [192]:
#Dividing the dataset into train and test for features as well as labels
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [193]:
#checking the number of rows in training data
len(X_train)

5989

In [194]:
#checking the number of rows in training labels
len(y_train)

5989

In [195]:
#checking the number of rows in testing data
len(X_test)

1498

In [196]:
#checking the number of rows in testing labels
len(y_test)

1498

# =============================================================

### knn

In [197]:
# create a knn regressor
neigh = KNeighborsRegressor()

In [198]:
# find optimal parameters with grid search
param_grid = {
    'n_neighbors': [3,5,7,9,11,19],
    'weights': ['uniform', 'distance'],
    'metric' : ['euclidean','manhattan']
}
grid = GridSearchCV(estimator=neigh, param_grid=param_grid,
                    scoring=['neg_median_absolute_error','neg_mean_squared_log_error','r2'], refit = 'neg_mean_squared_log_error',
                    verbose=1, n_jobs=-1)

In [199]:
# fit the grid on the train data
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [200]:
# print the best score and parameters found
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  -0.021191998046753598
Best Params:  {'metric': 'manhattan', 'n_neighbors': 19, 'weights': 'distance'}


In [202]:
# find predictions for test data
y_pred = grid_result.best_estimator_.predict(X_test)

In [203]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [204]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.16653555878093784

In [205]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.16545173015424464

In [206]:
# find r2 score
r2_score(y_test,y_pred)

-0.1472021736660043

# =============================================================

### Multiple Linear Regression

In [207]:
# create a linear regressor
lr = LinearRegression()

In [208]:
# fit the lr on train data
lr = lr.fit(X_train, y_train)

In [209]:
# find the predictions
y_pred = lr.predict(X_test)

In [210]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [211]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.3106472486430929

In [212]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.2848226752175971

In [213]:
# find r2 score
r2_score(y_test,y_pred)

-2.1411975666849132

R2  is negative only when the chosen model does not follow the trend of the data, so fits worse than a horizontal line.

# =============================================================

### Random Forest

In [None]:
# create a random forest regressor
rf = RandomForestRegressor()

In [None]:
# find optimal parameters with grid search
param_grid = {
'bootstrap': [True, False],
 'max_depth': [None,10, 20, 50, 100],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100,200,500, 1000]
             }
grid = GridSearchCV(estimator=rf, param_grid=param_grid,
                    scoring=['neg_median_absolute_error','neg_mean_squared_log_error','r2'], refit = 'neg_mean_squared_log_error',
                    verbose=1, n_jobs=-1)

In [None]:
# fit the grid on the train data
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed: 14.4min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed: 25.3min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed: 41.6min
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed: 68.6min
[Parallel(n_jobs=-1)]: Done 1796 tasks      | elapsed: 97.1min
[Parallel(n_jobs=-1)]: Done 2446 tasks      | elapsed: 147.2min
[Parallel(n_jobs=-1)]: Done 3196 tasks      | elapsed: 209.0min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 242.5min finished


In [None]:
# print the best score and parameters found
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  -0.016208164634010154
Best Params:  {'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}


In [None]:
# find predictions for test data
y_pred = grid_result.best_estimator_.predict(X_test)

In [None]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [None]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.06339626104495508

In [None]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.12466759306708458

In [None]:
# find r2 score
r2_score(y_test,y_pred)

0.3552887850735922

# =============================================================

### SGRegression

In [214]:
# create a linear regressor
lr = SGDRegressor()

In [215]:
# find optimal parameters with grid search
param_grid = {
    
    'loss' : ['squared_loss','huber'],
    'alpha': [0.0001,0.001,0.01,0.1],
     'eta0': [0.01,0.1,1,10,100],
    'tol' : [0.00001,0.0001,0.001,0.01,0.1]    
 }
grid = GridSearchCV(estimator=lr, param_grid=param_grid,
                    scoring=['neg_median_absolute_error','r2'], refit = 'neg_median_absolute_error',
                    verbose=1, n_jobs=-1)

In [216]:
# fit the grid on the train data
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


In [217]:
# print the best score and parameters found
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  -0.19886521836378318
Best Params:  {'alpha': 0.0001, 'eta0': 10, 'loss': 'squared_loss', 'tol': 0.1}


In [218]:
# find predictions for test data
y_pred = grid_result.best_estimator_.predict(X_test)

In [219]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [220]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.1578245789394696

In [221]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.2169134895048725

In [222]:
# find r2 score
r2_score(y_test,y_pred)

-0.8937296652360205

# =============================================================

### Decision Tree Regressor

In [223]:
dt = DecisionTreeRegressor()

In [224]:
# find optimal parameters with grid search
param_grid = {
    
    'criterion' : ['mse','mae'],
    'max_depth': range(10,20),
     'min_samples_split': range(2,10),
    'min_samples_leaf' : range(1,5)    
 }
grid = GridSearchCV(estimator=dt, param_grid=param_grid, cv=10,
                    scoring=['neg_median_absolute_error','neg_mean_squared_log_error','r2'], refit = 'neg_mean_squared_log_error',
                    verbose=1, n_jobs=-1)

In [225]:
# fit the grid on the train data
grid_result = grid.fit(X_train, y_train)

Fitting 10 folds for each of 640 candidates, totalling 6400 fits


In [232]:
# print the best score and parameters found
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  -0.021627997495911627
Best Params:  {'criterion': 'mse', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 9}


In [233]:
# find predictions for test data
y_pred = grid_result.best_estimator_.predict(X_test)

In [234]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [235]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.08793477057526328

In [236]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.13827971812096929

In [237]:
# find r2 score
r2_score(y_test,y_pred)

0.21134319535530133

# =============================================================

### XGBoost Regressor

In [238]:
xgbtree = XGBRegressor()

In [239]:
# find optimal parameters with grid search
param_grid = {
              'learning_rate': [0.001,0.01,0.1,1],
              'max_depth': range(2,10),
              'subsample':[0.01,0.1,1] ,
              'colsample_bytree': [0.01,0.1,1],
              'n_estimators': [100,200,500,1000]}
grid = GridSearchCV(estimator=xgbtree, param_grid=param_grid, cv=5,
                    scoring=['neg_median_absolute_error','r2'], refit = 'neg_median_absolute_error',
                    verbose=1, n_jobs=-1)

In [None]:
# fit the grid on the train data
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 1152 candidates, totalling 5760 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed: 13.3min
[Parallel(n_jobs=-1)]: Done 1796 tasks      | elapsed: 19.7min
[Parallel(n_jobs=-1)]: Done 2446 tasks      | elapsed: 27.1min
[Parallel(n_jobs=-1)]: Done 3196 tasks      | elapsed: 35.2min
[Parallel(n_jobs=-1)]: Done 4046 tasks      | elapsed: 44.9min
[Parallel(n_jobs=-1)]: Done 4996 tasks      | elapsed: 60.7min
[Parallel(n_jobs=-1)]: Done 5760 out of 5760 | elapsed: 73.3min finished




In [None]:
# print the best score and parameters found
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  -0.07022984297011255
Best Params:  {'colsample_bytree': 1, 'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 1000, 'subsample': 1}


In [None]:
# find predictions for test data
y_pred = grid_result.best_estimator_.predict(X_test)

In [None]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [None]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.07750636324936641

In [None]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.12503603225652735

In [None]:
# find r2 score
r2_score(y_test,y_pred)

0.3485884002530737

# =============================================================

### AdaBoost Regressor

In [None]:
abreg = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(),
                              random_state=0)

In [None]:
# find optimal parameters with grid search
param_grid = { 'loss' : ['linear', 'square', 'exponential'],
              'learning_rate': [0.001,0.01,0.1,1],
              'base_estimator__max_depth': range(2,10),
              'n_estimators': [100,200,500,1000]}
grid = GridSearchCV(estimator=abreg, param_grid=param_grid, cv=5,
                    scoring=['neg_median_absolute_error','r2'], refit = 'neg_median_absolute_error',
                    verbose=1, n_jobs=-1)

In [None]:
# fit the grid on the train data
grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 384 candidates, totalling 1920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed: 25.5min
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed: 48.1min
[Parallel(n_jobs=-1)]: Done 1796 tasks      | elapsed: 86.3min
[Parallel(n_jobs=-1)]: Done 1920 out of 1920 | elapsed: 95.1min finished


In [None]:
# print the best score and parameters found
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  -0.10607506721892213
Best Params:  {'base_estimator__max_depth': 9, 'learning_rate': 0.001, 'loss': 'linear', 'n_estimators': 100}


In [None]:
# find predictions for test data
y_pred = grid_result.best_estimator_.predict(X_test)

In [None]:
#normalizing y_pred
y_pred = (y_pred - y_pred.min())/(y_pred.max() - y_pred.min())

In [None]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.09586101757730897

In [None]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.13071360759079187

In [None]:
# find r2 score
r2_score(y_test,y_pred)

0.30911404242888374

# =============================================================

### MLP Regressor

In [240]:
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard

In [241]:
# Creating a Neural Network Model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.optimizers import Adam

In [242]:
model = Sequential()
model.add(Dense(512,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(512,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1))

In [243]:
import tensorflow.keras.backend as K

In [244]:

def rmsle(y_true, y_pred):
    msle = tf.keras.losses.MeanSquaredLogarithmicError()
    return K.sqrt(msle(y_true, y_pred)) 

In [245]:
model.compile(optimizer='Adam',loss=rmsle,metrics=[tf.keras.metrics.RootMeanSquaredError()])

In [246]:
model.fit(x=X_train,y=y_train,
          validation_data=(X_test,y_test),
          batch_size=64,epochs=100)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x143720f5cd0>

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               2048      
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               262656    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               65664     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               1

In [None]:
K.clear_session()

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred.shape

(1498, 1)

In [None]:
# find median absolute error
median_absolute_error(y_test,y_pred)

0.1378374286099205

In [None]:
# find root mean square log error
np.sqrt(mean_squared_log_error(y_test,y_pred))

0.14250678100449088

In [None]:
# find r2 score
r2_score(y_test,y_pred)

0.14410067815385863

# =============================================================

# Summary

In [247]:
from prettytable import PrettyTable

In [248]:
# Table for summarizing water body type and their R2 score with different models
finalTable = PrettyTable(["Model","Aquifers", "Lakes", "Rivers", "Springs"])
  
# Add rows
finalTable.add_row(["knn", "0.96", "-17.851,-1.254", "-5.839", "-0.147"])
finalTable.add_row(["Linear Regression", "-0.361", "-156.95,-0.821", "-11.11", "-2.141"])
finalTable.add_row(["Random Forest", "0.966", "-25.89,-0.684", "-9.331", "0.355"])
finalTable.add_row(["SGRegression", "-0.171", "-0.781", "-11.21", "-1.11"])
finalTable.add_row(["Decision Tree", "0.956", "-0.091", "-7.012", "0.211"])
finalTable.add_row(["XGBoost", "0.958", "-0.325", "-11.698", "0.348"])
finalTable.add_row(["AdaBoost", "0.964", "-0.053", "-4.11", "0.309"])
finalTable.add_row(["MLP", "0.958","0.076", "0.106", "0.181"])
  
print(finalTable)

+-------------------+----------+----------------+---------+---------+
|       Model       | Aquifers |     Lakes      |  Rivers | Springs |
+-------------------+----------+----------------+---------+---------+
|        knn        |   0.96   | -17.851,-1.254 |  -5.839 |  -0.147 |
| Linear Regression |  -0.361  | -156.95,-0.821 |  -11.11 |  -2.141 |
|   Random Forest   |  0.966   | -25.89,-0.684  |  -9.331 |  0.355  |
|    SGRegression   |  -0.171  |     -0.781     |  -11.21 |  -1.11  |
|   Decision Tree   |  0.956   |     -0.091     |  -7.012 |  0.211  |
|      XGBoost      |  0.958   |     -0.325     | -11.698 |  0.348  |
|      AdaBoost     |  0.964   |     -0.053     |  -4.11  |  0.309  |
|        MLP        |  0.958   |     0.076      |  0.106  |  0.181  |
+-------------------+----------+----------------+---------+---------+
