In [5]:
import numpy as np
import pandas as pd
import sqlite3
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder

In [6]:
All = slice(None)

pop = 'Total population'
cult = 'Cultivated area (arable land + permanent crops)'
gdp = 'Gross Domestic Product (GDP)'
trsw = 'Total renewable surface water'
tfw = 'Total freshwater withdrawal (primary and secondary)'
trwr = 'Total renewable water resources'
efr = 'Environmental Flow Requirements'
wsi = 'SDG 6.4.2. Water Stress'

preds = [cult, gdp, pop, trsw]

trwr_id = 4188
efr_id = 4549

In [7]:
data = pd.read_csv('training_data_arima.csv')
df_model = data.copy()
data.set_index(['Area', 'YearBin'], inplace=True)

In [8]:
# df_model.columns = df_model.columns.droplevel()
df_model.rename(columns={'Cultivated area (arable land + permanent crops)':'cultivated_area', 'Gross Domestic Product (GDP)':'GDP', 'Total freshwater withdrawal (primary and secondary)':'TFW', 'Total population':'total_population', 'Total renewable surface water':'TRSW'}, inplace=True)
df_model.rename_axis(None, inplace=True)

# Remove rows that contains NaN
df_model.dropna(inplace=True)
print(len(df_model.index))

1336


In [9]:
x_data = df_model[['cultivated_area', 'GDP', 'total_population', 'TRSW',]]
y_data = df_model['TFW']

## One-hot encoding
# enc = OneHotEncoder(handle_unknown='ignore')
# x_data = enc.fit_transform(x_data)

## Split data
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, train_size=0.75, random_state=100, shuffle=True)

## Build a random forest regressor
random = 100
RF_clf = RandomForestRegressor(bootstrap=True, criterion='mse',random_state=random)
RF_clf.fit(x_train, y_train)
RF_param_grid = {"n_estimators":[10, 20, 30,  50,100], "max_depth": [5, 10, 15, 20, None]}

## Optimize the randomForest regressor
RF_grid_search = GridSearchCV(RF_clf, param_grid=RF_param_grid, n_jobs=-1, cv=3)
RF_grid_search.fit(x_train, y_train)
print(RF_grid_search.best_params_)
print(RF_grid_search.best_score_)

## Define optimum randomForest regressor
opt_RF = RandomForestRegressor(bootstrap=True, criterion='mse',random_state=random, n_estimators=RF_grid_search.best_params_['n_estimators'], max_depth=RF_grid_search.best_params_['max_depth'])
opt_RF.fit(x_train, y_train)
RF_tr_pred = opt_RF.predict(x_train)
print("Training MSE: %.2f" %mean_squared_error(y_train, RF_tr_pred))
RF_test_pred = opt_RF.predict(x_test)
print("Test MSE: %.2f" %mean_squared_error(y_test, RF_test_pred))

## Calculate adjusted R-squared
# n1 = number of data points for training set
# n2 = number of data points for test set
# p = number of predictors / attributes
n1 = len(y_train)
p=4
n2 = len(y_test)
train_adj_r2 = 1-(1-r2_score(y_train, RF_tr_pred))*(n1-1)/(n1-p-1)
test_adj_r2 = 1-(1-r2_score(y_test, RF_test_pred))*(n2-1)/(n2-p-1)
print("Training adj R2: %.2f" %train_adj_r2) 
print("Test adj R2: %.2f" %test_adj_r2) 



{'max_depth': 15, 'n_estimators': 30}
0.9385050033887423
Training MSE: 31.90
Test MSE: 284.07
Training adj R2: 0.99
Test adj R2: 0.95


In [11]:
## SVM regressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, normalize

# scale data
scaler = StandardScaler()
scaler_fit = scaler.fit(x_train)
s_x_train = scaler_fit.transform(x_train)
s_x_test = scaler_fit.transform(x_test)

# Build a SVM regressor
SVM = SVR()
SVM.fit(s_x_train, y_train)

# Hyper-tunning parameter
SVM_grid = {"C":[0.01, 0.1, 1.0, 10, 100.0], "kernel": ['linear', 'rbf', 'poly', 'sigmoid'], "epsilon":[0.01, 0.05, 0.1, 0.15]}
SVM_search = GridSearchCV(SVM, param_grid=SVM_grid, n_jobs=-1, cv=10)
SVM_search.fit(s_x_train, y_train)
print(SVM_search.best_score_)
print(SVM_search.best_params_)

0.8470083971930287
{'C': 1.0, 'epsilon': 0.15, 'kernel': 'linear'}




In [14]:
# build the optimal SVM model
opt_SVM = SVR(C=SVM_search.best_params_['C'], kernel=SVM_search.best_params_['kernel'], epsilon=SVM_search.best_params_["epsilon"])
opt_SVM.fit(s_x_train, y_train)
opt_SVM_y_tr_pred = opt_SVM.predict(s_x_train)
opt_SVM_y_test_pred = opt_SVM.predict(s_x_test)
print("Training MSE: %.2f" %mean_squared_error(y_train, opt_SVM_y_tr_pred))
print("Test MSE: %.2f" %mean_squared_error(y_test, opt_SVM_y_test_pred))

Training MSE: 499.69
Test MSE: 772.72


In [20]:
# ANN 
from sklearn.neural_network import MLPRegressor

# Build a SVM regressor
ANN = MLPRegressor(random_state=100)
ANN.fit(s_x_train, y_train)

# Hyper-tunning parameter
# one layer case
ANN_grid_1 = {"activation":['logistic', 'tanh', 'relu'], "solver": ['lbfgs', 'sgd', 'adam'], "alpha":[0.0001, 10**(-5), 10**(-3)],
           "learning_rate":['constant','invscaling','adaptive'], "hidden_layer_sizes": [(5,), (3,), (7,), (10,)]}
ANN_search = GridSearchCV(ANN, param_grid=ANN_grid_1, n_jobs=-1, cv=10)
ANN_search.fit(s_x_train, y_train)
print(ANN_search.best_score_)
print(ANN_search.best_params_)





0.8914547968703199
{'activation': 'relu', 'alpha': 1e-05, 'hidden_layer_sizes': (5,), 'learning_rate': 'constant', 'solver': 'lbfgs'}




In [22]:
# Hyper-tunning parameter
# two layers case
ANN_grid_2 = {"activation":['logistic', 'tanh', 'relu'], "solver": ['lbfgs', 'sgd', 'adam'], "alpha":[0.0001, 10**(-5), 10**(-3)],
           "learning_rate":['constant','invscaling','adaptive'], "hidden_layer_sizes": [(5,3), (5,2), (5,1), (7,1), (3,1)]}
ANN_search2 = GridSearchCV(ANN, param_grid=ANN_grid_2, n_jobs=-1, cv=10)
ANN_search2.fit(s_x_train, y_train)
print(ANN_search2.best_score_)
print(ANN_search2.best_params_)

0.8717033008608985
{'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (5, 3), 'learning_rate': 'constant', 'solver': 'lbfgs'}




In [18]:
# Hyper-tunning parameter
# three layers case
ANN_grid_3 = {"activation":['logistic', 'tanh', 'relu'], "solver": ['lbfgs', 'sgd', 'adam'], "alpha":[0.0001, 10**(-5), 10**(-3)],
           "learning_rate":['constant','invscaling','adaptive'], "hidden_layer_sizes": [(5,3,2), (5,3,3), (5,3,1), (5,2,1), (5,2,2), (5,1,1)]}
ANN_search3 = GridSearchCV(ANN, param_grid=ANN_grid_3, n_jobs=-1, cv=10, random_state=100)
ANN_search3.fit(s_x_train, y_train)
print(ANN_search3.best_score_)
print(ANN_search3.best_params_)



0.8691051784450886
{'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (5, 2, 1), 'learning_rate': 'invscaling', 'solver': 'sgd'}


In [23]:
# build the optimal ANN model
opt_ANN = MLPRegressor(activation=ANN_search.best_params_['activation'], alpha=ANN_search.best_params_['alpha'], 
                       hidden_layer_sizes=ANN_search.best_params_["hidden_layer_sizes"],
                      learning_rate=ANN_search.best_params_['learning_rate'],
                      solver=ANN_search.best_params_['solver'], random_state=100)
opt_ANN.fit(s_x_train, y_train)
opt_ANN_y_tr_pred = opt_ANN.predict(s_x_train)
opt_ANN_y_test_pred = opt_ANN.predict(s_x_test)
print("Training MSE: %.2f" %mean_squared_error(y_train, opt_ANN_y_tr_pred))
print("Test MSE: %.2f" %mean_squared_error(y_test, opt_ANN_y_test_pred))

Training MSE: 246.92
Test MSE: 236.64
