In [1]:
import numpy as np
import pandas as pd
import sqlite3
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder

In [81]:
conn = sqlite3.connect(r'C:\Users\Spencer\Environments\aquastaat\project.db')
conn.text_factory = lambda x: str(x, 'latin1')

water = pd.read_sql("SELECT * FROM parameter", conn)

In [89]:
All = slice(None)

pop = 'Total population'
cult = 'Cultivated area (arable land + permanent crops)'
gdp = 'Gross Domestic Product (GDP)'
trsw = 'Total renewable surface water'
tfw = 'Total freshwater withdrawal (primary and secondary)'
trwr = 'Total renewable water resources'
efr = 'Environmental Flow Requirements'
wsi = 'SDG 6.4.2. Water Stress'

preds = [cult, gdp, pop, trsw]

trwr_id = 4188
efr_id = 4549

In [3]:
data = pd.read_csv('training_data_arima.csv')
df_model = data.copy()
data.set_index(['Area', 'YearBin'], inplace=True)

In [4]:
# df_model.columns = df_model.columns.droplevel()
df_model.rename(columns={'Cultivated area (arable land + permanent crops)':'cultivated_area', 'Gross Domestic Product (GDP)':'GDP', 'Total freshwater withdrawal (primary and secondary)':'TFW', 'Total population':'total_population', 'Total renewable surface water':'TRSW'}, inplace=True)
df_model.rename_axis(None, inplace=True)

# Remove rows that contains NaN
df_model.dropna(inplace=True)
print(len(df_model.index))

1336


In [5]:
x_data = df_model[['cultivated_area', 'GDP', 'total_population', 'TRSW',]]
y_data = df_model['TFW']

## One-hot encoding
# enc = OneHotEncoder(handle_unknown='ignore')
# x_data = enc.fit_transform(x_data)

## Split data
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, train_size=0.75, random_state=100, shuffle=True)

## Build a random forest regressor
random = 100
RF_clf = RandomForestRegressor(bootstrap=True, criterion='mse',random_state=random)
RF_clf.fit(x_train, y_train)
RF_param_grid = {"n_estimators":[10, 20, 30,  50,100], "max_depth": [5, 10, 15, 20, None]}

## Optimize the randomForest regressor
RF_grid_search = GridSearchCV(RF_clf, param_grid=RF_param_grid, n_jobs=-1, cv=3)
RF_grid_search.fit(x_train, y_train)
print(RF_grid_search.best_params_)
print(RF_grid_search.best_score_)

## Define optimum randomForest regressor
opt_RF = RandomForestRegressor(bootstrap=True, criterion='mse',random_state=random, n_estimators=RF_grid_search.best_params_['n_estimators'], max_depth=RF_grid_search.best_params_['max_depth'])
opt_RF.fit(x_train, y_train)
RF_tr_pred = opt_RF.predict(x_train)
print("Training MSE: %.2f" %mean_squared_error(y_train, RF_tr_pred))
RF_test_pred = opt_RF.predict(x_test)
print("Test MSE: %.2f" %mean_squared_error(y_test, RF_test_pred))

## Calculate adjusted R-squared
# n1 = number of data points for training set
# n2 = number of data points for test set
# p = number of predictors / attributes
n1 = len(y_train)
p=4
n2 = len(y_test)
train_adj_r2 = 1-(1-r2_score(y_train, RF_tr_pred))*(n1-1)/(n1-p-1)
test_adj_r2 = 1-(1-r2_score(y_test, RF_test_pred))*(n2-1)/(n2-p-1)
print("Training adj R2: %.2f" %train_adj_r2) 
print("Test adj R2: %.2f" %test_adj_r2) 



{'max_depth': 15, 'n_estimators': 30}
0.9385050033887423
Training MSE: 31.90
Test MSE: 284.07
Training adj R2: 0.99
Test adj R2: 0.95


In [6]:
tfw_preds = opt_RF.predict(data.loc[(All, [2020, 2025, 2030]), preds].values)
data.loc[(All, [2020, 2025, 2030]), tfw] = tfw_preds

In [59]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,Total population,Cultivated area (arable land + permanent crops),Gross Domestic Product (GDP),Total renewable surface water,SDG 6.4.2. Water Stress,Total freshwater withdrawal (primary and secondary),Total renewable water resources,Environmental Flow Requirements
Area,YearBin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Afghanistan,1980,12750.000000,8054.000000,3.480000e+09,55.68,70.490000,26.110000,,
Afghanistan,1985,11503.000000,8045.000000,3.642500e+09,55.68,70.490000,26.110000,,
Afghanistan,1990,13981.000000,8030.000000,3.805000e+09,55.68,65.243333,24.166667,,
Afghanistan,1995,18382.000000,7790.000000,3.967500e+09,55.68,59.996667,22.223333,,
Afghanistan,2000,21980.000000,7753.000000,4.130000e+09,55.68,54.750000,20.280000,,
Afghanistan,2005,26617.000000,7910.000000,9.840000e+09,55.68,54.750000,20.280000,,
Afghanistan,2010,30697.000000,7910.000000,2.050000e+10,55.68,54.750000,20.280000,,
Afghanistan,2015,35530.000000,7910.000000,1.920000e+10,55.68,54.750000,20.280000,,
Afghanistan,2020,40022.760330,7920.311940,2.151193e+10,55.68,,10.788403,,
Afghanistan,2025,45083.629170,7928.710700,2.410225e+10,55.68,,15.042920,,


In [82]:
water.set_index(['Area', 'VariableId'], inplace=True)
water.sort_index(inplace=True)
water = water.loc[(All, [trwr_id, efr_id]), ['Value']]
water.fillna(0, inplace=True)
water.loc[water.loc[:,'Value']=='', :] = 0

In [87]:
data[trwr] = None
data[efr] = None
for name, group in data.groupby('Area'):
    if (name, trwr_id) in water.index:
        data.loc[(name, All), trwr] = water.at[(name, trwr_id), 'Value'].max()
    if (name, efr_id) in water.index:
        data.loc[(name, All), efr] = water.at[(name, efr_id), 'Value'].max()

In [91]:
data[wsi] = 100*data[tfw]/(data[trwr] - data[efr])

In [93]:
data.dropna(axis=0, inplace=True)

In [96]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,Total population,Cultivated area (arable land + permanent crops),Gross Domestic Product (GDP),Total renewable surface water,SDG 6.4.2. Water Stress,Total freshwater withdrawal (primary and secondary),Total renewable water resources,Environmental Flow Requirements
Area,YearBin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Afghanistan,1980,12750.000000,8054.000000,3.480000e+09,55.68,70.4914,26.110000,65.33,28.29
Afghanistan,1985,11503.000000,8045.000000,3.642500e+09,55.68,70.4914,26.110000,65.33,28.29
Afghanistan,1990,13981.000000,8030.000000,3.805000e+09,55.68,65.2448,24.166667,65.33,28.29
Afghanistan,1995,18382.000000,7790.000000,3.967500e+09,55.68,59.9982,22.223333,65.33,28.29
Afghanistan,2000,21980.000000,7753.000000,4.130000e+09,55.68,54.7516,20.280000,65.33,28.29
Afghanistan,2005,26617.000000,7910.000000,9.840000e+09,55.68,54.7516,20.280000,65.33,28.29
Afghanistan,2010,30697.000000,7910.000000,2.050000e+10,55.68,54.7516,20.280000,65.33,28.29
Afghanistan,2015,35530.000000,7910.000000,1.920000e+10,55.68,54.7516,20.280000,65.33,28.29
Afghanistan,2020,40022.760330,7920.311940,2.151193e+10,55.68,29.1264,10.788403,65.33,28.29
Afghanistan,2025,45083.629170,7928.710700,2.410225e+10,55.68,40.6126,15.042920,65.33,28.29
