In [135]:
import numpy as np
import pandas as pd
import sqlite3
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [136]:
# Random forest for TFW
conn = sqlite3.connect('project.db')
conn.text_factory = lambda x: str(x, 'iso-8859-1')
cur = conn.cursor()

###
get_variables = """
Select Year, AreaId, Area, VariableName, Value FROM parameter
where VariableId = 4112 or VariableId = 4263 or VariableId = 4103 or VariableId = 4104 or VariableId = 4185
ORDER by Area
"""

df = pd.read_sql(sql=get_variables, con=conn)
df_long=df.set_index(['Area','AreaId', 'Year', 'VariableName']).unstack(level=3).reset_index()
df_long.drop(columns=[df_long.columns[0], df_long.columns[1], df_long.columns[2]], inplace=True)
df_model = df_long
df_model[:10]

Unnamed: 0_level_0,Value,Value,Value,Value,Value
VariableName,Cultivated area (arable land + permanent crops),Gross Domestic Product (GDP),Total freshwater withdrawal (primary and secondary),Total population,Total renewable surface water
0,7760.0,547000000.0,,9346.0,55.68
1,7979.0,1670000000.0,,10373.0,55.68
2,8046.0,1600000000.0,,11722.0,55.68
3,,,10.7,,
4,8050.0,2950000000.0,,13068.0,55.68
5,,3480000000.0,,,
6,8054.0,,,12750.0,55.68
7,8045.0,,26.11,11503.0,55.68
8,8030.0,,,13981.0,55.68
9,7790.0,,,18382.0,55.68


In [137]:
df_model.columns = df_model.columns.droplevel()
df_model.rename(columns={'Cultivated area (arable land + permanent crops)':'cultivated_area', 'Gross Domestic Product (GDP)':'GDP', 'Total freshwater withdrawal (primary and secondary)':'TFW', 'Total population':'total_population', 'Total renewable surface water':'TRSW'}, inplace=True)
df_model.rename_axis(None, inplace=True)

# Remove rows that contains NaN
df_model.dropna(inplace=True)
print(len(df_model.index))

188


In [138]:
## Split data
x_data = df_model[['cultivated_area', 'GDP', 'total_population', 'TRSW']]
y_data = df_model['TFW']
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, train_size=0.75, random_state=100, shuffle=True)

## Build a random forest regressor
random = 100
RF_clf = RandomForestRegressor(bootstrap=True, criterion='mse',random_state=random)
RF_clf.fit(x_train, y_train)
RF_param_grid = {"n_estimators":[10, 20, 30,  50,100], "max_depth": [5, 10, 15, 20, None]}

## Optimize the randomForest regressor
RF_grid_search = GridSearchCV(RF_clf, param_grid=RF_param_grid, n_jobs=-1, cv=10)
RF_grid_search.fit(x_train, y_train)
print(RF_grid_search.best_params_)
print(RF_grid_search.best_score_)

## Define optimum randomForest regressor
opt_RF = RandomForestRegressor(bootstrap=True, criterion='mse',random_state=random, n_estimators=RF_grid_search.best_params_['n_estimators'], max_depth=RF_grid_search.best_params_['max_depth'])
opt_RF.fit(x_train, y_train)
RF_tr_pred = opt_RF.predict(x_train)
print("Training MSE: %.2f" %mean_squared_error(y_train, RF_tr_pred))
RF_test_pred = opt_RF.predict(x_test)
print("Test MSE: %.2f" %mean_squared_error(y_test, RF_test_pred))

## Calculate adjusted R-squared
# n1 = number of data points for training set
# n2 = number of data points for test set
# p = number of predictors / attributes
n1 = len(y_train)
p=4
n2 = len(y_test)
train_adj_r2 = 1-(1-r2_score(y_train, RF_tr_pred))*(n1-1)/(n1-p-1)
test_adj_r2 = 1-(1-r2_score(y_test, RF_test_pred))*(n2-1)/(n2-p-1)
print("Training adj R2: %.2f" %train_adj_r2) 
print("Test adj R2: %.2f" %test_adj_r2) 



{'max_depth': 10, 'n_estimators': 100}
0.5174083200118571
Training MSE: 125.61
Test MSE: 39.00
Training adj R2: 0.97
Test adj R2: 0.88


# Total renewable surface water (TRSW)

In [247]:
get_TRSW = """
Select Year, AreaId, Area, Value FROM parameter
where VariableId = 4185
ORDER by Area
"""

df_TRSW = pd.read_sql(sql=get_TRSW, con=conn)
df_TRSW[:20]

Unnamed: 0,Year,AreaId,Area,Value
0,1962,2,Afghanistan,55.68
1,1967,2,Afghanistan,55.68
2,1972,2,Afghanistan,55.68
3,1977,2,Afghanistan,55.68
4,1982,2,Afghanistan,55.68
5,1987,2,Afghanistan,55.68
6,1992,2,Afghanistan,55.68
7,1997,2,Afghanistan,55.68
8,2002,2,Afghanistan,55.68
9,2007,2,Afghanistan,55.68


In [252]:
# TRSW is static!!! except Bhutan...
df_TRSW.drop_duplicates(subset ="Value", 
                     keep = "first", inplace = True) 
temp = df_TRSW.groupby("Area").count()[['Value']]
temp.loc[temp['Value'] != 1]

Unnamed: 0_level_0,Value
Area,Unnamed: 1_level_1
Bhutan,3
