In [1]:
import numpy as np
import pandas as pd
import math
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn import linear_model

# Initial Data


*   x1: Total Population *(in 1000s)*
*   x2: Population Density *(inhabitants/square km)*
*   x3: GDP per capita *(USD/inhabitant)*
*   x4: Long term average annual precipitation in depth *(mm/year)*
*   x5: Total renewable resources per capita *($m^{3}$/inhabitant /year)*
*   x6: Total water withdrawal per capita *($m^{3}$/inhabitant /year)*
*   x7: Agricultural water withdrawal *(%age of x6)*
*   x8: Industrial water withdrawal *(%age of x6)*
*   x9: Municipal water withdrawal *(%age of x6)*
*   x10: Collected Municipal wastewater  *($km^{3}$/year)*
*   x11: Capacity of the municipal wastewater treatment facilities *($km^{3}$/year)*
*   x12: Not treated municipal wastewater *($km^{3}$/year)*
*   y: Water Stress *(%age)*

In [2]:
#Read and prepare the data
data = pd.read_csv('https://www.dropbox.com/s/j07u99pg44kbfvm/final_data.csv?dl=1')
data

Unnamed: 0,Area,Variable Name,Year,Value
0,Afghanistan,Total population,1962,9351.441000
1,Afghanistan,Total population,1967,10399.926000
2,Afghanistan,Total population,1972,11791.215000
3,Afghanistan,Total population,1977,13171.306000
4,Afghanistan,Total population,1982,12882.528000
...,...,...,...,...
18698,Zimbabwe,SDG 6.4.2. Water Stress,2002,39.476155
18699,Zimbabwe,SDG 6.4.2. Water Stress,2007,33.514833
18700,Zimbabwe,SDG 6.4.2. Water Stress,2012,32.488030
18701,Zimbabwe,SDG 6.4.2. Water Stress,2017,31.346226


In [3]:
#data_x1_x5.pivot(columns='Variable Name', index='Area')['Value']
data = data.groupby(['Year','Variable Name', 'Area'])['Value'].aggregate('mean').unstack(1)
data

Unnamed: 0_level_0,Variable Name,Agricultural water withdrawal as % of total water withdrawal,Capacity of the municipal wastewater treatment facilities,Collected municipal wastewater,GDP per capita,Industrial water withdrawal as % of total water withdrawal,Long-term average annual precipitation in volume,Municipal water withdrawal as % of total withdrawal,Not treated municipal wastewater,Population density,SDG 6.4.2. Water Stress,Total population,Total renewable water resources per capita,Total water withdrawal per capita
Year,Area,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1962,Afghanistan,,,,,,213.48522,,,14.323808,,9351.441,6986.089096,
1962,Albania,,,,,,42.69375,,,60.441252,,1737.686,17379.434490,
1962,Algeria,,,,,,211.97486,,,4.878714,,11619.828,1004.059613,
1962,Andorra,,,,,,0.47240,,,32.702128,,15.370,20533.506830,
1962,Angola,,,,,,1259.16700,,,4.498708,,5608.539,26459.653750,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,Venezuela (Bolivarian Republic of),73.852110,,,6433.928949,3.506097,1864.23020,22.641793,,31.672735,7.540367,28887.118,45868.196340,783.266091
2018,Viet Nam,94.782397,0.423,0.19700,2538.068147,3.747409,603.16983,1.470194,1.775,288.458056,18.130315,95545.962,9253.347619,858.539684
2018,Yemen,90.743338,,,922.404203,1.823282,88.17099,7.433380,,53.977845,169.761905,28498.683,73.687616,125.093500
2018,Zambia,73.282443,,,1549.354235,8.269720,767.66220,18.447837,,23.055378,2.835498,17351.708,6039.751245,90.596269


In [4]:
data.isna().sum()

Variable Name
Agricultural water withdrawal as % of total water withdrawal    1180
Capacity of the municipal wastewater treatment facilities       2206
Collected municipal wastewater                                  2170
GDP per capita                                                   409
Industrial water withdrawal as % of total water withdrawal      1204
Long-term average annual precipitation in volume                 210
Municipal water withdrawal as % of total withdrawal             1182
Not treated municipal wastewater                                2082
Population density                                                27
SDG 6.4.2. Water Stress                                         1104
Total population                                                  11
Total renewable water resources per capita                        41
Total water withdrawal per capita                               1090
dtype: int64

## Exploratory Data Analysis



*   Three variables, Capacity of the municipal wastewater treatment facilities (x11), Collected municipal wastewater (x10) and Not treated municipal wastewater (x12) are missing around 75% to 80% of the values. Therefore it can be assumed that they do not impact the water stress (y) values and can be droped from the dataset.
*   All the rows with NaN values can be dropped.
*   Variables x6, x7 and x8 sum up to 100 as they are percentages of x6. Therefore we need only two of these three variables and one of them should be dropped to avoid redundancy.
*   Total population and Population density are related variables. So one of them can be dropped. Total population being a more absolute variable can be dropped.





# Cleaning the dataset

In [5]:
#drop the rows that are missing "y" values i.e. missing SDG 6.4.2. Water Stress
print("Dimensions of the original dataset: ", data.shape)
data.dropna(subset=["Population density","GDP per capita","SDG 6.4.2. Water Stress","Agricultural water withdrawal as % of total water withdrawal","Municipal water withdrawal as % of total withdrawal"], inplace=True)
data.drop(["Total population","Capacity of the municipal wastewater treatment facilities","Collected municipal wastewater","Not treated municipal wastewater","Industrial water withdrawal as % of total water withdrawal"], axis=1, inplace=True)
print("Dimensions of the new dataset: ", data.shape)

Dimensions of the original dataset:  (2431, 13)
Dimensions of the new dataset:  (1205, 8)


In [6]:
data.isna().sum()

Variable Name
Agricultural water withdrawal as % of total water withdrawal    0
GDP per capita                                                  0
Long-term average annual precipitation in volume                0
Municipal water withdrawal as % of total withdrawal             0
Population density                                              0
SDG 6.4.2. Water Stress                                         0
Total renewable water resources per capita                      0
Total water withdrawal per capita                               0
dtype: int64

In [7]:
data.reset_index(inplace=True)

In [8]:
data

Variable Name,Year,Area,Agricultural water withdrawal as % of total water withdrawal,GDP per capita,Long-term average annual precipitation in volume,Municipal water withdrawal as % of total withdrawal,Population density,SDG 6.4.2. Water Stress,Total renewable water resources per capita,Total water withdrawal per capita
0,1972,Denmark,31.947484,4655.290175,30.29227,60.831510,115.816802,24.716063,1202.273258,183.146293
1,1972,Ghana,54.727904,493.336524,283.14698,32.902354,38.692173,1.505958,6089.084168,37.430171
2,1972,Hungary,22.379496,782.184870,54.79467,20.569268,112.072299,5.828256,9974.980064,323.626719
3,1972,Israel,78.318250,2762.062183,9.60045,16.779789,135.771726,135.122077,594.029932,531.022713
4,1972,Italy,59.201104,2674.044383,250.71488,16.811125,180.058210,37.599562,3525.699205,786.449189
...,...,...,...,...,...,...,...,...,...,...
1200,2018,Venezuela (Bolivarian Republic of),73.852110,6433.928949,1864.23020,22.641793,31.672735,7.540367,45868.196340,783.266091
1201,2018,Viet Nam,94.782397,2538.068147,603.16983,1.470194,288.458056,18.130315,9253.347619,858.539684
1202,2018,Yemen,90.743338,922.404203,88.17099,7.433380,53.977845,169.761905,73.687616,125.093500
1203,2018,Zambia,73.282443,1549.354235,767.66220,18.447837,23.055378,2.835498,6039.751245,90.596269


In [9]:
data.corr()

Variable Name,Year,Agricultural water withdrawal as % of total water withdrawal,GDP per capita,Long-term average annual precipitation in volume,Municipal water withdrawal as % of total withdrawal,Population density,SDG 6.4.2. Water Stress,Total renewable water resources per capita,Total water withdrawal per capita
Variable Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Year,1.0,-0.060807,0.223303,0.024027,0.078427,0.023163,0.030757,-0.0689,-0.051328
Agricultural water withdrawal as % of total water withdrawal,-0.060807,1.0,-0.415532,0.018947,-0.675577,-0.180608,0.085287,-0.089993,0.296968
GDP per capita,0.223303,-0.415532,1.0,0.026815,0.250539,0.165558,0.170544,0.093298,0.035759
Long-term average annual precipitation in volume,0.024027,0.018947,0.026815,1.0,-0.094149,-0.07301,-0.066559,0.054745,0.066076
Municipal water withdrawal as % of total withdrawal,0.078427,-0.675577,0.250539,-0.094149,1.0,0.148729,-0.011202,0.099943,-0.42291
Population density,0.023163,-0.180608,0.165558,-0.07301,0.148729,1.0,0.045586,-0.090589,-0.069448
SDG 6.4.2. Water Stress,0.030757,0.085287,0.170544,-0.066559,-0.011202,0.045586,1.0,-0.078604,0.079976
Total renewable water resources per capita,-0.0689,-0.089993,0.093298,0.054745,0.099943,-0.090589,-0.078604,1.0,0.094333
Total water withdrawal per capita,-0.051328,0.296968,0.035759,0.066076,-0.42291,-0.069448,0.079976,0.094333,1.0


There is no direct linear relation between any of the variables and the water stress values. We need to build a prediction model that analyze complex relations between variables such as Neural Networks.

# Scaling the dataset

In [10]:
#Create X_data and Y_data
X_data= data[["Agricultural water withdrawal as % of total water withdrawal", "GDP per capita", "Long-term average annual precipitation in volume", "Municipal water withdrawal as % of total withdrawal", "Total renewable water resources per capita", "Total water withdrawal per capita", "Population density"]]
Y_data= data["SDG 6.4.2. Water Stress"]
X_scale= X_data.apply(lambda x: (x-x.min(axis=0)) / (x.max(axis=0)-x.min(axis=0)))
#Change the percentage to proportion
Y_scale=Y_data.apply(lambda x: x/100)

# Neural Network Model

## Splitting the dataset

In [11]:
print(X_scale.shape,Y_scale.shape)

(1205, 7) (1205,)


In [12]:
#convert dataframes to array
X=pd.DataFrame(X_scale).to_numpy()
Y=pd.DataFrame(Y_scale).to_numpy()

In [13]:
X

array([[3.20902724e-01, 3.84416071e-02, 2.01606512e-03, ...,
        1.64310175e-03, 3.06699357e-02, 1.43787686e-02],
       [5.49725098e-01, 3.78738908e-03, 1.88773038e-02, ...,
        8.34868443e-03, 5.25083958e-03, 4.67859864e-03],
       [2.24795207e-01, 6.19246477e-03, 3.64997106e-03, ...,
        1.36808319e-02, 5.51757034e-02, 1.39078125e-02],
       ...,
       [9.11489140e-01, 7.35999131e-03, 5.87562111e-03, ...,
        9.44794555e-05, 2.05430567e-02, 6.60111833e-03],
       [7.36099775e-01, 1.25802472e-02, 5.11864775e-02, ...,
        8.28099080e-03, 1.45252708e-02, 2.71191726e-03],
       [8.09958693e-01, 1.16482843e-02, 1.71156816e-02, ...,
        1.89405038e-03, 4.42854017e-02, 4.45955169e-03]])

In [14]:
Y

array([[0.24716063],
       [0.01505958],
       [0.05828256],
       ...,
       [1.69761905],
       [0.02835498],
       [0.3540537 ]])

In [15]:
#split the data for testing and training
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(964, 7) (241, 7) (964, 1) (241, 1)


## Building the Neural Network Model

In [16]:
#Create a neutral network model
model=tf.keras.models.Sequential()
model.add(tf.keras.layers.Flatten()),
model.add(tf.keras.layers.Dense(35, activation="relu")),
model.add(tf.keras.layers.Dense(30, activation="sigmoid")),
model.add(tf.keras.layers.Dense(25, activation="sigmoid")),
model.add(tf.keras.layers.Dense(1, activation="linear")),

#compile the model
model.compile(optimizer="adam",loss="mean_squared_error", metrics="accuracy")

In [17]:
#train the model
model.fit(X_train, Y_train, epochs=450, batch_size=32)

Epoch 1/450
Epoch 2/450
Epoch 3/450
Epoch 4/450
Epoch 5/450
Epoch 6/450
Epoch 7/450
Epoch 8/450
Epoch 9/450
Epoch 10/450
Epoch 11/450
Epoch 12/450
Epoch 13/450
Epoch 14/450
Epoch 15/450
Epoch 16/450
Epoch 17/450
Epoch 18/450
Epoch 19/450
Epoch 20/450
Epoch 21/450
Epoch 22/450
Epoch 23/450
Epoch 24/450
Epoch 25/450
Epoch 26/450
Epoch 27/450
Epoch 28/450
Epoch 29/450
Epoch 30/450
Epoch 31/450
Epoch 32/450
Epoch 33/450
Epoch 34/450
Epoch 35/450
Epoch 36/450
Epoch 37/450
Epoch 38/450
Epoch 39/450
Epoch 40/450
Epoch 41/450
Epoch 42/450
Epoch 43/450
Epoch 44/450
Epoch 45/450
Epoch 46/450
Epoch 47/450
Epoch 48/450
Epoch 49/450
Epoch 50/450
Epoch 51/450
Epoch 52/450
Epoch 53/450
Epoch 54/450
Epoch 55/450
Epoch 56/450
Epoch 57/450
Epoch 58/450
Epoch 59/450
Epoch 60/450
Epoch 61/450
Epoch 62/450
Epoch 63/450
Epoch 64/450
Epoch 65/450
Epoch 66/450
Epoch 67/450
Epoch 68/450
Epoch 69/450
Epoch 70/450
Epoch 71/450
Epoch 72/450
Epoch 73/450
Epoch 74/450
Epoch 75/450
Epoch 76/450
Epoch 77/450
Epoch 78

Epoch 80/450
Epoch 81/450
Epoch 82/450
Epoch 83/450
Epoch 84/450
Epoch 85/450
Epoch 86/450
Epoch 87/450
Epoch 88/450
Epoch 89/450
Epoch 90/450
Epoch 91/450
Epoch 92/450
Epoch 93/450
Epoch 94/450
Epoch 95/450
Epoch 96/450
Epoch 97/450
Epoch 98/450
Epoch 99/450
Epoch 100/450
Epoch 101/450
Epoch 102/450
Epoch 103/450
Epoch 104/450
Epoch 105/450
Epoch 106/450
Epoch 107/450
Epoch 108/450
Epoch 109/450
Epoch 110/450
Epoch 111/450
Epoch 112/450
Epoch 113/450
Epoch 114/450
Epoch 115/450
Epoch 116/450
Epoch 117/450
Epoch 118/450
Epoch 119/450
Epoch 120/450
Epoch 121/450
Epoch 122/450
Epoch 123/450
Epoch 124/450
Epoch 125/450
Epoch 126/450
Epoch 127/450
Epoch 128/450
Epoch 129/450
Epoch 130/450
Epoch 131/450
Epoch 132/450
Epoch 133/450
Epoch 134/450
Epoch 135/450
Epoch 136/450
Epoch 137/450
Epoch 138/450
Epoch 139/450
Epoch 140/450
Epoch 141/450
Epoch 142/450
Epoch 143/450
Epoch 144/450
Epoch 145/450
Epoch 146/450
Epoch 147/450
Epoch 148/450
Epoch 149/450
Epoch 150/450
Epoch 151/450
Epoch 152/45

Epoch 158/450
Epoch 159/450
Epoch 160/450
Epoch 161/450
Epoch 162/450
Epoch 163/450
Epoch 164/450
Epoch 165/450
Epoch 166/450
Epoch 167/450
Epoch 168/450
Epoch 169/450
Epoch 170/450
Epoch 171/450
Epoch 172/450
Epoch 173/450
Epoch 174/450
Epoch 175/450
Epoch 176/450
Epoch 177/450
Epoch 178/450
Epoch 179/450
Epoch 180/450
Epoch 181/450
Epoch 182/450
Epoch 183/450
Epoch 184/450
Epoch 185/450
Epoch 186/450
Epoch 187/450
Epoch 188/450
Epoch 189/450
Epoch 190/450
Epoch 191/450
Epoch 192/450
Epoch 193/450
Epoch 194/450
Epoch 195/450
Epoch 196/450
Epoch 197/450
Epoch 198/450
Epoch 199/450
Epoch 200/450
Epoch 201/450
Epoch 202/450
Epoch 203/450
Epoch 204/450
Epoch 205/450
Epoch 206/450
Epoch 207/450
Epoch 208/450
Epoch 209/450
Epoch 210/450
Epoch 211/450
Epoch 212/450
Epoch 213/450
Epoch 214/450
Epoch 215/450
Epoch 216/450
Epoch 217/450
Epoch 218/450
Epoch 219/450
Epoch 220/450
Epoch 221/450
Epoch 222/450
Epoch 223/450
Epoch 224/450
Epoch 225/450
Epoch 226/450
Epoch 227/450
Epoch 228/450
Epoch 

Epoch 236/450
Epoch 237/450
Epoch 238/450
Epoch 239/450
Epoch 240/450
Epoch 241/450
Epoch 242/450
Epoch 243/450
Epoch 244/450
Epoch 245/450
Epoch 246/450
Epoch 247/450
Epoch 248/450
Epoch 249/450
Epoch 250/450
Epoch 251/450
Epoch 252/450
Epoch 253/450
Epoch 254/450
Epoch 255/450
Epoch 256/450
Epoch 257/450
Epoch 258/450
Epoch 259/450
Epoch 260/450
Epoch 261/450
Epoch 262/450
Epoch 263/450
Epoch 264/450
Epoch 265/450
Epoch 266/450
Epoch 267/450
Epoch 268/450
Epoch 269/450
Epoch 270/450
Epoch 271/450
Epoch 272/450
Epoch 273/450
Epoch 274/450
Epoch 275/450
Epoch 276/450
Epoch 277/450
Epoch 278/450
Epoch 279/450
Epoch 280/450
Epoch 281/450
Epoch 282/450
Epoch 283/450
Epoch 284/450
Epoch 285/450
Epoch 286/450
Epoch 287/450
Epoch 288/450
Epoch 289/450
Epoch 290/450
Epoch 291/450
Epoch 292/450
Epoch 293/450
Epoch 294/450
Epoch 295/450
Epoch 296/450
Epoch 297/450
Epoch 298/450
Epoch 299/450
Epoch 300/450
Epoch 301/450
Epoch 302/450
Epoch 303/450
Epoch 304/450
Epoch 305/450
Epoch 306/450
Epoch 

Epoch 314/450
Epoch 315/450
Epoch 316/450
Epoch 317/450
Epoch 318/450
Epoch 319/450
Epoch 320/450
Epoch 321/450
Epoch 322/450
Epoch 323/450
Epoch 324/450
Epoch 325/450
Epoch 326/450
Epoch 327/450
Epoch 328/450
Epoch 329/450
Epoch 330/450
Epoch 331/450
Epoch 332/450
Epoch 333/450
Epoch 334/450
Epoch 335/450
Epoch 336/450
Epoch 337/450
Epoch 338/450
Epoch 339/450
Epoch 340/450
Epoch 341/450
Epoch 342/450
Epoch 343/450
Epoch 344/450
Epoch 345/450
Epoch 346/450
Epoch 347/450
Epoch 348/450
Epoch 349/450
Epoch 350/450
Epoch 351/450
Epoch 352/450
Epoch 353/450
Epoch 354/450
Epoch 355/450
Epoch 356/450
Epoch 357/450
Epoch 358/450
Epoch 359/450
Epoch 360/450
Epoch 361/450
Epoch 362/450
Epoch 363/450
Epoch 364/450
Epoch 365/450
Epoch 366/450
Epoch 367/450
Epoch 368/450
Epoch 369/450
Epoch 370/450
Epoch 371/450
Epoch 372/450
Epoch 373/450
Epoch 374/450
Epoch 375/450
Epoch 376/450
Epoch 377/450
Epoch 378/450
Epoch 379/450
Epoch 380/450
Epoch 381/450
Epoch 382/450
Epoch 383/450
Epoch 384/450
Epoch 

Epoch 392/450
Epoch 393/450
Epoch 394/450
Epoch 395/450
Epoch 396/450
Epoch 397/450
Epoch 398/450
Epoch 399/450
Epoch 400/450
Epoch 401/450
Epoch 402/450
Epoch 403/450
Epoch 404/450
Epoch 405/450
Epoch 406/450
Epoch 407/450
Epoch 408/450
Epoch 409/450
Epoch 410/450
Epoch 411/450
Epoch 412/450
Epoch 413/450
Epoch 414/450
Epoch 415/450
Epoch 416/450
Epoch 417/450
Epoch 418/450
Epoch 419/450
Epoch 420/450
Epoch 421/450
Epoch 422/450
Epoch 423/450
Epoch 424/450
Epoch 425/450
Epoch 426/450
Epoch 427/450
Epoch 428/450
Epoch 429/450
Epoch 430/450
Epoch 431/450
Epoch 432/450
Epoch 433/450
Epoch 434/450
Epoch 435/450
Epoch 436/450
Epoch 437/450
Epoch 438/450
Epoch 439/450
Epoch 440/450
Epoch 441/450
Epoch 442/450
Epoch 443/450
Epoch 444/450
Epoch 445/450
Epoch 446/450
Epoch 447/450
Epoch 448/450
Epoch 449/450
Epoch 450/450


<keras.callbacks.History at 0x25a6cc43880>

## Predicting the test data

In [18]:
Y_hat=model.predict(X_test)
Y_hat



array([[ 0.7179318 ],
       [ 0.14259958],
       [ 0.6663942 ],
       [ 0.07807833],
       [ 0.08353722],
       [ 0.07509935],
       [ 0.19101822],
       [ 0.16766697],
       [ 0.07509935],
       [ 0.20958626],
       [ 0.39574462],
       [ 0.14912128],
       [ 0.21232665],
       [ 0.33833754],
       [ 0.8213795 ],
       [ 0.07871932],
       [ 0.08050418],
       [ 0.23019129],
       [ 0.09562194],
       [ 0.34592122],
       [ 0.16889066],
       [ 0.07509959],
       [ 0.1844911 ],
       [ 0.2929303 ],
       [ 0.40083086],
       [ 0.37147468],
       [ 0.56945115],
       [ 0.23851901],
       [ 0.07509935],
       [ 0.19429195],
       [ 0.32545173],
       [ 0.07625467],
       [ 0.46853036],
       [ 0.07721889],
       [ 0.19393581],
       [ 0.09648085],
       [ 0.4816519 ],
       [ 0.14166659],
       [ 0.4939857 ],
       [ 0.10705298],
       [ 0.568876  ],
       [ 0.1018458 ],
       [ 5.967721  ],
       [ 0.07520735],
       [ 0.07974213],
       [ 0

In [19]:
Y_test

array([[1.16714286e+00],
       [8.17057212e-02],
       [7.87544531e-01],
       [1.49572650e-02],
       [5.91313131e-02],
       [2.47898551e-03],
       [6.27029448e-02],
       [5.08756567e-02],
       [5.02166065e-03],
       [5.73452485e-02],
       [5.48754749e-01],
       [5.74209708e-02],
       [1.40845753e-01],
       [2.71312265e-01],
       [9.10891089e-01],
       [3.43948718e-02],
       [6.78235294e-02],
       [2.14908880e-01],
       [6.63979132e-02],
       [1.74084416e-01],
       [5.08756567e-02],
       [2.30433000e-02],
       [1.35503388e-01],
       [8.46153846e-02],
       [6.84512428e-02],
       [3.67736151e-01],
       [6.41022433e-01],
       [2.07942170e-01],
       [1.32242833e-03],
       [1.75048381e-01],
       [4.02076574e-01],
       [3.35648148e-03],
       [1.01250000e+00],
       [3.03260459e-02],
       [3.39163498e-02],
       [3.39500734e-02],
       [6.20557157e-01],
       [1.94195387e-01],
       [1.43000000e-01],
       [3.46646571e-02],


Problems to be addressed:
*   Analyze the performance of the model.
*   Prediction of coutries that will run out of water by 2050. (can be verified by checking if the Water stress will reach 100% or 1.0)
*   No justification behind chossing the certain number of hidden layers and certain number of neurons.
*   Further use this model to predict values of water stress in the app.











# Predict the x variables for future

In [20]:
#Read and prepare the data
data_ori = pd.read_csv('https://www.dropbox.com/s/j07u99pg44kbfvm/final_data.csv?dl=1')
data_ori

Unnamed: 0,Area,Variable Name,Year,Value
0,Afghanistan,Total population,1962,9351.441000
1,Afghanistan,Total population,1967,10399.926000
2,Afghanistan,Total population,1972,11791.215000
3,Afghanistan,Total population,1977,13171.306000
4,Afghanistan,Total population,1982,12882.528000
...,...,...,...,...
18698,Zimbabwe,SDG 6.4.2. Water Stress,2002,39.476155
18699,Zimbabwe,SDG 6.4.2. Water Stress,2007,33.514833
18700,Zimbabwe,SDG 6.4.2. Water Stress,2012,32.488030
18701,Zimbabwe,SDG 6.4.2. Water Stress,2017,31.346226


In [21]:
data_ori = data_ori.groupby(['Year','Variable Name', 'Area'])['Value'].aggregate('mean').unstack(1)
data_ori.reset_index(inplace=True)
print("Dimensions of the original dataset: ", data_ori.shape)
data_ori.dropna(subset=["Population density","GDP per capita","SDG 6.4.2. Water Stress","Agricultural water withdrawal as % of total water withdrawal","Municipal water withdrawal as % of total withdrawal"], inplace=True)
data_ori.drop(["Total population","Capacity of the municipal wastewater treatment facilities","Collected municipal wastewater","Not treated municipal wastewater","Industrial water withdrawal as % of total water withdrawal"], axis=1, inplace=True)
print("Dimensions of the new dataset: ", data_ori.shape)
data_ori


Dimensions of the original dataset:  (2431, 15)
Dimensions of the new dataset:  (1205, 10)


Variable Name,Year,Area,Agricultural water withdrawal as % of total water withdrawal,GDP per capita,Long-term average annual precipitation in volume,Municipal water withdrawal as % of total withdrawal,Population density,SDG 6.4.2. Water Stress,Total renewable water resources per capita,Total water withdrawal per capita
388,1972,Denmark,31.947484,4655.290175,30.29227,60.831510,115.816802,24.716063,1202.273258,183.146293
404,1972,Ghana,54.727904,493.336524,283.14698,32.902354,38.692173,1.505958,6089.084168,37.430171
414,1972,Hungary,22.379496,782.184870,54.79467,20.569268,112.072299,5.828256,9974.980064,323.626719
421,1972,Israel,78.318250,2762.062183,9.60045,16.779789,135.771726,135.122077,594.029932,531.022713
422,1972,Italy,59.201104,2674.044383,250.71488,16.811125,180.058210,37.599562,3525.699205,786.449189
...,...,...,...,...,...,...,...,...,...,...
2426,2018,Venezuela (Bolivarian Republic of),73.852110,6433.928949,1864.23020,22.641793,31.672735,7.540367,45868.196340,783.266091
2427,2018,Viet Nam,94.782397,2538.068147,603.16983,1.470194,288.458056,18.130315,9253.347619,858.539684
2428,2018,Yemen,90.743338,922.404203,88.17099,7.433380,53.977845,169.761905,73.687616,125.093500
2429,2018,Zambia,73.282443,1549.354235,767.66220,18.447837,23.055378,2.835498,6039.751245,90.596269


In [22]:
def extrapolate(country,variable):
  poly = np.polyfit(data_ori[data_ori['Area']==country]['Year'], data_ori[data_ori['Area']==country][variable], deg=1)
  y_int  = np.polyval(poly, 2022)
  return y_int

In [23]:
newdata = data_ori[data_ori['Area']=='country']

In [24]:
ind=0
for i in np.unique(np.array(data_ori['Area'])):
  #print(ind)
  #print(i)
  newdata.loc[ind] = pd.Series({'Year':2022, 'Area':i,
                                'Agricultural water withdrawal as % of total water withdrawal':extrapolate(i,'Agricultural water withdrawal as % of total water withdrawal'),
                                'GDP per capita':extrapolate(i,'GDP per capita'), 
                                'Long-term average annual precipitation in volume':extrapolate(i,'Long-term average annual precipitation in volume'),
                                'Municipal water withdrawal as % of total withdrawal':extrapolate(i,'Municipal water withdrawal as % of total withdrawal'),
                                'Population density':extrapolate(i,'Population density'), 
                                'SDG 6.4.2. Water Stress':extrapolate(i,'SDG 6.4.2. Water Stress'),
                                'Total renewable water resources per capita':extrapolate(i,'Total renewable water resources per capita'),
                                'Total water withdrawal per capita':extrapolate(i,'Total water withdrawal per capita')})
  #print(newdata.loc[ind])
  ind=ind+1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.loc[ind] = pd.Series({'Year':2022, 'Area':i,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.loc[ind] = pd.Series({'Year':2022, 'Area':i,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.loc[ind] = pd.Series({'Year':2022, 'Area':i,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.l

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.loc[ind] = pd.Series({'Year':2022, 'Area':i,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.loc[ind] = pd.Series({'Year':2022, 'Area':i,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.loc[ind] = pd.Series({'Year':2022, 'Area':i,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.l

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.loc[ind] = pd.Series({'Year':2022, 'Area':i,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.loc[ind] = pd.Series({'Year':2022, 'Area':i,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.loc[ind] = pd.Series({'Year':2022, 'Area':i,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.l

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.loc[ind] = pd.Series({'Year':2022, 'Area':i,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.loc[ind] = pd.Series({'Year':2022, 'Area':i,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.loc[ind] = pd.Series({'Year':2022, 'Area':i,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.l

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.loc[ind] = pd.Series({'Year':2022, 'Area':i,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.loc[ind] = pd.Series({'Year':2022, 'Area':i,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.loc[ind] = pd.Series({'Year':2022, 'Area':i,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.l

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.loc[ind] = pd.Series({'Year':2022, 'Area':i,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.loc[ind] = pd.Series({'Year':2022, 'Area':i,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.loc[ind] = pd.Series({'Year':2022, 'Area':i,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.l

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.loc[ind] = pd.Series({'Year':2022, 'Area':i,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.loc[ind] = pd.Series({'Year':2022, 'Area':i,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.loc[ind] = pd.Series({'Year':2022, 'Area':i,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newdata.l

In [25]:
newdata

Variable Name,Year,Area,Agricultural water withdrawal as % of total water withdrawal,GDP per capita,Long-term average annual precipitation in volume,Municipal water withdrawal as % of total withdrawal,Population density,SDG 6.4.2. Water Stress,Total renewable water resources per capita,Total water withdrawal per capita
0,2022,Afghanistan,97.941104,656.180969,213.48522,0.993969,61.257508,50.562994,969.975594,176.892287
1,2022,Albania,57.542793,5786.599868,41.381326,30.362635,101.094659,6.154082,10689.844269,363.92748
2,2022,Algeria,58.977252,4659.461837,211.974923,38.004788,18.40922,142.115966,233.342039,255.148373
3,2022,Angola,5.890727,4653.147601,1259.167,53.707643,25.809087,2.03677,3238.309847,21.052376
4,2022,Antigua and Barbuda,14.236588,16491.180458,0.4532,63.205424,231.527633,9.855674,485.886409,131.255451
...,...,...,...,...,...,...,...,...,...,...
172,2022,Venezuela (Bolivarian Republic of),76.424646,9129.766093,1864.069566,21.187347,35.293496,8.25744,32080.107666,819.597845
173,2022,Viet Nam,95.87318,2400.104517,602.640497,0.828757,304.226958,20.510049,8166.541004,944.226983
174,2022,Yemen,90.544375,1217.420441,88.17099,7.490926,57.992638,177.354961,56.084185,106.871156
175,2022,Zambia,72.242013,1861.535819,767.6622,18.899592,24.287317,2.745366,4918.65272,67.947275


In [26]:
newdata.to_csv('validation_data.csv', encoding = 'utf-8-sig') 