In [1]:
import numpy as np
import pandas as pd
import math
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn import linear_model

ModuleNotFoundError: No module named 'tensorflow'

In [2]:
!pip install tensorflow

^C


# Initial Data


*   x1: Total Population *(in 1000s)*
*   x2: Population Density *(inhabitants/square km)*
*   x3: GDP per capita *(USD/inhabitant)*
*   x4: Long term average annual precipitation in depth *(mm/year)*
*   x5: Total renewable resources per capita *($m^{3}$/inhabitant /year)*
*   x6: Total water withdrawal per capita *($m^{3}$/inhabitant /year)*
*   x7: Agricultural water withdrawal *(%age of x6)*
*   x8: Industrial water withdrawal *(%age of x6)*
*   x9: Municipal water withdrawal *(%age of x6)*
*   x10: Collected Municipal wastewater  *($km^{3}$/year)*
*   x11: Capacity of the municipal wastewater treatment facilities *($km^{3}$/year)*
*   x12: Not treated municipal wastewater *($km^{3}$/year)*
*   y: Water Stress *(%age)*

In [None]:
#Read and prepare the data
data = pd.read_csv('https://www.dropbox.com/s/j07u99pg44kbfvm/final_data.csv?dl=1')
data

In [None]:
#data_x1_x5.pivot(columns='Variable Name', index='Area')['Value']
data = data.groupby(['Year','Variable Name', 'Area'])['Value'].aggregate('mean').unstack(1)
data

In [None]:
data.isna().sum()

## Exploratory Data Analysis



*   Three variables, Capacity of the municipal wastewater treatment facilities (x11), Collected municipal wastewater (x10) and Not treated municipal wastewater (x12) are missing around 75% to 80% of the values. Therefore it can be assumed that they do not impact the water stress (y) values and can be droped from the dataset.
*   All the rows with NaN values can be dropped.
*   Variables x6, x7 and x8 sum up to 100 as they are percentages of x6. Therefore we need only two of these three variables and one of them should be dropped to avoid redundancy.
*   Total population and Population density are related variables. So one of them can be dropped. Total population being a more absolute variable can be dropped.





# Cleaning the dataset

In [None]:
#drop the rows that are missing "y" values i.e. missing SDG 6.4.2. Water Stress
print("Dimensions of the original dataset: ", data.shape)
data.dropna(subset=["Population density","GDP per capita","SDG 6.4.2. Water Stress","Agricultural water withdrawal as % of total water withdrawal","Municipal water withdrawal as % of total withdrawal"], inplace=True)
data.drop(["Total population","Capacity of the municipal wastewater treatment facilities","Collected municipal wastewater","Not treated municipal wastewater","Industrial water withdrawal as % of total water withdrawal"], axis=1, inplace=True)
print("Dimensions of the new dataset: ", data.shape)

In [None]:
data.isna().sum()

In [None]:
data.reset_index(inplace=True)

In [None]:
data

In [None]:
data.corr()

There is no direct linear relation between any of the variables and the water stress values. We need to build a prediction model that analyze complex relations between variables such as Neural Networks.

# Scaling the dataset

In [None]:
#Create X_data and Y_data
X_data= data[["Agricultural water withdrawal as % of total water withdrawal", "GDP per capita", "Long-term average annual precipitation in volume", "Municipal water withdrawal as % of total withdrawal", "Total renewable water resources per capita", "Total water withdrawal per capita", "Population density"]]
Y_data= data["SDG 6.4.2. Water Stress"]
X_scale= X_data.apply(lambda x: (x-x.min(axis=0)) / (x.max(axis=0)-x.min(axis=0)))
#Change the percentage to proportion
Y_scale=Y_data.apply(lambda x: x/100)

# Neural Network Model

## Splitting the dataset

In [None]:
print(X_scale.shape,Y_scale.shape)

In [None]:
#convert dataframes to array
X=pd.DataFrame(X_scale).to_numpy()
Y=pd.DataFrame(Y_scale).to_numpy()

In [None]:
X

In [None]:
Y

In [None]:
#split the data for testing and training
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

## Building the Neural Network Model

In [None]:
#Create a neutral network model
model=tf.keras.models.Sequential()
model.add(tf.keras.layers.Flatten()),
model.add(tf.keras.layers.Dense(35, activation="relu")),
model.add(tf.keras.layers.Dense(30, activation="sigmoid")),
model.add(tf.keras.layers.Dense(25, activation="sigmoid")),
model.add(tf.keras.layers.Dense(1, activation="linear")),

#compile the model
model.compile(optimizer="adam",loss="mean_squared_error", metrics="accuracy")

In [None]:
#train the model
model.fit(X_train, Y_train, epochs=450, batch_size=32)

## Predicting the test data

In [None]:
Y_hat=model.predict(X_test)
Y_hat

In [None]:
Y_test

Problems to be addressed:
*   Analyze the performance of the model.
*   Prediction of coutries that will run out of water by 2050. (can be verified by checking if the Water stress will reach 100% or 1.0)
*   No justification behind chossing the certain number of hidden layers and certain number of neurons.
*   Further use this model to predict values of water stress in the app.











# Predict the x variables for future

In [None]:
#Read and prepare the data
data_ori = pd.read_csv('https://www.dropbox.com/s/j07u99pg44kbfvm/final_data.csv?dl=1')
data_ori

In [None]:
data_ori = data_ori.groupby(['Year','Variable Name', 'Area'])['Value'].aggregate('mean').unstack(1)
data_ori.reset_index(inplace=True)
print("Dimensions of the original dataset: ", data_ori.shape)
data_ori.dropna(subset=["Population density","GDP per capita","SDG 6.4.2. Water Stress","Agricultural water withdrawal as % of total water withdrawal","Municipal water withdrawal as % of total withdrawal"], inplace=True)
data_ori.drop(["Total population","Capacity of the municipal wastewater treatment facilities","Collected municipal wastewater","Not treated municipal wastewater","Industrial water withdrawal as % of total water withdrawal"], axis=1, inplace=True)
print("Dimensions of the new dataset: ", data_ori.shape)
data_ori


In [None]:
def extrapolate(country,variable):
  poly = np.polyfit(data_ori[data_ori['Area']==country]['Year'], data_ori[data_ori['Area']==country][variable], deg=1)
  y_int  = np.polyval(poly, 2022)
  return y_int

In [None]:
newdata = data_ori[data_ori['Area']=='country']

In [None]:
ind=0
for i in np.unique(np.array(data_ori['Area'])):
  #print(ind)
  #print(i)
  newdata.loc[ind] = pd.Series({'Year':2022, 'Area':i,
                                'Agricultural water withdrawal as % of total water withdrawal':extrapolate(i,'Agricultural water withdrawal as % of total water withdrawal'),
                                'GDP per capita':extrapolate(i,'GDP per capita'), 
                                'Long-term average annual precipitation in volume':extrapolate(i,'Long-term average annual precipitation in volume'),
                                'Municipal water withdrawal as % of total withdrawal':extrapolate(i,'Municipal water withdrawal as % of total withdrawal'),
                                'Population density':extrapolate(i,'Population density'), 
                                'SDG 6.4.2. Water Stress':extrapolate(i,'SDG 6.4.2. Water Stress'),
                                'Total renewable water resources per capita':extrapolate(i,'Total renewable water resources per capita'),
                                'Total water withdrawal per capita':extrapolate(i,'Total water withdrawal per capita')})
  #print(newdata.loc[ind])
  ind=ind+1

In [None]:
newdata

In [None]:
newdata.to_csv('validation_data.csv', encoding = 'utf-8-sig') 