In [1]:
import pandas as pd
import numpy as np
import sqlite3
from itertools import product
pd.set_option('display.max_columns', 500)

import matplotlib.pyplot as plt

conn = sqlite3.connect(r'C:\Users\Spencer\Environments\aquastaat\project.db')
conn.text_factory = lambda x: str(x, 'latin1')

data = pd.read_sql("SELECT * FROM parameter", conn)

In [2]:
def remove_blanks(data):
    print('Before: ', data.shape)
    data.loc[data.loc[:,'Value']=='','Value'] = None
    data.dropna(axis=0, how='any', inplace=True)
    print('After: ', data.shape)
    
def label_bin(year, year_bins, bin_labels):
    return bin_labels[len(list(filter(lambda y: y < year, year_bins)))-1]
    
def bin_year(data):
    year_bins = [y for y in range(1978, 2020, 5)]
    bin_labels = [l for l in range(1980, 2020, 5)]
    data['YearBin'] = data['Year'].apply(lambda x: label_bin(x, year_bins, bin_labels))

In [3]:
remove_blanks(data)
bin_year(data)
data['Value'] = data['Value'].astype(float)

Before:  (122853, 6)
After:  (122740, 6)


In [4]:
data.drop(axis=1, labels=['VariableId', 'AreaId', 'Year'], inplace=True)

In [5]:
All = slice(None)
WSI = 'SDG 6.4.2. Water Stress'
TFW = 'Total freshwater withdrawal (primary and secondary)'

In [6]:
data = data.pivot_table(index=['Area', 'YearBin'], columns='VariableName', values='Value', aggfunc=np.mean)

In [7]:
wsi_null = data.loc[:,'SDG 6.4.2. Water Stress'].isnull()
data = data.loc[~wsi_null,:]
data.reset_index(inplace=True)
data.set_index(['Area', 'YearBin'], inplace=True)
null_cols = pd.DataFrame({'column':data.isnull().sum().index, 'nulls':data.isnull().sum().values})
usable_cols = null_cols.loc[null_cols.loc[:,'nulls']<data.shape[0]*.1,'column'].values
data = data.loc[:, data.columns.isin(usable_cols)]

In [8]:
static_cols = [
    'Dependency ratio',
    'Groundwater',
    'Long-term',
    'Overlap',
    'Surface',
    'Total area of the country (excl. coastal waters)',
    'Total internal renewable water resources (IRWR)',
    'Total renewable groundwater',
    'Total renewable water resources',
    'Water resources: total external renewable',
]

In [9]:
cols = data.columns
dyn_cols = []
for c in cols:
    if not any([1 for s in static_cols if s in c]):
        dyn_cols.append(c)  

data = data.loc[:, dyn_cols]

In [10]:
areas = data.index.levels[0]
bin_labels = [l for l in range(1980, 2020, 5)]
data_viz = pd.DataFrame(list(product(areas, bin_labels)), columns=['Area', 'YearBin'])
data_viz.set_index(['Area', 'YearBin'], inplace=True)
data_viz = data_viz.join(data, how='left')

In [11]:
tfw_predictors = [
    'Total population',
    'Cultivated area (arable land + permanent crops)',
    'Gross Domestic Product (GDP)',
    'Total renewable surface water',
]

In [21]:
training_data = data_viz.loc[:, tfw_predictors + [WSI, TFW]]
training_data = training_data.groupby('Area').apply(lambda x: x.interpolate(limit_direction='both'))

In [26]:
training_data.to_csv('training_data.csv')