# Produce `water_data.json` for the app's use

Authors: everett@bayes.org, mehdi@bayes.org

In [1]:
from collections import defaultdict
import datetime as datetime
import json
import random

import numpy as np
import pandas as pd

%matplotlib inline

DATA_DIR = "../data/"

### Load a reduced version of the water conservation dataset

#### This was manually edited by me (Everett) from the full dataset (also in the same folder), though in an ideal world I'd do it via a Makefile and code.

In [2]:
df = pd.read_csv(DATA_DIR + "uw_supplier_data110116_edited.csv", encoding="latin-1")
df = df.rename(columns=dict((c, c.strip()) for c in df.columns))
print("* " + "\n* ".join(df.columns))

* Supplier Name
* Mandatory Restrictions
* Reporting Month
* Total Population Served
* Conservation Standard (starting in June 2015) *Adjusted in March 2016 **Revised in June 2016
* REPORTED Total Monthly Potable Water Production Reporting Month
* REPORTED Monthly CII
* REPORTED Monthly Ag Use Reporting Month
* CALCULATED Total Monthly Potable Water Production Reporting Month Gallons
* CALCULATED Total Monthly Potable Water Production 2013 Gallons
* CALCULATED Monthly CII Reporting Month
* CALCULATED R-GPCD Reporting Month
* % Residential Use
* Hydrologic Region


In [3]:
def numberize(i):
    """Convert a string like '100,000' into a float, like 100000.0
    
    Since this data has a lot of variants for zero (missing data,
    null, strings like 'n/a'), we regard any failed float conversion
    as an indicator that the value is 0. This seems to be correct
    based on examining the data.
    """
    try:
        return float(str(i).replace(',', ''))
    except:
        return 0.0

In [4]:
# Rename wordy columns to more palatable strings
col_rename_dict = {
    'Supplier Name': 'supplier_name',
#     'REPORTED Total Monthly Potable Water Production Reporting Month': 'reported_total_gal',
#     'REPORTED Monthly CII': 'reported_cii_gal',
#     'REPORTED Monthly Ag Use Reporting Month': 'reported_ag_gal',
    "CALCULATED Total Monthly Potable Water Production Reporting Month Gallons": "total_potable_gal",
    "CALCULATED Monthly CII Reporting Month": "cii_gal",
#     "CALCULATED R-GPCD Reporting Month": "r_gpcd",
    'Reporting Month': 'reporting_month',
    'Total Population Served': 'total_population_served',
    'REPORTED Residential Gallons-per-Capita-Day (R-GPCD) (starting in September 2014)': 'r_gpc',
    '% Residential Use': 'percent_residential_use',
    'Hydrologic Region': 'hydrologic_region',
    'Conservation Standard (starting in June 2015) *Adjusted in March 2016 **Revised in June 2016': 'conservation_standard'
}
cols = list(df.columns)
for c in cols:
    if c not in col_rename_dict:
        del df[c]
df = df.rename(columns=col_rename_dict)

In [5]:
# Compute a bunch of useful columns. Water usage breakdowns, etc.

df['reporting_month'] = pd.to_datetime(df['reporting_month'])
df['month'] = df['reporting_month'].apply(lambda x: x.month)
df['year'] = df['reporting_month'].apply(lambda x: x.year)
df['water_year'] = df['year'] + (df['month'] > 9).astype(int)

# Weirdly, the "total potable water REPORTED" number includes agricultural
# water, whereas the "total potable gallons CALCULATED" does not. The former
# is also reported in a range of units, while the latter converts to gallons.

df['residential_gal'] = df['total_potable_gal'] * (df['percent_residential_use'] / 100.0)
df['cii_gal'] = df['cii_gal'].apply(numberize)
df['other_gal'] = df['total_potable_gal'] * (1 - df['percent_residential_use'] / 100.0) - df['cii_gal']

df['conservation_standard'] = df['conservation_standard'].apply(
    lambda s: 0.0 if pd.isnull(s) else float(s.strip('%')) / 100.0)

df.head()

Unnamed: 0,supplier_name,reporting_month,total_population_served,conservation_standard,total_potable_gal,cii_gal,percent_residential_use,hydrologic_region,month,year,water_year,residential_gal,other_gal
0,East Bay Municipal Utilities District,2016-09-15,1400000,0.0,5636200000,912000000,61,San Francisco Bay,9,2016,2016,3438082000,1286118000
1,East Bay Municipal Utilities District,2016-08-15,1400000,0.0,6007500000,1141000000,61,San Francisco Bay,8,2016,2016,3664575000,1201925000
2,East Bay Municipal Utilities District,2016-07-15,1400000,0.0,6056600000,994000000,60,San Francisco Bay,7,2016,2016,3633960000,1428640000
3,East Bay Municipal Utilities District,2016-06-15,1400000,0.0,5675900000,839000000,61,San Francisco Bay,6,2016,2016,3462299000,1374601000
4,East Bay Municipal Utilities District,2016-05-15,1400000,0.16,4959300000,955000000,60,San Francisco Bay,5,2016,2016,2975580000,1028720000


In [6]:
def computeUsage(df):
    '''Given a dataframe of rows for a single provider, create usage dict.'''
    usage = defaultdict(dict)
    for i, row in df.iterrows():
        m = row['reporting_month'].strftime('%Y-%m')
        pop = float(row['total_population_served'])
        usage['totalPerCapita'][m] = numberize(row['total_potable_gal']) / pop
        usage['residentialPerCapita'][m] = numberize(row['residential_gal']) / pop
        usage['commercialIndustrialPerCapita'][m] = numberize(row['cii_gal']) / pop
        usage['otherPotablePerCapita'][m] = numberize(row['other_gal']) / pop
    
    # For demonstration purposes, add fake data for Oct 2016.
    # Will swap with real data once it's available.
    reference_month = '2015-10'
    if '2015-10' not in usage['totalPerCapita']:
        reference_month = random.choice(list(usage['totalPerCapita'].keys()))
    factor = 1 + random.uniform(-0.1, 0.1)
    for k, v in usage.items():
        usage[k]['2016-10'] = usage[k][reference_month] * factor

    return dict(usage)  # Convert from defaultdict to regular dict

def computePredictions(usage):
    '''Compute monthly usage predictions for the upcoming 12 months.'''
    predictions = {}
    for month in range(1, 13):
        matching_keys = [k for k in usage if k.endswith("%02d" % month)]
        if len(matching_keys) == 0:
            # We have never seen data for this month, can't predict anything
            result = None
        elif len(matching_keys) == 1:
            # Only one occurance of this month, just predict homeostasis
            result = usage[matching_keys[0]]
        else:
            # Assume the year-over-year growth for this month is static.
            matching_keys.sort()
            growth = usage[matching_keys[-1]] / usage[matching_keys[-2]]
            result = usage[matching_keys[-1]] * growth
        predictions['%02d' %  month] = result
    return predictions

In [7]:
# One dataset identifies water suppliers by ID, the other merely by name. We need to pair them.

providers = pd.read_csv(DATA_DIR + 'provider_ids.tsv', sep='\t')
provider_id_lookup = {}
for p, i in zip(providers['Provider'], providers['ID']):
    provider_id_lookup[p.lower()] = i

In [8]:
js = {}
for i, items in enumerate(df.groupby("supplier_name").groups.items()):
    name, indices = items
    print(i, name)
    name = name.lower()
    if name not in provider_id_lookup:
        print("Can't find supplier ID for '%s', skipping..." % name)
        continue

    supplier_df = df.loc[indices].set_index('reporting_month', drop=False)
    usage = computeUsage(supplier_df)
    total = usage['totalPerCapita']
    del usage['totalPerCapita']
    preds = computePredictions(total)

    if len(supplier_df[supplier_df['water_year'] == 2016]) < 12:
        print("Don't have complete data for water year 2016 for '%s', skipping..." % name)
        continue
    reduction = supplier_df.loc[datetime.datetime(2016, 9, 15), 'conservation_standard']
    supplier_2016 = supplier_df[supplier_df['water_year'] == 2016]
    used_2016 = (supplier_2016['total_potable_gal'] / supplier_2016['total_population_served']).sum()
    target = (1 - reduction) * used_2016
    js[provider_id_lookup[name]] = {
        "agencyName": name,
        "totalUsage": total,
        "usageByCategory": usage,
        "monthlyPrediction": preds,
        "annualTarget": target,
    }

0 Suisun-Solano Water Authority
1 Crestline Village Water District
2 Beaumont-Cherry Valley Water District
3 Paradise Irrigation District
4 Kingsburg, City of
5 Fountain Valley  City of
6 Redlands  City of
7 Pleasanton  City of
8 Galt  City of
9 Delano  City of
10 California Water Service Company Oroville
11 Calexico  City of
Don't have complete data for water year 2016 for 'calexico  city of', skipping...
12 Redding  City of
13 Santa Maria  City of
14 Monrovia  City of
Don't have complete data for water year 2016 for 'monrovia  city of', skipping...
15 Ukiah  City of
Don't have complete data for water year 2016 for 'ukiah  city of', skipping...
16 Hesperia Water District City of
17 Red Bluff  City of
18 La Habra  City of Public Works
19 Sacramento Suburban Water District
20 Citrus Heights Water District
21 Castaic Lake Water Agency Santa Clarita Water Division
22 San Juan Capistrano  City of
23 Monte Vista Water District
24 Livermore  City of Division of Water Resources
25 Mission Spr

In [9]:
js[provider_id_lookup['East Bay Municipal Utilities District'.lower()]]

{'agencyName': 'east bay municipal utilities district',
 'annualTarget': 39940.788195133602,
 'monthlyPrediction': {'01': 2417.0425868076827,
  '02': 2307.3946062486384,
  '03': 2162.496771084676,
  '04': 2569.9142125418007,
  '05': 3744.794391501367,
  '06': 4781.496866924954,
  '07': 5052.842907325889,
  '08': 4874.674539013777,
  '09': 4451.649133761039,
  '10': 3126.39045587951,
  '11': 2555.6796736092665,
  '12': 2564.2099279337895},
 'totalUsage': {'2014-06': 4786.015037593985,
  '2014-07': 4954.962406015037,
  '2014-08': 4511.965192168238,
  '2014-09': 4121.02973168963,
  '2014-10': 3894.5612762871647,
  '2014-11': 3042.3495286439447,
  '2014-12': 2783.901377810007,
  '2015-01': 2946.906474820144,
  '2015-02': 2709.2805755395684,
  '2015-03': 3345.4676258992804,
  '2015-04': 3206.4028776978416,
  '2015-05': 3350.863309352518,
  '2015-06': 3437.5539568345325,
  '2015-07': 3703.956834532374,
  '2015-08': 3777.338129496403,
  '2015-09': 3640.7913669064747,
  '2015-10': 3511.5107913

In [10]:
with open(DATA_DIR + 'usage.json', 'w') as f:
    f.write(json.dumps(js, indent=4))