# Produce `water_data.json` for the app's use

Authors: everett@bayes.org, mehdi@bayes.org

In [1]:
from collections import defaultdict
import datetime as datetime
import json
import random

import numpy as np
import pandas as pd

%matplotlib inline

DATA_DIR = "../data/"

### Load a reduced version of the water conservation dataset

#### This was manually edited by me (Everett) from the full dataset (also in the same folder), though in an ideal world I'd do it via a Makefile and code.

In [2]:
df = pd.read_csv(DATA_DIR + 'uw_supplier_data1201616_edited.csv', encoding="latin-1")
df = df.rename(columns=dict((c, c.strip()) for c in df.columns))
print("* " + "\n* ".join(df.columns))
df.head()

* Supplier Name
* Mandatory Restrictions
* Reporting Month
* Total Population Served
* REPORTED Total Monthly Potable Water Production Reporting Month
* REPORTED Total Monthly Potable Water Production 2013
* REPORTED Monthly CII
* REPORTED Monthly Ag Use Reporting Month
* Conservation Standard (starting in June 2015) *Adjusted in March 2016 **Revised in June 2016
* CALCULATED Total Monthly Potable Water Production Reporting Month Gallons
* CALCULATED Total Monthly Potable Water Production 2013 Gallons
* CALCULATED Monthly CII Reporting Month
* % Residential Use
* Hydrologic Region


Unnamed: 0,Supplier Name,Mandatory Restrictions,Reporting Month,Total Population Served,REPORTED Total Monthly Potable Water Production Reporting Month,REPORTED Total Monthly Potable Water Production 2013,REPORTED Monthly CII,REPORTED Monthly Ag Use Reporting Month,Conservation Standard (starting in June 2015) *Adjusted in March 2016 **Revised in June 2016,CALCULATED Total Monthly Potable Water Production Reporting Month Gallons,CALCULATED Total Monthly Potable Water Production 2013 Gallons,CALCULATED Monthly CII Reporting Month,% Residential Use,Hydrologic Region
0,East Bay Municipal Utilities District,No,10/15/16,1400000,4689.9,6175.0,956,,0%,4689900000,6175000000,956000000,60,San Francisco Bay
1,East Bay Municipal Utilities District,No,9/15/16,1400000,5636.2,6528.4,912,,0%,5636200000,6528400000,912000000,61,San Francisco Bay
2,East Bay Municipal Utilities District,No,8/15/16,1400000,6007.5,7172.3,1141,,0%,6007500000,7172300000,1141000000,61,San Francisco Bay
3,East Bay Municipal Utilities District,No,7/15/16,1400000,6056.6,7452.2,994,,0%,6056600000,7452200000,994000000,60,San Francisco Bay
4,East Bay Municipal Utilities District,Yes,6/15/16,1400000,5675.9,6927.5,839,,0%,5675900000,6927500000,839000000,61,San Francisco Bay


In [3]:
def numberize(i):
    """Convert a string like '100,000' into a float, like 100000.0
    
    Since this data has a lot of variants for zero (missing data,
    null, strings like 'n/a'), we regard any failed float conversion
    as an indicator that the value is 0. This seems to be correct
    based on examining the data.
    """
    try:
        return float(str(i).replace(',', ''))
    except:
        return 0.0

In [4]:
# Rename wordy columns to more palatable strings
col_rename_dict = {
    'Supplier Name': 'supplier_name',
    "CALCULATED Total Monthly Potable Water Production Reporting Month Gallons": "total_potable_gal",
    "CALCULATED Total Monthly Potable Water Production 2013 Gallons": "total_potable_gal_2013",
    "CALCULATED Monthly CII Reporting Month": "cii_gal",
    'Reporting Month': 'reporting_month',
    'Total Population Served': 'total_population_served',
    '% Residential Use': 'percent_residential_use',
    'Hydrologic Region': 'hydrologic_region',
    'Conservation Standard (starting in June 2015) *Adjusted in March 2016 **Revised in June 2016': 'conservation_standard'
}
cols = list(df.columns)
for c in cols:
    if c not in col_rename_dict:
        del df[c]
df = df.rename(columns=col_rename_dict)

In [5]:
# Convert numerical columns from there comma-delimeted and other funky formats
numerical_columns = [c for c in df.columns if '_gal' in c] + ['percent_residential_use', 'total_population_served']
for c in numerical_columns:
    df[c] = df[c].apply(numberize)

In [6]:
# Compute a bunch of useful columns. Water usage breakdowns, etc.

df['reporting_month'] = pd.to_datetime(df['reporting_month'])
df['month'] = df['reporting_month'].apply(lambda x: x.month)
df['year'] = df['reporting_month'].apply(lambda x: x.year)
df['water_year'] = df['year'] + (df['month'] > 9).astype(int)

# Weirdly, the "total potable water REPORTED" number includes agricultural
# water, whereas the "total potable gallons CALCULATED" does not. The former
# is also reported in a range of units, while the latter converts to gallons.

df['residential_gal'] = df['total_potable_gal'] * (df['percent_residential_use'] / 100.0)
df['other_gal'] = df['total_potable_gal'] * (1 - df['percent_residential_use'] / 100.0) - df['cii_gal']

df['conservation_standard'] = df['conservation_standard'].apply(
    lambda s: 0.0 if pd.isnull(s) else float(s.strip('%')) / 100.0)

df.head()

Unnamed: 0,supplier_name,reporting_month,total_population_served,conservation_standard,total_potable_gal,total_potable_gal_2013,cii_gal,percent_residential_use,hydrologic_region,month,year,water_year,residential_gal,other_gal
0,East Bay Municipal Utilities District,2016-10-15,1400000,0,4689900000,6175000000,956000000,60,San Francisco Bay,10,2016,2017,2813940000,919960000
1,East Bay Municipal Utilities District,2016-09-15,1400000,0,5636200000,6528400000,912000000,61,San Francisco Bay,9,2016,2016,3438082000,1286118000
2,East Bay Municipal Utilities District,2016-08-15,1400000,0,6007500000,7172300000,1141000000,61,San Francisco Bay,8,2016,2016,3664575000,1201925000
3,East Bay Municipal Utilities District,2016-07-15,1400000,0,6056600000,7452200000,994000000,60,San Francisco Bay,7,2016,2016,3633960000,1428640000
4,East Bay Municipal Utilities District,2016-06-15,1400000,0,5675900000,6927500000,839000000,61,San Francisco Bay,6,2016,2016,3462299000,1374601000


In [7]:
def computeUsage(df):
    '''Given a dataframe for a single provider, create usage dict.'''
    usage = defaultdict(dict)
    for i, row in df.iterrows():
        m = row['reporting_month'].strftime('%Y-%m')
        pop = float(row['total_population_served'])
        usage['totalPerCapita'][m] = numberize(row['total_potable_gal']) / pop
        usage['residentialPerCapita'][m] = numberize(row['residential_gal']) / pop
        usage['commercialIndustrialPerCapita'][m] = numberize(row['cii_gal']) / pop
        usage['otherPotablePerCapita'][m] = numberize(row['other_gal']) / pop

    return dict(usage)  # Convert from defaultdict to regular dict

def computePredictions(usage):
    '''Compute monthly usage predictions for the upcoming 12 months.'''
    predictions = {}
    for month in range(1, 13):
        matching_keys = [k for k in usage if k.endswith("%02d" % month)]
        if len(matching_keys) == 0:
            # We have never seen data for this month, can't predict anything
            result = None
        elif len(matching_keys) == 1:
            # Only one occurance of this month, just predict homeostasis
            result = usage[matching_keys[0]]
        else:
            # Assume the year-over-year growth for this month is static.
            matching_keys.sort()
            growth = usage[matching_keys[-1]] / usage[matching_keys[-2]]
            result = usage[matching_keys[-1]] * growth
        predictions['%02d' %  month] = result
    return predictions

def computeTargetUsage(df, year):
    '''Given a dataframe for a single provider, calculate it's usage target for the year.'''
    if len(df[df['water_year'] == year]) < 12:
        print("Don't have complete data for water year %d, skipping..." % (year))
        return None
    reduction = df.loc[datetime.datetime(year - 1, 10, 15), 'conservation_standard']
    df_year = df[df['water_year'] == year]
    used_year = (df_year['total_potable_gal_2013'] / df_year['total_population_served']).sum()
    return (1 - reduction) * used_year

In [8]:
# One dataset identifies water suppliers by ID, the other merely by name. We need to pair them.

providers = pd.read_csv(DATA_DIR + 'provider_ids.tsv', sep='\t')
provider_id_lookup = {}
for p, i in zip(providers['Provider'], providers['ID']):
    provider_id_lookup[p.lower()] = i

In [9]:
js = {}
for i, items in enumerate(df.groupby("supplier_name").groups.items()):
    name, indices = items
    print(i, name)
    name = name.lower()
    if name not in provider_id_lookup:
        print("Can't find supplier ID for '%s', skipping..." % name)
        continue

    supplier_df = df.loc[indices].set_index('reporting_month', drop=False)
    usage = computeUsage(supplier_df)
    total = usage['totalPerCapita']
    del usage['totalPerCapita']
    preds = computePredictions(total)
    target_2016 = computeTargetUsage(supplier_df, 2016)
    target_2015 = computeTargetUsage(supplier_df, 2015)
    if target_2016 is None or target_2015 is None:
        continue

    js[provider_id_lookup[name]] = {
        "agencyName": name,
        "totalUsage": total,
        "usageByCategory": usage,
        "monthlyPrediction": preds,
        "annualTarget": target_2016,
        "previousTarget": target_2015,
    }

0 Santa Barbara  City of
1 San Gabriel Valley Fontana Water Company
2 Del Oro Water Company
3 Lincoln Avenue Water Company
4 San Jacinto  City of
5 San Buenaventura  City of
6 California Water Service Company Willows
7 Pismo Beach  City of
8 Estero Municipal Improvement District
9 Poway  City of
10 Corona  City of
11 La Palma  City of
12 Los Angeles County Public Works Waterworks District 29
13 Trabuco Canyon Water District
14 Tulare, City of
15 Bellflower-Somerset Mutual Water Company
16 Imperial, City of
17 Lemoore  City of
18 California Water Service Company Westlake
19 Phelan Pinon Hills Community Services District
20 Dublin San Ramon Services District
21 Beaumont-Cherry Valley Water District
22 Buena Park  City of
23 Las Virgenes Municipal Water District
24 Yuba City  City of
25 Valley County Water District
26 Golden State Water Company S Arcadia
27 Burbank  City of
28 Fresno  City of
29 California Water Service Company Selma
30 Rosamond Community Service District
31 Padre Dam Mun

In [10]:
js[provider_id_lookup['East Bay Municipal Utilities District'.lower()]]

{'agencyName': 'east bay municipal utilities district',
 'annualTarget': 41681.397410071942,
 'monthlyPrediction': {'01': 2417.0425868076827,
  '02': 2307.3946062486384,
  '03': 2162.496771084676,
  '04': 2569.9142125418007,
  '05': 3744.794391501367,
  '06': 4781.496866924954,
  '07': 5052.842907325889,
  '08': 4874.674539013777,
  '09': 4451.649133761039,
  '10': 3195.7815596816476,
  '11': 2555.6796736092665,
  '12': 2564.2099279337895},
 'previousTarget': 49986.756329526652,
 'totalUsage': {'2014-06': 4786.015037593985,
  '2014-07': 4954.962406015037,
  '2014-08': 4511.965192168238,
  '2014-09': 4121.02973168963,
  '2014-10': 3894.5612762871647,
  '2014-11': 3042.3495286439447,
  '2014-12': 2783.901377810007,
  '2015-01': 2946.906474820144,
  '2015-02': 2709.2805755395684,
  '2015-03': 3345.4676258992804,
  '2015-04': 3206.4028776978416,
  '2015-05': 3350.863309352518,
  '2015-06': 3437.5539568345325,
  '2015-07': 3703.956834532374,
  '2015-08': 3777.338129496403,
  '2015-09': 3640

In [11]:
with open(DATA_DIR + 'usage.json', 'w') as f:
    f.write(json.dumps(js, indent=4))