# Produce `water_data.json` for the app's use

Authors: everett@bayes.org, mehdi@bayes.org

In [6]:
from collections import defaultdict
import datetime as datetime
import json

import numpy as np
import pandas as pd

%matplotlib inline

DATA_DIR = "../data/"

### Load a reduced version of the water conservation dataset

#### This was manually edited by me (Everett) from the full dataset (also in the same folder), though in an ideal world I'd do it via a Makefile and code.

In [7]:
df = pd.read_csv(DATA_DIR + "uw_supplier_data110116_edited.csv", encoding="latin-1")
df = df.rename(columns=dict((c, c.strip()) for c in df.columns))
print("* " + "\n* ".join(df.columns))

* Supplier Name
* Mandatory Restrictions
* Reporting Month
* Total Population Served
* Conservation Standard (starting in June 2015) *Adjusted in March 2016 **Revised in June 2016
* REPORTED Total Monthly Potable Water Production Reporting Month
* REPORTED Monthly CII
* REPORTED Monthly Ag Use Reporting Month
* CALCULATED Total Monthly Potable Water Production Reporting Month Gallons
* CALCULATED Total Monthly Potable Water Production 2013 Gallons
* CALCULATED Monthly CII Reporting Month
* CALCULATED R-GPCD Reporting Month
* % Residential Use
* Hydrologic Region


In [8]:
def numberize(i):
    """Convert a string like '100,000' into a float, like 100000.0
    
    Since this data has a lot of variants for zero (missing data,
    null, strings like 'n/a'), we regard any failed float conversion
    as an indicator that the value is 0. This seems to be correct
    based on examining the data.
    """
    try:
        return float(str(i).replace(',', ''))
    except:
        return 0.0

In [9]:
# Rename wordy columns to more palatable strings
col_rename_dict = {
    'Supplier Name': 'supplier_name',
    'REPORTED Total Monthly Potable Water Production Reporting Month': 'reported_total_gal',
    'REPORTED Monthly CII': 'reported_cii_gal',
    'REPORTED Monthly Ag Use Reporting Month': 'reported_ag_gal',
    "CALCULATED Total Monthly Potable Water Production Reporting Month Gallons": "total_gal",
    "CALCULATED Monthly CII Reporting Month": "cii_gal",
    "CALCULATED R-GPCD Reporting Month": "r_gpcd",
    'Reporting Month': 'reporting_month',
    'Total Population Served': 'total_population_served',
    'REPORTED Residential Gallons-per-Capita-Day (R-GPCD) (starting in September 2014)': 'r_gpc',
    '% Residential Use': 'percent_residential_use',
    'Hydrologic Region': 'hydrologic_region',
    'Conservation Standard (starting in June 2015) *Adjusted in March 2016 **Revised in June 2016': 'conservation_standard'
}
cols = list(df.columns)
for c in cols:
    if c not in col_rename_dict:
        del df[c]
df = df.rename(columns=col_rename_dict)

In [10]:
# Compute a bunch of useful columns. Water usage breakdowns, etc.

df['reporting_month'] = pd.to_datetime(df['reporting_month'])
df["month"] = df["reporting_month"].apply(lambda x: x.month)
df["year"] = df["reporting_month"].apply(lambda x: x.year)

df['reported_ag_gal'] = df['reported_ag_gal'].apply(numberize).fillna(0.0)
df['conversion_factor'] = df['total_gal'] / (df['reported_total_gal'] - df['reported_ag_gal'])
df['r_gal'] = df['total_gal'] * (df['percent_residential_use'] / 100.0)
df['ag_gal'] = df['reported_ag_gal'] * df['conversion_factor']
df['person_days'] = df['r_gal'] / df['r_gpcd']
df['cii_gal'] = df['cii_gal'].apply(numberize).astype(float)
df['cii_gal2'] = df['total_gal'] * (1 - df['percent_residential_use'] / 100.0)

df['conservation_standard'] = df['conservation_standard'].apply(
    lambda s: 0 if pd.isnull(s) else float(s.strip('%')) / 100.0)

df.head()

Unnamed: 0,supplier_name,reporting_month,total_population_served,conservation_standard,reported_total_gal,reported_cii_gal,reported_ag_gal,total_gal,cii_gal,r_gpcd,percent_residential_use,hydrologic_region,month,year,conversion_factor,r_gal,ag_gal,person_days,cii_gal2
0,East Bay Municipal Utilities District,2016-09-15,1400000,0.0,5636.2,912,0,5636200000,912000000,81.9,61,San Francisco Bay,9,2016,1000000,3438082000,0,41979023.199023,2198118000
1,East Bay Municipal Utilities District,2016-08-15,1400000,0.0,6007.5,1141,0,6007500000,1141000000,84.4,61,San Francisco Bay,8,2016,1000000,3664575000,0,43419135.07109,2342925000
2,East Bay Municipal Utilities District,2016-07-15,1400000,0.0,6056.6,994,0,6056600000,994000000,83.7,60,San Francisco Bay,7,2016,1000000,3633960000,0,43416487.455197,2422640000
3,East Bay Municipal Utilities District,2016-06-15,1400000,0.0,5675.9,839,0,5675900000,839000000,82.4,61,San Francisco Bay,6,2016,1000000,3462299000,0,42018191.747573,2213601000
4,East Bay Municipal Utilities District,2016-05-15,1400000,0.16,4959.3,955,0,4959300000,955000000,68.6,60,San Francisco Bay,5,2016,1000000,2975580000,0,43375801.749271,1983720000


In [11]:
def computeUsage(df):
    usage = defaultdict(dict)
    for i, row in df.iterrows():
        m = row['reporting_month'].strftime('%Y-%m')
        days = row['person_days']
        usage['total'][m] = numberize(row['total_gal']) / days
        usage['residential'][m] = numberize(row['r_gal']) / days
        usage['agricultural'][m] = numberize(row['ag_gal']) / days
        usage['commercialIndustrial'][m] = numberize(row['cii_gal']) / days
        if pd.isnull(usage['commercialIndustrial'][m]):
            usage['commercialIndustrial'][m] = 0.0
    return usage

def computeMonthShare(usage, water_year):
    total = 0
    year = water_year
    month = 9
    share = {}
    for i in range(12):
        month_usage = usage['%d-%02d' % (year, month)]
        share[month] = month_usage
        total += month_usage
        if month == 1:
            month = 12
            year -= 1
        else:
            month -= 1
    for k in share.keys():
        share[k] /= total
    return share, total

def computePredictions(usage):
    year = 2016
    month = 9
#     month_shares = computeMonthShare(usage, 2015)
    predictions = {}
    for i in range(12):
        next_month = 12 if month == 1 else (month -1)
        next_year = year if next_month != 12 else (year - 1)
        
        current_key = '%d-%02d' % (year, month)
        last_year_key = '%d-%02d' % (year - 1, month)
        next_month_last_year_key = '%d-%02d' % (next_year - 1, next_month)
        if (current_key not in usage
            or last_year_key not in usage
            or next_month_last_year_key not in usage):
            return None
        
        growth = usage[current_key] / usage[last_year_key]
        predictions['%02d' %  next_month] = growth * usage[next_month_last_year_key]
        month = next_month
        year = next_year
    return predictions

In [13]:
# One dataset identifies water suppliers by ID, the other merely by name. We need to pair them.

providers = pd.read_csv(DATA_DIR + 'provider_ids.tsv', sep='\t')
provider_id_lookup = {}
for p, i in zip(providers['Provider'], providers['ID']):
    provider_id_lookup[p.lower()] = i

In [14]:
js = {}
for i, items in enumerate(df.groupby("supplier_name").groups.items()):
    name, indices = items
    print(i, name)
    name = name.lower()
    if name not in provider_id_lookup:
        continue
    group_frame = df.loc[indices]
    usage = computeUsage(group_frame)
    total = usage['total']
    del usage['total']
    preds = computePredictions(total)
    if not preds:
        continue
    month_share, year_total = computeMonthShare(total, 2016)
    group_frame = group_frame.set_index('reporting_month', drop=False)
    reduction = group_frame.loc[datetime.datetime(2016, 9, 15),'conservation_standard']
    target = (1 - reduction) * year_total
    js[provider_id_lookup[name]] = {
        "agencyName": name,
        "totalUsage": total,
        "usageByCategory": usage,
        "monthlyPrediction": preds,
        "annualTarget": target,
    }

0 California Water Service Company Hermosa/Redondo
1 Millbrae  City of
2 Downey  City of
3 Pico Rivera  City of
4 Grover Beach  City of
5 Anderson, City of
6 Lake Hemet Municipal Water District
7 El Centro  City of
8 Tahoe City Public Utilities District
9 Bakersfield  City of
10 West Kern Water District
11 Golden State Water Company Orcutt
12 Olivehurst Public Utility District
13 Vallejo  City of
14 Imperial, City of
15 Petaluma  City of
16 California Water Service Company Kern River Valley
17 El Toro Water District
18 Rancho California Water District
19 Banning  City of
20 Redding  City of
21 San Bernardino  City of
22 Gilroy  City of
23 Corcoran, City of
24 Glendale  City of
25 Nipomo Community Services District
26 Fountain Valley  City of
27 Oakdale  City of
28 East Palo Alto, City of
29 Mountain House Community Services District
30 Hi-Desert Water District
31 Ukiah  City of
32 Healdsburg  City of
33 Nevada Irrigation District
34 Manhattan Beach  City of
35 Golden State Water Compan

In [15]:
js[list(js.keys())[0]]

{'agencyName': 'yreka, city of',
 'annualTarget': 2684.5349399646834,
 'monthlyPrediction': {'01': 123.7307642936555,
  '02': 113.26516680118374,
  '03': 121.14337024842861,
  '04': 182.03074750960855,
  '05': 225.89825255832764,
  '06': 364.72232054225276,
  '07': 410.39231027682024,
  '08': 379.0939822492886,
  '09': 375.81218051701484,
  '10': 189.53260296141744,
  '11': 123.13287037037034,
  '12': 128.8636363636364},
 'totalUsage': {'2014-06': 380.40816326530614,
  '2014-07': 440.4,
  '2014-08': 357.1153846153846,
  '2014-09': 316.45833333333337,
  '2014-10': 189.77272727272725,
  '2014-11': 126.9047619047619,
  '2014-12': 128.57142857142858,
  '2015-01': 130.95238095238093,
  '2015-02': 127.25,
  '2015-03': 136.15384615384616,
  '2015-04': 183.57142857142856,
  '2015-05': 236.53061224489798,
  '2015-06': 354.88888888888886,
  '2015-07': 369.5744680851064,
  '2015-08': 353.09523809523813,
  '2015-09': 303.9024390243902,
  '2015-10': 225.3658536585366,
  '2015-11': 126.7441860465116

In [16]:
with open('water_usage.json', 'w') as f:
    f.write(json.dumps(js, indent=4))