# Allocation of Demand based on Disaggregation and Re-aggregation of Data

This notebook is used to disaggregate demand from the FERC-714 dataset, based on the population from census data, then re-aggregating it to different various geometries like REEDs balancing areas, county-level, or at the state level.

## Datasets and inputs used

1. FERC-714 form: Energy sales timeseries data for every planning area

2. 2010 US Census data: Census tract geometries, and tract-level population and characteristics

3. ReEDs balancing geometries: ReEDs geometries containing county level data

4. US Planning Areas: Contains 97 planning area geometries

## Core functions

1. Function to find intersection of the large and small geometries

2. Function to normalize and redistribute from area to another attribute e.g. population

3. Map functions:

4. Timeseries functions: Functions for allocation

## Disposable parts of analysis

1. Cells doing auxiliary analysis like multiple-counted areas, visualizations for state, county and census tracts 

## Intermediate and Final Datasets

Intermediate datasets are required to limit the number of time-consuming calculations. Primarily, the overlay calculation takes excessive time.

### Intermediate Datasets

Area mapping of planning area with census tracts.

### Final Datasets

ReEDs aggregated demand timeseries data

## Workflow

1. There are generally two geometries: one containing primary smaller non-intersecting geometries like tracts, and the other with larger intersecting geometries

### Disaggregation of data

1. 

### Reaggregation of data

1. 


In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import pathlib
import time
import requests
import json
import datetime
import pickle

import pandas as pd
import numpy as np
import scipy.stats

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import colors
from matplotlib.legend import Legend
import matplotlib.patches as mpatches
import seaborn as sns

import pyproj
from geopandas import gpd
from shapely.geometry import Point
import geopandas
import fiona
from geopandas import GeoDataFrame

import pudl
from pudl.analysis.demand_mapping import (create_intersection_matrix,
                                          create_stacked_intersection_df,
                                          extract_multiple_tracts_demand_ratios,
                                          extract_time_series_demand_multiple_tracts,
                                          matrix_linear_scaling)

from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [3]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
log_format = '%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s'
formatter = logging.Formatter(log_format)
handler.setFormatter(formatter)
logger.handlers = [handler]

In [4]:
pudl_settings = pudl.workspace.setup.get_defaults()
data_path = pathlib.Path.cwd().parent / 'data'
pudl_settings

{'pudl_in': '/home/zane/code/catalyst/pudl-work',
 'data_dir': '/home/zane/code/catalyst/pudl-work/data',
 'settings_dir': '/home/zane/code/catalyst/pudl-work/settings',
 'pudl_out': '/home/zane/code/catalyst/pudl-work',
 'sqlite_dir': '/home/zane/code/catalyst/pudl-work/sqlite',
 'parquet_dir': '/home/zane/code/catalyst/pudl-work/parquet',
 'datapkg_dir': '/home/zane/code/catalyst/pudl-work/datapkg',
 'notebook_dir': '/home/zane/code/catalyst/pudl-work/notebook',
 'ferc1_db': 'sqlite:////home/zane/code/catalyst/pudl-work/sqlite/ferc1.sqlite',
 'pudl_db': 'sqlite:////home/zane/code/catalyst/pudl-work/sqlite/pudl.sqlite'}

# Obtain non-PUDL data
Some of the data we're using for this analysis has not yet been fully integrated into PUDL, so we are managing it ad-hoc in a directory at `pudl_settings["data_dir"]/local`

## FERC Form 714
* Download the raw file if we don't have it already.
* Run the draft Extract and Transform steps on it.
* Merge the respondent ID and Planning Area Hourly Demand dataframes.

In [5]:
%%time
def download_zip_url(url, save_path, chunk_size=128):
    r = requests.get(url, stream=True)
    with save_path.open(mode='wb') as fd:
        for chunk in r.iter_content(chunk_size=chunk_size):
            fd.write(chunk)

local_data = pathlib.Path(pudl_settings["data_dir"]) / "local"

ferc714_url = "https://www.ferc.gov/docs-filing/forms/form-714/data/form714-database.zip"
ferc714_dir = local_data / "ferc714"
ferc714_dir.mkdir(parents=True, exist_ok=True)

ferc714_save_path = ferc714_dir / "ferc714.zip"
if ferc714_save_path.exists():
    logger.info("Already have FERC 714 data, not downloading.")
else:
    logger.info("Downloading fresh FERC 714 data.")
    download_zip_url(ferc714_url, ferc714_save_path)

raw_ferc714_dfs = pudl.extract.ferc714.extract(pudl_settings=pudl_settings)
tfr_ferc714_dfs = pudl.transform.ferc714.transform(raw_ferc714_dfs)
pa_demand_ferc714_df = pd.merge(
    tfr_ferc714_dfs["pa_demand_hourly_ferc714"],
    tfr_ferc714_dfs["respondent_id_ferc714"]
)

2020-05-13 13:49:24,051 [    INFO] root:15 Already have FERC 714 data, not downloading.
2020-05-13 13:49:24,052 [    INFO] pudl.extract.ferc714:75 Reading respondent_id_ferc714 from CSV into pandas DataFrame.
2020-05-13 13:49:24,057 [    INFO] pudl.extract.ferc714:75 Reading id_certification_ferc714 from CSV into pandas DataFrame.
2020-05-13 13:49:24,087 [    INFO] pudl.extract.ferc714:75 Reading ba_gen_plants_ferc714 from CSV into pandas DataFrame.
2020-05-13 13:49:24,235 [    INFO] pudl.extract.ferc714:75 Reading ba_demand_monthly_ferc714 from CSV into pandas DataFrame.
2020-05-13 13:49:24,256 [    INFO] pudl.extract.ferc714:75 Reading ba_net_energy_load_ferc714 from CSV into pandas DataFrame.
2020-05-13 13:49:24,293 [    INFO] pudl.extract.ferc714:75 Reading adjacent_bas_ferc714 from CSV into pandas DataFrame.
2020-05-13 13:49:24,394 [    INFO] pudl.extract.ferc714:75 Reading ba_interchange_ferc714 from CSV into pandas DataFrame.
2020-05-13 13:49:24,419 [    INFO] pudl.extract.ferc7

## US Census Tract Geometries and Population
* This US census tract data comes from: https://www.arcgis.com/home/item.html?id=ca1316dba1b442d99cb76bc2436b9fdb
* The 7Zip file from that page must be extracted into the `esri_dir` defined below: `PUDL_IN/data/local/esri`
* The download has to be done by hand because the link is hidden behind some JavaScript nonsense.
* This proprietary dataset will need to be replaced with something open, potentially straight from the US Census.

In [6]:
%%time
esri_dir = local_data / "esri"
esri_dir.mkdir(parents=True, exist_ok=True)
esri_tract_path = esri_dir / "USA_Census_Tract_Boundaries/v10/tracts.gdb"
census_tract_gdf = (
    gpd.read_file(esri_tract_path, driver='FileGDB', layer='tracts')
    # THIS CREATES INVALID FIPS CODES: LEADING ZEROS ARE REQUIRED.
    .assign(STATE_FIPS=lambda x: pd.to_numeric(x.STATE_FIPS))
    # Remove all islands and non-mainland states and territories
    .query("STATE_FIPS<=56 & STATE_FIPS not in (2, 15, 44)")
    # Project to US Albers conic equal-area projection
    .to_crs("ESRI:102003")
)
census_tract_gdf.sample(5)

CPU times: user 1min 25s, sys: 734 ms, total: 1min 26s
Wall time: 1min 26s


Unnamed: 0,STATE_FIPS,CNTY_FIPS,STCOFIPS,TRACT,FIPS,POPULATION,POP_SQMI,POP2010,POP10_SQMI,WHITE,...,FAMILIES,AVE_FAM_SZ,HSE_UNITS,VACANT,OWNER_OCC,RENTER_OCC,SQMI,Shape_Length,Shape_Area,geometry
56454,26,163,26163,532700,26163532700,850.0,3695.7,962.0,4182.6,19.0,...,207.0,3.23,686.0,294.0,147.0,245.0,0.23,0.033087,6.4e-05,"MULTIPOLYGON (((1052750.336 617175.348, 105274..."
30687,21,111,21111,1400,21111001400,2582.0,4965.4,2601.0,5001.9,24.0,...,722.0,3.02,1131.0,98.0,457.0,576.0,0.52,0.053771,0.000138,"MULTIPOLYGON (((881642.475 129392.338, 881722...."
9009,6,71,6071,11401,6071011401,5074.0,1417.3,4394.0,1227.4,3367.0,...,1125.0,3.07,4530.0,2802.0,1030.0,698.0,3.58,0.200024,0.000909,"MULTIPOLYGON (((-1889147.047 -153406.247, -188..."
4182,6,37,6037,503402,6037503402,4497.0,7253.2,4273.0,6891.9,3134.0,...,1139.0,3.34,1475.0,59.0,1115.0,301.0,0.62,0.054773,0.000156,"MULTIPOLYGON (((-2000161.442 -165801.222, -200..."
42961,13,173,13173,950100,13173950100,1846.0,20.3,1936.0,21.3,1672.0,...,539.0,3.05,859.0,112.0,587.0,160.0,90.79,1.166579,0.022216,"MULTIPOLYGON (((1231089.382 -645233.495, 12310..."


## Electricity Planning Area Geometries:
* For now planning areas come from this DHS open dataset: https://hifld-geoplatform.opendata.arcgis.com/datasets/electric-planning-areas

In [7]:
import zipfile
hifld_pa_url = "https://opendata.arcgis.com/datasets/7d35521e3b2c48ab8048330e14a4d2d1_0.gdb"
hifld_dir = local_data / "hifld"
hifld_dir.mkdir(parents=True, exist_ok=True)
hifld_pa_zipfile = hifld_dir / "hifld_electric_planning_areas.gdb.zip"
hifld_pa_gdb_dir = hifld_dir / "hifld_electric_planning_areas.gdb"
if not hifld_pa_gdb_dir.is_dir():
    logger.info("No Planning Area GeoDB found. Downloading from HIFLD.")
    # Download to appropriate location
    download_zip_url(hifld_pa_url, hifld_pa_zipfile)
    # Unzip because we can't use zipfile paths with geopandas
    with zipfile.ZipFile(hifld_pa_zipfile, 'r') as zip_ref:
        zip_ref.extractall(hifld_dir)
        # Grab the UUID based directory name so we can change it:
        extract_root = hifld_dir / pathlib.Path(zip_ref.filelist[0].filename).parent
    extract_root.rename(hifld_pa_gdb_dir)
else:
    logger.info("We've already got the planning area GeoDB.")

logger.info("Extracting the GeoDB into a GeoDataFrame")
epas_gdf = pudl.transform.ferc714.electricity_planning_areas(pudl_settings)
logger.info("Dropping Planning Areas in AK and HI.")
# * 3522 = Chugach Electric Association (AK)
# * 19547 = Hawaii Electric Co 
logger.info("Reprojecting to US Albers Conic Equal Area projection.")
ak_hi_planning_area_ids = [3522, 19547]
epas_gdf = (
    epas_gdf.query("ID not in @ak_hi_planning_area_ids")
    # Project to US Albers conic equal-area projection
    .to_crs("ESRI:102003")
)

2020-05-13 13:57:31,341 [    INFO] root:18 We've already got the planning area GeoDB.
2020-05-13 13:57:31,342 [    INFO] root:20 Extracting the GeoDB into a GeoDataFrame
2020-05-13 13:57:34,314 [    INFO] root:22 Dropping Planning Areas in AK and HI.
2020-05-13 13:57:34,315 [    INFO] root:25 Reprojecting to US Albers Conic Equal Area projection.


In [8]:
pa_demand_2018 = pa_demand_ferc714_df.query("report_year==2018")

## merging 2018 demand data with gdf_planning_areas
ferc_demand_sum = (
    pa_demand_2018
    .groupby("utility_id_eia")[["demand_mwh"]]
    .sum()
    .reset_index()
#    .rename(columns={"demand_mwh": "demand_mwh_2018"})
)

pd.set_option("max_rows", 200)

display(
    epas_gdf[["ID", "NAME"]]
    .drop_duplicates()
    .merge(
        pa_demand_2018[["utility_name_ferc714", "utility_id_eia"]].drop_duplicates(),
        how="outer", left_on="ID", right_on="utility_id_eia"
    )
)

pd.reset_option("max_rows")

epas_gdf = (
    epas_gdf
    .merge(ferc_demand_sum, how="inner", left_on="ID", right_on="utility_id_eia")
    .drop("utility_id_eia", axis=1)
)

Unnamed: 0,ID,NAME,utility_name_ferc714,utility_id_eia
0,195.0,ALABAMA POWER COMPANY,Alabama Power Company,195.0
1,796.0,"ARIZONA ELECTRIC POWER COOPERATIVE, INC.","Arizona Electric Power Cooperative, Inc.",796.0
2,1307.0,BASIN ELECTRIC POWER COOP,Basin Electric Power Cooperative,1307.0
3,2507.0,CITY OF BURBANK,City of Burbank,2507.0
4,3408.0,ELECTRIC POWER BOARD OF CHATTANOOGA,Electric Power Board of Chattanooga,3408.0
5,3989.0,COLORADO SPRINGS UTILITIES,Colorado Springs Utilities,3989.0
6,4922.0,DAYTON POWER & LIGHT CO,"Dayton Power & Light Company, The",4922.0
7,4958.0,DECATUR UTILITIES,Decatur Utilities,4958.0
8,6022.0,EUGENE WATER & ELECTRIC BOARD,Eugene Water & Electric Board,6022.0
9,6567.0,FLORIDA MUNICIPAL POWER AGENCY,Florida Municipal Power Agency,6567.0


## Finding intersection area of each census tract with each planning area for demand allocation 

In [9]:
%%time
tracts_pa_ratios_stacked_pickle = pathlib.Path.cwd() / "tracts_planning_area_ratios_stacked.pkl"
if tracts_pa_ratios_stacked_pickle.is_file():
    logging.info("Found pre-existing intersection file.")
    with tracts_pa_ratios_stacked_pickle.open(mode="rb") as f:
        gdf_intersection = pickle.load(f)
else:
    start_time = time.time()
    gdf_intersection = create_stacked_intersection_df(census_tract_gdf, epas_gdf)
    print("--- %s seconds ---" % (time.time() - start_time))
    with tracts_pa_ratios_stacked_pickle.open(mode="wb") as f:
        pickle.dump(gdf_intersection, f)

2020-05-13 13:57:41,893 [    INFO] root:3 Found pre-existing intersection file.
CPU times: user 15.9 ms, sys: 3.99 ms, total: 19.9 ms
Wall time: 20 ms


## Pivoting the mapping to a matrix, and scaling/normalization by individual population, then finding allocating demand accordingly

In [10]:
%%time
intersection_matrix = create_intersection_matrix(gdf_intersection)
pop_norm_matrix = matrix_linear_scaling(intersection_matrix, gdf_scale=census_tract_gdf)
demand_norm_matrix = matrix_linear_scaling(
    pop_norm_matrix, gdf_scale=epas_gdf,
    gdf_scale_col="demand_mwh", axis_scale=0
)

CPU times: user 1.4 s, sys: 91.9 ms, total: 1.49 s
Wall time: 1.53 s


In [11]:
# Areas in demand dataframe that do not have a shape geometry
(
    ferc_demand_sum[~ferc_demand_sum["utility_id_eia"].isin(pop_norm_matrix.columns)]
     .merge(pa_demand_2018[["utility_id_eia", "utility_name_ferc714"]])
     .drop_duplicates()
)

Unnamed: 0,utility_id_eia,demand_mwh,utility_name_ferc714
0,3522,1258622.0,"Chugach Electric Association, Inc."
8760,13100,11132980.0,Municipal Electric Authority of Georgia
17520,13670,3365884.0,Northeast Texas Electric Cooperative
26280,17867,807656.0,City of St. Cloud
35040,19547,7002252.63,"Hawaiian Electric Company, Inc"


## Grouping demand data at the county and state level

In [12]:
census_tract_gdf = (
    census_tract_gdf.merge(
        demand_norm_matrix
        .sum(axis=1)
        .reset_index()
        .rename(columns={0: "demand_mwh"})
    )
)

## Created tracts demand aggregated by state
county_gdf = (
    census_tract_gdf[["STCOFIPS", "geometry"]]
    .merge(
        census_tract_gdf
        .groupby("STCOFIPS")[["demand_mwh", "SQMI", "POPULATION"]]
        .sum()
        .reset_index()
    )
)

county_gdf["energy_demand_per_sqmi"] = county_gdf["demand_mwh"] / county_gdf["SQMI"]
county_gdf["demand_per_capita"] = county_gdf["demand_mwh"] / county_gdf["POPULATION"]

## Created tracts demand aggregated by state
state_gdf = (
    census_tract_gdf[["STATE_FIPS", "geometry"]]
    .merge(
        census_tract_gdf
        .groupby("STATE_FIPS")[["demand_mwh", "SQMI", "POPULATION"]]
        .sum()
        .reset_index()
    )
)

state_gdf["energy_demand_per_sqmi"] = state_gdf["demand_mwh"] / state_gdf["SQMI"]

state_gdf["demand_per_capita"] = state_gdf["demand_mwh"] / state_gdf["POPULATION"]

## Incorporating state annual sales for residential, industrial comparison

In [13]:
sales_df = pd.read_excel("https://www.eia.gov/electricity/data/state/sales_annual.xlsx", skiprows=[0])
sales_df = sales_df[sales_df["Year"]==2018]
sales_df = sales_df[~sales_df["State"].isin(["US", "HI", "AK", "DC"])]
# print(sales_df["Industry Sector Category"].unique())
sales_df = sales_df[sales_df["Industry Sector Category"]=="Total Electric Industry"]

states_fips_lookup = pd.read_html("https://www.nrcs.usda.gov/wps/portal/nrcs/detail/?cid=nrcs143_013696")[0].iloc[:-1]
states_fips_lookup = states_fips_lookup.astype({
    "FIPS": "int32"
})

# states_fips_lookup["FIPS"] = states_fips_lookup["FIPS"].apply(lambda x: int(x))

sales_df = sales_df.merge(states_fips_lookup, left_on="State",
                          right_on="Postal Code")[["State", "Total", "FIPS",
                                                   "Residential", "Commercial", "Industrial"]]

sales_df = (state_gdf[["STATE_FIPS", "demand_mwh"]]
            .drop_duplicates()
            .reset_index(drop=True)
            .merge(sales_df, left_on="STATE_FIPS", right_on="FIPS")).drop("FIPS", axis=1)

sales_df = sales_df.sort_values("demand_mwh", ascending=False).reset_index(drop=True)

sales_df["pc_error"] = abs(sales_df["Total"] - sales_df["demand_mwh"]) / sales_df["Total"]


sales_df = sales_df.sort_values("pc_error", ascending=False).reset_index(drop=True)
sales_sum = sales_df["Total"].sum()
allocated_demand_sum = sales_df["demand_mwh"].sum()

sales_df["adjusted_demand"] = sales_df["demand_mwh"] * sales_sum / allocated_demand_sum

sales_df.head()

Unnamed: 0,STATE_FIPS,demand_mwh,State,Total,Residential,Commercial,Industrial,pc_error,adjusted_demand
0,31,54426490.0,NE,30939492,10412008,9553396,10974088,0.759127,48961560.0
1,46,21345790.0,SD,12856938,5018360,4903243,2935335,0.660255,19202470.0
2,20,66379590.0,KS,42036979,14187192,16168750,11681037,0.579076,59714450.0
3,30,22664710.0,MT,14838845,5197686,4921231,4719928,0.527391,20388960.0
4,34,115319400.0,NJ,76016762,29530689,38807065,7369106,0.517027,103740300.0


## Allocating demand time series at the state level

In [36]:
dict_state_tracts = (
    census_tract_gdf
    .groupby("STATE_FIPS")
    .agg({"FIPS": list})
    .to_dict()["FIPS"]
)

dict_statefips_names = (
    states_fips_lookup
    .set_index("FIPS")["Name"]
    .to_dict()
)

dict_statefips_names[11] = "DC"
display(pa_demand_2018.head())
display(pop_norm_matrix.head())

df_state_sales = []
for state_tracts in tqdm(dict_state_tracts.values()):
    df_state_sales.append(
        extract_time_series_demand_multiple_tracts(
            ferc_df=pa_demand_2018,
            pop_norm_df=pop_norm_matrix,
            ferc_df_col="utility_id_eia",
            intermediate_ids=state_tracts,
            time_col="utc_datetime",
            demand_col="demand_mwh"
        )
    )
    
df_state_sales = pd.concat(
    df_state_sales,
    axis=1,
    keys=[dict_statefips_names[key] + "_Sales" for key in dict_state_tracts.keys()]
)

Unnamed: 0,report_year,utility_id_ferc714,utc_datetime,timezone,demand_mwh,utility_name_ferc714,utility_id_eia
105192,2018,101,2018-01-01 06:00:00,America/Chicago,1565.0,PowerSouth Energy Cooperative (Alabama Electri...,189
105193,2018,101,2018-01-01 07:00:00,America/Chicago,1602.0,PowerSouth Energy Cooperative (Alabama Electri...,189
105194,2018,101,2018-01-01 08:00:00,America/Chicago,1648.0,PowerSouth Energy Cooperative (Alabama Electri...,189
105195,2018,101,2018-01-01 09:00:00,America/Chicago,1702.0,PowerSouth Energy Cooperative (Alabama Electri...,189
105196,2018,101,2018-01-01 10:00:00,America/Chicago,1774.0,PowerSouth Energy Cooperative (Alabama Electri...,189


ID,189,195,229,796,803,924,1307,1738,2507,3046,...,21554,24211,25471,28503,30151,39347,40211,40218,40233,56669
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001020100,899.5,899.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1001020200,1032.0,1032.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1001020300,1739.0,1739.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1001020400,2132.0,2132.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1001020500,5675.5,5675.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


  0%|          | 0/48 [00:00<?, ?it/s]


ValueError: matrices are not aligned

In [32]:


df_state_sales_monthly = df_state_sales.groupby(pd.Grouper(freq="M")).sum()


df_state_sales_2018 = df_state_sales_monthly[(df_state_sales_monthly.index >= "2018-01-01") &
                       (df_state_sales_monthly.index <= "2018-12-31")].stack().reset_index().rename(columns={"level_1": "State", 0: "GWh_allocated"})

df_state_sales_2018["GWh_allocated"] = df_state_sales_2018["GWh_allocated"] / 1000

df_state_sales_2018["utc_datetime"] = (df_state_sales_2018["utc_datetime"].dt.year * 100 +
                                     df_state_sales_2018["utc_datetime"].dt.month)

df_state_sales_2018["State"] = df_state_sales_2018["State"].str.rstrip('Sales').str.rstrip('_').str.strip()

  0%|          | 0/48 [00:00<?, ?it/s]


ValueError: matrices are not aligned

## Extracting state sales from EIA

In [15]:
sales_data = (json.loads(requests
                         .get("http://api.eia.gov/category/?api_key=d2b250683a925a1bddcd63c5d12698c0&category_id=38")
                         .text))

sales_data_urls = [a["series_id"]
                   for a in sales_data["category"]["childseries"]
                   if a["series_id"][-1] == "M"]

sales_data_states_names = [a["name"][30:-23]
                           for a in sales_data["category"]["childseries"]
                           if a["series_id"][-1] == "M"]


df_sales_eia = []

for index in tqdm(range(len(sales_data_urls))):
    
   
    
    df_sales_eia.append(pd.DataFrame((json.loads(requests
            .get("http://api.eia.gov/series/?api_key=d2b250683a925a1bddcd63c5d12698c0&series_id=" +
                 sales_data_urls[index])
            .text))["series"][0]["data"])
                        .rename(columns={0: "utc_datetime", 1: sales_data_states_names[index]})
                        .set_index("utc_datetime"))
    
    
df_sales_eia = pd.concat(df_sales_eia, axis=1).reset_index()
df_sales_eia["utc_datetime"] = pd.to_numeric(df_sales_eia["utc_datetime"])


df_sales_eia_2018 = df_sales_eia[df_sales_eia["utc_datetime"] // 100 == 2018]
df_sales_eia_2018 = df_sales_eia_2018.set_index("utc_datetime").stack().reset_index().rename(columns={"level_1": "State", 0: "GWh"})
df_sales_eia_2018["State"] = df_sales_eia_2018["State"].str.strip()


## Scaling Demand Time Series by a constant factor to make the total sums equal
df_check = df_state_sales_2018.merge(df_sales_eia_2018, how="inner")

df_check["GWh_adjusted"] = df_check["GWh_allocated"] * df_check["GWh"].sum() / df_check["GWh_allocated"].sum()

100%|██████████| 62/62 [00:36<00:00,  1.70it/s]


NameError: name 'df_state_sales_2018' is not defined

## Extracting 2018 REEDS data

In [None]:
path_reeds = str(data_path / "NREL/us_canada_reeds_map_files")
gdf_reeds = gpd.read_file(path_reeds)
gdf_reeds["pca_num"] = pd.to_numeric(gdf_reeds["pca"].str.slice_replace(stop=1, repl=""))
gdf_reeds = gdf_reeds[gdf_reeds["pca_num"]<=134]
gdf_reeds = gdf_reeds.to_crs("ESRI:102003")

file_save = "tracts_reeds_areas_ratios_stacked.pkl"

if sum([file_save in str(a) for a in data_path.glob("*")]):

    with open(str(data_path / file_save), "rb") as f:
        gdf_reeds_intersection = pickle.load(f)
    
else:
    start_time = time.time()
    gdf_reeds_intersection = create_stacked_intersection_df(gdf, gdf_reeds, gdf_secondary_col="OBJECTID")
    pickle.dump(gdf_reeds_intersection, open(file_save, "wb" ))
    
    print("--- %s seconds ---" % (time.time() - start_time))
    
reeds_intersection_matrix = create_intersection_matrix(gdf_reeds_intersection,
                                                       gdf_source_col="OBJECTID",
                                                       gdf_intersection_col='gdf_primary_intersection_fraction')

pop_norm_reeds = matrix_linear_scaling(reeds_intersection_matrix, gdf)

reeds_demand_2018 = matrix_linear_scaling(pop_norm_reeds,
                                          gdf_scale=gdf,
                                          gdf_scale_col="demand_mwh_2018",
                                          axis_scale=1)

gdf_reeds = gdf_reeds.set_index("OBJECTID")
gdf_reeds["demand_2018_mwh"] = reeds_demand_2018.sum(axis=0)
gdf_reeds = gdf_reeds.reset_index()

## Aggregating REEDS data at PCA level
reeds_pca = (gdf_reeds
             .drop("demand_2018_mwh", axis=1)
             .merge(gdf_reeds.groupby("pca_num")["demand_2018_mwh"].sum().reset_index()))

## Mapping from Census-Planning Areas & Census-REEDS to REEDS-Planning Areas

In [None]:
drop_fips = (set(pop_norm_reeds[pop_norm_reeds.sum(axis=1) == 0].index)
             .union(set(pop_norm_matrix[pop_norm_matrix.sum(axis=1) == 0].index)))

common_fips = (set(pop_norm_matrix.index)
               .intersection(set(pop_norm_reeds.index))
               .difference(drop_fips))

pop_ferc_reeds= (pop_norm_reeds
                 .loc[common_fips].T
                 .divide(pop_norm_reeds.loc[common_fips]
                         .sum(axis=1))) @ pop_norm_matrix.loc[common_fips]

## Mapping FERC714 demand time series to pca areas

In [None]:
df_reeds_sales = []
dict_reeds_tracts = gdf_reeds.groupby("pca").agg({"OBJECTID": list}).to_dict()["OBJECTID"]

for objectids in tqdm(dict_reeds_tracts.values()):
    
    ser = extract_time_series_demand_multiple_tracts(ferc_df, pop_ferc_reeds, ferc_df_col="utility_id_eia",
                                                     intermediate_ids=objectids, time_col="utc_datetime",
                                                     demand_col="demand_mwh")
    df_reeds_sales.append(ser)
    
df_reeds_sales = pd.concat(df_reeds_sales, axis=1, keys=dict_reeds_tracts.keys())
df_reeds_sales_monthly = df_reeds_sales.groupby(pd.Grouper(freq="M")).sum() 

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
plt.plot((intersection_matrix != 0)
         .sum(axis=1)
         .sort_values(ascending=False)
         .tolist(), 'ro', ms=2)

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
plt.plot(create_intersection_matrix(gdf_intersection, normalization=0)
         .sum(axis=1)
         .sort_values(ascending=False)
         .tolist(), 'ro', ms=2)

plt.show()

In [None]:
# County demand per unit area
fig, ax = plt.subplots(figsize=(28, 16))
county_gdf.plot(column="energy_demand_per_sqmi", cmap="cividis_r", ax=ax,
                          legend=True, scheme="percentiles")
plt.show()

In [None]:
## State demand figure
fig, ax = plt.subplots(figsize=(28, 16))
state_gdf.plot(column="demand_mwh_2018", cmap="cividis_r", ax=ax, legend=True, scheme="NaturalBreaks")
plt.show()

In [None]:
# 6 figures of 8 states with allocated demand and FERC sales

# for i in range(6):

#     fig, ax = plt.subplots(figsize=(8, 6))

#     for idx, row in sales_df.iloc[8*i: 8*(i+1)].iterrows():

#         ax.plot([row["demand_mwh_2018"], row["Total"]], marker="o", label=row["State"])
        
#     plt.xticks([0, 1], ["Allocated Demand", "Sales"])
#     plt.legend()    
#     plt.show()
    

In [None]:
pred = 'adjusted_demand'
actual = "Total"


fig, ax = plt.subplots(figsize=(6, 6))

def r2(x, y):
    
    return scipy.stats.pearsonr(sales_df[actual], sales_df[pred])


slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(sales_df[actual], sales_df[pred])

sns.regplot(x=actual, y=pred, data=sales_df, ax = ax,
            line_kws={'label':"y={0:.2f}x + {1:.1f} (R2 = {2:.2f})".format(slope,intercept,r_value)})


min_lim, max_lim = 0, sales_df[[actual, pred]].max().max()


ax.plot((min_lim, max_lim), (min_lim, max_lim), ls="--")

# ax = sns.jointplot(x="Residential", y="Demand", data=sales_df, stats_func=r2)


ax.legend()
plt.show()

## Comparison with monthly data using EIA 



In [None]:
# data_eia_states = (json
#                    .loads(requests
#                           .get("http://api.eia.gov/category/?api_key=d2b250683a925a1bddcd63c5d12698c0&category_id=40213")
#                           .text))




In [None]:
df_check["GWh_allocated"].sum() / df_check["GWh"].sum()

In [None]:
pred = 'GWh_adjusted'
actual = "GWh"


g = sns.FacetGrid(df_check, col="State", col_wrap=5, sharey=False, sharex=False)
g.map(sns.regplot, actual, pred)

state_list = df_check["State"].unique().tolist()

counter = 0

for ax in g.axes.flat:
    
    df_temp = df_check[df_check["State"]==state_list[counter]]
    min_max = df_temp.describe().loc[["min", "max"], ["GWh", "GWh_adjusted"]]
    
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(df_temp[actual], df_temp[pred])
    
#     min_lim, max_lim = min_max.min().min(), min_max.max().max()
    min_lim, max_lim = 0, min_max.max().max()
    
    
    ax.plot((min_lim, max_lim), (min_lim, max_lim), ls="--")
    
    ax.text(max_lim-10, max_lim-10,"y={0:.2f}x + {1:.1f} (R² = {2:.2f})".format(slope,intercept,r_value),
            horizontalalignment='right', verticalalignment="top")
    
    ax.set_ylim(min_lim, max_lim)
    ax.set_xlim(min_lim, max_lim)
    
    
    counter += 1

In [None]:
state_demand_factors = df_check.groupby("State").mean()
state_demand_factors["factor"] = state_demand_factors["GWh"] / state_demand_factors["GWh_adjusted"]
state_demand_factors = state_demand_factors["factor"].to_dict()
state_demand_factors

df_check["state_GWh_adjusted"] = df_check.apply(lambda x: x["GWh_adjusted"] * state_demand_factors[x["State"]],
                                                axis=1)
df_check

In [None]:
pred = 'state_GWh_adjusted'
actual = "GWh"


g = sns.FacetGrid(df_check, col="State", col_wrap=5, sharey=False, sharex=False)
g.map(sns.regplot, actual, pred)

state_list = df_check["State"].unique().tolist()

counter = 0

for ax in g.axes.flat:
    
    df_temp = df_check[df_check["State"]==state_list[counter]]
    min_max = df_temp.describe().loc[["min", "max"], ["GWh", "GWh_adjusted"]]
    
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(df_temp[actual], df_temp[pred])
    
#     min_lim, max_lim = min_max.min().min(), min_max.max().max()
    min_lim, max_lim = 0, min_max.max().max()
    
    
    ax.plot((min_lim, max_lim), (min_lim, max_lim), ls="--")
    
    ax.text(max_lim-10, max_lim-10,"y={0:.2f}x + {1:.1f} (R² = {2:.2f})".format(slope,intercept,r_value),
            horizontalalignment='right', verticalalignment="top")
    
    ax.set_ylim(min_lim, max_lim)
    ax.set_xlim(min_lim, max_lim)
    
    
    counter += 1

In [None]:
pred = 'GWh_adjusted'
actual = "GWh"

g = sns.FacetGrid(df_check, col="utc_datetime", col_wrap=4, sharey=False, sharex=False)
g.map(sns.regplot, actual, pred)

time_list = df_check["utc_datetime"].unique().tolist()

counter = 0

for ax in g.axes.flat:
    
    df_temp = df_check[df_check["utc_datetime"]==time_list[counter]]
    min_max = df_temp.describe().loc[["min", "max"], ["GWh", "GWh_adjusted"]]
    
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(df_temp[actual], df_temp[pred])
      
    
#     min_lim, max_lim = min_max.min().min(), min_max.max().max()
    min_lim, max_lim = 0, min_max.max().max()
    
    
    ax.plot((min_lim, max_lim), (min_lim, max_lim), ls="--")
    ax.text(max_lim-10, max_lim-10,"y={0:.2f}x + {1:.1f} (R² = {2:.2f})".format(slope,intercept,r_value),
            horizontalalignment='right', verticalalignment="top")
    
    ax.set_ylim(min_lim, max_lim)
    ax.set_xlim(min_lim, max_lim)
    
    
    counter += 1

In [None]:
pred = 'state_GWh_adjusted'
actual = "GWh"

g = sns.FacetGrid(df_check, col="utc_datetime", col_wrap=4, sharey=False, sharex=False)
g.map(sns.regplot, actual, pred)

time_list = df_check["utc_datetime"].unique().tolist()

counter = 0

for ax in g.axes.flat:
    
    df_temp = df_check[df_check["utc_datetime"]==time_list[counter]]
    min_max = df_temp.describe().loc[["min", "max"], ["GWh", "GWh_adjusted"]]
    
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(df_temp[actual], df_temp[pred])
      
    
#     min_lim, max_lim = min_max.min().min(), min_max.max().max()
    min_lim, max_lim = 0, min_max.max().max()
    
    
    ax.plot((min_lim, max_lim), (min_lim, max_lim), ls="--")
    ax.text(max_lim-10, max_lim-10,"y={0:.2f}x + {1:.1f} (R² = {2:.2f})".format(slope,intercept,r_value),
            horizontalalignment='right', verticalalignment="top")
    
    ax.set_ylim(min_lim, max_lim)
    ax.set_xlim(min_lim, max_lim)
    
    
    counter += 1

In [None]:
fig, ax = plt.subplots(figsize=(30, 20))
reeds_pca.plot("demand_2018_mwh", ax=ax, legend=True, cmap="cividis", scheme="NaturalBreaks")
plt.show()

## State of Texas

In [None]:
state_demand_factors["Texas"]

In [None]:
Time Series of individual

In [None]:
gdf[gdf["STATE_FIPS"]==48][["STATE_FIPS", "CNTY_FIPS"]].drop_duplicates()