# Test Old Years

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Standard libraries
import logging
import os
import pathlib
import sys

# 3rd party libraries
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
import sqlalchemy as sa
from functools import reduce

# Local libraries
import pudl
import pudl.constants as pc

In [3]:
logger=logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [4]:
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_in = pathlib.Path(pudl_settings['pudl_in'])
ds = pudl.workspace.datastore.Datastore(pudl_in, sandbox=True)
#eia861_raw_dfs = pudl.extract.eia861.Extractor(ds).extract([2019])

In [5]:
datapkg_dir = '/Users/aesharpe/Desktop/Work/Catalyst_Coop/PUDL_DIR/datapkg/pudl-2008/eia-example/'

## Toggle Settings

In [46]:
eia_inputs = {
    "eia860_years": [2009, 2006],
    "eia860_tables": pudl.constants.pudl_tables["eia860"],
    "eia861_years": [],
    "eia861_tables": pudl.constants.pudl_tables["eia861"],
    "eia923_years": [2009],
    "eia923_tables": pudl.constants.pudl_tables["eia923"],
}

eia860_tables = eia_inputs["eia860_tables"]
eia860_years = eia_inputs["eia860_years"]
eia861_tables = eia_inputs["eia861_tables"]
eia861_years = eia_inputs["eia861_years"]
eia923_tables = eia_inputs["eia923_tables"]
eia923_years = eia_inputs["eia923_years"]

In [47]:
# generate CSVs for the static EIA tables, return the list of tables
static_tables = pudl.etl._load_static_tables_eia(datapkg_dir)

Loading Static EIA Tables fuel_type_eia923 dataframe into CSV
Loading Static EIA Tables prime_movers_eia923 dataframe into CSV
Loading Static EIA Tables fuel_type_aer_eia923 dataframe into CSV
Loading Static EIA Tables energy_source_eia923 dataframe into CSV
Loading Static EIA Tables transport_modes_eia923 dataframe into CSV


In [48]:
# Extract EIA forms 923, 860
eia860_raw_dfs = pudl.extract.eia860.Extractor(ds).extract(eia860_years)
#eia861_raw_dfs = pudl.extract.eia861.Extractor(ds).extract(eia861_years)
eia923_raw_dfs = pudl.extract.eia923.Extractor(ds).extract(eia923_years)

Extracting eia860 spreadsheet data.
Columns for boiler_generator_assn are off: should be 4 but got 6
Columns for generator are off: should be 65 but got 66
Columns for generator_existing are off: should be 76 but got 77
Columns for generator_proposed are off: should be 55 but got 56
Columns for generator_retired are off: should be 75 but got 76
Columns for ownership are off: should be 14 but got 15
Columns for plant are off: should be 46 but got 47
Columns for utility are off: should be 20 but got 21
Extracting eia923 spreadsheet data.


In [50]:
# Test that the column you added made it into the raw table grab here
# Take a look at what the data type is supposed to be and whether it needs transformation

eia860_raw_dfs['generator']['ferc_cogen_status']
test = eia860_raw_dfs['generator'][['ferc_cogen_status']].sample(20)
test
test['ferc_cogen_status'].unique()

array(['N', nan, 'Y'], dtype=object)

In [51]:
# Transform EIA forms 860, 861, 923
eia860_transformed_dfs = pudl.transform.eia860.transform(eia860_raw_dfs, eia860_tables=eia860_tables)
#eia861_transformed_dfs = pudl.transform.eia861.transform(eia861_raw_dfs, eia861_tables=eia861_tables)
eia923_transformed_dfs = pudl.transform.eia923.transform(eia923_raw_dfs, eia923_tables=eia923_tables)

Transforming raw EIA 860 DataFrames for ownership_eia860 concatenated across all years.
Transforming raw EIA 860 DataFrames for generators_eia860 concatenated across all years.
Transforming raw EIA 860 DataFrames for plants_eia860 concatenated across all years.
Transforming raw EIA 860 DataFrames for boiler_generator_assn_eia860 concatenated across all years.
Transforming raw EIA 860 DataFrames for utilities_eia860 concatenated across all years.
Transforming raw EIA 923 DataFrames for generation_fuel_eia923 concatenated across all years.
Transforming raw EIA 923 DataFrames for boiler_fuel_eia923 concatenated across all years.
Transforming raw EIA 923 DataFrames for generation_eia923 concatenated across all years.
Transforming raw EIA 923 DataFrames for coalmine_eia923 concatenated across all years.
Transforming raw EIA 923 DataFrames for fuel_receipts_costs_eia923 concatenated across all years.


In [53]:
# See if your transformation fixed the column
eia860_transformed_dfs['generators_eia860'][['ferc_cogen_status']]

Unnamed: 0,ferc_cogen_status
0,
1,
2,
3,
4,
...,...
18349,
18350,True
18351,False
18352,False


In [68]:
# create an eia transformed dfs dictionary
eia_transformed_dfs = eia860_transformed_dfs.copy()
#eia_transformed_dfs.update(eia861_transformed_dfs.copy())
eia_transformed_dfs.update(eia923_transformed_dfs.copy())

In [69]:
# convert types..
eia_transformed_dfs = pudl.helpers.convert_dfs_dict_dtypes(eia_transformed_dfs, 'eia')

  mask = arr == x
  mask = arr == x


In [70]:
# See if your datatype worked
test = eia_transformed_dfs['generators_eia860'] #['ferc_cogen_status']

In [128]:
test = test[[
    'plant_id_eia',
    'generator_id',
    'report_date',
    'ferc_cogen_status', 
    'ferc_cogen_docket_no', 
    'ferc_small_power_producer',
    'ferc_small_power_producer_docket_no',
    'ferc_exempt_wholesale_generator',
    'ferc_exempt_wholesale_generator_docket_no'
]]
test2 = test.groupby(['plant_id_eia', 'report_date']).nunique()
test2 = test2[test2['generator_id']>1]
test2.sort_values('ferc_cogen_status', ascending=False)
full_len = len(test2)
fcs_len = len(test2[test2['ferc_cogen_status']>1])
fcsd_len = len(test2[test2['ferc_cogen_docket_no']>1])
fspp_len = len(test2[test2['ferc_small_power_producer']>1])
fsppd_len = len(test2[test2['ferc_small_power_producer_docket_no']>1])
fewg_len = len(test2[test2['ferc_exempt_wholesale_generator']>1])
fewgd_len = len(test2[test2['ferc_exempt_wholesale_generator_docket_no']>1])

print(
    f'ferc_cogen_status: {fcs_len} / {full_len} \n' 
    f'ferc_cogen_docket_no: {fcsd_len} / {full_len} \n'
    f'ferc_small_power_producer: {fspp_len} / {full_len} \n'
    f'ferc_small_power_producer_docket_no: {fsppd_len} / {full_len} \n'
    f'ferc_exempt_wholesale_generator: {fewg_len} / {full_len} \n'
    f'ferc_exempt_wholesale_generator_docket_no: {fewgd_len} / {full_len}'
)

#test[test['plant_id_eia']==10123]

ferc_cogen_status: 36 / 8667 
ferc_cogen_docket_no: 43 / 8667 
ferc_small_power_producer: 15 / 8667 
ferc_small_power_producer_docket_no: 30 / 8667 
ferc_exempt_wholesale_generator: 8 / 8667 
ferc_exempt_wholesale_generator_docket_no: 38 / 8667


In [129]:
# NOTE: you can't run this twice in a row -- need to reload eia_transformed_dfs
entities_dfs, eia_transformed_dfs = pudl.transform.eia.transform(
    eia_transformed_dfs,
    eia860_years=eia860_years,
    eia923_years=eia923_years,
)

Harvesting IDs & consistently static attributes for EIA plants
Average consistency of static plants values is 99.56%
Harvesting IDs & consistently static attributes for EIA generators
Average consistency of static generators values is 99.98%
Harvesting IDs & consistently static attributes for EIA utilities
Average consistency of static utilities values is 100.00%
Harvesting IDs & consistently static attributes for EIA boilers
Average consistency of static boilers values is 99.92%
Inferring complete EIA boiler-generator associations.
Multiple EIA unit codes:plant_id_eia=10725, unit_id_pudl=1, unit_id_eia=['F801' 'F802']
Multiple EIA unit codes:plant_id_eia=56309, unit_id_pudl=1, unit_id_eia=['G401' 'G402']


In [130]:
entities_dfs = pudl.helpers.convert_dfs_dict_dtypes(entities_dfs, 'eia')

  mask = arr == x


In [131]:
transformed_dfs = {"Entities": entities_dfs, "EIA": eia_transformed_dfs}

In [156]:
# Double check that the column is still in there
test = transformed_dfs['EIA']['plants_eia860']

In [158]:
# LOAD step
for data_source, transformed_df in transformed_dfs.items():
    pudl.load.csv.dict_dump(transformed_df,
                            data_source,
                            datapkg_dir=datapkg_dir)
# return (
#     list(eia_transformed_dfs.keys())
#     + list(entities_dfs.keys())
#     + static_tables)

Loading Entities plants_entity_eia dataframe into CSV
Loading Entities generators_entity_eia dataframe into CSV
Loading Entities utilities_entity_eia dataframe into CSV
Loading Entities boilers_entity_eia dataframe into CSV
Loading EIA ownership_eia860 dataframe into CSV
Loading EIA generators_eia860 dataframe into CSV
Loading EIA plants_eia860 dataframe into CSV
Loading EIA boiler_generator_assn_eia860 dataframe into CSV
Loading EIA utilities_eia860 dataframe into CSV
Loading EIA generation_fuel_eia923 dataframe into CSV
Loading EIA boiler_fuel_eia923 dataframe into CSV
Loading EIA generation_eia923 dataframe into CSV
Loading EIA coalmine_eia923 dataframe into CSV
Loading EIA fuel_receipts_costs_eia923 dataframe into CSV


In [231]:
transformed_dfs['EIA'].keys()

dict_keys(['ownership_eia860', 'generators_eia860', 'plants_eia860', 'boiler_generator_assn_eia860', 'utilities_eia860', 'generation_fuel_eia923', 'boiler_fuel_eia923', 'generation_eia923', 'coalmine_eia923', 'fuel_receipts_costs_eia923'])

In [149]:
# Aaaaand make sure it loaded properly
test = transformed_dfs['EIA']['plants_eia860']
test['ferc_cogen_docket_no']

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
        ... 
12443    NaN
12444    NaN
12445    NaN
12446    NaN
12447    NaN
Name: ferc_cogen_docket_no, Length: 12448, dtype: object

In [130]:
# run: 
# datapkg_to_sqlite PUDL_DIR/datapkg/pudl-2008/eia-example/datapackage.json -c
# Then run the code below

In [90]:
# little helper function
def get_full_sql_table(table_name, engine):
    """Get a full table from a SQL database."""
    # generate table metadata
    md = sa.MetaData()
    md.reflect(engine)
    pt = md.tables
    # make a sql select statement
    select = sa.sql.select([pt[table_name], ])
    # read the sql select into a dataframe
    df = pd.read_sql(select, engine)
    return df
# Generate a SQLalchemy databse engine:
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings["pudl_db"])
table_name='utilities_eia860'
table_df = get_full_sql_table(table_name=table_name, engine=pudl_engine)

In [91]:
table_df

Unnamed: 0,id,utility_id_eia,report_date,street_address,city,state,zip_code,plants_reported_owner,plants_reported_operator,plants_reported_asset_manager,...,entity_type,attention_line,address_2,zip_code_4,contact_firstname,contact_lastname,contact_title,contact_firstname_2,contact_lastname_2,contact_title_2
0,1,62789,2018-01-01,300 Spectrum Center Drive Suit,Irvine,CA,92618,True,,,...,Independent Power Producer,,,,,,,,,
1,2,62788,2018-01-01,300 Spectrum Center Drive Suit,Irvine,CA,92618,True,,,...,Independent Power Producer,,,,,,,,,
2,3,62787,2018-01-01,"1900 East Golf Road, Suite 103",Schaumburg,IL,60173,True,,True,...,Independent Power Producer,,,,,,,,,
3,4,62784,2018-01-01,"11550 Ash St, Ste 300",Leawood,KS,66211,True,,,...,Independent Power Producer,,,,,,,,,
4,5,62778,2018-01-01,100 California St Suite 400,San Francisco,CA,94118,True,,,...,Independent Power Producer,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61182,61183,23,2007-01-01,5527 Paseo Arnaldo,Yorba Linda,CA,92886,,,,...,,,,,A,Giusiana,Vice President,A,Giusiana,Vice President
61183,61184,21,2007-01-01,,Panama,OK,74951,,,,...,,,P O Box 1740,,Lundy,Kiger,Vice President,Ruben,Soroeta,President
61184,61185,20,2007-01-01,1001 Nineteenth St 20th Floor,Arlington,VA,22209,,,,...,,,,,Paul,Burdick,Vice President,,,
61185,61186,8,2007-01-01,2245 Sagamore Parkway North,Lafayette,IN,47904,,,,...,,,,,Scott A,Market,Utility Resource,William D,Coats,Utilities Manager
