In [1]:
import pandas as pd
from pathlib import Path

# Explore EIA API  and EIA Bulk data
API query:

(reproduciblilty note: replace `api_key=<REDACTED>` at the top)
```
https://api.eia.gov/v2/electricity/electric-power-operational-data/data/?api_key=<REDACTED>&frequency=quarterly&data[0]=cost-per-btu&data[1]=receipts-btu&facets[fueltypeid][]=ANT&facets[fueltypeid][]=BIS&facets[fueltypeid][]=DFO&facets[fueltypeid][]=FOS&facets[fueltypeid][]=LFG&facets[fueltypeid][]=MLG&facets[fueltypeid][]=NGO&facets[fueltypeid][]=OOG&facets[fueltypeid][]=PET&facets[fueltypeid][]=RC&facets[fueltypeid][]=RFO&facets[fueltypeid][]=WOC&facets[fueltypeid][]=WOO&facets[location][]=90&facets[location][]=AK&facets[location][]=AL&facets[location][]=AR&facets[location][]=AZ&facets[location][]=CA&facets[location][]=CO&facets[location][]=CT&facets[location][]=DC&facets[location][]=DE&facets[location][]=ENC&facets[location][]=ESC&facets[location][]=FL&facets[location][]=GA&facets[location][]=HI&facets[location][]=IA&facets[location][]=ID&facets[location][]=IL&facets[location][]=IN&facets[location][]=KS&facets[location][]=KY&facets[location][]=LA&facets[location][]=MA&facets[location][]=MAT&facets[location][]=MD&facets[location][]=ME&facets[location][]=MI&facets[location][]=MN&facets[location][]=MO&facets[location][]=MS&facets[location][]=MT&facets[location][]=MTN&facets[location][]=NC&facets[location][]=ND&facets[location][]=NE&facets[location][]=NEW&facets[location][]=NH&facets[location][]=NJ&facets[location][]=NM&facets[location][]=NV&facets[location][]=NY&facets[location][]=OH&facets[location][]=OK&facets[location][]=OR&facets[location][]=PA&facets[location][]=PCC&facets[location][]=PCN&facets[location][]=RI&facets[location][]=SAT&facets[location][]=SC&facets[location][]=SD&facets[location][]=TN&facets[location][]=TX&facets[location][]=US&facets[location][]=UT&facets[location][]=VA&facets[location][]=VT&facets[location][]=WA&facets[location][]=WI&facets[location][]=WNC&facets[location][]=WSC&facets[location][]=WV&facets[location][]=WY&facets[sectorid][]=1&facets[sectorid][]=2&facets[sectorid][]=3&facets[sectorid][]=4&facets[sectorid][]=5&facets[sectorid][]=6&facets[sectorid][]=7&facets[sectorid][]=90&facets[sectorid][]=94&facets[sectorid][]=96&facets[sectorid][]=97&facets[sectorid][]=98&facets[sectorid][]=99&start=2015-Q1&end=2015-Q1&sort[0][column]=period&sort[0][direction]=asc&offset=0&length=5000
```

In [5]:
# I manually edited the file to remove metadata and make it line-delimited for easier parsing.
path_api_stuff = Path('./api.eia.gov2015.json')
assert path_api_stuff.exists()

In [22]:
api_raw = pd.read_json(path_api_stuff, lines=True)

In [23]:
api_raw.head(3)

Unnamed: 0,period,location,stateDescription,sectorid,sectorDescription,fueltypeid,fuelTypeDescription,cost-per-btu,cost-per-btu-units,receipts-btu,receipts-btu-units
0,2015-Q1,ESC,East South Central,3,IPP CHP,BIT,bituminous coal,,dollars per million Btu,0.0,billion Btu
1,2015-Q1,ESC,East South Central,3,IPP CHP,COW,all coal products,,dollars per million Btu,0.0,billion Btu
2,2015-Q1,WSC,West South Central,4,Commercial Non-CHP,NG,natural gas,,dollars per million Btu,0.0,billion Btu


In [13]:
import sqlalchemy as sa

In [14]:
pudl_path = Path('../workspace/sqlite/pudl.sqlite')
assert pudl_path.exists()

In [17]:
engine = sa.create_engine("sqlite:///" + str(pudl_path.absolute()))

In [18]:
bulk = pd.read_sql(sql="""
select * from fuel_receipts_costs_aggs_eia
where report_date = date('2015-01-01')
    and temporal_agg = 'quarterly'
;
""",
                  con=engine)
bulk.head(3)

Unnamed: 0,fuel_agg,geo_agg,sector_agg,temporal_agg,report_date,fuel_received_mmbtu,fuel_cost_per_mmbtu
0,all_coal,AK,all_commercial,quarterly,2015-01-01,0.0,
1,all_coal,AK,all_electric_power,quarterly,2015-01-01,0.0,0.0
2,all_coal,AK,all_ipp,quarterly,2015-01-01,0.0,


In [24]:
api_raw.shape, bulk.shape

((2457, 11), (2029, 7))

## Transform API data to look like bulk data

In [21]:
from pudl.transform.eia_bulk_elec import _map_key_codes_to_readable_values

In [31]:
rename_dict = {'location': "geo_agg", "fueltypeid": 'fuel_agg', 'sectorid': 'sector_agg'}
api_keys = api_raw.loc[:, rename_dict.keys()].rename(columns=rename_dict).assign(temporal_agg='Q').astype(pd.StringDtype())

In [43]:
api = pd.concat(
    [_map_key_codes_to_readable_values(api_keys),
     api_raw.loc[:, ['cost-per-btu', 'receipts-btu']].rename(columns={'cost-per-btu': "fuel_cost_per_mmbtu", 'receipts-btu': "fuel_received_mmbtu"})
    ],
    #ignore_index=True,
    axis=1,
)

  keys.loc[:, col_name] = keys.loc[:, col_name].map(mapping)
  keys.loc[:, col_name] = keys.loc[:, col_name].map(mapping)
  keys.loc[:, col_name] = keys.loc[:, col_name].map(mapping)


In [44]:
api.head(3)

Unnamed: 0,geo_agg,fuel_agg,sector_agg,temporal_agg,fuel_cost_per_mmbtu,fuel_received_mmbtu
0,ESC,bituminous_coal,ipp_cogen,quarterly,,0.0
1,ESC,all_coal,ipp_cogen,quarterly,,0.0
2,WSC,natural_gas,commercial_non_cogen,quarterly,,0.0


In [45]:
api.shape

(2457, 6)

## Compare
### Missingness

In [52]:
# temporal_agg is constant in this case
key_columns = list(rename_dict.values())
value_columns = ['fuel_cost_per_mmbtu', "fuel_received_mmbtu"]
key_columns

['geo_agg', 'fuel_agg', 'sector_agg']

In [61]:
api[value_columns].isna().all(axis=1).sum()

0

In [62]:
bulk[value_columns].isna().all(axis=1).sum()

0

In [48]:
api.isna().agg(['sum', 'mean'])

Unnamed: 0,geo_agg,fuel_agg,sector_agg,temporal_agg,fuel_cost_per_mmbtu,fuel_received_mmbtu
sum,0.0,0.0,0.0,0.0,1649.0,0.0
mean,0.0,0.0,0.0,0.0,0.671144,0.0


In [49]:
bulk.isna().agg(['sum', 'mean'])

Unnamed: 0,fuel_agg,geo_agg,sector_agg,temporal_agg,report_date,fuel_received_mmbtu,fuel_cost_per_mmbtu
sum,0.0,0.0,0.0,0.0,0.0,12.0,1485.0
mean,0.0,0.0,0.0,0.0,0.0,0.005914,0.731888


In [50]:
api['fuel_cost_per_mmbtu'].notna().sum() - bulk['fuel_cost_per_mmbtu'].notna().sum()

264

In [58]:
joined = api[key_columns + value_columns].merge(bulk[key_columns+value_columns], on=key_columns, how='outer', suffixes = ['_api', '_bulk'])

In [59]:
joined.head()

Unnamed: 0,geo_agg,fuel_agg,sector_agg,fuel_cost_per_mmbtu_api,fuel_received_mmbtu_api,fuel_cost_per_mmbtu_bulk,fuel_received_mmbtu_bulk
0,ESC,bituminous_coal,ipp_cogen,,0.0,,0.0
1,ESC,all_coal,ipp_cogen,,0.0,,0.0
2,WSC,natural_gas,commercial_non_cogen,,0.0,,
3,WSC,petroleum_liquids,commercial_non_cogen,,0.0,,
4,ESC,all_coal,all_ipp,,9144.73092,,9144730.92


In [55]:
joined.shape

(2481, 7)

In [56]:
# number of rows unique to bulk data
joined.shape[0] - api.shape[0]

24

In [57]:
# number of rows unique to API data
joined.shape[0] - bulk.shape[0]

452

In [60]:
bulk_value_columns = [col+'_bulk' for col in value_columns]
api_value_columns = [col+'_api' for col in value_columns]

In [63]:
joined[bulk_value_columns].isna().all(axis=1).sum()

452

In [64]:
joined['api_only'] = joined[bulk_value_columns].isna().all(axis=1)
joined['bulk_only'] = joined[api_value_columns].isna().all(axis=1)

In [72]:
unique_bulk_data = joined.loc[joined['bulk_only'], key_columns + bulk_value_columns].sort_values(key_columns)
unique_bulk_data

Unnamed: 0,geo_agg,fuel_agg,sector_agg,fuel_cost_per_mmbtu_bulk,fuel_received_mmbtu_bulk
2467,DC,natural_gas,all_sectors,,0.0
2470,DC,petroleum_liquids,all_sectors,,0.0
2462,ESC,bituminous_coal,ipp_non_cogen,,1245188.4
2471,ESC,petroleum_liquids,industrial_non_cogen,,0.0
2476,ESC,sub_bituminous_coal,all_ipp,,784887.43
2477,ESC,sub_bituminous_coal,ipp_non_cogen,,784887.43
2457,KY,all_coal,all_ipp,,784887.43
2458,KY,all_coal,ipp_non_cogen,,784887.43
2478,KY,sub_bituminous_coal,all_ipp,,784887.43
2479,KY,sub_bituminous_coal,ipp_non_cogen,,784887.43


In [70]:
unique_api_data = joined.loc[joined['api_only'], key_columns + api_value_columns].sort_values(key_columns)

In [71]:
unique_api_data[key_columns].describe()

Unnamed: 0,geo_agg,fuel_agg,sector_agg
count,452,452,452
unique,59,7,12
top,MO,petroleum_liquids,all_commercial
freq,19,180,82


In [73]:
unique_api_data.describe()

Unnamed: 0,fuel_cost_per_mmbtu_api,fuel_received_mmbtu_api
count,134.0,452.0
mean,0.0,0.0
std,0.0,0.0
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,0.0,0.0


In [74]:
unique_api_data.sample(10)

Unnamed: 0,geo_agg,fuel_agg,sector_agg,fuel_cost_per_mmbtu_api,fuel_received_mmbtu_api
1348,NEW,natural_gas,industrial_non_cogen,,0.0
1090,AK,petroleum_liquids,ipp_cogen,,0.0
1464,MI,petroleum_liquids,commercial_non_cogen,,0.0
804,IA,petroleum_liquids,ipp_non_cogen,,0.0
369,LA,petroleum_liquids,all_industrial,0.0,0.0
2138,UT,petroleum_liquids,industrial_cogen,,0.0
149,WNC,lignite_coal,industrial_cogen,,0.0
991,IL,petroleum_liquids,ipp_cogen,,0.0
185,ESC,bituminous_coal,all_commercial,,0.0
1211,WI,all_coal,ipp_non_cogen,,0.0


In [78]:
[df.query('fuel_received_mmbtu != 0').shape for df in (api, bulk)]

[(1510, 6), (1534, 7)]

In [92]:
[df.query('fuel_received_mmbtu != 0 and not fuel_cost_per_mmbtu.isna()').shape for df in (api, bulk)]

[(459, 6), (467, 7)]

In [82]:
joined_no_zeros = (api.query('fuel_received_mmbtu != 0')[key_columns + value_columns]
                   .merge(bulk.query('fuel_received_mmbtu != 0')[key_columns+value_columns],
                          on=key_columns, how='outer', suffixes = ['_api', '_bulk']
                         )
                  )
joined_no_zeros.shape

(1534, 7)

In [83]:
joined_no_zeros['api_only'] = joined_no_zeros[bulk_value_columns].isna().all(axis=1)
joined_no_zeros['bulk_only'] = joined_no_zeros[api_value_columns].isna().all(axis=1)

In [85]:
joined_no_zeros[['api_only', 'bulk_only']].agg(['count', 'sum', 'mean'])

Unnamed: 0,api_only,bulk_only
count,1534.0,1534.0
sum,0.0,24.0
mean,0.0,0.015645


In [88]:
# zero price information in these records
joined_no_zeros[joined_no_zeros['bulk_only']].drop(columns=api_value_columns) # all NaN

Unnamed: 0,geo_agg,fuel_agg,sector_agg,fuel_cost_per_mmbtu_bulk,fuel_received_mmbtu_bulk,api_only,bulk_only
1510,KY,all_coal,all_ipp,,784887.43,False,True
1511,KY,all_coal,ipp_non_cogen,,784887.43,False,True
1512,MN,all_coal,industrial_non_cogen,,270441.72,False,True
1513,ESC,bituminous_coal,ipp_non_cogen,,1245188.4,False,True
1514,MS,bituminous_coal,all_ipp,,1245188.4,False,True
1515,MS,bituminous_coal,ipp_non_cogen,,1245188.4,False,True
1516,MAT,natural_gas,all_commercial,0.0,,False,True
1517,NC,natural_gas,ipp_cogen,,13858934.15,False,True
1518,US,natural_gas,commercial_non_cogen,0.0,,False,True
1519,DE,petroleum_liquids,electric_utility,0.0,,False,True
