# EIA923 Column Changes
This notebook reimplements the excel extractor process to extract each sheet of each excel file separately. This preserves the original structure for easier comparison.

In [1]:
%load_ext autoreload
%autoreload 2
import pudl
from pudl import constants as pc
from pathlib import Path
import pandas as pd
pd.options.display.max_columns = 150
pd.options.display.max_rows = 150

In [2]:
# make notebooks full width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [4]:
eia923_tables = pc.PUDL_TABLES['eia923']
eia923_years = list(range(2001, 2020))

In [5]:
from pudl.workspace.setup import PudlPaths

ds = pudl.workspace.datastore.Datastore(local_cache_path=PudlPaths().data_dir)

In [6]:
eia923_extractor = pudl.extract.eia923.Extractor(ds)

In [7]:
dfs = {}
# Lightly altered extractor code (pudl.extract.excel.GenericExtractor.extract) to avoid concatenating prematurely
for partition in pudl.helpers.iterate_multivalue_dict(year=eia923_years):
    dfs[partition['year']] = {}
    for page in eia923_extractor._metadata.get_all_pages():
        if eia923_extractor.excel_filename(page, **partition) == '-1':
            continue
        newdata = pd.read_excel(
                    eia923_extractor.load_excel_file(page, **partition),
                    sheet_name=eia923_extractor._metadata.get_sheet_name(
                        page, **partition),
                    skiprows=eia923_extractor._metadata.get_skiprows(page, **partition),
                    skipfooter=eia923_extractor._metadata.get_skipfooter(
                        page, **partition),
                    dtype=eia923_extractor.get_dtypes(page, **partition),
                    nrows=20
        )
        newdata = pudl.helpers.simplify_columns(newdata)
        newdata = eia923_extractor.process_raw(newdata, page, **partition)
        newdata = eia923_extractor.process_renamed(newdata, page, **partition)
        dfs[partition['year']][page] = newdata

In [8]:
for k, v in dfs.items():
    print(k, v.keys())

2001 dict_keys(['generation_fuel', 'stocks'])
2002 dict_keys(['generation_fuel', 'stocks'])
2003 dict_keys(['generation_fuel', 'stocks'])
2004 dict_keys(['generation_fuel', 'stocks'])
2005 dict_keys(['generation_fuel', 'stocks'])
2006 dict_keys(['generation_fuel', 'stocks'])
2007 dict_keys(['generation_fuel', 'stocks'])
2008 dict_keys(['boiler_fuel', 'fuel_receipts_costs', 'generation_fuel', 'generator', 'stocks'])
2009 dict_keys(['boiler_fuel', 'fuel_receipts_costs', 'generation_fuel', 'generator', 'stocks'])
2010 dict_keys(['boiler_fuel', 'fuel_receipts_costs', 'generation_fuel', 'generator', 'stocks'])
2011 dict_keys(['boiler_fuel', 'fuel_receipts_costs', 'generation_fuel', 'generator', 'plant_frame', 'stocks'])
2012 dict_keys(['boiler_fuel', 'fuel_receipts_costs', 'generation_fuel', 'generator', 'plant_frame', 'stocks'])
2013 dict_keys(['boiler_fuel', 'fuel_receipts_costs', 'generation_fuel', 'generator', 'plant_frame', 'stocks'])
2014 dict_keys(['boiler_fuel', 'fuel_receipts_costs

In [23]:
# make dataframes of columns. One df per excel sheet, one row per year
from collections import defaultdict
col_dfs = defaultdict(list)
for page in eia923_extractor._metadata.get_all_pages():
    for year in eia923_years:
        try:
            col_dfs[page].append(dfs[year][page].columns.to_frame().rename(columns={0: year}))
        except KeyError:
            continue

In [24]:
col_dfs = {k : pd.concat(v, axis=1).T for k, v in col_dfs.items()}

This shows the state of the columns for each year for each sheet. When a column is introduced (or disappears), that entry will be NaN. This particular page only has data from 2011 on.

In [35]:
col_dfs['plant_frame']

Unnamed: 0,report_year,plant_id_eia,plant_name_eia,plant_state,eia_sector,sector_name,naics_code,combined_heat_power,reporting_frequency,nameplate_capacity_mw
2011,report_year,plant_id_eia,plant_name_eia,plant_state,eia_sector,sector_name,naics_code,combined_heat_power,reporting_frequency,nameplate_capacity_mw
2012,report_year,plant_id_eia,plant_name_eia,plant_state,eia_sector,,naics_code,combined_heat_power,reporting_frequency,
2013,report_year,plant_id_eia,plant_name_eia,plant_state,eia_sector,sector_name,naics_code,combined_heat_power,reporting_frequency,
2014,report_year,plant_id_eia,plant_name_eia,plant_state,eia_sector,,naics_code,combined_heat_power,reporting_frequency,
2015,report_year,plant_id_eia,plant_name_eia,plant_state,eia_sector,,naics_code,combined_heat_power,reporting_frequency,
2016,report_year,plant_id_eia,plant_name_eia,plant_state,eia_sector,,naics_code,combined_heat_power,reporting_frequency,
2017,report_year,plant_id_eia,plant_name_eia,plant_state,eia_sector,,naics_code,combined_heat_power,reporting_frequency,
2018,report_year,plant_id_eia,plant_name_eia,plant_state,eia_sector,,naics_code,combined_heat_power,reporting_frequency,
2019,report_year,plant_id_eia,plant_name_eia,plant_state,eia_sector,,naics_code,combined_heat_power,reporting_frequency,


Filter for columns with any NaNs to avoid filling the screen with unchanged columns

In [36]:
col_dfs['plant_frame'].loc[:,col_dfs['plant_frame'].isna().any()]

Unnamed: 0,sector_name,nameplate_capacity_mw
2011,sector_name,nameplate_capacity_mw
2012,,
2013,sector_name,
2014,,
2015,,
2016,,
2017,,
2018,,
2019,,


Look at all the dataframes this way

In [25]:
a = iter(col_dfs.items())

In [29]:
k,v = next(a)
print(k)
v.loc[:,v.isna().any()]

boiler_fuel


2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018


fuel_receipts_costs has a weird column dropout: mercury content in 2013. But I checked the raw excel sheet and it is real.

In [30]:
k,v = next(a)
print(k)
v.loc[:,v.isna().any()]

fuel_receipts_costs


Unnamed: 0,mercury_content_ppm,natural_gas_delivery_contract_type_code,moisture_content_pct,chlorine_content_ppm
2008,,,,
2009,,,,
2010,,,,
2011,,,,
2012,mercury_content_ppm,,,
2013,,,,
2014,mercury_content_ppm,,,
2015,mercury_content_ppm,,,
2016,mercury_content_ppm,natural_gas_delivery_contract_type_code,moisture_content_pct,chlorine_content_ppm
2017,mercury_content_ppm,natural_gas_delivery_contract_type_code,moisture_content_pct,chlorine_content_ppm


In [31]:
k,v = next(a)
print(k)
v.loc[:,v.isna().any()]

generation_fuel


2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011


In [32]:
k,v = next(a)
print(k)
v.loc[:,v.isna().any()]

generator


2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018


plant_frame has a few ephemeral columns. Again, checking the raw EIA excel sheets shows that they really do only exist for 2011 and 2013

In [33]:
k,v = next(a)
print(k)
v.loc[:,v.isna().any()]

plant_frame


Unnamed: 0,sector_name,nameplate_capacity_mw
2011,sector_name,nameplate_capacity_mw
2012,,
2013,sector_name,
2014,,
2015,,
2016,,
2017,,
2018,,
2019,,


In [34]:
k,v = next(a)
print(k)
v.loc[:,v.isna().any()]

stocks


2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
