# EIA923 Column Changes
This notebook reimplements the excel extractor process to extract each sheet of each excel file separately. This preserves the original structure for easier comparison.

In [None]:
%load_ext autoreload
%autoreload 2
import pudl
from pudl import constants as pc
from pathlib import Path
import pandas as pd
pd.options.display.max_columns = 150
pd.options.display.max_rows = 150

In [None]:
# make notebooks full width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [None]:
eia923_tables = pc.PUDL_TABLES['eia923']
eia923_years = list(range(2001, 2020))

In [None]:
from pudl.workspace.setup import PudlPaths

ds = pudl.workspace.datastore.Datastore(local_cache_path=PudlPaths().data_dir)

In [None]:
eia923_extractor = pudl.extract.eia923.Extractor(ds)

In [None]:
dfs = {}
# Lightly altered extractor code (pudl.extract.excel.GenericExtractor.extract) to avoid concatenating prematurely
for partition in pudl.helpers.iterate_multivalue_dict(year=eia923_years):
    dfs[partition['year']] = {}
    for page in eia923_extractor._metadata.get_all_pages():
        if eia923_extractor.excel_filename(page, **partition) == '-1':
            continue
        newdata = pd.read_excel(
                    eia923_extractor.load_excel_file(page, **partition),
                    sheet_name=eia923_extractor._metadata.get_sheet_name(
                        page, **partition),
                    skiprows=eia923_extractor._metadata.get_skiprows(page, **partition),
                    skipfooter=eia923_extractor._metadata.get_skipfooter(
                        page, **partition),
                    dtype=eia923_extractor.get_dtypes(page, **partition),
                    nrows=20
        )
        newdata = pudl.helpers.simplify_columns(newdata)
        newdata = eia923_extractor.process_raw(newdata, page, **partition)
        newdata = eia923_extractor.process_renamed(newdata, page, **partition)
        dfs[partition['year']][page] = newdata

In [None]:
for k, v in dfs.items():
    print(k, v.keys())

In [None]:
# make dataframes of columns. One df per excel sheet, one row per year
from collections import defaultdict
col_dfs = defaultdict(list)
for page in eia923_extractor._metadata.get_all_pages():
    for year in eia923_years:
        try:
            col_dfs[page].append(dfs[year][page].columns.to_frame().rename(columns={0: year}))
        except KeyError:
            continue

In [None]:
col_dfs = {k : pd.concat(v, axis=1).T for k, v in col_dfs.items()}

This shows the state of the columns for each year for each sheet. When a column is introduced (or disappears), that entry will be NaN. This particular page only has data from 2011 on.

In [None]:
col_dfs['plant_frame']

Filter for columns with any NaNs to avoid filling the screen with unchanged columns

In [None]:
col_dfs['plant_frame'].loc[:,col_dfs['plant_frame'].isna().any()]

Look at all the dataframes this way

In [None]:
a = iter(col_dfs.items())

In [None]:
k,v = next(a)
print(k)
v.loc[:,v.isna().any()]

fuel_receipts_costs has a weird column dropout: mercury content in 2013. But I checked the raw excel sheet and it is real.

In [None]:
k,v = next(a)
print(k)
v.loc[:,v.isna().any()]

In [None]:
k,v = next(a)
print(k)
v.loc[:,v.isna().any()]

In [None]:
k,v = next(a)
print(k)
v.loc[:,v.isna().any()]

plant_frame has a few ephemeral columns. Again, checking the raw EIA excel sheets shows that they really do only exist for 2011 and 2013

In [None]:
k,v = next(a)
print(k)
v.loc[:,v.isna().any()]

In [None]:
k,v = next(a)
print(k)
v.loc[:,v.isna().any()]