In [None]:
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

import pudl
import pudl.extract.eia_bulk_elec as ebulk
import pudl.transform.eia_bulk_elec as tbulk

In [None]:
fpath = Path('/home/tpb/catalyst/workspace/data/eia_bulk_data/ELEC.zip')
assert fpath.exists()

In [None]:
%%time
raw = ebulk.extract(fpath)

In [None]:
raw_meta = raw['metadata']
raw_ts = raw['timeseries']

In [None]:
raw_meta.head(2)

In [None]:
tbulk._transform_metadata(raw_meta).head(2)

In [None]:
pd.to_datetime(raw_meta['last_updated'])

In [None]:
meta = pd.concat([raw_meta.drop(columns=['name']),
                  tbulk._extract_keys_from_name(raw_meta),
                  tbulk._extract_keys_from_series_id(raw_meta)],
                 axis=1
                )

In [None]:
meta.sample(5)

In [None]:
meta.shape, raw_ts.shape

In [None]:
meta.sample(5)

In [None]:
ts.head(5)

In [None]:
ts.isna().mean()

In [None]:
ts.isna().groupby(ts['series_id'].str[-1])['value'].mean()

In [None]:
ts.memory_usage(deep=True) / 2**20

In [None]:
meta.describe()

In [None]:
meta.loc[:, ~meta.isna().all()].describe()

In [None]:
all_nan = meta.isna().all()
all_none = meta.eq('None').all()
to_drop = all_nan | all_none
dropped_col_names = meta.columns[to_drop]

In [None]:
set(dropped_col_names)

In [None]:
tbulk._get_empty_col_names(meta)

In [None]:
tbulk._get_redundant_frequency_col_names(meta)

In [None]:
tbulk._get_constant_col_names(meta)

In [None]:
meta['fuel_code'].value_counts()

In [None]:
meta.columns[meta.nunique() == 1]

In [None]:
geo_parts = meta['geoset_id'].str.split('-', expand=True)

In [None]:
tbulk._get_col_names_to_drop(meta)

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
meta.sample(5, random_state=2)[['series_id', 'description']]

In [None]:
dmeta = meta.drop(columns=tbulk._get_col_names_to_drop(meta))

In [None]:
dmeta.sample(5, random_state=3)

In [None]:
# confirm connecticut is in "geography" because it is missing from "description"
dmeta.loc[dmeta['region_code'].eq("NEW"),:].head(3)

In [None]:
dmeta.groupby(["fuel", "fuel_code"])['description'].first().str.strip('; ').str.split(';', expand=True)[[0]]

In [None]:
dmeta.groupby(["sector", "sector_code"])['description'].first().str.strip('; ').str.split(';', expand=True)[[1]]

In [None]:
from io import StringIO
pd.read_csv(StringIO(
	"""
fuel	fuel_code	description
bituminous coal	BIT	Bituminous Coal
coal	COW	Summation of all types of coal
lignite coal	LIG	Lignite Coal
natural gas	NG	Natural Gas
petroleum coke	PC	Petroleum Coke (solid residual petroluem)
petroleum liquids	PEL	Summation of all petroleum liquids (distallte fuel oil, jet fuel, residual fuel oil, kerosense waste oil and other petroleum liquids)
subbituminous coal	SUB	Subbituminous Coal
    """
), sep='\t')

In [None]:
out = StringIO()
dmeta.groupby(["region", "region_code"])['description'].first().str.strip('; ').str.split(';', expand=True)[[2]].to_csv(out, sep=",")

In [None]:
print(out.getvalue())

In [None]:
wide_ts = ts.merge(dmeta[['series_id', 'units', 'fuel', 'region','sector', 'frequency']], on='series_id').drop(columns='series_id')

In [None]:
wide_ts = wide_ts.set_index(['units', 'fuel', 'region','sector', 'frequency', 'date']).unstack('units')

In [None]:
wide_ts.columns = wide_ts.columns.droplevel(level=None)

In [None]:
wide_ts.rename(columns={'billion Btu': 'receipts_billion_btu',
                       'dollars per million Btu': 'cost_dollars_per_mmbtu'}, inplace=True)

In [None]:
wide_ts.sample(5)

In [None]:
wide_ts.columns.name = None

In [None]:
wide_ts.columns

In [None]:
tbulk._transform_timeseries(ts, dmeta).sample(5)

In [None]:
cen_reg = pd.read_csv(
    StringIO(
        """
region,region_code,included_states
East North Central,ENC,"Illinois,Indiana,Michigan,Ohio,Wisconsin"
East South Central,ESC,"Alabama,Kentucky,Mississippi,Tennessee"
Middle Atlantic,MAT,"New Jersey,New York,Pennsylvania"
Mountain,MTN,"Arizona,Colorado,Idaho,Montana,Nevada,New Mexico,Utah,Wyoming"
New England,NEW,"Maine,Massaschuetts,New Hampshire,Rhode Island,Vermont,Connecticut"
Pacific Contiguous,PCC,"California,Oregon,Washington"
Pacific Noncontiguous,PCN,"Alaska,Hawaii"
South Atlantic,SAT,"Delaware,the District of Columbia,Florida,Georgia,Maryland,North Carolina,South Carolina,Virginia,West Virginia"
West North Central,WNC,"Iowa,Kansas,Minnesota,Missouri,Nebraska,North Dakota,South Dakota"
West South Central,WSC,"Arkansas,Louisiana,Oklahoma,Texas"
    """
    ),
)

In [None]:
cen_reg

In [None]:
cen_reg['included_states'].str.split(',', expand=True).melt()

In [None]:
pd.concat([cen_reg[['region', 'region_code']], cen_reg['included_states'].str.split(',', expand=True)], axis=1).melt(id_vars=['region', 'region_code'], value_name='state').dropna(subset='state').drop(columns='variable').sort_values('region')