In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from pathlib import Path

In [3]:
zip_dir = Path('./eia_epm_zips/')
assert zip_dir.exists()

In [4]:
from eia_epm import *

In [5]:
%%time
dfs = Extractor(zip_dir).extract()

CPU times: user 46.9 s, sys: 143 ms, total: 47 s
Wall time: 47.4 s


In [6]:
[key.split('_')[0] for key in dfs.keys()]

['coal', 'oil', 'petcoke', 'gas', 'coal', 'oil', 'petcoke', 'gas']

In [7]:
transformed = add_fuel_cols_and_combine(dfs)

In [8]:
m = transformed['monthly']
y = transformed['ytd']

In [9]:
m.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Electric Power Sector,Electric Power Sector,Electric Utilities,Electric Utilities,Independent Power Producers,Independent Power Producers,yoy_pct_change
Unnamed: 0_level_1,Unnamed: 1_level_1,is_revised,False,True,False,True,False,True,False
region,date,fuel,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Alabama,2006-09-01,coal,,2.11,,2.11,,--,
Alabama,2006-09-01,gas,,6.36,,7.15,,5.54,


In [10]:
idx = pd.IndexSlice

In [11]:
# don't care about non-revised data
m_rev = m.loc[:, idx[:,True]]
m_rev.shape

(45212, 3)

In [14]:
m_rev.columns = m_rev.columns.droplevel('is_revised') # all True

In [15]:
m_rev.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Electric Power Sector,Electric Utilities,Independent Power Producers
region,date,fuel,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama,2006-09-01,coal,2.11,2.11,--
Alabama,2006-09-01,gas,6.36,7.15,5.54


In [17]:
from pudl.metadata.enums import US_STATES
US_STATES

{'AK': 'Alaska',
 'AL': 'Alabama',
 'AR': 'Arkansas',
 'AZ': 'Arizona',
 'CA': 'California',
 'CO': 'Colorado',
 'CT': 'Connecticut',
 'DE': 'Delaware',
 'FL': 'Florida',
 'GA': 'Georgia',
 'HI': 'Hawaii',
 'IA': 'Iowa',
 'ID': 'Idaho',
 'IL': 'Illinois',
 'IN': 'Indiana',
 'KS': 'Kansas',
 'KY': 'Kentucky',
 'LA': 'Louisiana',
 'MA': 'Massachusetts',
 'MD': 'Maryland',
 'ME': 'Maine',
 'MI': 'Michigan',
 'MN': 'Minnesota',
 'MO': 'Missouri',
 'MS': 'Mississippi',
 'MT': 'Montana',
 'NC': 'North Carolina',
 'ND': 'North Dakota',
 'NE': 'Nebraska',
 'NH': 'New Hampshire',
 'NJ': 'New Jersey',
 'NM': 'New Mexico',
 'NV': 'Nevada',
 'NY': 'New York',
 'OH': 'Ohio',
 'OK': 'Oklahoma',
 'OR': 'Oregon',
 'PA': 'Pennsylvania',
 'RI': 'Rhode Island',
 'SC': 'South Carolina',
 'SD': 'South Dakota',
 'TN': 'Tennessee',
 'TX': 'Texas',
 'UT': 'Utah',
 'VA': 'Virginia',
 'VT': 'Vermont',
 'WA': 'Washington',
 'WI': 'Wisconsin',
 'WV': 'West Virginia',
 'WY': 'Wyoming'}

In [26]:
m.index.levels[0]

Index(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', 'East North Central', 'East South Central', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Middle Atlantic', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Mountain', 'Nebraska', 'Nevada', 'New England', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'North Dakota[1]', 'Ohio', 'Oklahoma', 'Oregon', 'Pacific', 'Pacific Contiguous', 'Pacific Noncontiguous', 'Pennsylvania', 'Rhode Island', 'South Atlantic', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'U.S. Total', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West North Central', 'West South Central', 'West Virginia', 'Wisconsin', 'Wyoming'], dtype='object', name='region')

In [27]:
m_rev.loc[idx['North Dakota[1]',:,:],:]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Electric Power Sector,Electric Utilities,Independent Power Producers
region,date,fuel,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
North Dakota[1],2006-09-01,gas,6.17,6.17,--
North Dakota[1],2006-10-01,gas,15.53,15.53,--
North Dakota[1],2006-11-01,gas,6.7,6.7,--
North Dakota[1],2007-01-01,gas,6.04,6.04,--
North Dakota[1],2007-09-01,gas,,,
North Dakota[1],2007-10-01,gas,,,
North Dakota[1],2007-11-01,gas,,,
North Dakota[1],2008-01-01,gas,,,


In [30]:
# fix 'North Dakota[1]' in gas data <= jan 2008
m_rev.index = pd.MultiIndex.from_frame(m_rev.index.to_frame().replace({'North Dakota[1]': 'North Dakota'}))

In [37]:
# break into state, census region, and national levels
states = [state for state in US_STATES.values()] + ['District of Columbia']
national = ['U.S. Total']
regions = list(m_rev.index.levels[0].difference(pd.Index(states + national)))

In [43]:
regions

['East North Central',
 'East South Central',
 'Middle Atlantic',
 'Mountain',
 'New England',
 'Pacific',
 'Pacific Contiguous',
 'Pacific Noncontiguous',
 'South Atlantic',
 'West North Central',
 'West South Central']

In [45]:
m_states = m_rev.loc[idx[states,:,:], :]
m_national = m_rev.loc[idx[national, :,:], :]
m_regions = m_rev.loc[idx[regions, :,:], :]

In [None]:
# same for ytd


In [39]:
# don't care about non-revised data
y_rev = y.loc[:, idx[:,True]]
y_rev.shape

(45208, 3)

In [40]:
y_rev.columns = y_rev.columns.droplevel('is_revised') # all True

In [41]:
y_rev.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Electric Power Sector,Electric Utilities,Independent Power Producers
region,date,fuel,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama,2006-09-01,coal,2.07,2.07,--
Alabama,2006-09-01,gas,7.17,7.54,6.9


In [42]:
y_rev.index.levels[0]

Index(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', 'East North Central', 'East South Central', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Middle Atlantic', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Mountain', 'Nebraska', 'Nevada', 'New England', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pacific', 'Pacific Contiguous', 'Pacific Noncontiguous', 'Pennsylvania', 'Rhode Island', 'South Atlantic', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'U.S. Total', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West North Central', 'West South Central', 'West Virginia', 'Wisconsin', 'Wyoming'], dtype='object', name='region')

In [44]:
# should be KeyError
y_rev.loc[idx['North Dakota[1]',:,:],:]

KeyError: 'North Dakota[1]'

In [46]:
y_states = y_rev.loc[idx[states,:,:], :]
y_national = y_rev.loc[idx[national, :,:], :]
y_regions = y_rev.loc[idx[regions, :,:], :]

In [47]:
root = 'eia_epm_prices_'
for name, df in {
    "states_ytd.csv": y_states,
    "national_ytd.csv": y_national,
    "regions_ytd.csv": y_regions,
    "states.csv": m_states,
    "national.csv": m_national,
    "regions.csv": m_regions,
}.items():
    filepath = root + name
    df.to_csv(filepath, index=True)