In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
dfg = pd.read_excel('./data/annual_generation_state.xls')

In [3]:
# Resetting column headings
dfg.columns = dfg.iloc[0]
dfg.drop([0], inplace = True)

In [4]:
dfg.head()

Unnamed: 0,YEAR,STATE,TYPE OF PRODUCER,ENERGY SOURCE,GENERATION (Megawatthours)
1,1990,AK,Total Electric Power Industry,Total,5599506
2,1990,AK,Total Electric Power Industry,Coal,510573
3,1990,AK,Total Electric Power Industry,Hydroelectric Conventional,974521
4,1990,AK,Total Electric Power Industry,Natural Gas,3466261
5,1990,AK,Total Electric Power Industry,Petroleum,497116


EIA data is quite complete:

In [5]:
dfg.isnull().sum()

0
YEAR                          0
STATE                         0
TYPE OF PRODUCER              0
ENERGY SOURCE                 0
GENERATION (Megawatthours)    0
dtype: int64

All-caps column names will be difficult to work with, so here we rename:

In [6]:
column_rename = {"YEAR": "Year", 
                 "STATE": "State", 
                 "TYPE OF PRODUCER": "Producer Type",
                 "ENERGY SOURCE": "Source", 
                 "GENERATION (Megawatthours)": "Gen MWh"}

dfg.rename(columns=column_rename, inplace=True)

However not all our data types are as expected. "Year" and "GENERATION (Megawatthours)" are expected to be numeric but they are not. We do not (yet) need Year as datetime, so we will convert both these columns to int.

In [7]:
dfg.dtypes

0
Year             object
State            object
Producer Type    object
Source           object
Gen MWh          object
dtype: object

In [8]:
dfg['Gen MWh'] = dfg['Gen MWh'].astype(int)
dfg['Year'] = dfg['Year'].astype(int)

We have 29 years of data, 1990 to 2018, just like our rates data:

In [9]:
dfg['Year'].unique()

array([1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
       2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
       2012, 2013, 2014, 2015, 2016, 2017, 2018])

There are 54 State categories, which is unexpected and worth investigation:

In [10]:
dfg['State'].nunique()

54

Beyond the 50 states plus DC, we have some blanks (' '), and two total US categories, "US-TOTAL" and "US-Total":

In [11]:
dfg['State'].unique()

array(['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
       'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
       'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
       'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
       'US-TOTAL', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', '  ',
       'US-Total'], dtype=object)

All three blank State items represent 0 MWh of generation, which is not meaningfu data. We lose nothing by dropping these rows.

In [12]:
dfg.loc[dfg['State'] == "  "]

Unnamed: 0,Year,State,Producer Type,Source,Gen MWh
20577,2003,,Total Electric Power Industry,Coal,0
20578,2003,,Total Electric Power Industry,Natural Gas,0
20579,2003,,Total Electric Power Industry,Petroleum,0


In [13]:
dfg = dfg[dfg['State'] != "  "]

We should also drop the US-TOTAL / US-Total data. We are not modeling the entire country, so the state data are all we need.

In [14]:
dfg = dfg[dfg['State'] != "US-TOTAL"]
dfg = dfg[dfg['State'] != "US-Total"]

It makes sense to boil our data down to the key rows: **Producer Type** "Total Electric Power Industry", each possible **Source**, along with the **Source**:Total from all reported generation sources. Before doing this, which will mean dropping many rows, let's confirm that the sum of all sources equals the total in the data. 

In [15]:
def gen_totals(Year, State):
    
    sources =    dfg[(dfg['Year'] == Year) & 
                (dfg['State'] == State) & 
                (dfg['Producer Type'] == "Total Electric Power Industry")]['Gen MWh']
    
    gen_total =  int(dfg[(dfg['Year'] == Year) &
                (dfg['State'] == State) & 
                (dfg['Producer Type'] == "Total Electric Power Industry")]['Gen MWh'][0:1].values)

    gen_sum =   dfg[(dfg['Year'] == Year) & 
               (dfg['State'] == State) & 
               (dfg['Producer Type'] == "Total Electric Power Industry")]['Gen MWh'][1:len(sources)].sum() 
        
    # We'll define a threshhold bewteen the reported total and the sum of 10
    # Considering even a very small state generates millions of MHW/yr, 
    # anything this small represents a rounding error.
    if abs(gen_total - gen_sum) < 10:
        return True
    else:
        return False

Here is an example of the output for once year/State:

In [16]:
gen_totals(2016, "RI")

True

The for loop below will confirm that we have no mistakes in our data (spoiler alert - there are none).<br><br> **NOTE:** this will take around 30 seconds to run on a newer computer:

In [17]:
for year in dfg['Year'].unique():
    for state in dfg['State'].unique():
        if gen_totals(year, state) == True:
            pass
        else:
            print(year, state)

We no longer need "Producer Type" at all as it only has one value. Here we drop the column:

In [18]:
dfg.drop(["Producer Type"], axis = 1, inplace = True)

We have 51 States and 29 years, or 1479 combinations of State and Year. There are an average of around 9 rows per State and Year combination, which is in the expected range of one Total row and an average of 8 generation types for each.

In [19]:
dfg.shape

(49659, 4)

In [20]:
dfg.head()

Unnamed: 0,Year,State,Source,Gen MWh
1,1990,AK,Total,5599506
2,1990,AK,Coal,510573
3,1990,AK,Hydroelectric Conventional,974521
4,1990,AK,Natural Gas,3466261
5,1990,AK,Petroleum,497116


In [21]:
dfg['Source'].unique()

array(['Total', 'Coal', 'Hydroelectric Conventional', 'Natural Gas',
       'Petroleum', 'Wind', 'Wood and Wood Derived Fuels', 'Nuclear',
       'Other Biomass', 'Other Gases', 'Pumped Storage', 'Geothermal',
       'Other', 'Solar Thermal and Photovoltaic'], dtype=object)

Let's shorten some of the Source names:

In [22]:
dfg.replace({"Source": {"Hydroelectric Conventional": "Hydroelectric", 
                        "Solar Thermal and Photovoltaic": "Solar Thermal/PV"}})

Unnamed: 0,Year,State,Source,Gen MWh
1,1990,AK,Total,5599506
2,1990,AK,Coal,510573
3,1990,AK,Hydroelectric,974521
4,1990,AK,Natural Gas,3466261
5,1990,AK,Petroleum,497116
...,...,...,...,...
51629,2018,WY,Coal,38641538
51630,2018,WY,Hydroelectric,966509
51631,2018,WY,Natural Gas,232851
51632,2018,WY,Petroleum,40084


Let's pivot our dataframe, so each Year / State combination is a row, and each generation source is a column with Gen MWh as the value:

In [23]:
dfg = pd.pivot_table(data=dfg,index=["Year", "State"], columns = "Source", values = "Gen MWh")
dfg.reset_index(level=[0,1], inplace = True)
dfg.head()

Source,Year,State,Coal,Geothermal,Hydroelectric Conventional,Natural Gas,Nuclear,Other,Other Biomass,Other Gases,Petroleum,Pumped Storage,Solar Thermal and Photovoltaic,Total,Wind,Wood and Wood Derived Fuels
0,1990,AK,340382.0,,974521.0,2310841.0,,,,,248558.0,,,2799753.0,0.0,151035.0
1,1990,AL,26829060.0,,10366507.0,510357.0,12051882.0,,31668.666667,269476.0,92059.33,,,31860850.0,,1049923.5
2,1990,AR,12805290.0,,3654653.0,1789286.0,11282053.0,,10259.333333,,39989.5,42972.0,,19549800.0,,1238044.0
3,1990,AZ,21277070.0,,7417576.0,1166950.0,20597689.0,,,,75933.5,249767.0,,31387150.0,,107888.0
4,1990,CA,1318838.0,9680836.0,11896283.5,24722770.0,32692807.0,0.0,705971.666667,1073371.0,1824617.0,986252.0,244445.333333,55261640.0,1839254.0,2060993.0


In [24]:
dfg.fillna(0, inplace=True)

In [25]:
dfg["Biomass"] = dfg["Wood and Wood Derived Fuels"] + dfg["Other Biomass"]

We can also drop "Wood and Wood Derived Fuels" and "Other Biomass" now that we have the combined "Biomass" column:

In [26]:
dfg.drop(["Wood and Wood Derived Fuels", "Other Biomass"], axis = 1, inplace = True)

Changing our float display format will make the data easier to interpret visually:

In [28]:
pd.options.display.float_format = '{:,.0f}'.format

In [30]:
dfg.head()

Source,Year,State,Coal,Geothermal,Hydroelectric Conventional,Natural Gas,Nuclear,Other,Other Gases,Petroleum,Pumped Storage,Solar Thermal and Photovoltaic,Total,Wind,Biomass
0,1990,AK,340382,0,974521,2310841,0,0,0,248558,0,0,2799753,0,151035
1,1990,AL,26829058,0,10366507,510357,12051882,0,269476,92059,0,0,31860853,0,1081592
2,1990,AR,12805290,0,3654653,1789286,11282053,0,0,39990,42972,0,19549799,0,1248303
3,1990,AZ,21277073,0,7417576,1166950,20597689,0,0,75934,249767,0,31387148,0,107888
4,1990,CA,1318838,9680836,11896284,24722769,32692807,0,1073371,1824617,986252,244445,55261636,1839254,2766965


In [33]:
dfg.to_csv('./data/electricity-generation.csv')