# Clean FERC Form 1

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Standard libraries
import logging
import sys
import os
import pathlib

# 3rd party libraries
import geopandas as gpd
import dask.dataframe as dd
from dask.distributed import Client
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
import sqlalchemy as sa

# Local libraries
import pudl

In [3]:
# Enable viewing of logging outputs
logger=logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [4]:
# Establish connection to pudl database
pudl_settings = pudl.workspace.setup.get_defaults()
pudl_engine = sa.create_engine(pudl_settings['pudl_db'])
pudl_out = pudl.output.pudltabl.PudlTabl(
    pudl_engine=pudl_engine,
    freq='AS'
)

In [5]:
ferc1_years = pudl.constants.working_partitions['ferc1']['years']
ferc1_tables = ['fuel_ferc1','plants_steam_ferc1'] 
ferc1_raw_dfs = pudl.extract.ferc1.extract(
        ferc1_tables=ferc1_tables,
        ferc1_years=ferc1_years,
        pudl_settings=pudl_settings)

Converting extracted FERC Form 1 table fuel_ferc1 into a pandas DataFrame.
Converting extracted FERC Form 1 table plants_steam_ferc1 into a pandas DataFrame.


In [6]:
ferc1_transformed_dfs = pudl.transform.ferc1.transform(
         ferc1_raw_dfs, ferc1_tables=ferc1_tables)

Transforming raw FERC Form 1 dataframe for loading into fuel_ferc1
Transforming raw FERC Form 1 dataframe for loading into plants_steam_ferc1
Identifying distinct large FERC plants for ID assignment.
slimming fuel table
running fbp table
merging fractions with steam
filling nulls with zeros
traning classifier with default weights
generating groups of similar records
Successfully associated 22138 of 29270 (75.63%) FERC Form 1 plant records with multi-year plant entities.
Assigning IDs to multi-year FERC plant entities.
Identified 4695 orphaned FERC plant records. Adding orphans to list of plant entities.
Successfully Identified 2074 multi-year plant entities.
Found report_year=1994 3 times in plant_id_ferc1=93
Found report_year=1994 2 times in plant_id_ferc1=316
Found report_year=1995 2 times in plant_id_ferc1=316
Found report_year=1996 2 times in plant_id_ferc1=316
Found report_year=1997 2 times in plant_id_ferc1=316
Found report_year=1998 2 times in plant_id_ferc1=316
Found report_yea

## Part 1: Fill in Missing Fuel

* **Step1.0**
* **Step1.1**
* **Step1.2**
* **Step1.3**
* **Step1.4**
* **Step1.5**
* **Step1.6**
* **Step1.7**
* **Step1.8**
* **Step1.9**

In [693]:
# Useful merge columns
ferc_merge_cols = ['report_year', 'utility_id_ferc1', 'plant_name_ferc1']
eia_merge_cols = ['report_date', 'plant_id_pudl', 'generator_id']

# Load the tables you'll need with some basic alterations
steam = ferc1_transformed_dfs['plants_steam_ferc1'].copy()
glue_dicts = pudl.glue.ferc1_eia.glue(ferc1=True) # For steam you'll have to mimic the glue process to get plant_id_pudl and add a column for primary fuel
steam = pd.merge(steam, glue_dicts['plants_ferc1'], on=['plant_name_ferc1', 'utility_id_ferc1'], how='left')
steam = pd.merge(steam, glue_dicts['utilities_ferc1'][['utility_id_ferc1', 'utility_name_ferc1']], on=['utility_id_ferc1'], how='left')
steam['primary_fuel'] = np.nan # Add column to document primary fuel

fuel = ferc1_transformed_dfs['fuel_ferc1']
fbp = pudl.transform.ferc1.fuel_by_plant_ferc1(fuel)
fbp_small = fbp[ferc_merge_cols + ['primary_fuel_by_mmbtu', 'primary_fuel_by_cost']]

plants = pudl_out.plants_eia860()
gens = pudl_out.gens_eia860()
net_gen = pudl_out.gen_eia923()

In [694]:
def test_for_duplicates(df, subset):
    test = fuel.copy()
    test['dup'] = test.duplicated(subset=subset)
    return f"there are {len(test[fuel_test['dup']==True])} duplicates"

print('number of duplicate index values for fuel table:', test_for_duplicates(fuel, ferc_merge_cols))
print('number of duplicate index values for steam table:', test_for_duplicates(steam, ferc_merge_cols))

number of duplicate index values for fuel table: there are 0 duplicates
number of duplicate index values for steam table: there are 0 duplicates


In [695]:
# FLAGS
flag1 = 'primary fuel by mmbtu'
flag2 = 'single fuel by net generation from eia923' ### DON'T FORGET TO CHECK IF NET GEN CALC IS EVEN NECESSARY LOL
flag3 = 'primary fuel by cost'
flag4 = 'ferc pudl map all'# Make better one
flag5 = 'similar heat rate to other years'
flag6 = 'ferc plant id has one fuel'
flag7 = 'pudl plant id has one fuel'
flag8 = 'manually filled in'
flag9 = 'fuel in name (solar/wind)'

In [696]:
def _add_new_fuel_and_flag(df, new_fuel_col, flag):
    """Add new fuels to the primary fuel column and flag where they came from."""
    df.loc[(df['primary_fuel'].isna()) & (df[f'{new_fuel_col}'].notna()), 'primary_fuel_flag'] = flag
    df['primary_fuel'] = df['primary_fuel'].fillna(df[f'{new_fuel_col}'])
    return df

def _check_flags(df):
    fuel = df[df['primary_fuel'].notna()]
    flag = df[df['primary_fuel_flag'].notna()]
    assert len(fuel) == len(flag), 'imputed fuels must be associated with a flag'
    return df

### **Step 1.0:** Remove rows with all NA values

In [697]:
value_cols_no_cap = [
    'net_generation_mwh','avg_num_employees',
    'capex_land', 'capex_equipment', 'capex_structures', 'capex_total', 'asset_retirement_cost',
    'opex_operations', 'opex_fuel', 'opex_coolants', 'opex_steam', 'opex_steam_other', 'opex_transfer', 
    'opex_electric', 'opex_misc_power', 'opex_rents', 'opex_allowances', 'opex_engineering', 'opex_structures', 
    'opex_boiler', 'opex_plants', 'opex_misc_steam', 'opex_production_total'
]

steam0 = steam.dropna(subset=value_cols_no_cap, how='all').copy()

print('rows with no fuel / total rows')
print(len(steam0[steam0['primary_fuel'].isna()]), '/', len(steam))

rows with no fuel / total rows
28552 / 29270


### **Step 1.1:** Primary fuel by mmbtu

In [698]:
# Merge steam and fbp table on steam table, clean primary_fuel_by_mmbtu col, add new fuel and flag
steam1 = (
    pd.merge(steam0, fbp_small, on=ferc_merge_cols, how='left')
    .assign(primary_fuel_by_mmbtu=lambda x: x.primary_fuel_by_mmbtu.replace({'':np.nan, 'unknown': np.nan}))
    .pipe(_add_new_fuel_and_flag, 'primary_fuel_by_mmbtu', flag1)
    .pipe(_check_flags)
)

In [699]:
print('rows with no fuel / total rows')
print(len(steam1[steam1['primary_fuel'].isna()]), '/', len(steam))

rows with no fuel / total rows
4497 / 29270


### **Step 1.2:** Net generation-based fuel from EIA

In [700]:
# Combine fuel type from 860 with net generation from 923 to get fuel type and net generation
net_gen_fuel = (
    pd.merge(gens, net_gen[eia_merge_cols+['net_generation_mwh']], on=eia_merge_cols, how='outer')
    .assign(
        # Get plant net generation sums
        plant_net_gen=lambda x: (
            x.groupby(['report_date', 'plant_id_pudl']).net_generation_mwh.transform('sum')),
        # Calculate the portion of net generation that each row is responsible for
        gen_pct=lambda x: (
            x.net_generation_mwh / x.plant_net_gen * 100)
    )
)

# Calculate the sum portion of net generation attributable to each fuel type 
net_gen_fuel_sum = (
    net_gen_fuel.groupby(['report_date', 'plant_id_pudl', 'fuel_type_code_pudl'])['gen_pct']
    .sum()
    .reset_index()
    # Create flag to distinguish between rows that have more than one fuel (duplicates)
    .assign(dup=lambda x: x.duplicated(subset=['report_date', 'plant_id_pudl'], keep=False))
)

# Keep rows with only one fuel
one_fuel_eia = (
    net_gen_fuel_sum[~net_gen_fuel_sum['dup']].copy()
    .rename(columns={'fuel_type_code_pudl': 'primary_fuel_type_eia'})
    .assign(
        report_year=lambda x: x.report_date.dt.year,
        primary_fuel_type_eia=lambda x: x.primary_fuel_type_eia.replace({'':np.nan, 'unknown': np.nan, 'other': np.nan}))
    .drop(['report_date', 'gen_pct', 'dup'], axis=1)
)

# Combine steam table with EIA plant primary fuel type table, add flag to note fuel type location
steam2 = (
    pd.merge(steam1, one_fuel_eia, on=['report_year', 'plant_id_pudl'], how='left')
    .pipe(_add_new_fuel_and_flag, 'primary_fuel_type_eia', flag2)
    .pipe(_check_flags)
)

In [701]:
print('rows with no fuel / total rows')
print(len(steam2[steam2['primary_fuel'].isna()]), '/', len(steam))

rows with no fuel / total rows
2808 / 29270


### **Step 1.3:** Primary fuel by cost

In [702]:
steam3 = (
    steam2.assign(primary_fuel_by_cost=lambda x: x.primary_fuel_by_cost.replace({'':np.nan, 'unknown': np.nan, 'other': np.nan}))
    .pipe(_add_new_fuel_and_flag, 'primary_fuel_by_cost', flag3)
    .pipe(_check_flags)
)

In [703]:
print('rows with no fuel / total rows')
print(len(steam3[steam3['primary_fuel'].isna()]), '/', len(steam))

rows with no fuel / total rows
2398 / 29270


### **Step 1.4:** raw FERC fuel

In [704]:
# Identify duplicate columns
fuel_dupes = (
    fuel.loc[fuel['fuel_type_code_pudl']!='unknown'].copy()
    .assign(dup=lambda x: x.duplicated(subset=ferc_merge_cols, keep=False))
)

# Only take fuels from plants without duplicate name/utility/year combos
fuel_ferc_no_dup = (
    fuel_dupes[~fuel_dupes['dup']][['report_year', 'utility_id_ferc1', 'plant_name_ferc1', 'fuel_type_code_pudl', 'fuel_avg_heat_raw', 'fuel_qty_burned']].copy() # keep fuel_avg_heat_raw and fuel_qty_burned in there for next round
    .rename(columns={'fuel_type_code_pudl':'fuel_type_code_pudl_ferc'})
)

steam4 = (
    pd.merge(steam3, fuel_ferc_no_dup, on=ferc_merge_cols, how='left')
    .pipe(_add_new_fuel_and_flag, 'fuel_type_code_pudl_ferc', flag4)
    .pipe(_check_flags)
)

In [705]:
print('rows with no fuel / total rows')
print(len(steam4[steam4['primary_fuel'].isna()]), '/', len(steam))

rows with no fuel / total rows
2080 / 29270


### **Step 1.5:** FERC heat rate -- for now doesn't seem to do anything

In [706]:
def _create_dict(df):
    """Create a dict of fuel types and fuel_avg_heat ranges within 1% of median."""
    no_unk = df[(df['primary_fuel'].notna()) & (df['fuel_avg_heat_raw'].notna())]
    median_df = no_unk.groupby(['plant_name_ferc1', 'primary_fuel'])['fuel_avg_heat_raw'].median().reset_index()

    # Create dictionary of plant name, fuel, and heat rate
    fuel_dict = {}
    for i in median_df['plant_name_ferc1'].unique():
        fuel_dict[i] = dict(zip(
            median_df.loc[median_df['plant_name_ferc1'] == i]['primary_fuel'], 
            median_df.loc[median_df['plant_name_ferc1'] == i]['fuel_avg_heat_raw']
        ))
    # Turn the median values into ranges based on 1% buffer
    for k,v in fuel_dict.items():
        for kk,vv in v.items():
            v[kk] = range(int(vv-vv*0.01), int(vv+vv*0.01))
    return fuel_dict

def _test_for_overlap(fuel_dict):
    """See if there are any overlapping heat rate ranges for the same plant."""
    for k,v in fuel_dict.items():
        overlap_list = []
        rr = tuple(v.values())
        overlap = set(rr[0]).intersection(rr[1:])
        if overlap:
            overlap_list = overlap_list.append(k)
    return overlap_list

In [707]:
steam5 = (
    steam4.copy()
    .assign(ferc_fuel_by_heat_rate=np.nan)
)

fuel_dict = _create_dict(steam5)
overlap_list = _test_for_overlap(fuel_dict)
if overlap_list:
    print('The following plants have fuels with overlapping heat rates')
    print(overlap_list)
    
for plant_name, hr_dict in fuel_dict.items():
    # Get all heat rate values for a given plant name
    plant_df = steam5.loc[steam5['plant_name_ferc1']==plant_name].copy()
    # If the heat rate is within the range of one of the given fuels in the dictionary associated
    # with it's name, then give it that fuel label.
    plant_df['fuel_by_heat'] = (
        plant_df['fuel_avg_heat_raw']
        .apply(lambda x: next((fuel for fuel, heat_rate in hr_dict.items() if x in heat_rate), np.nan)))
    #print(plant_df['fuel_by_heat'])
    # Add these new fuels to the full table
    steam5['ferc_fuel_by_heat_rate'].update(plant_df['fuel_by_heat'])
    
steam5 = (
    steam5
    .pipe(_add_new_fuel_and_flag, 'ferc_fuel_by_heat_rate', flag5)
    .pipe(_check_flags)
)

In [708]:
print('rows with no fuel / total rows')
print(len(steam5[steam5['primary_fuel'].isna()]), '/', len(steam))

rows with no fuel / total rows
2080 / 29270


### **Step 1.6:** Group by FERC plant id and fill in if all one fuel

In [709]:
plant_df = (
    steam6.groupby(['plant_id_ferc1'])['primary_fuel']
    .apply(lambda x: x.dropna().unique()[0] if len(x.dropna().unique()) ==1 else np.nan)
    .reset_index()
    .rename(columns={'primary_fuel':'ferc1_id_has_one_fuel'})
)

steam6 = (
    pd.merge(steam5, plant_df, on=['plant_id_ferc1'], how='left')
    .pipe(_add_new_fuel_and_flag, 'ferc1_id_has_one_fuel', flag6)
    .pipe(_check_flags)
)

In [710]:
print('rows with no fuel / total rows')
print(len(steam6[steam6['primary_fuel'].isna()]), '/', len(steam))

rows with no fuel / total rows
1271 / 29270


### **Step 1.7:** Group by PUDL plant id and fill in if all one fuel

In [711]:
plant_df = (
    steam6.groupby(['plant_id_pudl'])['primary_fuel']
    .apply(lambda x: x.dropna().unique()[0] if len(x.dropna().unique()) ==1 else np.nan)
    .reset_index()
    .rename(columns={'primary_fuel':'pudl_id_has_one_fuel'})
)

steam7 = (
    pd.merge(steam6, plant_df, on=['plant_id_pudl'], how='left')
    .pipe(_add_new_fuel_and_flag, 'pudl_id_has_one_fuel', flag7)
    .pipe(_check_flags)
)

In [712]:
print('rows with no fuel / total rows')
print(len(steam7[steam7['primary_fuel'].isna()]), '/', len(steam))

rows with no fuel / total rows
1121 / 29270


### **Step 1.8:** Manually filled fuel

In [713]:
manually_filled_plants = (
    pd.read_csv('/Users/aesharpe/Desktop/fill_in_fuel.csv')
    .rename(columns={'fuel': 'manual_fill_in_fuel'})
)

steam8 = (
    pd.merge(steam7, manually_filled_plants, on=ferc_merge_cols, how='left')
     .pipe(_add_new_fuel_and_flag, 'manual_fill_in_fuel', flag8)
     .pipe(_check_flags)
)

In [714]:
print('rows with no fuel / total rows')
print(len(steam8[steam8['primary_fuel'].isna()]), '/', len(steam))

rows with no fuel / total rows
875 / 29270


In [715]:
# Find bad plants
def tt(bad_list):
    new_lis = []
    for x in li:
        test = steam7[steam7['plant_id_pudl']==x]
        na = test[test['primary_fuel'].isna()].sort_values('report_year')
        if len(test) != len(na):
            new_lis.append(x)
    return new_lis

li = list(steam7[steam7['primary_fuel'].isna()].plant_id_pudl.unique())
zz = tt(li)
zz.sort()
#zz

### **Step 1.9:** Get ones with obvious names

In [716]:
steam9 = steam8.copy()

steam9.loc[steam9['plant_name_ferc1'].str.contains('solar'), 'name_based'] = 'solar'
steam9.loc[steam9['plant_name_ferc1'].str.contains('wind'), 'name_based'] = 'wind'

steam9 = (
    steam9.pipe(_add_new_fuel_and_flag, 'name_based', flag9)
    .pipe(_check_flags)
)

In [717]:
print('rows with no fuel / total rows')
print(len(steam9[steam9['primary_fuel'].isna()]), '/', len(steam))

rows with no fuel / total rows
857 / 29270


## Part 2: Flag double counting columns

Current flags: 
* plant total
* utility owned total
* unit total

In [875]:
def flag_totals(steam_table):
    """Preliminarily mark the rows with total indicated in the plant name."""
    regex1 = r'(?i)tot[a-z]*' # Find anything that would resembles TOTAL in the plant name
    regex2 = '100%' # Find all names with 100% in them
    regex3 = 'ttl ' # the space at the end is important
    regex4 = r'\(all' # find any names with '(all' in them
    
    steam_table['total'] = steam_table['plant_name_ferc1'].str.contains('|'.join([regex1, regex2, regex3])) #regex4
    ser_no_octo = steam_table[steam_table['plant_name_ferc1'].str.contains('octotillo')].total == False
    steam_table['total'].update(ser_no_octo) # remove octotillio from total = True because it accidentally fits in the 'tot' regex
    
    return steam_table

In [876]:
def flag_plant_totals(df, col_name):
    
    def is_plant_total(row):
        if 'total plant' in row:
            return 'plant total'
        elif 'plant total' in row:
            return 'plant total'
        elif 'total plt' in row:
            return 'plant total'
        elif 'ttl plt' in row:
            return 'plant total'
        elif 'tot. plt.' in row:
            return 'plant total'
        elif '100%' in row:
            return 'plant total'
        elif 'general' in row:
            return 'utility owned total'
        else:
            return None
    
    df[col_name] = df.apply(lambda x: is_plant_total(x.plant_name_ferc1), axis=1)
    
    return df

In [877]:
def backfill_years_by_capacity(df, col_name, replace, replace_with):
    """Backfill rows based on capacity."""
    
    only_totals_df = df[df[col_name]==replace_with]
    plant_groups = only_totals_df.groupby('plant_id_pudl')
    capacity_dict = plant_groups['capacity_mw'].apply(lambda x: [x for x in list(x.unique()) if x !=0]).to_dict() # no zeros
    plants_with_totals_list = list(capacity_dict.keys())
    
    for plant_id in plants_with_totals_list:
        for capacity in capacity_dict[plant_id]:
            one_plant_df = df[df['plant_id_pudl']==plant_id].copy()
            cap_match_df = one_plant_df[one_plant_df['capacity_mw'].isin(capacity_dict[plant_id])]
            series_update = cap_match_df[col_name].replace({replace: replace_with})
            df[col_name].update(series_update) 
            
    return df

In [878]:
def categorize_bad_rows(df, f_list): # could probably make this faster...
    """Flag bad rows."""
    for fix_dict in f_list:
        for year in fix_dict['years']:
            df.loc[df['record_id']==f"f1_steam_{year}{fix_dict['id_suffix']}", 'total_type'] = fix_dict['total_type']
    
    return df

In [879]:
def compare_totals(flag_df, comp_col):
    """Sum non-total utility-plant reported values and compare to reported totals when applicable.
    
    This function takes in a DataFrame and calculates the sum value for both the plant and utility-plant
    groups on an annual basis and compares them with any reported totals. First, this function calculates
    group totals by excluding any rows flagged as totals or extraneous in the total_types row (i.e.:
    total_type.isna()). Then, it separates the DataFrame into two small DataFrames representing all
    the utility owned total rows and all of the plant total rows reported and flagged. Next, it
    compares the calculated group total against the reported group total and outputs a True/False
    boolean. The boolean flags from these small table comparisons are then subsumed into the larger 
    DataBase under the column names: utility_owned_total_flag and plant_total_flag.
    
    These columns will show users where there might be a reporting discrepancy and allows them to
    choose which value they might want to reply on for further calculation. This flag is not yet
    incorporated into the aggregation function below, but it could be!
    
    """
    def sum_no_totals(df, col):
        no_totals_df = df.loc[df['total_type'].isna()]
        return no_totals_df[col].sum()

    # Groupby utility-plant and plant
    plant_util_groups = flag_df.groupby(['report_year', 'utility_id_ferc1', 'plant_id_pudl'])
    plant_groups = flag_df.groupby(['report_year', 'plant_id_pudl'])

    # Get the sum of the values in each group that are not totals
    plant_util_total_series = plant_util_groups.apply(lambda x: sum_no_totals(x, comp_col))
    plant_total_series = plant_groups.apply(lambda x: sum_no_totals(x, comp_col))

    # Take that series of sums, reset the index, and give the sum column an informative name
    util_plant_df = pd.DataFrame(plant_util_total_series).reset_index().rename(columns={0:'plant_util_total'})
    plant_df = pd.DataFrame(plant_total_series).reset_index().rename(columns={0:'plant_total'})

    # Merge the two sum columns together on utility-plant
    df = pd.merge(plant_df, util_plant_df, on=['report_year', 'plant_id_pudl'], how='outer')
    comp_totals = pd.merge(flag_df, df, on=['report_year', 'utility_id_ferc1', 'plant_id_pudl'], how='outer')
    
    # Get slices of the dataframe that represent the utility owned total rows and the plant total rows to compare with calculated totals
    util_ot = comp_totals[comp_totals['total_type']=='utility owned total'].copy()
    plant_ot = comp_totals[comp_totals['total_type']=='plant total'].copy()

    # For each dataframe, flag whether the sum of the components (calculated above) equals the reported total
    # Could make this a little more flexible (+/- 1)
    util_ot['utility_owned_total_flag'] = np.where(util_ot['capacity_mw'] == util_ot['plant_util_total'], True, False)
    plant_ot['plant_total_flag'] = np.where(plant_ot['capacity_mw'] == plant_ot['plant_total'], True, False)
    
    # Create blank columns in the original dataframe
    comp_totals[f'{comp_col}_utility_owned_total_flag'] = None
    comp_totals[f'{comp_col}_plant_total_flag'] = None

    # Update those blank columns so that the True/False values from the prior calculations are added to the correct column
    comp_totals[f'{comp_col}_utility_owned_total_flag'].update(util_ot['utility_owned_total_flag'])
    comp_totals[f'{comp_col}_plant_total_flag'].update(plant_ot['plant_total_flag'])

    return comp_totals

In [880]:
# Manual fix of certain total type columns

fix_list = [
    # Rockport AEP
    {'id_suffix': '_12_1_0_3', 'total_type': 'utility owned total', 'years': range(1994,2020)}, #pudl id 530
    # Rockport IMP
    {'id_suffix': '_12_73_1_3', 'total_type': 'utility owned total', 'years': range(1994,1997)}, #pudl id 530
    {'id_suffix': '_12_73_0_3', 'total_type': 'utility owned total', 'years': range(1997,2020)}, # pudl id 530
    # Amos APC
    {'id_suffix': '_12_6_0_3', 'total_type': 'plant total', 'years': range(1994,2002)}, #pudl id 16
    # Conesville 4 - Columbus Southern Power Company then Ohio Power Company then AEP
    {'id_suffix': '_12_31_0_3', 'total_type': 'unit total', 'years': range(1994,2011)}, # pudl id 128
    {'id_suffix': '_12_127_4_1', 'total_type': 'unit total', 'years': range(2011,2014)}, # pudl id 128
    {'id_suffix': '_12_452_1_2', 'total_type': 'unit total', 'years': range(2014,2015)}, # pudl id 128
    # Conesville 4 - Duke 
    {'id_suffix': '_12_27_1_3', 'total_type': 'unit total', 'years': range(1994,2003)}, # pudl if 128  # was plant total
    # Belle River - DTE
    {'id_suffix': '_12_44_0_1', 'total_type': 'utility owned total', 'years': range(1994, 2020)}, # pudl id 44  # also plant total, doesn't add up first year
    # Mitchell - Kentucky Power
    {'id_suffix': '_12_81_0_3', 'total_type': 'plant total', 'years': range(2014,2020)}, # pudl id 382  
    # Mitchell - AEP then Wheeling Power
    {'id_suffix': '_12_452_3_3', 'total_type': 'plant total', 'years': range(2014,2015)}, # pudl id 382
    {'id_suffix': '_12_192_0_2', 'total_type': 'plant total', 'years': range(2015,2020)}, # pudl id 382
    # Iatan 1 - Kansas City Power and Light
    {'id_suffix': '_12_79_1_1', 'total_type': 'unit total', 'years': range(2010,2020)}, # pudl id 295  # was plant total
    # Iatan 2 - Kansas Ciry Power and Light
    {'id_suffix': '_12_79_1_3', 'total_type': 'unit total', 'years': range(2010,2020)}, # pudl id 295  # was plant total
    # La Cygne - Kansas  
    #{'id_suffix': '_12_80_0_3', 'total_type': 'plant total', 'years': range(1994,2010)}, # pudl id 336  # very weird, nums don't add up
    # Jeffrey - Kansas Gas and Electric
    {'id_suffix': '_12_80_1_1', 'total_type': 'plant total', 'years': range(1994,1995)}, # pudl id 307
    {'id_suffix': '_12_80_1_3', 'total_type': 'plant total', 'years': range(1995,2002)}, # pudl id 307
    {'id_suffix': '_12_80_1_1', 'total_type': 'plant total', 'years': range(2002,2010)},
    # Jeffrey - Westar Energy
    {'id_suffix': '_12_191_1_4', 'total_type': 'plant total', 'years': range(1994,1995)}, # pudl id 307
    {'id_suffix': '_12_191_1_5', 'total_type': 'plant total', 'years': range(1995,2005)}, # pudl id 307
    {'id_suffix': '_12_191_1_3', 'total_type': 'plant total', 'years': range(2005,2006)}, # pudl id 307
    {'id_suffix': '_12_191_1_5', 'total_type': 'plant total', 'years': range(2006,2010)}, # pudl id 307
    # JM Stuart - Duke Energy
    {'id_suffix': '_12_27_1_1', 'total_type': 'plant total', 'years': range(1994,2003)}, # pudl id 288
    # JM Stuart - Dayton Power and Light
    {'id_suffix': '_12_42_2_1', 'total_type': 'plant total', 'years': range(1994,1998)}, # pudl id 288
    {'id_suffix': '_12_42_1_1', 'total_type': 'plant total', 'years': range(1998,2001)}, # pudl id 288
    # Valley - Wisconsin Power and Electric
    {'id_suffix': '_12_193_0_3', 'total_type': 'utility owned total', 'years': range(1994,1996)}, # pudl id 603  # could also be plant total
    # Pt. Wash - Wisconsin Electric Power
    {'id_suffix': '_12_193_1_4', 'total_type': 'utility owned total', 'years': range(1994,1996)}, # pudl id 470  # other weird value -- see below
    # Pt. Wash (gas) - Wisconsin Electric Power
    {'id_suffix': '_12_193_7_1', 'total_type': 'combustion turbine extra', 'years': range(1994,1996)}, # pudl id 470
    {'id_suffix': '_12_193_1_3', 'total_type': 'combustion turbine extra', 'years': range(1996,2004)}, # pudl id 470  # 2004 might not be right -- confusing
    
    {'id_suffix': '_12_193_3_4', 'total_type': 'utility owned total', 'years': range(1994,1996)}, # pudl id 469  # also plant total
    
    {'id_suffix': '_12_193_4_4', 'total_type': 'utility owned total', 'years': range(1994,1996)}, # pudl id 542  # also plant total
    {'id_suffix': '_12_193_0_4', 'total_type': 'utility owned total', 'years': range(1996,2008)}, # has 1 unit only here on...
    {'id_suffix': '_12_193_1_3', 'total_type': 'utility owned total', 'years': range(2008,2015)}, # pudl id 542  # technically becomes just one row in 2010
    
    {'id_suffix': '_12_193_5_2', 'total_type': 'utility owned total', 'years': range(1994,1996)}, # pudl id 1216 # also plant total
    #{'id_suffix': '_12_193_0_5', 'total_type': 'utility owned total', 'years': range(1996,2008)}, # pudl id 1216 # has 1 unit only
    
    {'id_suffix': '_12_193_5_5', 'total_type': 'utility owned total', 'years': range(1994,1996)}, # pudl id 458 # also plant total
    
    {'id_suffix': '_12_193_6_5', 'total_type': 'utility owned total', 'years': range(1994,1996)}, # pudl id 216  # also plant total
    
    {'id_suffix': '12_193_8_4', 'total_type': 'utility owned total', 'years': range(1994,1996)}, # pudl id 127  # also plant total
    
    {'id_suffix': '_12_194_0_4', 'total_type': 'unit total', 'years': range(1994,2012)}, # pudl id 123  # was labeled plant total b/c of 100%
    {'id_suffix': '_12_194_0_1', 'total_type': 'unit total', 'years': range(2012,2019)}, # pudl id 123
    # 
    {'id_suffix': '_12_194_0_5', 'total_type': 'unit total', 'years': range(1994,2012)}, # pudl id 123  # was labeled plant total b/c of 100%
    {'id_suffix': '_12_194_0_2', 'total_type': 'unit total', 'years': range(2012,2019)}, # pudl id 123
    
    {'id_suffix': '_12_194_2_3', 'total_type': 'utility owned total', 'years': range(1994,1998)}, # pudl id 123 
    {'id_suffix': '_12_194_1_4', 'total_type': 'utility owned total', 'years': range(1998,2012)}, # pudl id 123  # continuation of before 2_3 to 1_4
    {'id_suffix': '_12_194_1_1', 'total_type': 'utility owned total', 'years': range(2012,2014)}, # pudl id 123 
    
    {'id_suffix': '_12_89_1_5', 'total_type': 'utility owned total', 'years': range(1994,1998)},# pudl id 123
    {'id_suffix': '_12_89_1_3', 'total_type': 'utility owned total', 'years': range(1998,2002)}, # pudl id 123  # continuation of before 1_5 to 1_3
    {'id_suffix': '_12_89_1_5', 'total_type': 'utility owned total', 'years': range(2002,2007)}, # pudl id 123  # record id went back from 1_3 to 1_5
    {'id_suffix': '_12_89_0_4', 'total_type': 'utility owned total', 'years': range(2007,2008)}, # pudl id 123  # 1_5 to 0_4
    {'id_suffix': '_12_89_0_5', 'total_type': 'utility owned total', 'years': range(2008,2009)}, # pudl id 123  # 0_4 to 0_5
    {'id_suffix': '_12_89_1_4', 'total_type': 'utility owned total', 'years': range(2010,2012)}, # pudl id 123  # 0_5 to 1_4
    {'id_suffix': '_12_89_0_5', 'total_type': 'utility owned total', 'years': range(2012,2014)}, # pudl id 123
    {'id_suffix': '_12_89_1_4', 'total_type': 'utility owned total', 'years': range(2014,2015)}, # pudl id 123
    {'id_suffix': '_12_89_1_5', 'total_type': 'utility owned total', 'years': range(2015,2016)}, # pudl id 123
    {'id_suffix': '_12_89_0_5', 'total_type': 'utility owned total', 'years': range(2016,2019)}, # pudl id 123
    {'id_suffix': '_12_89_0_4', 'total_type': 'utility owned total', 'years': range(2019,2020)}, # pudl id 123
    
    {'id_suffix': '_12_194_2_5', 'total_type': 'unit total', 'years': range(1994,1998)}, # pudl id 171  # was plant total
    {'id_suffix': '_12_194_2_1', 'total_type': 'unit total', 'years': range(1998,2012)}, # pudl id 171
    {'id_suffix': '_12_194_1_3', 'total_type': 'unit total', 'years': range(2012,2014)}, # pudl id 171
    {'id_suffix': '_12_194_1_1', 'total_type': 'unit total', 'years': range(2014,2016)}, # pudl id 171
    {'id_suffix': '_12_194_0_5', 'total_type': 'unit total', 'years': range(2016,2019)}, # pudl id 171
    
    {'id_suffix': '_12_194_3_4', 'total_type': 'unit total', 'years': range(1994,1998)}, # pudl id 171  # contains unit-1, was plant total
    {'id_suffix': '_12_194_2_3', 'total_type': 'unit total', 'years': range(1998,2012)}, # pudl id 171
    {'id_suffix': '_12_194_1_5', 'total_type': None, 'years': range(2012,2014)}, # pudl id 171 -- was 2_3 but now 1_5 and owned by one utility   
    
    {'id_suffix': '_12_134_0_5', 'total_type': 'utility owned total', 'years': range(1994,2001)}, # pudl id 281  # also plant total
    {'id_suffix': '_12_134_1_5', 'total_type': 'utility owned total', 'years': range(2001,2016)},
    {'id_suffix': '_12_134_1_4', 'total_type': 'utility owned total', 'years': range(2016,2020)},
    
    {'id_suffix': '_12_138_0_5', 'total_type': 'utility owned total', 'years': range(1994,)}, # pudl id 2281  # also plant total
    
    {'id_suffix': '_12_138_2_3', 'total_type': 'combustion turbine extra', 'years': range(1994,1997)}, # pudl id 2281  # pause for 2 years
    {'id_suffix': '_12_138_4_1', 'total_type': 'combustion turbine extra', 'years': range(1999,2000)}, # pudl id 2281
    {'id_suffix': '_12_138_2_3', 'total_type': 'combustion turbine extra', 'years': range(2000,2001)}, # pudl id 2281

    {'id_suffix': '_12_195_1_5', 'total_type': 'utility owned total', 'years': range(1994,2008)}, # pudl id 503  # was plant total, is also technically plant total...
    {'id_suffix': '_12_195_1_3', 'total_type': 'utility owned total', 'years': range(2008,2011)}, # pudl id 503
    {'id_suffix': '_12_195_1_1', 'total_type': 'utility owned total', 'years': range(2011,2019)}, # pudl id 503
   
    {'id_suffix': '_12_195_3_4', 'total_type': 'utility owned total', 'years': range(1994,2011)}, # pudl id 473  # was plant total, might also be plant total
    {'id_suffix': '_12_195_2_3', 'total_type': 'utility owned total', 'years': range(2011,2018)}, # pudl id 473
    {'id_suffix': '_12_195_2_2', 'total_type': 'utility owned total', 'years': range(2018,2019)}, # pudl id 473 # but there is wierdness with the w31 and w32
    
    {'id_suffix': '_12_195_2_5', 'total_type': 'unit total', 'years': range(2008,2011)}, # pudl id 473
    {'id_suffix': '_12_195_2_1', 'total_type': 'unit total', 'years': range(2011,2018)}, # pudl id 473
    
    {'id_suffix': '_12_195_3_5', 'total_type': 'plant total', 'years': range(1994,2006)}, # pudl id 1166
    
    {'id_suffix': '_12_195_5_5', 'total_type': 'unit total', 'years': range(2004,2006)}, # pudl id 343
    {'id_suffix': '_12_195_5_3', 'total_type': 'unit total', 'years': range(2006,2011)}, # pudl id 343
    {'id_suffix': '_12_195_3_3', 'total_type': 'plant total', 'years': range(2018,2019)}, # pudl id 343

    {'id_suffix': '_12_57_5_1', 'total_type': 'utility owned total', 'years': range(1994,1995)}, # pudl id 661  # was plant total  # doesn't add up
    {'id_suffix': '_12_57_4_1', 'total_type': 'utility owned total', 'years': range(1995,1998)}, # pudl id 661  # skips a year
    {'id_suffix': '_12_57_4_1', 'total_type': 'utility owned total', 'years': range(1999,2009)}, # pudl id 661

    {'id_suffix': '_12_57_5_3', 'total_type': 'utility owned total', 'years': range(1994,1995)}, # pudl id 257  # was plant total
    {'id_suffix': '_12_57_5_1', 'total_type': 'utility owned total', 'years': range(1995,2009)}, # pudl id 257  # doesn't add up

    {'id_suffix': '_12_193_9_4', 'total_type': 'utility owned total', 'years': range(1995,1996)}, # pudl id 443

    {'id_suffix': '_12_281_0_2', 'total_type': 'utility owned total', 'years': range(2002,2007)}, # pudl id 1110  # was plant total and maybe is

    {'id_suffix': '_12_89_2_5', 'total_type': 'utility owned total', 'years': range(2011,2019)},
    {'id_suffix': '_12_89_1_2', 'total_type': 'utility owned total', 'years': range(2019,2020)},
    # Coit peaker units - South Caroline EG
    {'id_suffix': '_12_159_4_4', 'total_type': 'utility owned total', 'years': range(1994,2002)}, # pudl id 121
    {'id_suffix': '_12_159_5_1', 'total_type': 'utility owned total', 'years': range(2002,2003)},
    {'id_suffix': '_12_159_4_4', 'total_type': 'utility owned total', 'years': range(2003,2007)},
    {'id_suffix': '_12_159_4_3', 'total_type': 'utility owned total', 'years': range(2007,2020)},
    # Jeffrey - Kansas Gas & Electric
    {'id_suffix': '_12_80_1_1', 'total_type': 'plant total', 'years': range(1994,1995)}, # pudl id 307
    {'id_suffix': '_12_191_1_4', 'total_type': 'plant total', 'years': range(1994,1995)},
    # Blackhawk units 3&4 - XXX
    {'id_suffix': '_12_194_0_3', 'total_type': 'utility owned total', 'years': range(1994,2010)}, # pudl id 1080
]

# pudl id 336 unclear which is the total
# pudl id 652 is fishy and kind of seems like a duplicate?
# pudl id 40 needs some attention....
# pudl id 410 unsure role of cge
# pudl id 167 unsure role of cge
# pudl id 316 unsure role of cge
# pudl id 611 unsure role of cge
# pudl id 470 in 2008 two totals?
# pudl id 363 gets confusing around 2008 
# pudl id 281 unt 2 in year 1999 might get double counted
# pudl id 1209 components don't add up
# pudl id 503 pulliam-common? with capcity 0 and in ~2004 pulliam 31 shows up
# pudl id 473 has "communal" row as well and in ~1997 w31, w32
# pudl id 661 is confusing which values are which
# pudl id 529 doesn't add up
# pudl id 610 confused by what this 100% ownership thing is...
# pudl id 90 confusing
# pudl id 183 confusing total value in 2011

### **Step 2.0:** Add a simple totals flag to the steam table

In [881]:
# Create copy of the steam table with fuel types merged in
steam_test = steam9.copy()

# Flag bad/total rows
flagged_steam = (
     steam_test
     .pipe(flag_totals)
     .pipe(backfill_years_by_capacity, col_name='total', replace=False, replace_with=True)
     .pipe(flag_plant_totals, col_name='total_type')
     .pipe(backfill_years_by_capacity, col_name='total_type', replace=None, replace_with='plant total')
     .pipe(categorize_bad_rows, f_list=fix_list)
     .drop('total', axis=1)
     .assign(primary_fuel = lambda x: x.primary_fuel.fillna('unknown'))
)

In [958]:
# Experiment with ones yet to be flagged
# common, combined, cbd, cmbd, general, all plants, &, exps.-all combustion, unknown....look for these and # 1 unit

test = flagged_steam[flagged_steam['plant_name_ferc1'].str.contains('&') & (flagged_steam['report_year']==1994)]
#test = flagged_steam[flagged_steam['plant_name_ferc1'].str.contains('saguaro')]
#flagged_steam.plant_name_ferc1.unique() # extract to excel and look for outliers.
test = flagged_steam[(flagged_steam['plant_id_pudl']==1209) & (flagged_steam['report_year'].isin(range(1994,2005)))]
#test = flagged_steam[(flagged_steam['plant_id_ferc1']==1023) & (flagged_steam['report_year']==1994)]
#test = flagged_steam[(flagged_steam['utility_id_ferc1']==138) & (flagged_steam['report_year']==1994)]

test[ferc_merge_cols + ['record_id', 'plant_id_pudl', 'plant_id_ferc1', 'plant_type', 'total_type', 'capacity_mw', 'utility_name_ferc1']]
#test
#flagged_steam[flagged_steam['record_id'].str.contains('_12_138_4_1')]

Unnamed: 0,report_year,utility_id_ferc1,plant_name_ferc1,record_id,plant_id_pudl,plant_id_ferc1,plant_type,total_type,capacity_mw,utility_name_ferc1
45,1994,9,pch bt 2&3,f1_steam_1994_12_9_0_3,1209,35,nuclear,,173.0,Atlantic City Electric Company
639,1994,135,p bottom- peco share,f1_steam_1994_12_135_1_2,1209,461,nuclear,,979.0,PECO Energy Company
688,1994,135,p bottom 100% (u),f1_steam_1994_12_135_3_5,1209,503,unknown,plant total,2373.0,PECO Energy Company
1427,1995,149,peach bottom,f1_steam_1995_12_149_3_5,1209,2365,nuclear,,979.0,Public Service Electric and Gas Company
1436,1995,149,peach bottom,f1_steam_1995_12_149_9_2,1209,2374,nuclear,,979.0,Public Service Electric and Gas Company
1676,1995,9,peach bt 2&3,f1_steam_1995_12_9_0_3,1209,35,nuclear,,183.4,Atlantic City Electric Company
2563,1995,135,p. bottom - peco shr,f1_steam_1995_12_135_1_2,1209,461,nuclear,,985.0,PECO Energy Company
3189,1996,135,p. bottom - peco shr,f1_steam_1996_12_135_1_2,1209,461,nuclear,,985.0,PECO Energy Company
3631,1996,149,peach bottom,f1_steam_1996_12_149_3_5,1209,2631,nuclear,,979.0,Public Service Electric and Gas Company
3868,1996,9,peach bt 2&3,f1_steam_1996_12_9_0_3,1209,35,nuclear,,183.4,Atlantic City Electric Company


### **Step 2.1 (optional):** Check reported totals against sum of available components
For utility-owned plant portions and entire plants

In [402]:
# Compare the reported totals with the sum of the reported components
flagged_steam_total_comp = compare_totals(flagged_steam, 'capacity_mw')

In [403]:
# Find miss-matching utility owned totals
uot = flagged_steam_total_comp[flagged_steam_total_comp['total_type']=='utility owned total']
uot_bad = uot[uot['capacity_mw_utility_owned_total_flag']==False]
print(int((len(uot_bad)/len(uot)*100)),'% of reported utility owned total values differ from the sum of their reported components')
uot_bad['plant_id_pudl'].unique()

49 % of reported utility owned total values differ from the sum of their reported components


array([  44, 1216,  121,  473,  661,  257,  542, 1110,  503,  281,  123])

In [404]:
# Find miss-matching plant totals
pt = flagged_steam_total_comp[flagged_steam_total_comp['total_type']=='plant total']
pt_bad = pt[pt['capacity_mw_plant_total_flag']==False]
print(int(len(pt_bad)/len(pt)*100),'% of reported plant total values differ from the sum of their reported components')
pt_bad['plant_id_pudl'].unique()

63 % of reported plant total values differ from the sum of their reported components


array([  16,  450,  288, 1087,  295,  307,  652, 1665,  123, 1209, 1166,
        529,  610,  336,  343,  473])

### **Step 2.2:** Custom aggregation based on the presense of nulls and/or totals rows

In [407]:
def col_aggregator(flag_df, agg_col):
    """
    Remove total rows from aggregation; use when there are nulls present in non-total rows. 
    
    This function looks at a table grouped by year, utility, and plant and determines whether to use any of the
    information from the total rows in an aggregation based on the column agg_col specified as a parameter.
    If there are no total rows, this function simply takes the sum of each group. If there are totals rows, 
    this function first looks to see if there is a utility owned total reported and then looks to see if there is
    a plant total reported.
    
    If the aggregation must resort to using a total row, a flag is created and returned alongside the aggregated value.
    These values will later get split apart and set as seperate columns (in the build_col_agg_df function).
    
    Args: 
        df (pandas.DataFrame): A flagged version of the cleaned ferc1_steam table with the column name 'total_type'
            specifying wither it is a utility owned total, unit total, or plant total.
        agg_col (str): The name of the column you'd like to aggregate by.
    
    """
    if flag_df.loc[flag_df['total_type'].isna()][agg_col].notna().all() & len(flag_df.loc[flag_df['total_type'].isna()]) > 0: 
            flag = None
            agg_value = flag_df.loc[flag_df['total_type'].isna()][agg_col].sum()
            return [agg_value, flag]
    else:
        if flag_df['total_type'].str.contains('utility owned total').any() & flag_df.loc[flag_df['total_type']=='utility owned total'][agg_col].notna().all():
            flag = 'used utility owned total'
            agg_value = flag_df.loc[flag_df['total_type']=='utility owned total'][agg_col].unique()[0]
            return [agg_value, flag]
        elif flag_df['total_type'].str.contains('plant total').any() & flag_df.loc[flag_df['total_type']=='plant total'][agg_col].notna().all():
            flag = 'used plant total pertains to more than one utility'
            agg_value = flag_df.loc[flag_df['total_type']=='plant total'][agg_col].unique()[0]
            return [agg_value, flag]
        elif flag_df.loc[flag_df['total_type'].isna()][agg_col].isna().all():
            flag = None
            agg_value = np.nan
            return [agg_value, flag]
        else:
            flag = 'aggregated with some null values'
            agg_value = flag_df.loc[flag_df['total_type'].isna()][agg_col].sum()
            return [agg_value, flag]

In [408]:
def build_col_agg_df(flag_df, agg_col):
    """Sort by field level.
    
    This function creates a mini aggregated dataframe based on a column specified in the parameters. 
    It runs the col_aggregator function so that the aggregations exclude total values unless there are
    gaps in the subcomponents--in which case it will first try and use a reported utility total and 
    then a plant total. The col_aggregator function returns both the aggregated value and flag to
    indicate whether it was aggregated based on a value from one of the total rows. This function turns
    the value and the flag (returned as a list within one column) to seperate columns.
    
    These column-based data aggregations can later be merged to form one large data aggregation column.
    
    Args: 
        df (pandas.DataFrame): A DataFrame....
        agg_col (str): The name of the column you'd like to aggregate by.
    Returns:
        pandas.DataFrame: A DataFrame with rows selected based on priority.
    
    """

    plant_util_group = flag_df.groupby(['report_year', 'utility_id_ferc1', 'plant_id_pudl', 'primary_fuel'])
    col_agg_series = plant_util_group.apply(lambda x: col_aggregator(x, agg_col))
    col_agg_df = pd.DataFrame(col_agg_series).reset_index()
    col_agg_df[[agg_col, f'{agg_col}_flag']] = pd.DataFrame(col_agg_df[0].tolist(), index=col_agg_df.index)
    col_agg_df = col_agg_df.drop(columns=[0])
    
    def combine_flags(ser):
        if ser.isna().all():
            return None
        else:
            return ', '.join([flag for flag in ser.unique() if flag != None])

    #Group by utility and fuel type
    util_fuel_df = (
        col_agg_df
        .groupby(['report_year', 'utility_id_ferc1', 'primary_fuel'])
        .agg({agg_col: 'sum',
              f'{agg_col}_flag': lambda x: combine_flags(x)})
        .reset_index()
    )
    
    return util_fuel_df

In [409]:
def aggregate_all_columns(df, col_list):
    agg_df = pd.DataFrame(columns=['report_year', 'utility_id_ferc1', 'primary_fuel'])
    for col in col_list:
        col_df = build_col_agg_df(df, col)
        agg_df = pd.merge(agg_df, col_df, on=['report_year', 'utility_id_ferc1', 'primary_fuel'], how='outer')
    return agg_df

In [410]:
complete_util_agg = aggregate_all_columns(flagged_steam, value_cols_no_cap+['capacity_mw'])

In [446]:
#complete_util_agg[complete_util_agg['primary_fuel']=='unknown']
complete_util_agg

Unnamed: 0,report_year,utility_id_ferc1,primary_fuel,net_generation_mwh,net_generation_mwh_flag,avg_num_employees,avg_num_employees_flag,capex_land,capex_land_flag,capex_equipment,...,opex_boiler,opex_boiler_flag,opex_plants,opex_plants_flag,opex_misc_steam,opex_misc_steam_flag,opex_production_total,opex_production_total_flag,capacity_mw,capacity_mw_flag
0,1994,1,coal,9.119496e+06,used utility owned total,462.0,used plant total pertains to more than one uti...,6469962.0,used utility owned total,5.300235e+08,...,6560762.0,used utility owned total,1150468.0,used utility owned total,1036572.0,used utility owned total,1.839901e+08,used utility owned total,1300.0,used utility owned total
1,1994,2,coal,5.028391e+07,,2031.0,,11310431.0,,3.979206e+09,...,52555239.0,,12821236.0,,3462156.0,,1.216481e+09,,10148.0,
2,1994,2,gas,3.388883e+06,,154.0,,331497.0,,1.251836e+08,...,2866852.0,,692054.0,,458325.0,,3.882226e+07,,614.0,
3,1994,2,nuclear,1.320825e+07,,894.0,,1454214.0,,1.397899e+09,...,18601315.0,,8937024.0,,8814524.0,,1.902891e+08,,1777.0,
4,1994,2,unknown,4.703416e+05,,0.0,,183261.0,,9.493402e+07,...,1361835.0,,252704.0,,207816.0,,6.494924e+06,,115.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7738,2019,519,gas,6.447320e+05,,16.0,,2121938.0,,1.539476e+08,...,0.0,,1852337.0,,0.0,,1.736698e+07,,188.0,
7739,2019,529,coal,0.000000e+00,,0.0,,34510845.0,aggregated with some null values,1.483039e+09,...,0.0,,0.0,,0.0,,0.000000e+00,,0.0,aggregated with some null values
7740,2019,531,coal,1.611511e+07,,670.0,,10225928.0,,3.096751e+09,...,76179710.0,,18827008.0,,2372785.0,,4.756402e+08,,3671.0,
7741,2019,531,gas,2.469732e+06,,42.0,,2852666.0,,1.135614e+09,...,658775.0,,10723561.0,,0.0,,8.997181e+07,,1224.0,
