# Test ETL for New Years of Data

## Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [23]:
# Standard libraries
import logging
import os
import pathlib
import sys

# 3rd party libraries
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns
import sqlalchemy as sa
import yaml

# Local libraries
import pudl

In [3]:
sns.set()
%matplotlib inline
mpl.rcParams['figure.figsize'] = (10,4)
mpl.rcParams['figure.dpi'] = 150
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [4]:
logger=logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [20]:
name_of_your_settings_file = 'etl_full_no_cems.yml'

In [49]:
pudl_settings = pudl.workspace.setup.get_defaults()
with pathlib.Path(pudl_settings['settings_dir'] + f'/{name_of_your_settings_file}').open() as f:
    script_settings = yaml.safe_load(f)
etl_settings = script_settings['datapkg_bundle_settings'][0]
#pudl_engine = sa.create_engine(pudl_settings['pudl_db'])

## Test Extract

In [73]:
ferc1_inputs = etl_settings['datasets'][0]['ferc1']
ferc1_years = ferc1_inputs['ferc1_years']
ferc1_tables = ferc1_inputs['ferc1_tables']

if not ferc1_years or not ferc1_tables:
    print('Not loading FERC1')

In [75]:
ferc1_raw_dfs = pudl.extract.ferc1.extract(
    ferc1_tables=ferc1_tables,
    ferc1_years=ferc1_years,
    pudl_settings=pudl_settings
)

Converting extracted FERC Form 1 table fuel_ferc1 into a pandas DataFrame.
Converting extracted FERC Form 1 table plants_steam_ferc1 into a pandas DataFrame.
Converting extracted FERC Form 1 table plants_small_ferc1 into a pandas DataFrame.
Converting extracted FERC Form 1 table plants_hydro_ferc1 into a pandas DataFrame.
Converting extracted FERC Form 1 table plants_pumped_storage_ferc1 into a pandas DataFrame.
Converting extracted FERC Form 1 table plant_in_service_ferc1 into a pandas DataFrame.
Converting extracted FERC Form 1 table purchased_power_ferc1 into a pandas DataFrame.


## Test Transform

In [76]:
ferc1_transformed_dfs = pudl.transform.ferc1.transform(
    ferc1_raw_dfs, ferc1_tables=ferc1_tables
)

Transforming raw FERC Form 1 dataframe for loading into fuel_ferc1
Transforming raw FERC Form 1 dataframe for loading into plants_steam_ferc1
Identifying distinct large FERC plants for ID assignment.


  df.sum(level=0, axis=1).loc[:, 'fuel_mmbtu'],
  df.sum(level=0, axis=1).loc[:, 'fuel_cost'], right_index=True, left_index=True).


Successfully associated 21508 of 28380 (75.79%) FERC Form 1 plant records with multi-year plant entities.
Assigning IDs to multi-year FERC plant entities.
Identified 4387 orphaned FERC plant records. Adding orphans to list of plant entities.
Successfully Identified 1942 multi-year plant entities.
Found report_year=1998 2 times in plant_id_ferc1=203
Found report_year=1994 2 times in plant_id_ferc1=308
Found report_year=1995 2 times in plant_id_ferc1=308
Found report_year=1996 2 times in plant_id_ferc1=308
Found report_year=1997 2 times in plant_id_ferc1=308
Found report_year=1998 2 times in plant_id_ferc1=308
Found report_year=1999 2 times in plant_id_ferc1=308
Found report_year=2000 2 times in plant_id_ferc1=308
Found report_year=2001 2 times in plant_id_ferc1=308
Found report_year=2002 2 times in plant_id_ferc1=308
Found report_year=2003 2 times in plant_id_ferc1=308
Found report_year=2004 2 times in plant_id_ferc1=308
Found report_year=2005 2 times in plant_id_ferc1=308
Found report_

  warn(msg)


Transforming raw FERC Form 1 dataframe for loading into plants_hydro_ferc1
Transforming raw FERC Form 1 dataframe for loading into plants_pumped_storage_ferc1
Transforming raw FERC Form 1 dataframe for loading into plant_in_service_ferc1
0.0240% of unpacked records were duplicates, and discarded.
Col: begin_yr_bal, Cat: starting_balance
Col: addition, Cat: additions
Col: retirements, Cat: retirements
Col: adjustments, Cat: adjustments
Col: transfers, Cat: transfers
Col: yr_end_bal, Cat: ending_balance
Transforming raw FERC Form 1 dataframe for loading into purchased_power_ferc1
7 duplicate record_id values found in pre-transform table f1_purchased_pwr: ['f1_purchased_pwr_1998_12_238_0_1' 'f1_purchased_pwr_1998_12_238_0_2'
 'f1_purchased_pwr_1998_12_238_0_3' 'f1_purchased_pwr_1998_12_238_0_15'
 'f1_purchased_pwr_1998_12_238_0_4' 'f1_purchased_pwr_1998_12_238_0_5'
 'f1_purchased_pwr_2000_12_148_6_5'].


In [91]:
test = ferc1_raw_dfs['purchased_power_ferc1']
test[test['record_id'].duplicated(keep=False)].sort_values('record_id').tail(4)

Unnamed: 0,respondent_id,report_year,spplmnt_num,row_number,row_seq,row_prvlg,athrty_co_name,sttstcl_clssfctn,rtsched_trffnbr,avgmth_bill_dmnd,avgmth_ncp_dmnd,avgmth_cp_dmnd,mwh_purchased,mwh_recv,mwh_delvd,dmnd_charges,erg_charges,othr_charges,settlement_tot,athrty_co_name_f,sttstcl_clssfctn_f,rtsched_trffnbr_f,avgmth_bill_dmnd_f,avgmth_ncp_dmnd_f,avgmth_cp_dmnd_f,mwh_purchased_f,mwh_recv_f,mwh_delvd_f,dmnd_charges_f,erg_charges_f,othr_charges_f,settlement_tot_f,report_prd,record_id
37287,238,1998,0,5,5,N,Pacificorp,RQ,Contract,,,,220764.0,,,,4625760.0,,4625760.0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,f1_purchased_pwr_1998_12_238_0_5
37935,238,1998,0,5,5,N,Pacificorp,RQ,Contract,,,,220764.0,,,,4625760.0,,4625760.0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,f1_purchased_pwr_1998_12_238_0_5
51389,148,2000,6,5,5,N,TXU Marketing,OS,WSPP,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,12,f1_purchased_pwr_2000_12_148_6_5
51391,148,2000,6,5,5,N,"Enron Power Marketing, Inc.",OS,WSPP,,,,7234.0,,,,338776.0,,338776.0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,f1_purchased_pwr_2000_12_148_6_5


In [92]:
test = ferc1_transformed_dfs['purchased_power_ferc1']
test[test['record_id'].duplicated()].sort_values('record_id')
test[test['record_id']=='f1_purchased_pwr_2000_12_148_6_5']

Unnamed: 0,utility_id_ferc1,report_year,seller_name,purchase_type,tariff,billing_demand_mw,non_coincident_peak_demand_mw,coincident_peak_demand_mw,purchased_mwh,received_mwh,delivered_mwh,demand_charges,energy_charges,other_charges,total_settlement,record_id
51391,148,2000,"Enron Power Marketing, Inc.",other_service,WSPP,,,,7234.0,0.0,0.0,0.0,338776.0,0.0,338776.0,f1_purchased_pwr_2000_12_148_6_5


AttributeError: 'DataFrame' object has no attribute 'plant_id_pudl'