From 97f803561dce96f683caf07c2b7ce232913b852a Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Mon, 17 Jun 2019 18:23:25 -0600 Subject: [PATCH] Revert to fully specifying pudl.helpers for clarity --- pudl/transform/eia923.py | 38 ++++++++++++++++++-------------------- pudl/transform/ferc1.py | 25 +++++++++++++------------ 2 files changed, 31 insertions(+), 32 deletions(-) diff --git a/pudl/transform/eia923.py b/pudl/transform/eia923.py index 8a169537ec..0cf10d7e11 100644 --- a/pudl/transform/eia923.py +++ b/pudl/transform/eia923.py @@ -6,8 +6,6 @@ import numpy as np import pudl import pudl.constants as pc -from pudl.helpers import fix_int_na, fix_eia_na, cleanstrings, strip_lower -from pudl.helpers import convert_to_date, cleanstrings_series logger = logging.getLogger(__name__) ############################################################################### @@ -112,7 +110,7 @@ def _coalmine_cleanup(cmi_df): # Transform coalmine names to a canonical form to reduce duplicates: # No leading or trailing whitespace: - cmi_df = strip_lower(cmi_df, columns=['mine_name']) + cmi_df = pudl.helpers.strip_lower(cmi_df, columns=['mine_name']) # remove all internal non-alphanumeric characters: cmi_df['mine_name'] = \ cmi_df['mine_name'].replace('[^a-zA-Z0-9 -]', '', regex=True) @@ -227,17 +225,17 @@ def generation_fuel(eia923_dfs, eia923_transformed_dfs): # Convert the EIA923 DataFrame from yearly to monthly records. gf_df = _yearly_to_monthly_records(gf_df, pc.month_dict_eia923) # Replace the EIA923 NA value ('.') with a real NA value. - gf_df = fix_eia_na(gf_df) + gf_df = pudl.helpers.fix_eia_na(gf_df) # Remove "State fuel-level increment" records... which don't pertain to # any particular plant (they have plant_id_eia == operator_id == 99999) gf_df = gf_df[gf_df.plant_id_eia != 99999] gf_df['fuel_type_code_pudl'] = \ - cleanstrings_series(gf_df.fuel_type, - pc.fuel_type_eia923_gen_fuel_simple_map) + pudl.helpers.cleanstrings_series(gf_df.fuel_type, + pc.fuel_type_eia923_gen_fuel_simple_map) # Convert Year/Month columns into a single Date column... - gf_df = convert_to_date(gf_df) + gf_df = pudl.helpers.convert_to_date(gf_df) eia923_transformed_dfs['generation_fuel_eia923'] = gf_df @@ -279,14 +277,14 @@ def boiler_fuel(eia923_dfs, eia923_transformed_dfs): bf_df = _yearly_to_monthly_records( bf_df, pc.month_dict_eia923) bf_df['fuel_type_code_pudl'] = \ - cleanstrings_series( + pudl.helpers.cleanstrings_series( bf_df.fuel_type_code, pc.fuel_type_eia923_boiler_fuel_simple_map) # Replace the EIA923 NA value ('.') with a real NA value. - bf_df = fix_eia_na(bf_df) + bf_df = pudl.helpers.fix_eia_na(bf_df) # Convert Year/Month columns into a single Date column... - bf_df = convert_to_date(bf_df) + bf_df = pudl.helpers.convert_to_date(bf_df) eia923_transformed_dfs['boiler_fuel_eia923'] = bf_df @@ -330,10 +328,10 @@ def generation(eia923_dfs, eia923_transformed_dfs): generation_df = _yearly_to_monthly_records( generation_df, pc.month_dict_eia923) # Replace the EIA923 NA value ('.') with a real NA value. - generation_df = fix_eia_na(generation_df) + generation_df = pudl.helpers.fix_eia_na(generation_df) # Convert Year/Month columns into a single Date column... - generation_df = convert_to_date(generation_df) + generation_df = pudl.helpers.convert_to_date(generation_df) eia923_transformed_dfs['generation_eia923'] = generation_df @@ -500,19 +498,19 @@ def fuel_reciepts_costs(eia923_dfs, eia923_transformed_dfs): 'mine_type_code', 'county_id_fips']). drop(cols_to_drop, axis=1). # Replace the EIA923 NA value ('.') with a real NA value. - pipe(fix_eia_na). + pipe(pudl.helpers.fix_eia_na). # These come in ALL CAPS from EIA... - pipe(strip_lower, columns=['supplier_name']). - pipe(fix_int_na, columns=['contract_expiration_date', ]). + pipe(pudl.helpers.strip_lower, columns=['supplier_name']). + pipe(pudl.helpers.fix_int_na, columns=['contract_expiration_date', ]). assign( # Standardize case on transportaion codes -- all upper case! primary_transportation_mode_code=lambda x: x.primary_transportation_mode_code.str.upper(), secondary_transportation_mode_code=lambda x: x.secondary_transportation_mode_code.str.upper(), fuel_cost_per_mmbtu=lambda x: x.fuel_cost_per_mmbtu / 100, fuel_group_code=lambda x: x.fuel_group_code.str.lower().str.replace(' ', '_'), - fuel_type_code_pudl=lambda x: cleanstrings_series( + fuel_type_code_pudl=lambda x: pudl.helpers.cleanstrings_series( x.energy_source_code, pc.energy_source_eia_simple_map), - fuel_group_code_simple=lambda x: cleanstrings_series( + fuel_group_code_simple=lambda x: pudl.helpers.cleanstrings_series( x.fuel_group_code, pc.fuel_group_eia923_simple_map), contract_expiration_month=lambda x: x.contract_expiration_date.apply( lambda y: y[:-2] if y != '' else y)). @@ -524,12 +522,12 @@ def fuel_reciepts_costs(eia923_dfs, eia923_transformed_dfs): lambda y: '20' + y[-2:] if y != '' else y)). # Now that we will create our own real date field, so chuck this one. drop('contract_expiration_date', axis=1). - pipe(convert_to_date, + pipe(pudl.helpers.convert_to_date, date_col='contract_expiration_date', year_col='contract_expiration_year', month_col='contract_expiration_month'). - pipe(convert_to_date). - pipe(cleanstrings, + pipe(pudl.helpers.convert_to_date). + pipe(pudl.helpers.cleanstrings, ['natural_gas_transport_code', 'natural_gas_delivery_contract_type_code'], [{'firm': ['F'], 'interruptible': ['I']}, diff --git a/pudl/transform/ferc1.py b/pudl/transform/ferc1.py index fbd6052abb..77c9b4b537 100644 --- a/pudl/transform/ferc1.py +++ b/pudl/transform/ferc1.py @@ -31,7 +31,6 @@ import pudl import pudl.constants as pc from pudl.settings import SETTINGS -from pudl.helpers import strip_lower, cleanstrings, fix_int_na logger = logging.getLogger(__name__) @@ -212,11 +211,11 @@ def _plants_steam_clean(ferc1_steam_df): ferc1_steam_df = ( ferc1_steam_df. pipe(_clean_cols, 'f1_steam'). - pipe(strip_lower, ['plant_name']). + pipe(pudl.helpers.strip_lower, ['plant_name']). # Take the messy free-form construction_type and plant_kind fields, and # do our best to map them to some canonical categories... this is # necessarily imperfect: - pipe(cleanstrings, ['type_const', 'plant_kind'], + pipe(pudl.helpers.cleanstrings, ['type_const', 'plant_kind'], [pc.ferc1_const_type_strings, pc.ferc1_plant_kind_strings], unmapped='') ) @@ -292,7 +291,8 @@ def _plants_steam_assign_plant_ids(ferc1_steam_df, ferc1_fuel_df): # scikit-learn still doesn't deal well with NA values (this will be fixed # eventually) We need to massage the type and missing data for the # Classifier to work. - ferc1_steam_df = fix_int_na(ferc1_steam_df, columns=['construction_year']) + ferc1_steam_df = pudl.helpers.fix_int_na( + ferc1_steam_df, columns=['construction_year']) # Grab fuel consumption proportions for use in assigning plant IDs: fuel_fractions = fuel_by_plant_ferc1(ferc1_fuel_df) @@ -467,11 +467,11 @@ def fuel(ferc1_raw_dfs, ferc1_transformed_dfs): _clean_cols(ferc1_raw_dfs['fuel_ferc1'], 'f1_fuel'). # Standardize plant_name capitalization and remove leading/trailing # white space -- necesary b/c plant_name is part of many foreign keys. - pipe(strip_lower, ['plant_name']). + pipe(pudl.helpers.strip_lower, ['plant_name']). # Take the messy free-form fuel & fuel_unit fields, and do our best to # map them to some canonical categories... this is necessarily # imperfect: - pipe(cleanstrings, ['fuel', 'fuel_unit'], + pipe(pudl.helpers.cleanstrings, ['fuel', 'fuel_unit'], [pc.ferc1_fuel_strings, pc.ferc1_fuel_unit_strings], unmapped=''). # Fuel cost per kWh is a per-unit value that doesn't make sense to @@ -594,7 +594,7 @@ def plants_small(ferc1_raw_dfs, ferc1_transformed_dfs): ferc1_small_df = ferc1_raw_dfs['plants_small_ferc1'] # Standardize plant_name_raw capitalization and remove leading/trailing # white space -- necesary b/c plant_name_raw is part of many foreign keys. - ferc1_small_df = strip_lower( + ferc1_small_df = pudl.helpers.strip_lower( ferc1_small_df, ['plant_name', 'kind_of_fuel'] ) @@ -646,7 +646,8 @@ def plants_small(ferc1_raw_dfs, ferc1_transformed_dfs): # Standardize plant_name capitalization and remove leading/trailing white # space, so that plant_name matches formatting of plant_name_raw - ferc1_small_df = strip_lower(ferc1_small_df, ['plant_name_clean']) + ferc1_small_df = pudl.helpers.strip_lower( + ferc1_small_df, ['plant_name_clean']) # in order to create one complete column of plant names, we have to use the # cleaned plant names when available and the orignial plant names when the @@ -708,8 +709,8 @@ def plants_hydro(ferc1_raw_dfs, ferc1_transformed_dfs): _clean_cols(ferc1_raw_dfs['plants_hydro_ferc1'], 'f1_hydro'). # Standardize plant_name capitalization and remove leading/trailing # white space -- necesary b/c plant_name is part of many foreign keys. - pipe(strip_lower, ['plant_name']). - pipe(cleanstrings, ['plant_const'], [pc.ferc1_const_type_strings], + pipe(pudl.helpers.strip_lower, ['plant_name']). + pipe(pudl.helpers.cleanstrings, ['plant_const'], [pc.ferc1_const_type_strings], unmapped=''). assign( # Converting kWh to MWh @@ -793,9 +794,9 @@ def plants_pumped_storage(ferc1_raw_dfs, ferc1_transformed_dfs): 'f1_pumped_storage'). # Standardize plant_name capitalization and remove leading/trailing # white space -- necesary b/c plant_name is part of many foreign keys. - pipe(strip_lower, ['plant_name']). + pipe(pudl.helpers.strip_lower, ['plant_name']). # Clean up the messy plant construction type column: - pipe(cleanstrings, ['plant_kind'], [pc.ferc1_const_type_strings], + pipe(pudl.helpers.cleanstrings, ['plant_kind'], [pc.ferc1_const_type_strings], unmapped=''). assign( # Converting from kW/kWh to MW/MWh