Skip to content

Commit

Permalink
Revert to fully specifying pudl.helpers for clarity
Browse files Browse the repository at this point in the history
  • Loading branch information
zaneselvans committed Jun 18, 2019
1 parent 209e8d5 commit 97f8035
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 32 deletions.
38 changes: 18 additions & 20 deletions pudl/transform/eia923.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
import numpy as np
import pudl
import pudl.constants as pc
from pudl.helpers import fix_int_na, fix_eia_na, cleanstrings, strip_lower
from pudl.helpers import convert_to_date, cleanstrings_series

logger = logging.getLogger(__name__)
###############################################################################
Expand Down Expand Up @@ -112,7 +110,7 @@ def _coalmine_cleanup(cmi_df):

# Transform coalmine names to a canonical form to reduce duplicates:
# No leading or trailing whitespace:
cmi_df = strip_lower(cmi_df, columns=['mine_name'])
cmi_df = pudl.helpers.strip_lower(cmi_df, columns=['mine_name'])
# remove all internal non-alphanumeric characters:
cmi_df['mine_name'] = \
cmi_df['mine_name'].replace('[^a-zA-Z0-9 -]', '', regex=True)
Expand Down Expand Up @@ -227,17 +225,17 @@ def generation_fuel(eia923_dfs, eia923_transformed_dfs):
# Convert the EIA923 DataFrame from yearly to monthly records.
gf_df = _yearly_to_monthly_records(gf_df, pc.month_dict_eia923)
# Replace the EIA923 NA value ('.') with a real NA value.
gf_df = fix_eia_na(gf_df)
gf_df = pudl.helpers.fix_eia_na(gf_df)
# Remove "State fuel-level increment" records... which don't pertain to
# any particular plant (they have plant_id_eia == operator_id == 99999)
gf_df = gf_df[gf_df.plant_id_eia != 99999]

gf_df['fuel_type_code_pudl'] = \
cleanstrings_series(gf_df.fuel_type,
pc.fuel_type_eia923_gen_fuel_simple_map)
pudl.helpers.cleanstrings_series(gf_df.fuel_type,
pc.fuel_type_eia923_gen_fuel_simple_map)

# Convert Year/Month columns into a single Date column...
gf_df = convert_to_date(gf_df)
gf_df = pudl.helpers.convert_to_date(gf_df)

eia923_transformed_dfs['generation_fuel_eia923'] = gf_df

Expand Down Expand Up @@ -279,14 +277,14 @@ def boiler_fuel(eia923_dfs, eia923_transformed_dfs):
bf_df = _yearly_to_monthly_records(
bf_df, pc.month_dict_eia923)
bf_df['fuel_type_code_pudl'] = \
cleanstrings_series(
pudl.helpers.cleanstrings_series(
bf_df.fuel_type_code,
pc.fuel_type_eia923_boiler_fuel_simple_map)
# Replace the EIA923 NA value ('.') with a real NA value.
bf_df = fix_eia_na(bf_df)
bf_df = pudl.helpers.fix_eia_na(bf_df)

# Convert Year/Month columns into a single Date column...
bf_df = convert_to_date(bf_df)
bf_df = pudl.helpers.convert_to_date(bf_df)

eia923_transformed_dfs['boiler_fuel_eia923'] = bf_df

Expand Down Expand Up @@ -330,10 +328,10 @@ def generation(eia923_dfs, eia923_transformed_dfs):
generation_df = _yearly_to_monthly_records(
generation_df, pc.month_dict_eia923)
# Replace the EIA923 NA value ('.') with a real NA value.
generation_df = fix_eia_na(generation_df)
generation_df = pudl.helpers.fix_eia_na(generation_df)

# Convert Year/Month columns into a single Date column...
generation_df = convert_to_date(generation_df)
generation_df = pudl.helpers.convert_to_date(generation_df)

eia923_transformed_dfs['generation_eia923'] = generation_df

Expand Down Expand Up @@ -500,19 +498,19 @@ def fuel_reciepts_costs(eia923_dfs, eia923_transformed_dfs):
'mine_type_code', 'county_id_fips']).
drop(cols_to_drop, axis=1).
# Replace the EIA923 NA value ('.') with a real NA value.
pipe(fix_eia_na).
pipe(pudl.helpers.fix_eia_na).
# These come in ALL CAPS from EIA...
pipe(strip_lower, columns=['supplier_name']).
pipe(fix_int_na, columns=['contract_expiration_date', ]).
pipe(pudl.helpers.strip_lower, columns=['supplier_name']).
pipe(pudl.helpers.fix_int_na, columns=['contract_expiration_date', ]).
assign(
# Standardize case on transportaion codes -- all upper case!
primary_transportation_mode_code=lambda x: x.primary_transportation_mode_code.str.upper(),
secondary_transportation_mode_code=lambda x: x.secondary_transportation_mode_code.str.upper(),
fuel_cost_per_mmbtu=lambda x: x.fuel_cost_per_mmbtu / 100,
fuel_group_code=lambda x: x.fuel_group_code.str.lower().str.replace(' ', '_'),
fuel_type_code_pudl=lambda x: cleanstrings_series(
fuel_type_code_pudl=lambda x: pudl.helpers.cleanstrings_series(
x.energy_source_code, pc.energy_source_eia_simple_map),
fuel_group_code_simple=lambda x: cleanstrings_series(
fuel_group_code_simple=lambda x: pudl.helpers.cleanstrings_series(
x.fuel_group_code, pc.fuel_group_eia923_simple_map),
contract_expiration_month=lambda x: x.contract_expiration_date.apply(
lambda y: y[:-2] if y != '' else y)).
Expand All @@ -524,12 +522,12 @@ def fuel_reciepts_costs(eia923_dfs, eia923_transformed_dfs):
lambda y: '20' + y[-2:] if y != '' else y)).
# Now that we will create our own real date field, so chuck this one.
drop('contract_expiration_date', axis=1).
pipe(convert_to_date,
pipe(pudl.helpers.convert_to_date,
date_col='contract_expiration_date',
year_col='contract_expiration_year',
month_col='contract_expiration_month').
pipe(convert_to_date).
pipe(cleanstrings,
pipe(pudl.helpers.convert_to_date).
pipe(pudl.helpers.cleanstrings,
['natural_gas_transport_code',
'natural_gas_delivery_contract_type_code'],
[{'firm': ['F'], 'interruptible': ['I']},
Expand Down
25 changes: 13 additions & 12 deletions pudl/transform/ferc1.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
import pudl
import pudl.constants as pc
from pudl.settings import SETTINGS
from pudl.helpers import strip_lower, cleanstrings, fix_int_na

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -212,11 +211,11 @@ def _plants_steam_clean(ferc1_steam_df):
ferc1_steam_df = (
ferc1_steam_df.
pipe(_clean_cols, 'f1_steam').
pipe(strip_lower, ['plant_name']).
pipe(pudl.helpers.strip_lower, ['plant_name']).
# Take the messy free-form construction_type and plant_kind fields, and
# do our best to map them to some canonical categories... this is
# necessarily imperfect:
pipe(cleanstrings, ['type_const', 'plant_kind'],
pipe(pudl.helpers.cleanstrings, ['type_const', 'plant_kind'],
[pc.ferc1_const_type_strings, pc.ferc1_plant_kind_strings],
unmapped='')
)
Expand Down Expand Up @@ -292,7 +291,8 @@ def _plants_steam_assign_plant_ids(ferc1_steam_df, ferc1_fuel_df):
# scikit-learn still doesn't deal well with NA values (this will be fixed
# eventually) We need to massage the type and missing data for the
# Classifier to work.
ferc1_steam_df = fix_int_na(ferc1_steam_df, columns=['construction_year'])
ferc1_steam_df = pudl.helpers.fix_int_na(
ferc1_steam_df, columns=['construction_year'])

# Grab fuel consumption proportions for use in assigning plant IDs:
fuel_fractions = fuel_by_plant_ferc1(ferc1_fuel_df)
Expand Down Expand Up @@ -467,11 +467,11 @@ def fuel(ferc1_raw_dfs, ferc1_transformed_dfs):
_clean_cols(ferc1_raw_dfs['fuel_ferc1'], 'f1_fuel').
# Standardize plant_name capitalization and remove leading/trailing
# white space -- necesary b/c plant_name is part of many foreign keys.
pipe(strip_lower, ['plant_name']).
pipe(pudl.helpers.strip_lower, ['plant_name']).
# Take the messy free-form fuel & fuel_unit fields, and do our best to
# map them to some canonical categories... this is necessarily
# imperfect:
pipe(cleanstrings, ['fuel', 'fuel_unit'],
pipe(pudl.helpers.cleanstrings, ['fuel', 'fuel_unit'],
[pc.ferc1_fuel_strings, pc.ferc1_fuel_unit_strings],
unmapped='').
# Fuel cost per kWh is a per-unit value that doesn't make sense to
Expand Down Expand Up @@ -594,7 +594,7 @@ def plants_small(ferc1_raw_dfs, ferc1_transformed_dfs):
ferc1_small_df = ferc1_raw_dfs['plants_small_ferc1']
# Standardize plant_name_raw capitalization and remove leading/trailing
# white space -- necesary b/c plant_name_raw is part of many foreign keys.
ferc1_small_df = strip_lower(
ferc1_small_df = pudl.helpers.strip_lower(
ferc1_small_df, ['plant_name', 'kind_of_fuel']
)

Expand Down Expand Up @@ -646,7 +646,8 @@ def plants_small(ferc1_raw_dfs, ferc1_transformed_dfs):

# Standardize plant_name capitalization and remove leading/trailing white
# space, so that plant_name matches formatting of plant_name_raw
ferc1_small_df = strip_lower(ferc1_small_df, ['plant_name_clean'])
ferc1_small_df = pudl.helpers.strip_lower(
ferc1_small_df, ['plant_name_clean'])

# in order to create one complete column of plant names, we have to use the
# cleaned plant names when available and the orignial plant names when the
Expand Down Expand Up @@ -708,8 +709,8 @@ def plants_hydro(ferc1_raw_dfs, ferc1_transformed_dfs):
_clean_cols(ferc1_raw_dfs['plants_hydro_ferc1'], 'f1_hydro').
# Standardize plant_name capitalization and remove leading/trailing
# white space -- necesary b/c plant_name is part of many foreign keys.
pipe(strip_lower, ['plant_name']).
pipe(cleanstrings, ['plant_const'], [pc.ferc1_const_type_strings],
pipe(pudl.helpers.strip_lower, ['plant_name']).
pipe(pudl.helpers.cleanstrings, ['plant_const'], [pc.ferc1_const_type_strings],
unmapped='').
assign(
# Converting kWh to MWh
Expand Down Expand Up @@ -793,9 +794,9 @@ def plants_pumped_storage(ferc1_raw_dfs, ferc1_transformed_dfs):
'f1_pumped_storage').
# Standardize plant_name capitalization and remove leading/trailing
# white space -- necesary b/c plant_name is part of many foreign keys.
pipe(strip_lower, ['plant_name']).
pipe(pudl.helpers.strip_lower, ['plant_name']).
# Clean up the messy plant construction type column:
pipe(cleanstrings, ['plant_kind'], [pc.ferc1_const_type_strings],
pipe(pudl.helpers.cleanstrings, ['plant_kind'], [pc.ferc1_const_type_strings],
unmapped='').
assign(
# Converting from kW/kWh to MW/MWh
Expand Down

0 comments on commit 97f8035

Please sign in to comment.