## Load the data

In [2]:
from dagster import AssetKey
from pudl.etl import defs
import pandas as pd
from pudl.helpers import zero_pad_numeric_string, standardize_phone_column, standardize_na_values
import numpy as np
import re
from pudl.metadata.dfs import POLITICAL_SUBDIVISIONS

In [3]:
raw_df = defs.load_asset_value(AssetKey("raw_phmsagas__yearly_distribution"))

No dagster instance configuration file (dagster.yaml) found at /Users/sam/Documents/pudl-data/dagster_home. Defaulting to loading and storing all metadata with /Users/sam/Documents/pudl-data/dagster_home. If this is the desired behavior, create an empty dagster.yaml file in /Users/sam/Documents/pudl-data/dagster_home.
2024-11-01 08:12:44 -0400 - dagster - DEBUG - system - Loading file from: /Users/sam/Documents/pudl-data/dagster_home/storage/raw_phmsagas__yearly_distribution using PickledObjectFilesystemIOManager...


## core_phmsagas__yearly_distribution_operators

In [4]:
df = raw_df[[
    "report_date",
    "report_number",
    "report_submission_type",
    "report_year",
    "operator_id_phmsa",
    "operator_name_phmsa",
    "office_address_street",
    "office_address_city",
    "office_address_state",
    "office_address_zip",
    "office_address_county",
    "headquarters_address_street",
    "headquarters_address_city",
    "headquarters_address_state",
    "headquarters_address_zip",
    "headquarters_address_county",
    "excavation_damage_excavation_practices",
    "excavation_damage_locating_practices",
    "excavation_damage_one_call_notification",
    "excavation_damage_other",
    "excavation_damage_total",
    "excavation_tickets",
    "services_efv_in_system",
    "services_efv_installed",
    "services_shutoff_valve_in_system",
    "services_shutoff_valve_installed",
    "federal_land_leaks_repaired_or_scheduled",
    "percent_unaccounted_for_gas",
    "additional_information",
    "preparer_email",
    "preparer_fax",
    "preparer_name",
    "preparer_phone",
    "preparer_title"
]]

In [5]:
# Convert columns to ints
# Excluded "federal_land_leaks_repaired_or_scheduled" from this list since there were a couple rows with decimal values
columns_to_convert = [
    "report_year",
    "report_number",
    "operator_id_phmsa",
    "excavation_damage_excavation_practices",
    "excavation_damage_locating_practices",
    "excavation_damage_one_call_notification",
    "excavation_damage_other",
    "excavation_damage_total",
    "excavation_tickets",
    "services_efv_in_system",
    "services_efv_installed",
    "services_shutoff_valve_in_system",
    "services_shutoff_valve_installed"
]
df[columns_to_convert] = df[columns_to_convert].astype("Int64")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[columns_to_convert] = df[columns_to_convert].astype("Int64")


In [6]:
# Ensure all "report_year" values have four digits
mask = df["report_year"] < 100

# Convert 2-digit years to appropriate 4-digit format (assume cutoff at year 50)
# We could also use the first 4 digits of the "report_number" but there was at least one anomaly here with an invalid year
df.loc[mask, "report_year"] = df.loc[mask, "report_year"].apply(
    lambda x: 2000 + x if x < 50 else 1900 + x
)

### Operator Table Tasks

#### Standardize NAs

In [7]:
# Fill NA values with zeroes because these columns are simply counts.
# Note that "excavation_damage..." columns should sum up to the value in "excavation_damage_total". However, many rows
# (on the scale of thousands) do not actually sum up to "excavation_damage_total".
columns_to_fill = [
    "excavation_damage_excavation_practices",
    "excavation_damage_locating_practices",
    "excavation_damage_one_call_notification",
    "excavation_damage_other",
    "excavation_damage_total",
    "excavation_tickets",
    "services_efv_in_system",
    "services_efv_installed",
    "services_shutoff_valve_in_system",
    "services_shutoff_valve_installed",
    "federal_land_leaks_repaired_or_scheduled"
]
df[columns_to_fill] = df[columns_to_fill].fillna(0)

# Fill in bad strings
df = standardize_na_values(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[columns_to_fill] = df[columns_to_fill].fillna(0)
  return df.replace(regex=r"(^\.$|^\s*$|^-+$)", value=np.nan)


#### Standardize case for city, county, operator name, etc.

In [8]:
# Capitalize the first letter of each word in all object-type columns except the excluded ones
exclude_columns = ['headquarters_address_state', 'office_address_state']
df[df.select_dtypes(include=['object']).columns.difference(exclude_columns)] = \
    df[df.select_dtypes(include=['object']).columns.difference(exclude_columns)].apply(lambda col: col.str.title())

In [9]:
# Standardize state abbreviations
# First create a dictionary of state names to abbreviations
state_to_abbr = {
    x.subdivision_name: x.subdivision_code
    for x in POLITICAL_SUBDIVISIONS.itertuples()
    if x.country_code == "USA" and x.subdivision_type == "state"
}
# Add abbreviations to the dictionary
state_to_abbr.update({
    x.subdivision_code: x.subdivision_code
    for x in POLITICAL_SUBDIVISIONS.itertuples()
    if x.country_code == "USA" and x.subdivision_type == "state"
})

def standardize_state(state):
    if pd.isna(state):
        return state
    state = state.strip()
    standardized_state = state_to_abbr.get(state, state)
    if standardized_state not in state_to_abbr.values():
        return np.nan
    return standardized_state

In [10]:
df["headquarters_address_state"] = df["headquarters_address_state"].apply(standardize_state)
df["office_address_state"] = df["office_address_state"].apply(standardize_state)

In [11]:
# Trim all the object-type columns
df[df.select_dtypes(include=['object']).columns] = df.select_dtypes(include=['object']).applymap(lambda x: x.strip() if isinstance(x, str) else x)

  df[df.select_dtypes(include=['object']).columns] = df.select_dtypes(include=['object']).applymap(lambda x: x.strip() if isinstance(x, str) else x)


#### Standardize telephone and fax number format and drop (000)-000-0000


In [12]:
df = standardize_phone_column(df, ["preparer_phone", "preparer_fax"])

### Other cleanup

In [13]:
# Standardize zip codes
df["office_address_zip"] = zero_pad_numeric_string(df["office_address_zip"], n_digits=5)
df["headquarters_address_zip"] = zero_pad_numeric_string(df["headquarters_address_zip"], n_digits=5)