## Load the data

In [1]:
from dagster import AssetKey
from pudl.etl import defs
import pandas as pd
from pudl.helpers import zero_pad_numeric_string, organize_cols, standardize_phone_column, fix_na, analyze_missing_values, standardize_state_columns
import numpy as np
import re

In [2]:
raw_df = defs.load_asset_value(AssetKey("raw_phmsagas__yearly_distribution"))

No dagster instance configuration file (dagster.yaml) found at /Users/sam/Documents/pudl-data/dagster_home. Defaulting to loading and storing all metadata with /Users/sam/Documents/pudl-data/dagster_home. If this is the desired behavior, create an empty dagster.yaml file in /Users/sam/Documents/pudl-data/dagster_home.
2024-10-24 08:29:46 -0400 - dagster - DEBUG - system - Loading file from: /Users/sam/Documents/pudl-data/dagster_home/storage/raw_phmsagas__yearly_distribution using PickledObjectFilesystemIOManager...


## core_phmsagas__yearly_distribution_operators

In [3]:
df = raw_df[[
    "report_date",
    "report_number",
    "report_submission_type",
    "report_year",
    "operator_id_phmsa",
    "operator_name_phmsa",
    "office_address_street",
    "office_address_city",
    "office_address_state",
    "office_address_zip",
    "office_address_county",
    "headquarters_address_street",
    "headquarters_address_city",
    "headquarters_address_state",
    "headquarters_address_zip",
    "headquarters_address_county",
    "excavation_damage_excavation_practices",
    "excavation_damage_locating_practices",
    "excavation_damage_one_call_notification",
    "excavation_damage_other",
    "excavation_damage_total",
    "excavation_tickets",
    "services_efv_in_system",
    "services_efv_installed",
    "services_shutoff_valve_in_system",
    "services_shutoff_valve_installed",
    "federal_land_leaks_repaired_or_scheduled",
    "percent_unaccounted_for_gas",
    "additional_information",
    "preparer_email",
    "preparer_fax",
    "preparer_name",
    "preparer_phone",
    "preparer_title"
]]

In [4]:
# Convert columns to ints
# Excluded "federal_land_leaks_repaired_or_scheduled" from this list since there were a couple rows with decimal values
columns_to_convert = [
    "report_year",
    "report_number",
    "operator_id_phmsa",
    "excavation_damage_excavation_practices",
    "excavation_damage_locating_practices",
    "excavation_damage_one_call_notification",
    "excavation_damage_other",
    "excavation_damage_total",
    "excavation_tickets",
    "services_efv_in_system",
    "services_efv_installed",
    "services_shutoff_valve_in_system",
    "services_shutoff_valve_installed"
]
df[columns_to_convert] = df[columns_to_convert].astype("Int64")

# Ensure all "report_year" values have four digits
mask = df["report_year"] < 100

# Convert 2-digit years to appropriate 4-digit format (assume cutoff at year 50)
# We could also use the first 4 digits of the "report_number" but there was at least one anomaly here with an invalid year
df.loc[mask, "report_year"] = df.loc[mask, "report_year"].apply(
    lambda x: 2000 + x if x < 50 else 1900 + x
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[columns_to_convert] = df[columns_to_convert].astype("Int64")


### Operator Table Tasks

#### Standardize NAs

In [5]:
# Fill NA values with zeroes because these columns are simply counts.
# Note that "excavation_damage..." columns should sum up to the value in "excavation_damage_total". However, many rows
# (on the scale of thousands) do not actually sum up to "excavation_damage_total".
columns_to_fill = [
    "excavation_damage_excavation_practices",
    "excavation_damage_locating_practices",
    "excavation_damage_one_call_notification",
    "excavation_damage_other",
    "excavation_damage_total",
    "excavation_tickets",
    "services_efv_in_system",
    "services_efv_installed",
    "services_shutoff_valve_in_system",
    "services_shutoff_valve_installed",
    "federal_land_leaks_repaired_or_scheduled"
]
df[columns_to_fill] = df[columns_to_fill].fillna(0)

# Fill in bad strings
df = fix_na(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[columns_to_fill] = df[columns_to_fill].fillna(0)
  return df.replace(regex=r"(^\.$|^\s*$|^-+$)", value=np.nan)


#### Standardize case for city, county, operator name, etc.

In [6]:
# Capitalize the first letter of each word in all object-type columns except the excluded ones
exclude_columns = ['headquarters_address_state', 'office_address_state']
df[df.select_dtypes(include=['object']).columns.difference(exclude_columns)] = \
    df[df.select_dtypes(include=['object']).columns.difference(exclude_columns)].apply(lambda col: col.str.title())

# List of state columns to standardize
state_columns_to_standardize = ['headquarters_address_state', 'office_address_state']
df = standardize_state_columns(df, state_columns_to_standardize)

# Trim all the object-type columns
df[df.select_dtypes(include=['object']).columns] = df.select_dtypes(include=['object']).applymap(lambda x: x.strip() if isinstance(x, str) else x)

  df[df.select_dtypes(include=['object']).columns] = df.select_dtypes(include=['object']).applymap(lambda x: x.strip() if isinstance(x, str) else x)


#### Standardize telephone and fax number format and drop (000)-000-0000


In [7]:
df = standardize_phone_column(df, ["preparer_phone", "preparer_fax"])

In [8]:
df.dtypes

report_date                                 datetime64[ns]
report_number                                        Int64
report_submission_type                              object
report_year                                          Int64
operator_id_phmsa                                    Int64
operator_name_phmsa                                 object
office_address_street                               object
office_address_city                                 object
office_address_state                                object
office_address_zip                                  object
office_address_county                               object
headquarters_address_street                         object
headquarters_address_city                           object
headquarters_address_state                          object
headquarters_address_zip                            object
headquarters_address_county                         object
excavation_damage_excavation_practices               Int

In [10]:
df[df.report_date.notna()].head()

Unnamed: 0,report_date,report_number,report_submission_type,report_year,operator_id_phmsa,operator_name_phmsa,office_address_street,office_address_city,office_address_state,office_address_zip,office_address_county,headquarters_address_street,headquarters_address_city,headquarters_address_state,headquarters_address_zip,headquarters_address_county,excavation_damage_excavation_practices,excavation_damage_locating_practices,excavation_damage_one_call_notification,excavation_damage_other,excavation_damage_total,excavation_tickets,services_efv_in_system,services_efv_installed,services_shutoff_valve_in_system,services_shutoff_valve_installed,federal_land_leaks_repaired_or_scheduled,percent_unaccounted_for_gas,additional_information,preparer_email,preparer_fax,preparer_name,preparer_phone,preparer_title
15329,2000-12-28,20000002,,2000,21170,Velma Public Works Auth,"Po Box 5, Main & Hargrove",Velma,OK,,Stephens,,,OK,,,0,0,0,0,0,0,0,0,0,0,0.0,4.0,,,,Paula Lynn,,
15330,2000-12-27,20000003,,2000,30584,Pentex Pipeline Company,579 Brighton Way,Phoenixville,PA,,,3131 Mckinney Ave #100,Dallas,TX,,,0,0,0,0,0,0,0,0,0,0,0.0,,"1.5 Mile 3"" Pipeline Service Customer At A Cus...",,,Mark D Casaday,,
15331,2000-12-27,20000004,,2000,22588,"Whittemore, Town Of",310 Fourth Street,Whittemore,IA,,Kossuth,,,,,,0,0,0,0,0,0,0,0,0,0,0.0,4.0,,,,Linda K. Farrell,,
15332,2000-12-26,20000005,,2000,18676,Sturgis Natural Gas System,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0.0,,,,,Mike Demoss,,
15333,2000-12-26,20000006,,2000,30865,"St. Robert, City Of",115 Plattner St.,St Robert,MO,,Pulaski,,,,,,0,0,0,0,0,0,0,0,0,0,0.0,1.0,,,,Matt F Wood,,
