## Load the data

In [None]:
from dagster import AssetKey
from pudl.etl import defs
import pandas as pd
from pudl.helpers import zero_pad_numeric_string, organize_cols, standardize_phone_column, fix_na, analyze_missing_values
import numpy as np
import re

raw_df = defs.load_asset_value(AssetKey("raw_phmsagas__yearly_distribution"))

## core_phmsagas__yearly_distribution_operators

In [7]:
df = raw_df[[
    "report_date",
    "report_number",
    "report_submission_type",
    "report_year",
    "operator_id_phmsa",
    "operator_name_phmsa",
    "office_address_street",
    "office_address_city",
    "office_address_state",
    "office_address_zip",
    "office_address_county",
    "excavation_damage_excavation_practices",
    "excavation_damage_locating_practices",
    "excavation_damage_one_call_notification",
    "excavation_damage_other",
    "excavation_damage_total",
    "excavation_tickets",
    "services_efv_in_system",
    "services_efv_installed",
    "services_shutoff_valve_in_system",
    "services_shutoff_valve_installed",
    "federal_land_leaks_repaired_or_scheduled",
    "percent_unaccounted_for_gas",
    "additional_information",
    "preparer_email",
    "preparer_fax",
    "preparer_name",
    "preparer_phone",
    "preparer_title",
    # Adding these fields temporarily for transformation cleanup
    "headquarters_address_city",
    "headquarters_address_county",
    "headquarters_address_state",
    "headquarters_address_street",
    "headquarters_address_zip"
]]

In [None]:
# Clean up office zip codes
df["office_address_zip"] = zero_pad_numeric_string(df["office_address_zip"], 5)
df.head()

# Convert columns to ints
# Would have included "federal_land_leaks_repaired_or_scheduled" in this list but there were a couple rows with decimal values
columns_to_convert = [
    "report_year",
    "report_number",
    "operator_id_phmsa",
    "excavation_damage_excavation_practices",
    "excavation_damage_locating_practices",
    "excavation_damage_one_call_notification",
    "excavation_damage_other",
    "excavation_damage_total",
    "excavation_tickets",
    "services_efv_in_system",
    "services_efv_installed",
    "services_shutoff_valve_in_system",
    "services_shutoff_valve_installed"
]
df[columns_to_convert] = df[columns_to_convert].astype("Int64")

# Ensure all "report_year" values have four digits
mask = df["report_year"] < 100
df.loc[mask, "report_year"] = df.loc[mask, "report_year"].astype(int).astype(str).str[:4].astype(int)

### Operator Table Tasks

#### Standardize NAs

In [None]:
# Fill NA values with zeroes because these columns are simply counts.
# Note that "excavation_damage..." columns should sum up to the value in "excavation_damage_total". However, many rows
# (on the scale of thousands) do not actually sum up to "excavation_damage_total".
columns_to_fill = [
    "excavation_damage_excavation_practices",
    "excavation_damage_locating_practices",
    "excavation_damage_one_call_notification",
    "excavation_damage_other",
    "excavation_damage_total",
    "excavation_tickets",
    "services_efv_in_system",
    "services_efv_installed",
    "services_shutoff_valve_in_system",
    "services_shutoff_valve_installed",
    "federal_land_leaks_repaired_or_scheduled"
]
df[columns_to_fill] = df[columns_to_fill].fillna(0)

#### Standardize usage of office vs HQ address over time

- Make sure to use "clean_eia_counties" from helpers at some point
- Also maybe "zero_pad_numeric_string"


In [18]:
# Selecting the relevant columns
address_cols = [
    'operator_id_phmsa', 'report_year',
    'office_address_street', 'office_address_city', 'office_address_state', 'office_address_zip',
    'headquarters_address_street', 'headquarters_address_city', 'headquarters_address_state', 'headquarters_address_zip'
]

df_addresses = df[address_cols]

In [None]:
df_addresses.dtypes

In [None]:
df.office_address_zip.head()

In [None]:
df_addresses['office_address'] = df_addresses['office_address_street'] + ', ' + df_addresses['office_address_city'] + ', ' + df_addresses['office_address_state'] + ' ' + df_addresses['office_address_zip']

In [None]:
# Create a concatenated address string for easier comparison
df_addresses['office_address'] = df_addresses['office_address_street'] + ', ' + df_addresses['office_address_city'] + ', ' + df_addresses['office_address_state'] + ' ' + df_addresses['office_address_zip']
df_addresses['headquarters_address'] = df_addresses['headquarters_address_street'] + ', ' + df_addresses['headquarters_address_city'] + ', ' + df_addresses['headquarters_address_state'] + ' ' + df_addresses['headquarters_address_zip']

# Group by operator and sort by year
df_addresses = df_addresses.sort_values(by=['operator_id_phmsa', 'report_year'])

# Create flags for overlap or switch
df_addresses['address_overlap'] = df_addresses['office_address'] == df_addresses['headquarters_address']
df_addresses['address_switch'] = df_addresses.groupby('operator_id_phmsa').apply(
    lambda x: (x['office_address'].shift() == x['headquarters_address']) & 
              (x['headquarters_address'].shift() == x['office_address'])
).reset_index(drop=True)

# Display the results where overlap or switch occurred
df_overlap_switch = df_addresses[(df_addresses['address_overlap'] == True) | (df_addresses['address_switch'] == True)]


#### Standardize case for city, county, operator name, etc.

#### Standardize telephone and fax number format and drop (000)-000-0000


In [None]:
df = standardize_phone_column(df, "preparer_phone")
df[df.preparer_phone.notna()]